\documentclass[12pt]{amsart} \usepackage{fullpage} \usepackage{amssymb} \usepackage{url} \usepackage{comment} \pagestyle{empty} \title{The MPFR Library: Algorithms and Proofs} \author{The MPFR team} \date{\tt www.mpfr.org} \def\O{{\mathcal O}} \def\n{\textnormal} \def\pinf{\bigtriangleup} \def\minf{\bigtriangledown} \def\q{\hspace*{5mm}} \def\ulp{{\rm ulp}} \def\Exp{{\rm EXP}} \def\prec{{\rm prec}} \def\sign{{\rm sign}} \def\Paragraph#1{\noindent {\sc #1}} \def\Z{{\mathcal Z}} \def\Q{{\mathbb Q}} \def\N{{\mathcal N}} \def\If{{\bf if}} \def\then{{\bf then}} \def\Else{{\bf else}} \def\elif{{\bf elif}} \def\for{{\bf for}} \def\to{{\bf to}} \def\while{{\bf while}} \def\err{{\rm err}} \newcommand{\U}[1]{\quad \mbox{[Rule~\ref{#1}]}} \newtheorem{Rule}{Rule} \newtheorem{lemma}{Lemma} \begin{document} \maketitle \tableofcontents \section{Error calculus} Let $n$ --- the working precision --- be a positive integer (considered fixed in the following). We write any nonzero real number $x$ in the form $x = m \cdot 2^e$ with $\frac{1}{2} \le |m| < 1$ and $e := {\rm EXP}(x)$, and we define $\ulp(x) := 2^{{\rm EXP}(x) - n}$. \subsection{Ulp calculus} \begin{Rule} \label{R1} $2^{-n} |x| < \ulp(x) \le 2^{-n+1} |x|$. \end{Rule} \begin{proof} Obvious from $x = m \cdot 2^e$ with $\frac{1}{2} \le |m| < 1$. \end{proof} \begin{Rule} \label{R2} If $a$ and $b$ have same precision $n$, and $|a| \le |b|$, then $\ulp(a) \le \ulp(b)$. \end{Rule} \begin{proof} Write $a = m_a \cdot 2^{e_a}$ and $b = m_b \cdot 2^{e_b}$. Then $|a| \le |b|$ implies $e_a \le e_b$, thus $\ulp(a) = 2^{e_a-n} \le 2^{e_b-n} = \ulp(b)$. \end{proof} \begin{Rule} \label{R3} Let $x$ be a real number, and $y = \circ(x)$. Then $|x - y| \leq \frac{1}{2} {\rm min}(\ulp(x), \ulp(y))$ in rounding to nearest, and $|x - y| \leq {\rm min}(\ulp(x), \ulp(y))$ for the other rounding modes. \end{Rule} \begin{proof} First consider rounding to nearest. By definition, we have $|x-y| \leq \frac{1}{2} \ulp(y)$. If $\ulp(y) \leq \ulp(x)$, then $|x-y| \leq \frac{1}{2} \ulp(y) \leq \frac{1}{2} \ulp(x)$. The only difficult case is when $\ulp(x) < \ulp(y)$, but then necessarily $y$ is a power of two; since in that case $y - \frac{1}{2} \ulp(y)$ is exactly representable, the maximal possible difference between $x$ and $y$ is $\frac{1}{4} \ulp(y) = \frac{1}{2} \ulp(x)$, which concludes the proof in the rounding to nearest case. In rounding to zero, we always have $\ulp(y) \leq \ulp(x)$, so the rule holds. In rounding away from zero, the only difficult case is when $\ulp(x) < \ulp(y)$, but then $y$ is a power of two, and since $y - \frac{1}{2} \ulp(y)$ is exactly representable, the maximal possible difference between $x$ and $y$ is $\frac{1}{2} \ulp(y) = \ulp(x)$, which concludes the proof. \end{proof} \begin{Rule} \label{R4} $\frac{1}{2} |a| \cdot \ulp(b) < \ulp(a b) < 2 |a| \cdot \ulp(b)$. \end{Rule} \begin{proof} Write $a = m_a 2^{e_a}$, $b = m_b \cdot 2^{e_b}$, and $a b = m 2^e$ with $\frac{1}{2} \le m_a, m_b, m < 1$, then $\frac{1}{4} 2^{e_a+e_b} \le |a b| < 2^{e_a+e_b}$, thus $e = e_a + e_b$ or $e = e_a + e_b - 1$, which implies $\frac{1}{2} 2^{e_a} \ulp(b) \le \ulp(a b) \le 2^{e_a} \ulp(b)$ using $2^{e_b-n} = \ulp(b)$, and the rule follows from the fact that $|a| < 2^{e_a} \le 2|a|$ (equality on the right side can occur only if $e = e_a + e_b$ and $m_a = \frac{1}{2}$, which are incompatible). \end{proof} \begin{Rule} \label{R5} $\ulp(2^k a) = 2^k \ulp(a)$. \end{Rule} \begin{proof} Easy: if $a = m_a \cdot 2^{e_a}$, then $2^k a = m_a \cdot 2^{e_a+k}$. \end{proof} \begin{Rule} \label{R6} Let $x > 0$, $o(\cdot)$ be any rounding, and $u := o(x)$, then $\frac{1}{2} u < x < 2 u$. \end{Rule} \begin{proof} Assume $x \geq 2 u$, then $2u$ is another representable number which is closer from $x$ than $u$, which leads to a contradiction. The same argument proves $\frac{1}{2} u < x$. \end{proof} \begin{Rule} \label{R7} $\frac{1}{2} |a| \cdot \ulp(1) < \ulp(a) \leq |a| \cdot \ulp(1)$. \end{Rule} \begin{proof} The left inequality comes from Rule~\ref{R4} with $b=1$, and the right one from $|a| \ulp(1) \geq \frac{1}{2} 2^{e_a} 2^{1-n} =\ulp(a)$. \end{proof} \begin{Rule} \label{R8} For any $x \neq 0$ and any rounding mode $o(\cdot)$, we have $\ulp(x) \leq \ulp(o(x))$, and equality holds when rounding towards zero, towards $-\infty$ for $x>0$, or towards $+\infty$ for $x<0$. \end{Rule} \begin{proof} Without loss of generality, assume $x > 0$. Let $x = m \cdot 2^e$ with $\frac{1}{2} \leq m < 1$. As $\frac{1}{2} 2^{e_x}$ is a machine number, necessarily $o(x) \geq \frac{1}{2} 2^{e_x}$, thus by Rule~\ref{R2}, then $\ulp(o(x)) \geq 2^{e_x - n} = \ulp(x)$. If we round towards zero, then $o(x) \leq x$ and by Rule~\ref{R2} again, $\ulp(o(x)) \leq \ulp(x)$. \end{proof} \begin{Rule} \label{R9} \begin{eqnarray}\nonumber &&\n{For}\;\; error(u) \leq k_u \ulp(u),\;\; u.c_u^- \leq x \leq u.c_u^+\\\nonumber &&\n{with}\;\; c_u^{-}=1- k_u 2^{1-p} \n{ and } c_u^{+}=1+ k_u 2^{1-p} \end{eqnarray} \begin{eqnarray}\nonumber &&\n{For}\;\; u=o(x),\;\; u.c_u^- \leq x \leq u.c_u^+\\\nonumber &&\n{if}\;\; u=\pinf(x),\n{ then } c_u^+=1\\\nonumber &&\n{if}\;\; u=\minf(x),\n{ then } c_u^-=1\\\nonumber &&\n{if}\;\; \n{for $x<0$ and } u=Z(x),\n{ then } c_u^+=1 \\\nonumber &&\n{if}\;\; \n{for $x>0$ and } u=Z(x),\n{ then } c_u^-=1 \\\nonumber &&\n{else}\;\; c_u^{-}=1-2^{1-p} \n{ and } c_u^{+}=1+2^{1-p} \end{eqnarray} \end{Rule} \subsection{Relative error analysis} Another way to get a bound on the error, is to bound the relative error. This is sometimes easier than using the ``ulp calculus'' especially when performing only multiplications or divisions. \begin{Rule} \label{R10} If $u := \circ_p(x)$, then we can write both: \[ u = x (1 + \theta_1), \qquad x = u (1 + \theta_2), \] where $|\theta_i| \leq 2^{-p}$ for rounding to nearest, and $|\theta_i| < 2^{1-p}$ for directed rounding. \end{Rule} \begin{proof} This is a simple consequence of Rule~\ref{R3}. For rounding to nearest, we have $|u-x| \leq \frac{1}{2} \ulp(t)$ for $t=u$ or $t=x$, hence by Rule~\ref{R1} $|u-x| \leq 2^{-p}$. \end{proof} \begin{Rule} \label{R11} Assume $x_1, \ldots, x_n$ are $n$ floating-point numbers in precision $p$, and we compute a approximation of their product with the following sequence of operations: $u_1 = x_1, u_2 = \circ(u_1 x_2), \ldots, u_n = \circ(u_{n-1} x_n$. If rounding away from zero, the total rounding error is bounded by $2(n-1) \ulp(u_n)$. \end{Rule} \begin{proof} We can write $u_1 x_2 = u_2 (1 - \theta_2), \ldots, u_{n-1} x_n = u_n (1 - \theta_n)$, where $0 \leq \theta_i \leq 2^{1-p}$. We get $x_1 x_2 \ldots x_n = u_n (1 - \theta_2) \ldots (1 - \theta_n)$, which we can write $u_n (1-\theta)^{n-1}$ for some $0 \leq \theta \leq 2^{1-p}$ by the intermediate value theorem. Since $1-nt \leq (1-t)^n \leq 1$, we get $|x_1 x_2 \ldots x_n - u_n| \leq (n-1) 2^{1-p} |u_n| \leq 2 (n-1) \ulp(u_n)$ by Rule~\ref{R1}. \end{proof} % \section{Generic error on basic operations} \subsection{Generic error of addition/subtraction}\label{generic:sous} We want to compute the generic error of the subtraction, the following rules apply to addition too. \begin{eqnarray}\nonumber \textnormal{Note:}&& error(u) \leq k_u \, \ulp(u), \;\; error(v) \leq k_v \, \ulp(v) \end{eqnarray} \begin{eqnarray}\nonumber \textnormal{Note:}&& \ulp(w)=2^{e_w-p}, \;\; \ulp(u)=2^{e_u-p},\;\; \ulp(v)=2^{e_v-p}\;\;\textnormal{with} \; p \; \textnormal{the accuracy} \\\nonumber && \ulp(u)=2^{d+e_w-p}, \;\; \ulp(v)=2^{d^{'}+e_w-p},\;\;\textnormal{with} \;\;d=e_u-e_w \;\; d^{'}=e_v-e_w \end{eqnarray} \begin{eqnarray}\nonumber error(w)& \leq &c_w \ulp(w) + k_u \ulp(u) + k_v \ulp(v) \\\nonumber &\leq&(c_w+k_u 2^d+ k_v 2^{d^{'}}) \ulp(w) \end{eqnarray} \begin{eqnarray}\nonumber &&\textnormal{If} \;\; ( u \geq 0 \;\;\textnormal{and}\;\; v \geq 0) \;\;\textnormal{or}\;\; (u \leq 0 \;\;\textnormal{and}\;\; v \leq 0) \end{eqnarray} \begin{eqnarray}\nonumber error(w)& \leq&(c_w + k_u + k_v) \, \ulp(w) \end{eqnarray} \begin{eqnarray}\nonumber \textnormal{Note:}&&\textnormal{If}\;\; w=N(u+v) \;\;\textnormal{Then}\;\; c_w =\frac{1}{2} \;\;\textnormal{else}\;\; c_w =1\\\nonumber \end{eqnarray} \subsection{Generic error of multiplication}\label{generic:mul} We want to compute the generic error of the multiplication. We assume here $u, v > 0$. \begin{eqnarray}\nonumber w&=&o(u.v) \\\nonumber \textnormal{Note:}&& error(u) \leq k_u \, \ulp(u), \;\; error(v) \leq k_v \, \ulp(v) \end{eqnarray} \begin{eqnarray}\nonumber error(w)& = &|w - x.y| \\\nonumber & \leq &|w - u v| +|u v - x y| \\\nonumber & \leq & c_w \ulp(w) + \frac{1}{2} [|u v-u y|+|u y-x y|+|u v-x v|+|x v-x y|]\\\nonumber & \leq & c_w \ulp(w) + \frac{u+x}{2} k_v \ulp(v) + \frac{v+y}{2} k_u \ulp(u)\\\nonumber & \leq & c_w \ulp(w) + \frac{u(1+c_u^+)}{2} k_v \ulp(v) + \frac{v(1+c_v^+)}{2} k_u \ulp(u) \U{R9}\\\nonumber & \leq & c_w \ulp(w) + (1+c_u^+) k_v \ulp(u.v) + (1+c_v^+) k_u \ulp(u.v) \U{R4}\\\nonumber & \leq & [ c_w + (1+c_u^+) k_v + (1+c_v^+) k_u ] \ulp(w)\U{R8}\\\nonumber \end{eqnarray} \begin{eqnarray}\nonumber \textnormal{Note:}&&\textnormal{If}\;\; w=N(u+v) \;\;\textnormal{Then}\;\; c_w =\frac{1}{2} \;\;\textnormal{else}\;\; c_w =1 \end{eqnarray} \subsection{Generic error of inverse}\label{generic:inv} We want to compute the generic error of the inverse. We assume $u > 0$. \begin{eqnarray}\nonumber w&=&o(\frac{1}{u}) \\\nonumber \textnormal{Note:}&& error(u) \leq k_u \, \ulp(u) \end{eqnarray} \begin{eqnarray}\nonumber error(w)& = &|w - \frac{1}{x}| \\\nonumber & \leq &|w - \frac{1}{u}| +|\frac{1}{u} - \frac{1}{x}| \\\nonumber & \leq & c_w \ulp(w) + \frac{1}{ux}|u-x| \\\nonumber & \leq & c_w \ulp(w) + \frac{k_u}{ux} \ulp(u) \end{eqnarray} \begin{eqnarray}\nonumber \textnormal{Note:}&& \frac{u}{c_u} \leq x\;\; \U{R6}\\\nonumber &&\n{for } u=\minf(x),\;c_u=1 \n{ else } c_u=2\\\nonumber && \n{then: } \frac{1}{x} \leq c_u \frac{1}{u} \end{eqnarray} \begin{eqnarray}\nonumber error(w)& \leq & c_w \ulp(w) + c_u\frac{k_u}{u^2} \ulp(u)\\\nonumber & \leq & c_w \ulp(w) + 2.c_u.k_u \ulp(\frac{u}{u^2}) \U{R4}\\\nonumber & \leq & [c_w + 2.c_u.k_u].\ulp(w) \U{R8} \end{eqnarray} \begin{eqnarray}\nonumber \textnormal{Note:}&&\textnormal{If}\;\; w=N(\frac{1}{u}) \;\;\textnormal{Then}\;\; c_w =\frac{1}{2} \;\;\textnormal{else}\;\; c_w =1\\\nonumber\end{eqnarray} \subsection{Generic error of division}\label{generic:div} We want to compute the generic error of the division. Without loss of generality, we assume all variables are positive. \begin{eqnarray}\nonumber w&=&o(\frac{u}{v}) \\\nonumber \textnormal{Note:}&& error(u) \leq k_u \, \ulp(u), \;\; error(v) \leq k_v \, \ulp(v) \end{eqnarray} \begin{eqnarray}\nonumber error(w)& = &|w - \frac{x}{y}| \\\nonumber & \leq &|w - \frac{u}{v}| +|\frac{u}{v} - \frac{x}{y}| \\\nonumber & \leq & c_w \ulp(w) + \frac{1}{vy}|uy-vx| \\\nonumber & \leq & c_w \ulp(w) + \frac{1}{vy}[|uy-xy|+|xy-vx| ]\\\nonumber & \leq & c_w \ulp(w) + \frac{1}{vy}[y k_u \ulp(u)+ x k_v \ulp(v)]\\\nonumber & \leq & c_w \ulp(w) + \frac{k_u}{v} \ulp(u)+ \frac{k_v x}{vy} \ulp(v) \end{eqnarray} \begin{eqnarray}\nonumber \textnormal{Note:}&& \frac{\ulp(u)}{v} \leq 2 \ulp(\frac{u}{v}) \;\; \U{R4}\\\nonumber && 2 \ulp(\frac{u}{v}) \leq 2 \ulp(w) \;\; \U{R8} \end{eqnarray} \begin{eqnarray}\nonumber \textnormal{Note:}&& x \leq c_u u \textnormal{ and } \frac{v}{c_v} \leq y\;\; \U{R6}\\\nonumber &&\n{with } \n{for } u=\pinf(x),\;c_u=1 \n{ else } c_u=2\\\nonumber &&\n{ and }\n{for } v=\minf(y),\;c_v=1 \n{ else } c_v=2\\\nonumber && \n{then: } \frac{x}{y} \leq c_u c_v \frac{u}{v} \end{eqnarray} \begin{eqnarray}\nonumber error(w)& \leq & c_w \ulp(w) + 2.k_u \ulp(w)+ c_u.c_v.\frac{k_v u}{vv} \ulp(v)\\\nonumber & \leq & c_w \ulp(w) + 2.k_u \ulp(w)+ 2.c_u.c_v.k_v \ulp(\frac{u.v}{v.v}) \U{R4}\\\nonumber & \leq & [c_w + 2.k_u+ 2.c_u.c_v.k_v].\ulp(w) \U{R8} \end{eqnarray} \begin{eqnarray}\nonumber \textnormal{Note:}&&\textnormal{If}\;\; w=N(\frac{u}{v}) \;\;\textnormal{Then}\;\; c_w =\frac{1}{2} \;\;\textnormal{else}\;\; c_w =1 \end{eqnarray} Note that we can obtain a slightly different result by writing $uy-vx = (uy - uv) + (uv - vx)$ instead of $(uy-xy)+(xy-vx)$. Another result can be obtained using a relative error analysis. Assume $x = u (1 + \theta_u)$ and $y = v (1 + \theta_v)$. Then $|\frac{u}{v} - \frac{x}{y}| \leq \frac{1}{vy} |uy - uv| + \frac{1}{vy} |uv - xv| = \frac{u}{y} (|\theta_u|+|\theta_v|)$. If $v \leq y$ and $\frac{u}{v} \leq w$, this is bounded by $w (|\theta_u|+|\theta_v|)$. \subsection{Generic error of square root}\label{generic:sqrt} We want to compute the generic error of the square root of a floating-point number $u$, itself an approximation to a real $x$, with $|u - x| \leq k_u \ulp(u)$. If $v = o(\sqrt{u})$, then: \begin{eqnarray}\nonumber error(v) := |v - \sqrt{x}| & \leq &|v - \sqrt{u}| +|\sqrt{u} - \sqrt{x}| \\\nonumber & \leq & c_v \ulp(v) + \frac{1}{\sqrt{u} + \sqrt{x}}|u-x| \\\nonumber & \leq & c_v \ulp(v) + \frac{1}{\sqrt{u} + \sqrt{x}} k_u \ulp(u) \\\nonumber \end{eqnarray} Since by Rule~\ref{R9} we have $u.c_u^- \leq x$, it follows $\frac{1}{\sqrt{x}+\sqrt{u}} \leq \frac{1}{\sqrt{u}.(1+\sqrt{c_u^-})}$: \begin{eqnarray}\nonumber error(v)& \leq & c_v \ulp(v) + \frac{1}{\sqrt{u}.(1+\sqrt{c_u^-})} k_u \ulp(u) \\\nonumber & \leq & c_v \ulp(v) + \frac{2}{1+\sqrt{c_u^-}} k_u \ulp(\sqrt{u}) \;\; \U{R4}\\\nonumber & \leq & (c_v + \frac{2k_u}{1+\sqrt{c_u^-}}) \ulp(v). \;\; \U{R8}\\\nonumber \end{eqnarray} If $u$ is less than $x$, we have $c_u^-=1$ and we get the simpler formula $|v-\sqrt{x}| \leq (c_v + k_u) \ulp(v)$. \subsection{Generic error of the exponential }\label{generic:exp} We want to compute the generic error of the exponential. \begin{eqnarray}\nonumber v&=&o(e^{u}) \\\nonumber \textnormal{Note:}&& error(u) \leq k_u \, \ulp(u)\\\nonumber \end{eqnarray} \begin{eqnarray}\nonumber error(v)& = &|v - e^{x}| \\\nonumber & \leq &|v - e^{u}| +|e^{u} - e^{x}| \\\nonumber & \leq & c_v \ulp(v) + e^t |u-x| \n{ with Rolle's theorem, for } t\in[x,u]\n{ or }t\in[u,x] \end{eqnarray} \begin{eqnarray}\nonumber error(v)& \leq & c_v \ulp(v) + c_u^* e^u k_u \ulp(u) \\\nonumber & \leq & c_v \ulp(v) + 2 c_u^* u k_u \ulp(e^u) \;\U{R4}\\\nonumber & \leq & (c_v + 2 c_u^* u k_u )\ulp(v) \;\U{R8}\\\nonumber & \leq & (c_v + c_u^* 2^{\Exp(u)+1} k_u )\ulp(v) \end{eqnarray} \begin{eqnarray}\nonumber \textnormal{Note:}&& u= m_u 2^{e_u} \n{ and }\ulp(u)=2^{e_u-p} \n{ with } p \n{ the accuracy} \\\nonumber \n{ Case }&x \leq u& c_u^* =1 \\\nonumber \n{ Case }&u \leq x& \\\nonumber && x \leq u + k_u \ulp(u)\\\nonumber && e^x \leq e^u e^{k_u \ulp(u)}\\\nonumber &&e^x \leq e^u e^{k_u 2^{e_u-p}}\\\nonumber &\n{then}& c_u^* = e^{k_u 2^{\Exp(u)-p}}\\\nonumber \end{eqnarray} \subsection{Generic error of the logarithm }\label{generic:log} We want to compute the generic error of the logarithm. \begin{eqnarray}\nonumber v&=&o(\log{u}) \\\nonumber \textnormal{Note:}&& error(u) \leq k_u \, \ulp(u)\\\nonumber \end{eqnarray} \begin{eqnarray}\nonumber error(v)& = &|v - \log{x}| \\\nonumber & \leq &|v - \log{u}| +|\log{u} - \log{x}| \\\nonumber & \leq & c_v \ulp(v) + \log{|\frac{x}{u}|} \\\nonumber & \leq & c_v \ulp(v) + \frac{|x-u|}{u} \\\nonumber & \leq & c_v \ulp(v) + \frac{k_u \, \ulp(u)}{u}\\\nonumber & \leq & c_v \ulp(v) + k_u \, \ulp(1)\;\; \U{R7}\\\nonumber \end{eqnarray} \begin{eqnarray}\nonumber \textnormal{Note:}&& \ulp(1)=2^{1-p}, \n{ and } \ulp(v)=2^{e_v-p} \n{ with } p \n{ the accuracy} \\\nonumber && \ulp(1)= 2^{1-e_v+e_v-p}=2^{1-e_v} \ulp(v) \end{eqnarray} \begin{eqnarray}\nonumber error(v)& \leq & c_v \ulp(v) + k_u 2^{1-e_v} \ulp(v)\\\nonumber & \leq & (c_v + k_u 2^{1-e_v} )\ulp(v)\\\nonumber \end{eqnarray} \subsection{Ulp calculus vs relative error} The error in ulp (ulp-error) and the relative error are related as follows. Let $n$ be the working precision. Consider $u = \circ(x)$, then the error on $u$ is at most $\ulp(u) = 2^{\Exp(u)-n} \leq |u| \cdot 2^{1-n}$, thus the relative error is $\leq 2^{1-n}$. Respectively, if the relative error is $\leq \delta$, then the error is at most $\delta |u| \leq \delta 2^n \ulp(u)$. (Going from the ulp-error to the relative error and back, we loose a factor of two.) It is sometimes more convenient to use the relative error instead of the error in ulp (ulp-error), in particular when only multiplications or divisions are made. In that case, Higham \cite{Higham02} proposes the following framework: we associate to each variable the cumulated number $k$ of roundings that were made. The $i$th rounding introduces a relative error of $\delta_i$, with $|\delta_i| \leq 2^{1-n}$, i.e. the computed result is $1+\delta_i$ times the exact result. Hence $k$ successive roundings give a error factor of $(1+\delta_1) (1+\delta_2) \cdots (1+\delta_k)$, which is between $(1-\varepsilon)^k$ and $(1+\varepsilon)^k$ with $\varepsilon = 2^{1-n}$. In particular, if all roundings are away, the final relative error is at most $k \varepsilon = k \cdot 2^{1-n}$, thus at most $2k$ ulps. \begin{lemma} \label{rel_ulp} If a value if computed by $k$ successive multiplications or divisions, each with rounding away from zero, and precision $n$, then the final error is bounded by $2k$ ulps. \end{lemma} \section{Low level functions} \subsection{The {\tt mpfr\_add} function} \begin{verbatim} mpfr_add (A, B, C, rnd) /* on suppose B et C de me^me signe, et EXP(B) >= EXP(C) */ 0. d = EXP(B) - EXP(C) /* d >= 0 par hypothe`se */ 1. Soient B1 les prec(A) premiers bits de B, et B0 le reste C1 les bits de C correspondant a` B1, C0 le reste /* B0, C1, C0 peuvent e^tre vides, mais pas B1 */ <----------- A ----------> <----------- B1 ---------><------ B0 -----> <---------- C1 -------><------------ C0 -----------> 2. A <- B1 + (C1 >> d) 3. q <- compute_carry (B0, C0, rnd) 4. A <- A + q \end{verbatim} \subsection{The {\tt mpfr\_cmp2} function} This function computes the exponent shift when subtracting $c > 0$ from $b \ge c$. In other terms, if ${\rm EXP}(x) := \lfloor \frac{\log x}{\log 2} \rfloor$, it returns ${\rm EXP}(b) - {\rm EXP}(b-c)$. This function admits the following specification in terms of the binary representation of the mantissa of $b$ and $c$: if $b = u 1 0^n r$ and $c = u 0 1^n s$, where $u$ is the longest common prefix to $b$ and $c$, and $(r,s)$ do not start with $(0, 1)$, then ${\tt mpfr\_cmp2}(b,c)$ returns $|u| + n$ if $r \ge s$, and $|u| + n + 1$ otherwise, where $|u|$ is the number of bits of $u$. As it is not very efficient to compare $b$ and $c$ bit-per-bit, we propose the following algorithm, which compares $b$ and $c$ word-per-word. Here $b[n]$ represents the $n$th word from the mantissa of $b$, starting from the most significant word $b[0]$, which has its most significant bit set. The values $c[n]$ represent the words of $c$, after a possible shift if the exponent of $c$ is smaller than that of $b$. \begin{verbatim} n = 0; res = 0; while (b[n] == c[n]) n++; res += BITS_PER_MP_LIMB; /* now b[n] > c[n] and the first res bits coincide */ dif = b[n] - c[n]; while (dif == 1) n++; dif = (dif << BITS_PER_MP_LIMB) + b[n] - c[n]; res += BITS_PER_MP_LIMB; /* now dif > 1 */ res += BITS_PER_MP_LIMB - number_of_bits(dif); if (!is_power_of_two(dif)) return res; /* otherwise result is res + (low(b) < low(c)) */ do n++; while (b[n] == c[n]); return res + (b[n] < c[n]); \end{verbatim} \subsection{The {\tt mpfr\_sub} function} The algorithm used is as follows, where $w$ denotes the number of bits per word. We assume that $a$, $b$ and $c$ denote different variables (if $a:=b$ or $a:=c$, we have first to copy $b$ or $c$), and that the rounding mode is either $\N$ (nearest), $\Z$ (towards zero), or $\infty$ (away from zero). \begin{quote} Algorithm {\tt mpfr\_sub}. \\ Input: $b$, $c$ of opposite sign with $b > c > 0$, a rounding mode $\circ \in \{ \N, \Z, \infty \}$ \\ Side effect: store in $a$ the value of $\circ(b - c)$ \\ Output: $0$ if $\circ(b - c) = b-c$, $1$ if $\circ(b - c) > b-c$, and $-1$ if $\circ(b - c) < b-c$ \\ ${\tt an} \leftarrow \lceil \frac{\prec(a)}{w} \rceil$, ${\tt bn} \leftarrow \lceil \frac{\prec(b)}{w} \rceil$, ${\tt cn} \leftarrow \lceil \frac{\prec(c)}{w} \rceil$ \\ ${\tt cancel} \leftarrow {\tt mpfr\_cmp2}(b, c)$; \quad ${\tt diff\_exp} \leftarrow \Exp(b)-\Exp(c)$ \\ ${\tt shift_b} \leftarrow (-{\tt cancel}) \bmod w$; \quad ${\tt cancel_b} \leftarrow ({\tt cancel} + {\tt shift_b})/w$ \\ \If\ ${\tt shift_b} > 0$ \then\ ${\tt b}[0 \dots \mbox{\tt bn}] \leftarrow {\tt mpn\_rshift}({\tt b}[0 \dots {\tt bn}-1], {\tt shift_b})$; ${\tt bn} \leftarrow {\tt bn} + 1$ \\ ${\tt shift_c} \leftarrow ({\tt diff\_exp}-{\tt cancel}) \bmod w$; \quad ${\tt cancel_c} \leftarrow ({\tt cancel} + {\tt shift_c}-{\tt diff\_exp})/w$ \\ \If\ ${\tt shift_c} > 0$ \then\ ${\tt c}[0 \dots \mbox{\tt cn}] \leftarrow {\tt mpn\_rshift}({\tt c}[0 \dots {\tt cn}-1], {\tt shift_c})$; ${\tt cn} \leftarrow {\tt cn} + 1$ \\ $\Exp(a) \leftarrow \Exp(b) - {\tt cancel}$; \quad $\sign(a) \leftarrow \sign(b)$ \\ $a[0 \dots {\tt an}-1] \leftarrow b[{\tt bn} - {\tt cancel_b} - {\tt an} \dots {\tt bn} - {\tt cancel_b} - 1]$ \\ $a[0 \dots {\tt an}-1] \leftarrow a[0 \dots {\tt an}-1] - c[{\tt cn} - {\tt cancel_c} - {\tt an} \dots {\tt cn} - {\tt cancel_c} - 1]$ \\ ${\tt sh} \leftarrow {\tt an} \cdot w - \prec(a)$; \quad $r \leftarrow a[0] \bmod 2^{\tt sh}$; \quad $a[0] \leftarrow a[0] - r$ \\ \If\ $\circ = \N$ and ${\tt sh} > 0$ \then \\ \q \If\ $r > 2^{{\tt sh}-1}$ \then\ $a \leftarrow a + \ulp(a)$; return $1$ \elif\ $0 < r < 2^{{\tt sh}-1}$ \then\ return $-1$ \\ \elif\ $\circ \in \{ \Z, \infty \}$ and $r > 0$ \then \\ \q \If\ $\circ = \Z$ return $-1$ \Else\ $a \leftarrow a + \ulp(a)$; return $1$ \\ ${\tt bl} \leftarrow {\tt bn} - {\tt an} - {\tt cancel_b}$ \\ ${\tt cl} \leftarrow {\tt cn} - {\tt an} - {\tt cancel_c}$ \\ \for\ $k=0$ \while\ ${\tt bl} > 0$ and ${\tt cl} > 0$ {\bf do} \\ \q ${\tt bl} \leftarrow {\tt bl} - 1$; ${\tt bp} \leftarrow b[{\tt bl}]$ \\ \q ${\tt cl} \leftarrow {\tt cl} - 1$; ${\tt cp} \leftarrow c[{\tt cl}]$ \\ \q \If\ $\circ = \N$ and $k=0$ and ${\tt sh}=0$ \then \\ \q \q \If\ ${\tt cp} \ge 2^{w-1}$ \then\ return $-1$ \\ \q \q $r \leftarrow {\tt bp} - {\tt cp}$; \quad ${\tt cp} \leftarrow {\tt cp} + 2^{w-1}$ \\ \q \If\ ${\tt bp} < {\tt cp}$ \then \\ \q \q \If\ $\circ = \Z$ \then\ $a \leftarrow a - \ulp(a)$; \quad \If\ $\circ = \infty$ \then\ return $1$ \Else\ return $-1$ \q \If\ ${\tt bp} > {\tt cp}$ \then \\ \q \q \If\ $\circ = \Z$ \then\ return $-1$ \Else\ $a \leftarrow a + \ulp(a)$; return $1$ \\ \If\ $\circ = \N$ and $r > 0$ \then \\ \q \If\ $a[0] \, {\rm div} \, 2^{\tt sh}$ is odd \then\ $a \leftarrow a + \ulp(a)$; return $1$ \Else\ return $-1$ \\ Return $0$. \end{quote} where $b[i]$ and $c[i]$ is meant as $0$ for negative $i$, and $c[i]$ is meant as $0$ for $i \ge {\tt cn}$ (${\tt cancel_b} \ge 0$, but ${\tt cancel_c}$ may be negative). \subsection{The {\tt mpfr\_mul} function} {\tt mpfr\_mul} uses two algorithms: if the precision of the operands is small enough, a plain multiplication using {\tt mpn\_mul} is used (there is no error, except in the final rounding); otherwise it uses {\tt mpfr\_mulhigh\_n}. In this case, it trunks the two operands to $m$ limbs: $1/2 \leq b < 1$ and $1/2 \leq c < 1$, $b = bh+bl$ and $c = ch+c$ ($B=2^{32} or 2^{64}$). The error comes from: \begin {itemize} \item Truncation: $ \leq bl.ch + bh.cl + bl.cl \leq bl + cl \leq 2 B^{-m}$ \item Mulder: Assuming $error(Mulder(n)) \leq error(mulhigh_basecase(n))$ $error(mulhigh(n) \leq (n-1) (B-1)^2 B^{-n-2} + ... + 1 (B-1)^2 B^{-2 n}$ $ = \sum_{i=1}^{n-1}{(n-i) (B-1)^2 B^{-n-1-i}}$ $ = (B-1)^2 B^{-n-1} \sum_{i=1}^{n-1}{B^{-i}}$ $ = (b-1)^2 B^{-n-1} \frac{B^{1-n}-n+n B-B}{(1-B)^2}$ $ \leq n B^{-n}$ \end {itemize} Total error: $\leq (m+2) B^{-m}$ \subsection{The {\tt mpfr\_div} function} The goals of the code of the {\tt mpfr\_div} function include the fact that the complexity should, while preserving correct rounding, depend on the precision required on the result rather than on the precision given on the operands. Let $u$ be the dividend, $v$ the divisor, and $p$ the target precision for the quotient. We denote by $q$ the real quotient $u/v$, with infinite precision, and $n \geq p$ the working precision. The idea --- as in the square root algorithm below --- is to use GMP's integer division: divide the most $2n$ or $2n-1$ significant bits from $u$ by the most $n$ significant bits from $v$ will give a good approximation of the quotient's integer significand. The main difficulties arise when $u$ and $v$ have a larger precision than $2n$ and $n$ respectively, since we have to truncate them. We distinguish two cases: whether the divisor is truncated or not. \subsubsection{Full divisor.} This is the easy case. Write $u = u_1 + u_0$ where $u_0$ is the truncated part, and $v = v_1$. Without loss of generality we can assume that $\ulp(u_1)=\ulp(v_1)=1$, thus $u_1$ and $v_1$ are integers, and $0 \leq u_0 < 1$. Since $v_1$ has $n$ significant bits, we have $2^{n-1} \leq v_1 < 2^n$. (We normalize $u$ so that the integer quotient gives exactly $n$ bits; this is easy by comparing the most significant bits of $u$ and $v$, thus $2^{2n-2} \leq u_1 < 2^{2n}$.) The integer division of $u_1$ by $v_1$ yields $q_1$ and $r$ such that $u_1 = q_1 v_1 + r$, with $0 \leq r < v_1$, and $q_1$ having exactly $n$ bits. In that case we have \[ q_1 \leq q=\frac{u}{v} < q_1 + 1. \] Indeed, $q = \frac{u}{v} \geq \frac{u_1}{v_1} = \frac{q_1 v_1 + r}{v_1}$, and $q \leq \frac{u_1+u_0}{v_1} \leq q_1 + \frac{r + u_0}{v_1} < q_1 + 1$, since $r + u_0 < r+1 \leq v_1$. \subsubsection{Truncated divisor.} This is the hard case. Write $u = u_1 + u_0$, and $v = v_1 + v_0$, where $0 \leq u_0, v_0 < 1$ with the same conventions as above. We prove in that case that: \begin{equation} \label{diveq} q_1-2 < q = \frac{u}{v} < q_1 + 1. \end{equation} The upper bound holds as above. For the lower bound, we have $u - (q_1-2) v > u_1 - (q_1-2) (v_1+1) \geq q_1 v_1 - (q_1-2) (v_1+1) = 2 (v_1+1) - q_1 \geq 2^n - q_1 > 0$. This lower bound is the best possible, since $q_1-1$ would be wrong; indeed, consider $n=3$, $v_1=4$, $v_0 = 7/8$, $u=24$: this gives $q_1 = 6$, but $u/v = 64/13 < q_1-1 = 5$. As a consequence of Eq.~(\ref{diveq}), if the open interval $(q_1-2, q_1+1)$ contains no rounding boundary for the target precision, we can deduce the correct rounding of $u/v$ just from the value of $q_1$. In other words, for directed rounding, the two only ``bad cases'' are when the binary representation of $q_1$ ends with $\underbrace{0000}_{n-p}$ or $\underbrace{0001}_{n-p}$. We even can decide if rounding is correct, since when $q_1$ ends with $0010$, the exact value cannot end with $0000$, and similarly when $q_1$ ends with $1111$. Hence if $n=p+k$, i.e.~if we use $k$ extra bits with respect to the target precision $p$, the failure probability is $2^{1-k}$. \subsubsection{Avoiding Ziv's strategy.} In the failure case ($q_1$ ending with $000\ldots 000x$ with directed rounding, or $100 \ldots 000x$ with rounding to nearest), we could try again with a larger working precision $p$. However, we then need to perform a second division, and we are not sure this new computation will enable us to conclude. In fact, we can conclude directly. Recall that $u_1 = q_1 v_1 + r$. Thus $u = q_1 v + (r + u_0 - q_1 v_0)$. We have to decide which of the following five cases holds: (a) $q_1 - 2 < q < q_1 - 1$, (b) $q = q_1-1$, (c) $q_1-1 < q < q_1$, (d) $q=q_1$, (e) $q_1 < q < q_1+1$. \begin{quote} $s \leftarrow q_1 v_0$ \\ \textbf{if} $s < r + u_0$ \textbf{then} $q \in (q_1, q_1+1)$ \\ \textbf{elif} $s = r + u_0$ \textbf{then} $q = q_1$ \\ \textbf{else} \\ % u = (q1-1) v + (r + u_0 - s + v) \q $t \leftarrow s - (r + u_0)$ \\ % u = (q1-1) v + (v - t) \q \textbf{if} $t < v$ \textbf{then} $q \in (q_1 - 1, q_1)$ \\ \textbf{elif} $t=v$ \textbf{then} $q = q_1-1$ \\ \textbf{else} $q \in (q_1 - 2, q_1-1)$ \end{quote} \begin{comment} Let $u = u_n 2^{u_e}$, $v = v_n 2^{v_e}$, where $u_n$ and $v_n$ are in $[1/2, 1[$. Let $q_p$ be the precision required on $q$. Put $b_p = \min(v_p, q_p + \varepsilon_p)$, $a_p = b_p + q_p + \varepsilon_p$, where $\varepsilon_p$ is a small value to be chosen. First, a integer division of $u_{hi} = \lfloor u_n 2^{a_p} \rfloor$ by $v_{hi} = \lfloor v_n 2^{b_p} \rfloor$ is performed. Write $u_{hi} = \tilde{q} v_{hi} + \tilde{r}$. If this division is not a full one, to obtain the real value of the quotient, if $\delta = max(u_p, v_p)$, we have to divide $u_n 2^{q_p + \varepsilon_p + \delta}$ by $v_n 2^{\delta}$. In that case, $2^{q_p + \varepsilon_p + \delta} u_n = \tilde{q}v_n 2^{\delta} + \tilde{r} 2^{\delta - q_p - \varepsilon_p} + u_{lo} - \tilde{q}v_{lo}$, with obvious notations. A positive correction on $q$ has to come from the contribution of $\tilde{r} 2^{\delta - q_p - \varepsilon_p} + u_{lo}$. The first term is at most $v_{hi} 2^{\delta - q_p - \varepsilon_p}$. As for $u_{lo}$, we have $u_{lo} < 2^{\delta-q_p-\varepsilon_p}$. Hence, the sum $u_{lo} + \tilde{r} 2^{\delta - q_p - \varepsilon_p} < 2v$, and the positive correction is at most 1. We now have to estimate $\tilde{q}v_{lo}$. It is easily seen that $\tilde{q} < 2^{q_p + \varepsilon_p + 1}$. As for $v_{lo}$, we have $v_{lo} < 2^{\delta - q_p - \varepsilon_p}$, so that $\tilde{q} v_{lo} < 2^{\delta + 1}$, to be compared with $v_n 2^{\delta}$, so that a negative correction is at most 3. As a consequence, to be able to decide rounding after the first stage, one should choose $\varepsilon_p \geq 3$ (to include the round-to-nearest case). \end{comment} \subsection{The {\tt mpfr\_sqrt} function} The \texttt{mpfr\_sqrt} implementation uses the \texttt{mpn\_sqrtrem} function from GMP's \texttt{mpn} level: given a positive integer $m$, it computes $s$ and $r$ such that $m = s^2 + r$ with $s^2 \leq m < (s+1)^2$, or equivalently $0 \leq r \leq 2s$. In other words, $s$ is the integer square root of $m$, rounded towards zero. The idea is to multiply the input significand by some power of two, in order to obtain an integer significand $m$ whose integer square root $s$ will have exactly $p$ bits, where $p$ is the target precision. This is easy: $m$ should have either $2p$ or $2p-1$ bits. For directed rounding, we then know that the result significand will be either $s$ or $s+1$, depending on the square root remainder $r$ being zero or not. \begin{quote} Algorithm $\texttt{FPSqrt}$. \\ Input: $x = m \cdot 2^e$, a target precision $p$, a rounding mode $\circ$ \\ Output: $y = \circ_p(\sqrt{x})$ \\ If $e$ is odd, $(m', f) \leftarrow (2m, e-1)$, else $(m',f) \leftarrow (m,e)$ \\ Write $m' := m_1 2^{2k} + m_0$, $m_1$ having $2p$ or $2p-1$ bits, $0 \leq m_0 < 2^{2k}$ \\ $(s, r) \leftarrow \texttt{SqrtRem}(m_1)$ \\ If round to zero or down or $r=m_0=0$, return $s \cdot 2^{k+f/2}$ \\ else return $(s+1) \cdot 2^{k+f/2}$. \end{quote} In case the input has more than $2p$ or $2p-1$ bits, it needs to be truncated, but the crucial point is that that truncated part will not overlap with the remainder $r$ from the integer square root, so the \emph{sticky bit} is simply zero when both parts are zero. For rounding to nearest, the simplest way is to ask $p+1$ bits for the integer square root --- thus $m$ has now $2p+1$ or $2p+2$ bits. In such a way, we directly get the rounding bit, which is the parity bit of $s$, and the sticky bit is determined as above. Otherwise, we have to compare the value of the whole remainder, i.e.~$r$ plus the possible truncated input, with $s + 1/4$, since $(s+1/2)^2 = s^2 + s + 1/4$. Note that equality can occur --- i.e.~the ``nearest even rounding rule'' --- only when the input has at least $2p+1$ bits; in particular it can not happen in the common case when input and output have the same precision. \section{High level functions} \subsection{The cosine function} To evaluate $\cos x$ with a target precision of $n$ bits, we use the following algorithm with working precision $m$: \begin{quote} $k \leftarrow \lfloor \sqrt{n/2} \rfloor$ \\ $r \leftarrow x^2$ rounded up \\ % err <= ulp(r) $r \leftarrow r/2^{2k}$ \\ % err <= ulp(r) $s \leftarrow 1, t \leftarrow 1$ \\ % err = 0 {\bf for} $l$ {\bf from} $1$ {\bf while} ${\rm EXP}(t) \ge -m$ \\ \q $t \leftarrow t \cdot r$ rounded up \\ % err <= (3*l-1)*ulp(t) \q $t \leftarrow \frac{t}{(2l-1)(2l)}$ rounded up \\ % err <= 3*l*ulp(t) \q $s \leftarrow s + (-1)^l t$ rounded down\\ % err <= l/2^m {\bf do} $k$ times \\ \q $s \leftarrow 2 s^2$ rounded up \\ \q $s \leftarrow s - 1$ \\ return $s$ \\ \end{quote} The error on $r$ after $r \leftarrow x^2$ is at most $1 \ulp(r)$ and remains $1 \ulp(r)$ after $r \leftarrow r/2^{2k}$ since that division is just an exponent shift. By induction, the error on $t$ after step $l$ of the for-loop is at most $3 l \ulp(t)$. Hence as long as $3 l \ulp(t)$ remains less than $\le 2^{-m}$ during that loop (this is possible as soon as $r < 1/\sqrt{2}$) and the loop goes to $l_0$, the error on $s$ after the for-loop is at most $2 l_0 2^{-m}$ (for $|r| < 1$, it is easy to check that $s$ will remain in the interval $[\frac{1}{2}, 1[$, thus $\ulp(s) = 2^{-m}$). (An additional $2^{-m}$ term represents the truncation error, but for $l=1$ the value of $t$ is exact, giving $(2 l_0 - 1) + 1 = 2 l_0$.) Denoting by $\epsilon_i$ the maximal error on $s$ after the $i$th step in the do-loop, we have $\epsilon_0 = 2 l_0 2^{-m}$ and $\epsilon_{k+1} \le 4 \epsilon_k + 2^{-m}$, giving $\epsilon_k \le (2 l_0+1/3) 2^{2k-m}$. \subsection{The sine function} The sine function is computed from the cosine, with a working precision of $m$ bits: \begin{quote} $c \leftarrow \cos x$ rounded away \\ $t \leftarrow c^2$ rounded away \\ $u \leftarrow 1 - t$ rounded to zero \\ $s \leftarrow {\rm sign}(x) \sqrt{u}$ rounded to zero \\ \end{quote} This algorithm ensures that the approximation $s$ is between zero and $\sin x$. Since all variables are in $[-1, 1]$, where $\ulp() \leq 2^{-m}$, all absolute errors are less than $2^{-m}$. We denote by $\epsilon_i$ a generic error with $0 \leq \epsilon_i < 2^{-m}$. We have $c = \cos x + \epsilon_1$; $t = c^2 + \epsilon_2 = \cos^2 x + 4 \epsilon_3$; $u = 1 - t - \epsilon_4 = 1 - \cos^2 x - 5 \epsilon_5$; $|s| = \sqrt{u} - \epsilon_6 = \sqrt{1 - \cos^2 x - 5 \epsilon_5} - \epsilon_6 \geq |\sin x| - \frac{5 \epsilon_5}{2 |s|} + \epsilon_6$ (by Rolle's theorem, $|\sqrt{u} - \sqrt{u'}| \le \frac{1}{2 \sqrt{v}} |u-u'|$ for $v \in [u, u']$, we apply it here with $u=1 - \cos^2 x - 5 \epsilon_5$, $u'=1 - \cos^2 x$.) Therefore, if $2^{e-1} \leq |s| < 2^e$, the absolute error on $s$ is bounded by $2^{-m} (\frac{5}{2} 2^{1-e}+1) \leq 2^{3-m-e}$. \subsubsection{An asymptotically fast algorithm for sin and cos.} We extend here the algorithm proposed by Brent for the exponential function to the simultaneous computation of sin and cos. The idea is the following. We first reduce the input $x$ to the range $0 < x < 1/2$. Then we decompose $x$ as follows: \[ x = \sum_{i=1}^{k} \frac{r_i}{2^{2^i}}, \] where $r_i$ is an integer, $0 \leq r_i < 2^{2^{i-1}}$. We define $x_j = \sum_{i=j}^{k} \frac{r_i}{2^{2^i}}$; then $x = x_1$, and we can write $x_j = \frac{r_j}{2^{2^j}} + x_{j+1}$. Thus with $S_j := \sin \frac{r_j}{2^{2^j}}$ and $C_j := \cos \frac{r_j}{2^{2^j}}$: \[ \sin x_j = S_j \cos x_{j+1} + C_j \sin x_{j+1}, \quad \cos x_j = C_j \cos x_{j+1} - S_j \sin x_{j+1}. \] The $2k$ values $S_j$ and $C_j$ can be computed by a binary splitting algorithm, each one in $O(M(n) \log n)$. Then each pair $(\sin x_j, \cos x_j)$ can be computed from $(\sin x_{j+1}, \sin x_{j+1})$ with four multiplies and two additions or subtractions. \paragraph{Error analysis.} We use here Higham's method. We assume that the values of $S_j$ and $C_j$ are approximated up to a multiplicative factor of the form $(1+u)^3$, where $|u| \leq 2^{-p}$, $p \geq 4$ being the working precision. We also assume that $\cos x_{j+1}$ and $\sin x_{j+1}$ are approximated with a factor of the form $(1+u)^{k_j}$. With rounding to nearest, the values of $S_j \cos x_{j+1}$, $C_j \sin x_{j+1}$, $C_j \cos x_{j+1}$ and $S_j \sin x_{j+1}$ are thus approximated with a factor $(1+u)^{k_j+4}$. The value of $\sin x_j$ is approximated with a factor $(1+u)^{k_j+5}$ since there all terms are nonnegative. We now analyze the effect of the cancellation in $C_j \cos x_{j+1} - S_j \sin x_{j+1}$. We have $\frac{r_j}{2^{2^j}} < 2^{-2^{j-1}}$, and for simplicity we define $l := 2^{j-1}$; thus $0 \leq S_j \leq 2^{-l}$, and $1-2^{-2l-1} \leq C_j \leq 1$. Similarly we have $x_{j+1} < 2^{-2l}$, thus $0 \leq \sin x_{j+1} \leq 2^{-2l}$, and $1-2^{-4l-1} \leq \cos x_{j+1} \leq 1$. The error is multiplied by a maximal ratio of \[ \frac{C_j \cos x_{j+1} + S_j \sin x_{j+1}} {C_j \cos x_{j+1} - S_j \sin x_{j+1}} \leq \frac{1+2^{-l} \cdot 2^{-2l}}{(1-2^{-2l-1})(1-2^{-4l-1})-2^{-l} \cdot 2^{-2l}}, \] which we can bound by \[ \frac{1+2^{-3l}}{1-2^{-2l}} \leq \frac{1}{(1-2^{-2l})(1-2^{-3l})} \leq \frac{1}{1-2^{-2l+1}}. \] The product of all those factors for $j \geq 1$ is bounded by $3$ (remember $l := 2^{j-1}$). In summary, the maximal error is of the form $3 [(1+u)^{5k}-1]$, where $2^{2^{k-1} < p \leq 2^{2^k}$. For $p \geq 4$, $5k \cdot 2^{-p}$ is bounded by $5/16$, and $(1+2^{-p})^{5k} - 1 \leq e^{5k \cdot 2^{-p}} - 1 \leq \frac{6}{5} \cdot 5k \cdot 2^{-p} = 6k \cdot 2^{-p}$. Thus the final relative error bound is $18k \cdot 2^{-p}$. Since $k \leq 6$ for $p \leq 2^{64}$, this gives a uniform relative error bound of $2^{-p+7}$. \subsection{The tangent function} The tangent function is computed from the cosine, using $\tan x = {\rm sign}(x) \sqrt{\frac{1}{\cos^2 x} - 1}$, with a working precision of $m$ bits: \begin{quote} $c \leftarrow \cos x$ rounded down \\ % c <= cos(x) <= 1 $t \leftarrow c^2$ rounded down \\ % t <= cos(x)^2 <= 1 $v \leftarrow 1/t$ rounded up \\ % v >= 1/cos(x)^2 >= 1 $u \leftarrow v - 1$ rounded up \\ % u >= 1/cos(x)^2 - 1 $s \leftarrow {\rm sign}(x) \sqrt{u}$ rounded away from $0$ \\ \end{quote} The absolute error on $c$ is at most $\ulp(c)$. Hence the error on $t$ is at most $\ulp(t) + 2 c \ulp(c) \le 5 \ulp(t)$, % err(t) <= ulp(t) + |c^2-c'^2| <= ulp(t) + |c-c'|*|c+c'| % <= ulp(t) + 2*ulp(c)*c <= 5*ulp(t) that on $v$ is at most $\ulp(v) + 5 \ulp(t)/t^2 \le \ulp(v) + 10 \ulp(1/t) \le 11 \ulp(v)$, % err(v) <= ulp(v) + |1/t-1/t'| <= ulp(v) + |t-t'|/t/t' % <= ulp(v) + 5*ulp(t)/t^2 <= ulp(v) + 10*ulp(1/t) <= 11*ulp(v) that on $u$ is at most $\ulp(u) + 11 \ulp(v) \le (1 + 11 \cdot 2^e) \ulp(u)$ where $e$ is the exponent difference between $v$ and $u$. % err(u) <= ulp(u) + err(v) <= ulp(v) + 11*ulp(v) <= (1+11*2^e)*ulp(u) The final error on $s$ is $\le \ulp(s) + (1+11 \cdot 2^e) \ulp(u)/2/\sqrt{u} \le \ulp(s) + (1+11 \cdot 2^e) \ulp(u/\sqrt{u}) \le (2 + 11 \cdot 2^e) \ulp(s)$. % err(s) <= ulp(s) + |u-u'|/2/sqrt(u) <= ulp(s) + (1+11*2^e)*ulp(u)/2/sqrt(u) % <= ulp(s) + (1+11*2^e)*ulp(u/sqrt(u)) % <= (2+11*2^e)*ulp(s) \subsection{The exponential function} The {\tt mpfr\_exp} function implements three different algorithms. For very large precision, it uses a $\O(M(n) \log^2 n)$ algorithm based on binary splitting, based on the generic implementation for hypergeometric functions in the file {\tt generic.c} (see \cite{Jeandel00}). Currently this third algorithm is used only for precision greater than $13000$ bits. For smaller precisions, it uses Brent's method~; if $r = (x - n \log 2)/2^k$ where $0 \le r < \log 2$, then \[ \exp(x) = 2^n \cdot \exp(r)^{2^k} \] and $\exp(r)$ is computed using the Taylor expansion: \[ \exp(r) = 1 + r + \frac{r^2}{2!} + \frac{r^3}{3!} + \cdots \] As $r < 2^{-k}$, if the target precision is $n$ bits, then only about $l = n/k$ terms of the Taylor expansion are needed. This method thus requires the evaluation of the Taylor series to order $n/k$, and $k$ squares to compute $\exp(r)^{2^k}$. If the Taylor series is evaluated using a na\"{\i}ve way, the optimal value of $k$ is about $n^{1/2}$, giving a complexity of $\O(n^{1/2} M(n))$. This is what is implemented in {\tt mpfr\_exp2\_aux}. If we use a baby step/giant step approach, the Taylor series can be evaluated in $\O(l^{1/2})$ operations, thus the evaluation requires $(n/k)^{1/2} + k$ multiplications, and the optimal $k$ is now about $n^{1/3}$, giving a total complexity of $\O(n^{1/3} M(n))$. This is implemented in the function {\tt mpfr\_exp2\_aux2}. \subsection{The error function} Let $n$ be the target precision, and $x$ be the input value. For $|x| \geq \sqrt{n \log 2}$, we have $|{\rm erf} \, x| = 1$ or $1^{-}$ according to the rounding mode. Otherwise we use the Taylor expansion. \subsubsection{Taylor expansion} \[ {\rm erf} \, z = \frac{2}{\sqrt{\pi}} \sum_{k=0}^{\infty} \frac{(-1)^k} {k! (2k+1)} z^{2k+1} \] \begin{quote} \verb|erf_0|$(z, n)$, assumes $z^2 \le n/e$ \\ working precision is $m$ \\ $y \leftarrow \circ (z^2)$ [rounded up] \\ $s \leftarrow 1$ \\ $t \leftarrow 1$ \\ {\bf for} $k$ {\bf from} $1$ {\bf do} \\ \q $t \leftarrow \circ (y t)$ [rounded up] \\ \q $t \leftarrow \circ (t/k)$ [rounded up] \\ \q $u \leftarrow \circ (\frac{t}{2k+1})$ [rounded up] \\ \q $s \leftarrow \circ (s + (-1)^k u)$ [nearest] \\ \q {\bf if} ${\rm EXP}(u) < {\rm EXP}(s) - m$ and $k \geq z^2$ {\bf then} break \\ $r \leftarrow 2 \circ (z s)$ [rounded up] \\ $p \leftarrow \circ (\pi)$ [rounded down] \\ $p \leftarrow \circ (\sqrt{p})$ [rounded down] \\ $r \leftarrow \circ (r/p)$ [nearest] \end{quote} Let $\varepsilon_k$ be the ulp-error on $t$ (denoted $t_k$) after the loop with index $k$. According to Lemma~\ref{rel_ulp}, since $t_k$ is computed after $2k$ roundings ($t_0=1$ is exact), we have $\varepsilon_k \leq 4k$. The error on $u$ at loop $k$ is thus at most $1+2\varepsilon_k \leq 1+8k$. Let $\sigma_k$ and $\nu_k$ be the exponent shifts between the new value of $s$ at step $k$ and respectively the old value of $s$, and $u$. Writing $s_k$ and $u_k$ for the values of $s$ and $u$ at the end of step $k$, we have $\sigma_k := {\rm EXP}(s_{k-1}) - {\rm EXP}(s_k)$ and $\nu_k := {\rm EXP}(u_k) - {\rm EXP}(s_k)$. The ulp-error $\tau_k$ on $s_k$ satisfies $\tau_k \leq \frac{1}{2} + \tau_{k-1} 2^{\sigma_k} + (1+8k) 2^{\nu_k}$. The halting condition $k \geq z^2$ ensures that $u_j \leq u_{j-1}$ for $j \geq k$, thus the series $\sum_{j=k}^{\infty} u_j$ is an alternating series, and the truncated part is bounded by its first term $|u_k| < {\rm ulp}(s_k)$. So the ulp-error between $s_k$ and $\sum_{k=0}^{\infty} \frac{(-1)^k z^2}{k! (2k+1)}$ is bounded by $1+\tau_k$. Now the error after $r \leftarrow 2 \circ (z s)$ is bounded by $1 + 2 (1+\tau_k) = 2 \tau_k + 3$. That on $p$ after $p \leftarrow \circ (\pi)$ is $1$ ulp, and after $p \leftarrow \circ (\sqrt{p})$ we get $2$ ulps (since $p \leftarrow \circ (\pi)$ was rounded down). The final error on $r$ is thus at most $1 + 2 (2 \tau_k + 3) + 4 = 4 \tau_k + 11$ (since $r$ is rounded up and $p$ is rounded down). \subsubsection{Very large arguments} Since ${\rm erfc} \, x \leq \frac{1}{\sqrt{\pi} x e^{x^2}}$, we have for $x^2 \geq n \log 2$ (which implies $x \geq 1$) that ${\rm erfc} \, x \leq 2^{-n}$, thus ${\rm erf} \, x = 1$ or ${\rm erf} \, x = 1 - 2^{-n}$ according to the rounding mode. \subsection{The hyperbolic cosine function} The {\tt mpfr\_cosh} ($\cosh{x}$) function implements the hyperbolic cosine as : $$ \cosh x = \frac{1}{2} \left( e^{x} + \frac{1}{e^x} \right) $$ The algorithm used for the calculation of the hyperbolic cosine is as follows\footnote{$o()$ represent the arrondi error and $error(u)$ the error associate with the calcualtion of $u$}: \begin{eqnarray}\nonumber u&\leftarrow&o(e^x)\\\label{coshalgo1} v&\leftarrow&o({u}^{-1})\\\label{coshalgo2} w&\leftarrow&o(u+v)\\\label{coshalgo3} s&\leftarrow&\frac{1}{2} w\\\label{coshalgo4} \end{eqnarray} Now, we have to bound the rounding error for each step of this algorithm. First, let consider the parity of hyperbolic cosine ($\cosh(-x)=\cosh(x)$) : the problem is reduced to calculate $\cosh x$ with $x \geq 0$. We can deduce $e^x \geq 1$ and $0 \leq e^{-x} \leq 1$. \begin{center} \begin{tabular}{l l l} \begin{minipage}{2.5cm} ${\textnormal{error}}(u)$ $u \leftarrow o(e^x)$\\ $-\infty \;\; (\bullet)$ \end{minipage} & \begin{minipage}{7.5cm} \begin{eqnarray}\nonumber |u-e^x| &\leq& \ulp(u)\\\nonumber \end{eqnarray} \end{minipage} & \begin{minipage}{6cm} {\hspace{7cm}} \end{minipage}\\%\hline \begin{minipage}{2.5cm} ${\textnormal{error}}(v)$ $v \leftarrow o({u}^{-1}) $\\ $+\infty \;\; (\bullet\bullet)$ \end{minipage} & \begin{minipage}{7.5cm} \begin{eqnarray}\nonumber &&|v-e^{-x}| \\\nonumber & \leq& |v - u^{-1}| + |u^{-1} - e^{-x}|\\\nonumber & \leq& \ulp(v) + \frac{1}{u \cdot e^x} |u-e^{x}|\\\nonumber & \leq& \ulp(v) + \frac{1}{u^2} \ulp(u)\;\;(\star)\\\nonumber & \leq& \ulp(v) + 2 \ulp(\frac{1}{u})\;\;(\star\star)\\\nonumber & \leq& 3 \, \ulp(v)\;\;(\star\star\star) \end{eqnarray} \end{minipage} & \begin{minipage}{6cm} $(\star)$ With $\frac{1}{e^x} \leq \frac{1}{u}$,\\ for that we must have $u \leq e^x$,\\ it is possible with a rounding of\\ $u$ to $-\infty \;\; (\bullet)$ $(\star\star)$ From inequation \U{R4}, \[ a \cdot \ulp(b) \leq 2 \cdot \ulp(a \cdot b)\] if $a =\frac{1}{u^2},\;b = u$ then \[ \frac{1}{u^2} \ulp(u) \leq 2 \ulp(\frac{1}{u})\] $(\star\star\star)$ If $\ulp(\frac{1}{u}) \leq ulp(v)$,\\ it is possible with a rounding of \\ $v$ to $+\infty \;\; (\bullet)$\\ \end{minipage}\\%\hline \begin{minipage}{2.5cm} ${\textnormal{error}}(w)$ $w \leftarrow o(u+v) $ \end{minipage} & \begin{minipage}{7.5cm} \begin{eqnarray}\nonumber &&|w-(e^{x}+e^{-x})| \\\nonumber & \leq& |w - (u+v)|+|u - e^x|+|v - e^{-x}|\\\nonumber & \leq& \ulp(w) + \ulp(u) + 3\ulp(v)\\\nonumber & \leq& \ulp(w) + 4\ulp(u)\;\;(\star)\\\nonumber & \leq& 5\ulp(w)\;\;(\star\star)\\\nonumber \end{eqnarray} \end{minipage} & \begin{minipage}{6cm} $(\star)$ With $v \leq 1\leq u$ then $\ulp(v) \leq \ulp(u)$ $(\star\star)$ With $u \leq w$ then $\ulp(u) \leq \ulp(w)$ \end{minipage}\\%\hline \begin{minipage}{2.5cm} ${\textnormal{error}}(s)$ $s \leftarrow o(\frac{w}{2}) $ \end{minipage} & \begin{minipage}{7.5cm} \begin{center} \begin{eqnarray}\nonumber {\textnormal{error}}(s) & = & {\textnormal{error}}(w)\\\nonumber & \leq & 5\ulp(s) \end{eqnarray} \end{center} \end{minipage} & \begin{minipage}{6cm} \end{minipage} \end{tabular} \end{center} That shows the rounding error on the calculation of $\cosh x$ can be bound by $5 \;\; \ulp$ on the result. So, to calculate the size of intermediary variables, we have to add, at least, $\lceil \log_2 5 \rceil=3$ bits the wanted precision. \subsection{The inverse hyperbolic cosine function} The {\tt mpfr\_acosh} function implements the inverse hyperbolic cosine as $\n{acosh} x = \log ( \sqrt{x^2-1} + x )$, using the following algorithm: \begin{quote} $q \leftarrow \circ(x^2)$ [down] \\ $r \leftarrow \circ(q-1)$ [down] \\ $s \leftarrow \circ(\sqrt{r})$ [nearest] \\ $t \leftarrow \circ(s + x)$ [nearest] \\ $u \leftarrow \circ(\log t)$ [nearest] \end{quote} The error on $q$ is at most $1$ ulp, thus that on $r$ is at most $\ulp(r) + \ulp(q) = (1+2^{\Exp(q)-\Exp(r)}) \ulp(r)$. Since $r$ is smaller than $x^2-1$, we can use the simpler formula for the error on the square root, which gives as bound $(\frac{3}{2} + 2^{\Exp(q)-\Exp(r)}) \ulp(s)$ for the error on $s$, and $(2 + 2^{\Exp(q)-\Exp(r)}) \ulp(t)$ for that on $t$. This gives a final bound of $\frac{1}{2} + (2 + 2^{\Exp(q)-\Exp(r)}) 2^{1-\Exp(u)}$ for the error on $u$. Since $2 + 2^{\Exp(q)-\Exp(r)} \leq 2^{1 + {\rm max}(1, \Exp(q)-\Exp(r))}$, that shows the rounding error on the calculation of $\n{acosh} x$ can be bounded by $\frac{1}{2} + 2^{2 + {\rm max}(1, \Exp(q)-\Exp(r)) - \Exp(u)} \ulp(u)$. \subsection{The hyperbolic sine function} The {\tt mpfr\_sinh} ($\sinh{x}$) function implements the hyperbolic sine as : $$ \sinh x = \frac{1}{2} \left( e^{x} - \frac{1}{e^x} \right) $$ The algorithm used for the calculation of the hyperbolic sine is as follows\footnote{$o()$ represent the arrondi error and $error(u)$ the error associate with the calcualtion of $u$}: \begin{eqnarray}\nonumber u&\leftarrow&o(e^x)\\\nonumber v&\leftarrow&o({u}^{-1})\\\nonumber w&\leftarrow&o(u-v)\\\nonumber s&\leftarrow&\frac{1}{2} w \end{eqnarray} Now, we have to bound the rounding error for each step of this algorithm. First, let consider the parity of hyperbolic sine ($\sinh(-x)=-\sinh(x)$) : the problem is reduced to calculate $\sinh x$ with $x \geq 0$. We can deduce $e^x \geq 1$ and $0 \leq e^{-x} \leq 1$. \begin{center} \begin{tabular}{l l l} \begin{minipage}{2.5cm} ${\textnormal{error}}(u)$ $u \leftarrow \minf(e^x)$\\ $(\bullet)$ \end{minipage} & \begin{minipage}{7.5cm} \begin{eqnarray}\nonumber |u-e^x| &\leq& \ulp(u)\\\nonumber \end{eqnarray} \end{minipage} & \begin{minipage}{6cm} {\hspace{7cm}} \end{minipage}\\%\hline \begin{minipage}{2.5cm} ${\textnormal{error}}(v)$ $v \leftarrow \pinf({u}^{-1}) $\\ $(\bullet\bullet)$ \end{minipage} & \begin{minipage}{7.5cm} \begin{eqnarray}\nonumber &&|v-e^{-x}| \\\nonumber & \leq& |v - u^{-1}| + |u^{-1} - e^{-x}|\\\nonumber & \leq& \ulp(v) + \frac{1}{u \cdot e^x} |u-e^{x}|\\\nonumber & \leq& \ulp(v) + \frac{1}{u^2} \ulp(u)\;\;(\star)\\\nonumber & \leq& \ulp(v) + 2 \ulp(\frac{1}{u})\;\;(\star\star)\\\nonumber & \leq& 3 \, \ulp(v)\;\;(\star\star\star) \end{eqnarray} \end{minipage} & \begin{minipage}{6cm} $(\star)$ With $\frac{1}{u} \leq \frac{1}{e^x}$,\\ for that we must have $e^x \leq u$,\\ it is possible with $u=\minf(e^x)$ $(\bullet)$ $(\star\star)$ From inequation \U{R4}, \[ a \cdot \ulp(b) \leq 2 \cdot \ulp(a \cdot b)\] if $a =\frac{1}{u^2},\;b = u$ then \[ \frac{1}{u^2} \ulp(u) \leq 2 \ulp(\frac{1}{u})\] $(\star\star\star)$ If $\ulp(\frac{1}{u}) \leq \ulp(v)$,\\ it is possible with $v=\pinf(u^{-1})$ $(\bullet\bullet)$ \end{minipage}\\%\hline \begin{minipage}{2.5cm} ${\textnormal{error}}(w)$ $w \leftarrow o(u+v) $ \end{minipage} & \begin{minipage}{7.8cm} \begin{eqnarray}\nonumber &&|w-(e^{x}-e^{-x})| \\\nonumber & \leq& |w - (u-v)|+|u - e^x|+|-v + e^{-x}|\\\nonumber & \leq& \ulp(w) + \ulp(u) + 3\ulp(v)\\\nonumber & \leq& \ulp(w) + 4\ulp(u)\;\;(\star)\\\nonumber & \leq& (1+ 4 \cdot 2^{\Exp(u)-\Exp(w)}) \ulp(w)\;\;(\star\star)\\\nonumber \end{eqnarray} \end{minipage} & \begin{minipage}{6cm} $(\star)$ With $v \leq 1\leq u$ then $\ulp(v) \leq \ulp(u)$ $(\star\star)$ see subsection \ref{generic:sous} \end{minipage}\\%\hline \begin{minipage}{2.5cm} ${\textnormal{error}}(s)$ $s \leftarrow o(\frac{w}{2}) $ \end{minipage} & \begin{minipage}{7.5cm} \begin{center} \begin{eqnarray}\nonumber {\textnormal{error}}(s) & = & {\textnormal{error}}(w)\\\nonumber & \leq & (1+ 4 \cdot 2^{\Exp(u)-\Exp(w)}) \ulp(w) \end{eqnarray} \end{center} \end{minipage} & \begin{minipage}{6cm} \end{minipage} \end{tabular} \end{center} That show the rounding error on the calculation of $\sinh x$ can be bound by $(1+ 4 \cdot 2^{\Exp(u)-\Exp(w)}) \ulp(w)$, then the number of bits need to add to the want accuracy to define intermediary variable is : \[ N_t=\lceil \log_2(1+ 4 \cdot 2^{\Exp(u)-\Exp(w)}) \rceil \] \subsection{The inverse hyperbolic sine function} The {\tt mpfr\_asinh} ($\n{acosh}{x}$) function implements the inverse hyperbolic sine as : $$ \n{asinh} = \log \left( \sqrt{x^2+1} + x \right) $$ The algorithm used for the calculation of the inverse hyperbolic sine is as follows \begin{eqnarray}\nonumber s&\leftarrow&o(x^2)\\\nonumber t&\leftarrow&o(s+1)\\\nonumber u&\leftarrow&o(\sqrt{t})\\\nonumber v&\leftarrow&o(u+x)\\\nonumber w&\leftarrow&o(\log v) \end{eqnarray} Now, we have to bound the rounding error for each step of this algorithm. First, let consider the parity of hyperbolic arcsine ($\n{asinh}(-x)=-\n{asinh}(x)$) : the problem is reduced to calculate $\n{asinh} x$ with $x \geq 0$. \begin{center} \begin{tabular}{l l l} \begin{minipage}{2.5cm} ${\textnormal{error}}(s)$ $s \leftarrow o(x^2) $ \end{minipage} & \begin{minipage}{7.5cm} \begin{eqnarray}\nonumber &&|s-x^2| \\\nonumber & \leq& \ulp(s)\;\;(\star)\\\nonumber \end{eqnarray} \end{minipage} & \begin{minipage}{6cm} \end{minipage}\\%\hline \begin{minipage}{2.5cm} ${\textnormal{error}}(t)$ $t \leftarrow \minf(s+1) $ $(\bullet)$ \end{minipage} & \begin{minipage}{7.5cm} \begin{eqnarray}\nonumber &&|t-(x^2+1)| \\\nonumber & \leq& 2 \ulp(t) \;\;(\star)\\\nonumber \end{eqnarray} \end{minipage} & \begin{minipage}{6cm} ($\star$) see subsection \ref{generic:sous} \end{minipage}\\%\hline \begin{minipage}{2.5cm} ${\textnormal{error}}(u)$ $u \leftarrow o(\sqrt{t}) $ \end{minipage} & \begin{minipage}{7.5cm} \begin{eqnarray}\nonumber &&|u-\sqrt{x^2+1}| \\\nonumber & \leq& 3 \ulp(u) \;(\star) \end{eqnarray} \end{minipage} & \begin{minipage}{6cm} ($\star$) see subsection \ref{generic:sqrt} with ($\bullet$) \end{minipage}\\%\hline \begin{minipage}{2.5cm} ${\textnormal{error}}(v)$ $v \leftarrow o(u+x) $ \end{minipage} & \begin{minipage}{7.5cm} \begin{eqnarray}\nonumber &&|v-(\sqrt{x^2+1}+x)| \\\nonumber & \leq& 5 \ulp(v) \;(\star) \end{eqnarray} \end{minipage} & \begin{minipage}{6cm} ($\star$) see subsection \ref{generic:sous} \end{minipage}\\%\hline \begin{minipage}{2.5cm} ${\textnormal{error}}(w)$ $w \leftarrow o(\log v) $ \end{minipage} & \begin{minipage}{7.5cm} \begin{eqnarray}\nonumber &&|w-\log(\sqrt{x^2+1}+x)| \\\nonumber & \leq& (1+5.2^{1-\Exp(w)}) \ulp(w) \;\star \end{eqnarray} \end{minipage} & \begin{minipage}{6cm} ($\star$) see subsection \ref{generic:log} \end{minipage} \end{tabular} \end{center} That shows the rounding error on the calculation of $\n{asinh} x$ can be bound by $ (1+5.2^{1-\Exp(w)})\;\; \ulp$ on the result. So, to calculate the size of intermediary variables, we have to add, at least, $\lceil \log_2 (1+5.2^{1-\Exp(w)}) \rceil$ bits the wanted precision. \subsection{The hyperbolic tangent function} The {\tt mpfr\_tanh} ($\tanh{x}$) function implements the hyperbolic tangent as : $$ \tanh x = \frac{ e^{2 \cdot x} -1 }{ e^{2 \cdot x} +1} $$ The algorithm used for the calculation of the hyperbolic tangent is as follows\footnote{$o()$ represent the arrondi error and $error(u)$ the error associate with the calcualtion of $u$}: \begin{eqnarray}\nonumber u&\leftarrow&o(2 \cdot x)\\\nonumber v&\leftarrow&o(e^u)\\\nonumber w&\leftarrow&o(v+1)\\\nonumber r&\leftarrow&o(v-1)\\\nonumber s&\leftarrow&o(\frac{r}{w}) \end{eqnarray} Now, we have to bound the rounding error for each step of this algorithm. First, let consider the parity of hyperbolic tangent ($\tanh(-x)=-\tanh(x)$) : the problem is reduced to calculate $\tanh x$ with $x \geq 0$. We can deduce $e^x \geq 1$. \begin{center} \begin{tabular}{l l l} \begin{minipage}{2.5cm} ${\textnormal{error}}(u)$ $u \leftarrow o(2 \cdot x)$ \end{minipage} & \begin{minipage}{7.5cm} exact \end{minipage} & \begin{minipage}{6cm} {\hspace{7cm}} \end{minipage}\\%\hline \begin{minipage}{2.5cm} ${\textnormal{error}}(v)$ $v \leftarrow o(e^{u}) $ \end{minipage} & \begin{minipage}{7.5cm} \begin{eqnarray}\nonumber &&|v-e^{2 \cdot x}| \\\nonumber & \leq& |v - e^{u}| + |e^{u} - e^{2 \cdot x}|\\\nonumber & \leq& \ulp(v) \end{eqnarray} \end{minipage} & \begin{minipage}{6cm} \end{minipage}\\%\hline \begin{minipage}{2.5cm} ${\textnormal{error}}(w)$ $w \leftarrow \minf(v+1) $ $(\bullet)$ \end{minipage} & \begin{minipage}{7.5cm} \begin{eqnarray}\nonumber &&|w-(e^{2 \cdot x}+1)| \\\nonumber &\leq& |w - (v+1)|+|(v+1) - (e^{2 \cdot x}+1)|\\\nonumber &\leq& \ulp(w) + \ulp(v)\\\nonumber &\leq& 2 \cdot \ulp(w)\;(\star)\\\nonumber \end{eqnarray} \end{minipage} & \begin{minipage}{6cm} $(\star)$ With $v \leq w$ then $\ulp(v) \leq \ulp(w)$ \end{minipage}\\%\hline \begin{minipage}{2.5cm} ${\textnormal{error}}(r)$ $r \leftarrow \pinf(v-1) $ $(\bullet\bullet)$ \end{minipage} & \begin{minipage}{7.5cm} \begin{eqnarray}\nonumber &&|r-(e^{2 \cdot x}-1)| \\\nonumber &\leq& |r - (v-1)|+|(v-1) - (e^{2 \cdot x}-1)|\\\nonumber &\leq& \ulp(r) + \ulp(v)\\\nonumber &\leq& (1+2^{\Exp(v)-\Exp(r)}) \ulp(r)\;(\star)\\\nonumber \end{eqnarray} \end{minipage} & \begin{minipage}{6cm} $(\star)$ see subsection \ref{generic:sous} \end{minipage}\\%\hline \begin{minipage}{2.5cm} ${\textnormal{error}}(s)$ $s \leftarrow o(\frac{r}{w}) $ \end{minipage} & \begin{minipage}{7.5cm} \begin{eqnarray}\nonumber &&|s-\frac{e^{2x}-1}{e^{2x}+1}| \\\nonumber &\leq& (1+2 \times 2+ \hdots\\\nonumber &\hdots& 2(1+2^{\Exp(v)-\Exp(r)})) \ulp(s) \;\;(\star)\\\nonumber \end{eqnarray} \end{minipage} & \begin{minipage}{6cm} $(\star)$ see subsection \ref{generic:div} with $(\bullet)$ and $(\bullet\bullet)$ \end{minipage} \end{tabular} \end{center} That show the rounding error on the calculation of $\tanh x$ can be bound by $(1+2 \times 2+2(1+2^{\Exp(v)-\Exp(r)})) \ulp(s)$, then the number of bits need to add to the want accuracy to define intermediary variable is : \[ N_t=\lceil \log_2(7+2^{\Exp(v)-\Exp(r)+1}) \rceil \] \subsection{The inverse hyperbolic tangent function} The {\tt mpfr\_atanh} ($\n{acosh}{x}$) function implements the inverse hyperbolic tangent as : $$ \n{atanh} = \frac{1}{2} \log \frac{1+x}{1-x} $$ The algorithm used for the calculation of the inverse hyperbolic tangent is as follows \begin{eqnarray}\nonumber s&\leftarrow&o(1+x)\\\nonumber t&\leftarrow&o(1-x)\\\nonumber u&\leftarrow&o(\frac{s}{t})\\\nonumber v&\leftarrow&o(\log u)\\\nonumber w&\leftarrow&o(\frac{1}{2} v) \end{eqnarray} Now, we have to bound the rounding error for each step of this algorithm. First, let consider the parity of hyperbolic arctan ($\n{atanh}(-x)=-\n{atanh}(x)$) : the problem is reduced to calculate $\n{atanh} x$ with $x \geq 0$. \begin{center} \begin{tabular}{l l l} \begin{minipage}{2.5cm} ${\textnormal{error}}(s)$ $s \leftarrow \pinf(1+x) $ $(\bullet)$ \end{minipage} & \begin{minipage}{7.5cm} \begin{eqnarray}\nonumber &&|s-(1+x)| \\\nonumber & \leq& 2 \ulp(s)\;\;(\star)\\\nonumber \end{eqnarray} \end{minipage} & \begin{minipage}{6cm} see subsection \ref{generic:sous} \end{minipage}\\%\hline \begin{minipage}{2.5cm} ${\textnormal{error}}(t)$ $t \leftarrow \minf(1-x) $ $(\bullet\bullet)$ \end{minipage} & \begin{minipage}{7.5cm} \begin{eqnarray}\nonumber &&|t-(1-x)| \\\nonumber & \leq& (1+2^{\Exp(x)-\Exp(t)}) \ulp(t) \;\;(\star)\\\nonumber \end{eqnarray} \end{minipage} & \begin{minipage}{6cm} ($\star$) see subsection \ref{generic:sous} \end{minipage}\\%\hline \begin{minipage}{2.5cm} ${\textnormal{error}}(u)$ $u \leftarrow o(\frac{s}{t}) $ \end{minipage} & \begin{minipage}{7.5cm} \begin{eqnarray}\nonumber &&|u-\frac{1+x}{1-x}| \\\nonumber & \leq& (1+ 2 \times 2 + \\\nonumber & \cdots& 2 \times (1+2^{\Exp(x)-\Exp(t)}))\ulp{u} \;(\star)\\\nonumber & \leq& (7+2^{\Exp(x)-\Exp(t)+1})\ulp(u) \end{eqnarray} \end{minipage} & \begin{minipage}{6cm} ($\star$) see subsection \ref{generic:inv} with ($\bullet$) and ($\bullet\bullet$) \end{minipage}\\%\hline \begin{minipage}{2.5cm} ${\textnormal{error}}(v)$ $v \leftarrow o(\log(u)) $ \end{minipage} & \begin{minipage}{7.5cm} \begin{eqnarray}\nonumber &&|v-(\log{\frac{1+x}{1-x}})| \\\nonumber & \leq& (1+(7+2^{\Exp(x)-\Exp(t)+1}) \\\nonumber & \cdots& \times 2^{1-\Exp(v)}) \ulp(v)\;(\star)\\\nonumber & \leq& (1+7 \times 2^{1-\Exp(v)} +\\\nonumber & \cdots& 2^{\Exp(x)-\Exp(t)-\Exp(v)+2}) \ulp(v) \end{eqnarray} \end{minipage} & \begin{minipage}{6cm} ($\star$) see subsection \ref{generic:log} \end{minipage}\\%\hline \begin{minipage}{2.5cm} ${\textnormal{error}}(w)$ $w \leftarrow o(\frac{1}{2} v) $ \end{minipage} & \begin{minipage}{7.5cm} \begin{eqnarray}\nonumber &&|w-\frac{1}{2}\log{\frac{1+x}{1-x}}| \\\nonumber & \leq& (1+7 \times 2^{1-\Exp(v)} + \\\nonumber & \cdots& 2^{\Exp(x)-\Exp(t)-\Exp(v)+2}) \ulp(w) \;\star \end{eqnarray} \end{minipage} & \begin{minipage}{6cm} ($\star$) exact \end{minipage} \end{tabular} \end{center} That shows the rounding error on the calculation of $\n{atanh} x$ can be bound by $ (1+7 \times 2^{1-\Exp(v)} + 2^{\Exp(x)-\Exp(t)-\Exp(v)+2})\;\; \ulp$ on the result. So, to calculate the size of intermediary variables, we have to add, at least, $\lceil \log_2 (1+7 \times 2^{1-\Exp(v)} + 2^{\Exp(x)-\Exp(t)-\Exp(v)+2}) \rceil$ bits the wanted precision. \subsection{The arc-sine function} \begin{enumerate} \item We use the formula $arcsin\,x=\arctan\,\frac{x}{\sqrt{1-x^2}}$ \item We will have the when $x$ is near $1$ we will experience uncertainty problems: \item If $x=a(1+\delta)$ with $\delta$ being the relative error then we will have \begin{equation*} 1-x=1-a-a\delta=(1-a)[1-\frac{a}{1-a}\delta] \end{equation*} Ans so when using the arctangent programs we need to take into account that decrease in precision. \item We will have \end{enumerate} \subsection{The arc-cosine function} % from Mathieu Dutour \begin{enumerate} \item Obviously, we used the formula \begin{equation*} \arccos\,x=\frac{\pi}{2}-\arcsin\,x \end{equation*} \item The problem of $\arccos$ is that it is $0$ at $1$, so, we have a cancellation problem to treat at $1$. \item (Suppose $x\geq 0$, this is where the problem happens) The derivative of $\arccos$ is $\frac{-1}{\sqrt{1-x^2}}$ and we will have \begin{equation*} \frac{1}{2\sqrt{1-x}} \leq |\frac{-1}{\sqrt{1-x^2}}|=\frac{1}{\sqrt{(1-x)(1+x)}} \leq \frac{1}{\sqrt{1-x}} \end{equation*} So, integrating the above inequality on $[x,1]$ we get \begin{equation*} \sqrt{1-x}\leq \arccos\,x\leq 2\sqrt{1-x} \end{equation*} \item The important part is the lower bound that we get which tell us a upper bound on the cancellation that will occur:\\ The terms that are canceled are $\pi/2$ and $\arcsin\,x$, their order is $2$. The number of canceled terms is so \begin{verbatim} 1-1/2*MPFR_EXP(1-x) \end{verbatim} \end{enumerate} \subsection{The arc-tangent function} % from Mathieu Dutour \begin{comment} Argument reduction $\frac{x}{1+\sqrt{1+x^2}}$: {Assume} $0 \leq y < 1$, where $y$ is an approximation of some real $x$ such that $x = y (1 + \delta)$. We approximate $\frac{x}{1+\sqrt{1+x^2}}$ using the following algorithm: \begin{quote} $s \leftarrow \circ(x^2)$ [nearest] \\ $t \leftarrow \circ(1+s)$ [nearest] \\ $u \leftarrow \circ(\sqrt{t})$ [nearest] \\ $v \leftarrow \circ(1+u))$ [nearest] \\ $w \leftarrow \circ(\frac{x}{v})$ [nearest] \end{quote} \end{comment} \subsubsection{Binary splitting} \noindent The Taylor serie for $\arctan$ is suitable for analysis using Binary splitting. \par This method is detailed for example in ``Pi and The AGM'' p 334. It is efficient for rational numbers and is non efficient for non rational numbers. \par The efficiency of this method is then quite limited. One can then wonder how to use it for non rational numbers. \par Using the formulas \begin{equation*} \arctan\,(-x)=-\arctan\,x\;\;\mbox{and}\;\;\arctan\,x+\arctan\,\frac{1}{x}=\frac{\pi}{2}{\rm sign}(x) \end{equation*} we can restrict ourself to $0\leq x\leq 1$. \par Writing \begin{equation*} x=\sum_{i=1}^{\infty} \frac{u_i}{2^i}\;\;\mbox{with}\;\;u_i\in\{0,1\} \end{equation*} or \begin{equation*} x=\sum_{i=1}^{\infty} \frac{u_i}{2^{2^i}}\;\;\mbox{with}\;\;u_i\in\{0,1,\dots,2^{2^{i-1}}_1\}\mbox{~if~}i>1\mbox{~and~}u_1\in \{0,1\} \end{equation*} we can compute $\cos$, $\sin$ or $\exp$ using the formulas \begin{equation*} \begin{array}{c} \cos\,(a+b)=\cos\,a\cos\,b-\sin\,a\sin\,b\\ \sin\,(a+b)=\sin\,a\cos\,b+\cos\,a\sin\,b\\ \exp(a+b)=(\exp\,a)(\exp\,b) \end{array} \end{equation*} Unfortunately for $\arctan$ there is no similar formulas. The only formula known is \begin{equation*} \arctan\,x+\arctan\,y=\arctan\,\frac{x+y}{1-xy}+k\pi\;\;\mbox{with}\;\;k\in\Z \end{equation*} we will use \begin{equation*} \arctan\,x=\arctan\,y+\arctan\,\frac{x-y}{1+xy} \end{equation*} with $x,y>0$ and $y 0$. As $a$ is an odd integer, necessarily we have $a^{c 2^d} = e$ and $2^{b c 2^d} = 2^f$, thus $b c 2^d = f$. If $d \geq 0$, then $a^c$ must be an integer: this is true if $c \geq 0$, and false for $c < 0$ since $a^{c 2^d} = \frac{1}{a^{-c} 2^d} < 1$ cannot be an integer. In addition $a^{c 2^d}$ must be representable in the given precision. Assume now $d < 0$, then $a^{c 2^d} = {a^c}^{1/2^{d'}}$ with $d'=-d$, thus we have $a^c = e^{2^{d'}}$, thus $a^c$ must be a $2^{d'}$-th power of an integer. Since $c$ is odd, $a$ itself must be a $2^{d'}$-th power. We therefore propose the following algorithm: \begin{quote} Algorithm CheckExactPower. \\ Input: $x=a 2^b$, $y=c 2^d$, $a, c$ odd integers \\ Output: \emph{true} if $x^y$ is an exact power $e 2^f$, {\em false} otherwise\\ {\bf if} $x < 0$ {\bf then} \\ \q {\bf if} $y$ is an integer {\bf then} return ${\rm CheckExactPower}(-x,y)$\\ \q {\bf else} return {\em false} \\ {\bf if} $y < 0$ {\bf then} \\ \q {\bf if} $a=1$ {\bf then} return {\em true} {\bf else} return {\em false}\\ {\bf if} $d < 0$ {\bf then} \\ \q {\bf if} $a 2^b$ is not a $2^{-d}$ power {\bf then} return {\em false} \\ return {\em true} \end{quote} Detecting if the result is exactly representable is not enough, since it may be exact, but with a precision larger than the target precision. Thus we propose the following: modify Algorithm CheckExactPower so that it returns an upper bound $p$ for the number of significant bits of $x^y$ when it is exactly representable, i.e.~$x^y = m \cdot 2^e$ with $|m| < 2^p$. Then if the relative error on the approximation of $x^y$ is less than $\frac{1}{2}$ ulp, then rounding it to nearest will give $x^y$. \subsection{The real cube root} The \texttt{mpfr\_cbrt} function computes the real cube root of $x$. Since for $x<0$, we have $\sqrt[3]{x} = - \sqrt[3]{-x}$, we can focus on $x > 0$. Let $n$ be the number of wanted bits of the result. We write $x = m \cdot 2^{3e}$ where $m$ is a positive integer with $m \geq 2^{3n-3}$. Then we compute the integer cubic root of $m$: let $m=s^3+r$ with $0 \leq r$ and $m < (s+1)^3$. Let $k$ be the number of bits of $s$: since $m \geq 2^{3n-3}$, we have $s \geq 2^{n-1}$ thus $k \geq n$. If $k > n$, we replace $s$ by $\lfloor s 2^{n-k} \rfloor$, $e$ by $e+(k-n)$, and update $r$ accordingly so that $x = (s^3+r) 2^{3e}$ still holds (be careful that $r$ may no longer be an integer in that case). Then the correct rounding of $\sqrt[3]{x}$ is: \begin{eqnarray*} s 2^e && \mbox{if $r=0$ or round down or round nearest and $r < \frac{3}{2} s^2 +\frac{3}{4}s+ \frac{1}{8}$,} \\ (s+1) 2^e && \mbox{otherwise}. \end{eqnarray*} Note: for rounding to nearest, one may consider $m \geq 2^{3n}$ instead of $m \geq 2^{3n-3}$, i.e.~taking $n+1$ instead of $n$. In that case, there is no need to compare the remainder $r$ to $\frac{3}{2} s^2 +\frac{3}{4}s+ \frac{1}{8}$: we just need to know whether $r=0$ or not. The even rounding rule is needed only when the input $x$ has at least $3n+1$ bits, since the cube of a odd number of $n+1$ bits has at least $3n+1$ bits. \subsection{The exponential integral} The exponential integral \verb|mpfr_eint| is defined as in \cite[formula 5.1.10]{AbSt73}: for $x > 0$, \[ {\rm Ei}(x) = \gamma + \log x + \sum_{k=1}^{\infty} \frac{x^k}{k \, k!}, \] and for $x < 0$ it gives NaN. We use the following integer-based algorithm to evaluate $\sum_{k=1}^{\infty} \frac{x^k}{k \, k!}$, using working precision $w$. For any real $v$, we denote by ${\rm trunc}(v)$ the nearest integer towards zero. \begin{quote} Decompose $x$ into $m\cdot2^e$ with $m$ integer [exact] \\ If necessary, truncate $m$ to $w$ bits and adjust $e$ \\ $s \leftarrow 0$ \\ % sum $t \leftarrow 2^w$ \\ % current term x^k/k! for $k := 1$ do \\ \q $t \leftarrow {\rm trunc}(t m 2^e/k)$ \\ \q $u \leftarrow {\rm trunc}(t/k)$ \\ \q $s \leftarrow s + u$ \\ Return $s \cdot 2^{-w}$. \end{quote} Note: in $t \leftarrow {\rm trunc}(t m 2^e/k)$, we first compute $tm$ exactly, then if $e$ is negative, we first divide by $2^{-e}$ and truncate, then divide by $k$ and truncate; this gives the same answer than dividing once by $k 2^{-e}$, but it is more efficient. Let $\epsilon_k$ be the absolute difference between $t$ and $2^w x^k/k!$ at step $k$. We have $\epsilon_0=0$, and $\epsilon_k \leq 1 + \epsilon_{k-1} m 2^e/k + t_{k-1} m 2^{e+1-w}/k$, since the error when approximating $x$ by $m 2^e$ is less than $m 2^{e+1-w}$. Similarly, the absolute error on $u$ at step $k$ is at most $\nu_k \leq 1 + \epsilon_k/k$, and that on $s$ at most $\tau_k \leq \tau_{k-1} + \nu_k$. We compute all these errors dynamically (using MPFR with a small precision), and we stop when $|t|$ is smaller than the bound $\tau_k$ on the error on $s$ made so far. At that time, the truncation error when neglecting terms of index $k+1$ to $\infty$ can be bounded by $(|t| + \epsilon_k)/k (|x|/k + |x|^2/k^2 + \cdots) \leq (|t| + \epsilon_k) |x|/k/(k-|x|)$. \subsection{Summary} \begin{table} \begin{center} \begin{tabular}{|c|c|c|} \hline $w$ & $\err(w)/\ulp(w) \le c_w + \ldots$ &special case\\ \hline\hline $o(u+v)$ & $k_u 2^{e_u-e_w} + k_v 2^{e_v-e_w}$ & $k_u + k_v$ if $u v \ge 0$\\ $o(u \cdot v)$ & $(1+c^{+}_u)k_u + (1+c^{+}_v)k_v$ & $2k_u + 2k_v$ if $u \ge x$, $v \ge y$\\ $o(1/v)$ & $4 k_v$ & $2 k_v$ if $v \le y$ \\ $o(u/v)$ & $4 k_u + 4 k_v$ & $2 k_u + 2 k_v$ if $v \le y$ \\ $o(\sqrt{u})$ & $2 k_u/(1+\sqrt{c^{-}_u})$ & $k_u$ if $u \le x$ \\ $o(e^u)$ & $e^{k_u 2^{e_u-p}} 2^{e_u+1} k_u$ & $2^{e_u+1} k_u$ if $u \ge x$ \\ $o(\log u)$ & $k_u 2^{1-e_w}$ & \\ \hline \end{tabular} \end{center} \caption{Generic error for several operations, assuming all variables have a mantissa of $p$ bits, and no overflow/underflow occurs. The inputs $u$ and $v$ are approximations of $x$ and $y$ with $|u-x| \le k_u \ulp(u)$ and $|v-y| \le k_v \ulp(v)$. The additional rounding error $c_w$ is $1/2$ for rounding to nearest, and $1$ otherwise. The value $c^{\pm}_u$ equals $1 \pm k_u 2^{1-p}$. } \end{table} Remark 1: in the addition case, when $u v > 0$, necessarily one of $2^{e_u-e_w}$ and $2^{e_v-e_w}$ is less than $1/2$, thus $\err(w)/\ulp(w) \le c_w + {\rm max}(k_u + k_v/2, k_u/2 + k_v) \le c_w + \frac{3}{2} {\rm max}(k_u, k_v)$. \subsection{The gamma function} % See http://oldmill.uchicago.edu/~wilder/Code/gamma The gamma function is computed by Spouge's method \cite{Spouge94}: \[ \Gamma(z+1) \approx (z+a)^{z+1/2} e^{-z-a} \left[ \sqrt{2 \pi} + \sum_{k=1}^{\lceil a \rceil - 1} \frac{c_k(a)}{z+k} \right], \] which is valid for $\Re(z+a) > 0$, where \[ c_k(a) = \frac{(-1)^{k-1}}{(k-1)!} (a-k)^{k-1/2} e^{a-k}. \] Here, we choose the free parameter $a$ to be an integer. According to \cite[Section 2.6]{Pugh04}, the relative error is bounded by $a^{-1/2} (2\pi)^{-a-1/2}$ for $a \ge 3$ and $\Re(z) \ge 0$. \section{Mathematical constants} \subsection{The constant $\pi$} The computation of $\pi$ uses the AGM formula \[ \pi = \frac{\mu^2}{D}, \] where $\mu = {\rm AGM}(\frac{1}{\sqrt{2}}, 1)$ is the common limit of the sequences $a_0=1, b_0 = \frac{1}{\sqrt{2}}, a_{k+1} = (a_k+b_k)/2, b_{k+1} = \sqrt{a_k b_k}$, $D=\frac{1}{4} - \sum_{k=1}^{\infty} 2^{k-1} (a_k^2-b_k^2)$. This formula can be evaluated efficiently as shown in \cite{ScGrVe94}, starting from $a_0 = A_0 = 1, B_0 = 1/2, D_0 = 1/4$, where $A_k$ and $B_k$ represent respectively $a_k^2$ and $b_k^2$: \begin{quote} $S_{k+1} \leftarrow (A_k + B_k)/4$ \\ $b_k \leftarrow \sqrt{B_k}$ \\ $a_{k+1} \leftarrow (a_k + b_k)/2$ \\ $A_{k+1} \leftarrow a_k^2$ \\ $B_{k+1} \leftarrow 2 (A_{k+1} - S_{k+1})$ \\ $D_{k+1} \leftarrow D_k - 2^k (A_{k+1} - B_{k+1})$ \end{quote} For each variable $x$ approximation a true value $\tilde{x}$, denote by $\epsilon(x)$ the exponent of the maximal error, i.e.~$x = \tilde{x} (1\pm\delta)^e$ with $|e| \leq \epsilon(x)$, and $\delta = 2^{-p}$ for precision $p$ (we assume all roundings to nearest). We can prove by an easy induction that $\epsilon(a_k) = 3 \cdot 2^{k-1} - 1$, for $k \geq 1$, $\epsilon(A_k) = 3 \cdot 2^k - 1$, $\epsilon(B_k) = 6 \cdot 2^k - 6$. Thus the relative error on $B_k$ is at most $12 \cdot 2^{k-p}$, --- assuming $12 \cdot 2^{k-p} \leq 1$ --- which is at most $12 \cdot 2^k \ulp(B_k)$, since $1/2 \leq B_k$. If we stop when $|A_k-B_k| \leq 2^{k-p}$ where $p$ is the working precision, then $|\mu^2 - B_k| \leq 13 \cdot 2^{k-p}$. The error on $D$ is bounded by $\sum_{j=0}^k 2^{j} (6 \cdot 2^{k-p} + 12 \cdot 2^{k-p}) \leq 6 \cdot 2^{2k-p+2}$, which gives a relative error less than $2^{2k-p+7}$ since $D_k \geq 3/16$. Thus we have $\pi = \frac{B_k (1+\epsilon)}{D (1+\epsilon')}$ with $|\epsilon| \leq 13 \cdot 2^{k-p}$ and $|\epsilon'| \leq 2^{2k-p+7}$. This gives $\pi = \frac{B_k}{D} (1+\epsilon'')$ with $|\epsilon''| \leq 2 \epsilon + \epsilon' \leq (26 + 2^{k+7}) 2^{k-p} \leq 2^{2k-p+8}$, assuming $|\epsilon'| \leq 1$. \subsection{Euler's constant} Euler's constant is computed using the formula $\gamma = S(n) - R(n) - \log n$ where: \[ S(n) = \sum_{k=1}^{\infty} \frac{n^k (-1)^{k-1}}{k! k}, \quad R(n) = \int_n^{\infty} \frac{\exp(-u)}{u} du \sim \frac{\exp(-n)}{n} \sum_{k=0}^{\infty} \frac{k!}{(-n)^k}. \] This identity is attributed to Sweeney by Brent \cite{Brent78}. We have $S(n) = _2 F_2(1,1;2,2;-n)$ and $R(n) = {\rm Ei}(1, n)$. \medskip \Paragraph{Evaluation of $S(n)$.} As in \cite{Brent78}, let $\alpha \sim 4.319136566$ the positive root of $\alpha + 2 = \alpha \log \alpha$, and $N = \lceil \alpha n \rceil$. We approximate $S(n)$ by \[ S'(n) = \sum_{k=1}^{N} \frac{n^k (-1)^{k-1}}{k! k}. \] % = \frac{1}{N!} \sum_{k=1}^N \frac{a_k}{k}, % where $a_k = n^k (-1)^{k-1} N!/k!$ is an integer. % Therefore $a_k$ is exactly computed, and when dividing it by $k$ % (integer division) % the error is at most $1$, and thus the absolute error on $S'(n)$ is % at most $N/N! = 1/(N-1)!$. The remainder term $S(n) - S'(n)$ is bounded by: \[ |S(n) - S'(n)| \le \sum_{k=N+1}^{\infty} \frac{n^k}{k!}. \] Since $k! \ge (k/e)^k$, and $k \ge N+1 \ge \alpha n$, we have: \[ |S(n) - S'(n)| \le \sum_{k=N+1}^{\infty} \left( \frac{n e}{k} \right)^k \le \sum_{k=N+1}^{\infty} \left( \frac{e}{\alpha} \right)^k \le 2 \left( \frac{e}{\alpha} \right)^N \le 2 e^{-2n} \] since $(e/\alpha)^{\alpha} = e^{-2}$. To approximate $S'(n)$, we use the binary splitting method, which computes integers $T$ and $Q$ such that $S'(n) = \frac{T}{Q}$ exactly, then we compute $t = \circ(T)$, and $s = \circ(t/Q)$, both with rounding to nearest. If the working precision is $w$, we have $t = T (1+\theta_1)$ and $s = t/Q(1+\theta_2)$ where $|\theta_i| \leq 2^{-w}$. If follows $s = T/Q (1+\theta_1)(1+\theta_2)$, thus the error on $s$ is less than $3$ ulps, since $(1+2^{-w})^2 \leq 1 + 3 \cdot 2^{-w}$. \begin{comment} To approximate $S'(n)$, we use the following algorithm, where $m$ is the working precision, and $a, s, t$ are integer variables: \begin{quote} $a \leftarrow 2^m$ \\ $s \leftarrow 0$ \\ {\bf for} $k$ {\bf from} $1$ {\bf to} $N$ {\bf do} \\ \q $a \leftarrow \lfloor \frac{n a}{k} \rfloor$ \\ \q $t \leftarrow \lfloor \frac{a}{k} \rfloor$ \\ \q $s \leftarrow s + (-1)^{k-1} t$ \\ return $x = s/2^m$ \end{quote} The absolute error $\epsilon_k$ on $a$ at step $k$ satisfies $\epsilon_k \le 1 + n/k \epsilon_{k-1}$ with $\epsilon_0=0$. The maximum error is $\epsilon_n \le \frac{n^n}{n!} \le e^n$. Thus the error on $t$ at step $k$ is less than $1 + e^n/k$, and the total error on $s$ is bounded by $N (e^n + 1)$. Hence to get a precision of $n$ bits, we need to use $m ge n (1 + \frac{1}{\log 2})$. In such a case, the value $s/2^m$ converted to a floating-point number will have an error of at most $\ulp(x)$. \end{comment} \medskip \Paragraph{Evaluation of $R(n)$.} We estimate $R(n)$ using the terms up to $k=n-2$, again as in \cite{Brent78}: \[ R'(n) = \frac{e^{-n}}{n} \sum_{k=0}^{n-2} \frac{k!}{(-n)^k}. \] % = \frac{\exp(-n)}{n^{n-1}} \sum_{k=0}^{n-2} (-1)^k \frac{k!} {n^{n-2-k}}. % Here again, the integer sum is computed exactly, converting it to a % floating-point number introduces at most one ulp of error, % $\exp(-n)$ is computed within one ulp, % and $n^{n-1}$ within at most $n-2$ ulps. % The division by $n^{n-1}$ and the multiplication introduce one more ulp of % error, thus the total error on $R'(n)$ is at most $n+2$ ulps. Let us introduce $I_k = \int_n^{\infty} \frac{e^{-u}}{u^k} du$. We have $R(n) = I_1$ and the recurrence $I_k = \frac{e^{-n}}{n^k} - k I_{k+1}$, which gives \[ R(n) = \frac{e^{-n}}{n} \sum_{k=0}^{n-2} \frac{k!}{(-n)^k} + (-1)^{n-1} (n-1)! I_n. \] Bounding $n!$ by $(n/e)^n \sqrt{2 \pi (n+1)}$ which holds\footnote{ Formula 6.1.38 from \cite{AbSt73} gives $x! = \sqrt{2\pi} x^{x+1/2} e^{-x+\frac{\theta}{12x}}$ for $x>0$ and $0 < \theta < 1$. Using it for $x \ge 1$, we have $0 < \frac{\theta}{6x} < 1$, and $e^t < 1+2t$ for $0 < t < 1$, thus $(x!)^2 \le 2\pi x^{2x} e^{-2x} (x+\frac{1}{3})$.} for $n \ge 1$, we have: \[ |R(n) - R'(n)| = (n-1)! I_n \le \frac{n!}{n} \int_n^{\infty} \frac{e^{-n}}{u^n} du \le \frac{n^{n-1}}{e^n} \sqrt{2 \pi (n+1)} \frac{e^{-n}}{(n-1) n^{n-1}} \] and since $\sqrt{2 \pi (n+1)}/(n-1) \le 1$ for $n \ge 9$: \[ |R(n) - R'(n)| \le e^{-2n} \quad \mbox{for $n \ge 9$}. \] Thus we have: \[ |\gamma - S'(n) - R'(n) - \log n| \le 3 e^{-2n} \quad \mbox{for $n\ge 9$}.\] % If the working precision is $p$, then choose $n \ge \frac{\log 2}{2} (p+2)$ % such that $3 e^{-2n}$ represents at most one ulp. To approximate $R'(n)$, we use the following: \begin{quote} $m \leftarrow {\rm prec}(x) - \lfloor \frac{n}{\log 2} \rfloor$ \\ $a \leftarrow 2^m$ \\ $s \leftarrow 1$ \\ {\bf for} $k$ {\bf from} $1$ {\bf to} $n$ {\bf do} \\ \q $a \leftarrow \lfloor \frac{k a}{n} \rfloor$ \\ \q $s \leftarrow s + (-1)^{k} a$ \\ $t \leftarrow \lfloor s/n \rfloor$ \\ $x \leftarrow t/2^m$ \\ return $r = e^{-n} x$ \end{quote} The absolute error $\epsilon_k$ on $a$ at step $k$ satisfies $\epsilon_k \le 1 + k/n \epsilon_{k-1}$ with $\epsilon_0=0$. As $k/n \le 1$, we have $\epsilon_k \le k$, whence the error on $s$ is bounded by $n(n+1)/2$, and that on $t$ by $1 + (n+1)/2 \le n+1$ since $n \ge 1$. The operation $x \leftarrow t/2^m$ is exact as soon as ${\rm prec}(x)$ is large enough, thus the error on $x$ is at most $(n+1) \frac{e^n}{2^{{\rm prec}(x)}}$. If $e^{-n}$ is computed with $m$ bits, then the error on it is at most $e^{-n} 2^{1-m}$. The error on $r$ is then $(n + 1 + 2/n) 2^{-{\rm prec}(x)} + \ulp(r)$. Since $x \ge \frac{2}{3} n$ for $n \ge 2$, and $x 2^{-{\rm prec}(x)} < \ulp(x)$, this gives an error bounded by $\ulp(r) + (n + 1 + 2/n) \frac{3}{2n} \ulp(r) \le 4 \ulp(r)$ for $n \ge 2$ --- if ${\rm prec}(x) = {\rm prec}(r)$. Now since $r \le \frac{e^{-n}}{n} \le \frac{1}{8}$, that error is less than $\ulp(1/2)$. \medskip \Paragraph{Final computation.} We use the formula $\gamma = S(n) - R(n) - \log n$ with $n$ such that $e^{-2n} \le \ulp(1/2) = 2^{-{\rm prec}}$, i.e.~$n \ge {\rm prec} \frac{\log 2}{2}$: \begin{quote} $s \leftarrow S'(n)$ \\ $l \leftarrow \log(n)$ \\ $r \leftarrow R'(n)$ \\ return $(s - l) - r$ \end{quote} Since the final result is in $[\frac{1}{2}, 1]$, and $R'(n) \le \frac{e^{-n}}{n}$, then $S'(n)$ approximates $\log n$. If the target precision is $m$, and we use $m + \lceil \log_2({\rm prec}) \rceil$ bits to evaluate $s$ and $l$, then the error on $s-l$ will be at most $3 \ulp(1/2)$, so the error on $(s - l) - r$ is at most $5 \ulp(1/2)$, and adding the $3 e^{-2n}$ truncation error, we get a bound of $8 \ulp(1/2)$. [\textbf{FIXME: check with new method to compute S}] \subsubsection{A faster formula} Brent and McMillan give in \cite{BrMc80} a faster algorithm (B2) using the modified Bessel functions, which was used by Gourdon and Demichel to compute 108,000,000 digits of $\gamma$ in October 1999: \[ \gamma = \frac{S_0 - K_0}{I_0} - \log n, \] where $S_0 = \sum_{k=1}^{\infty} \frac{n^{2k}}{(k!)^2} H_k$, $H_k = 1 + \frac{1}{2} + \cdots + \frac{1}{k}$ being the $k$-th harmonic number, $K_0 = \sqrt{\frac{\pi}{4n}} e^{-2n} \sum_{k=0}^{\infty} (-1)^k \frac{[(2k)!]^2}{(k!)^3 (64n)^k}$, and $I_0 = \sum_{k=0}^{\infty} \frac{n^{2k}}{(k!)^2}$. We have $I_0 \ge \frac{e^{2n}}{\sqrt{4 \pi n}}$ (see \cite{BrMc80} page 306). From the remark following formula 9.7.2 of \cite{AbSt73}, the remainder in the truncated expansion for $K_0$ up to $k$ does not exceed the $(k+1)$-th term, whence $K_0 \le \sqrt{\frac{\pi}{4n}} e^{-2n}$ and $\frac{K_0}{I_0} \le \pi e^{-4n}$ as in formula (5) of \cite{BrMc80}. Let $I'_0 = \sum_{k=0}^{K-1} \frac{n^{2k}}{(k!)^2}$ with $K = \lceil \beta n \rceil$, and $\beta$ is the root of $\beta (\log \beta - 1) = 3$ ($\beta \sim 4.971...$) then \[ |I_0 - I'_0| \le \frac{\beta}{2 \pi (\beta^2-1)} \frac{e^{-6n}}{n}. \] Let $K'_0 = \sqrt{\frac{\pi}{4n}} e^{-2n} \sum_{k=0}^{4n-1} (-1)^k \frac{[(2k)!]^2}{(k!)^3 (64n)^k}$, then bounding by the next term: \[ |K_0 - K'_0| \le \frac{(8n+1)}{16 \sqrt{2} n} \frac{e^{-6n}}{n} \le \frac{1}{2} \frac{e^{-6n}}{n}. \] We get from this \[ \left| \frac{K_0}{I_0} - \frac{K'_0}{I'_0} \right| \le \frac{1}{2 I_0} \frac{e^{-6n}}{n} \le \sqrt{\frac{\pi}{n}} e^{-8n}. \] Let $S'_0 = \sum_{k=1}^{K-1} \frac{n^{2k}}{(k!)^2} H_k$, then using $\frac{H_{k+1}}{H_k} \le \frac{k+1}{k}$ and the same bound $K$ than for $I'_0$ ($4n \le K \le 5n$), we get: \[ |S_0 - S'_0| \le \frac{\beta}{2 \pi (\beta^2-1)} H_k \frac{e^{-6n}}{n}. \] We deduce: \[ \left| \frac{S_0}{I_0} - \frac{S'_0}{I'_0} \right| \le e^{-8n} H_K \frac{\sqrt{4 \pi n}}{\pi (\beta^2-1)} \frac{\beta}{n} \le e^{-8n}. \] Hence we have \[ \left| \gamma - \left( \frac{S'_0 - K'_0}{I'_0} - \log n \right) \right| \le (1 + \sqrt{\frac{\pi}{n}}) e^{-8n} \le 3 e^{-8n}. \] \subsection{The $\log 2$ constant} This constant is used in the exponential function, and in the base $2$ exponential and logarithm. We use the following formula (formula (30) from \cite{GoSe04}): \[ \log 2 = \frac{3}{4} \sum_{n \geq 0} (-1)^n \frac{{n!}^2}{2^n (2n+1)!}. \] Let $w$ be the working precision. We take $N = \lfloor w/3 \rfloor + 1$, and evaluate exactly using binary spitting: \[ \frac{T}{Q} = \frac{3}{4} \sum_{n \geq 0}^{N-1} (-1)^n \frac{{n!}^2}{2^n (2n+1)!}, \] where $T$ and $Q$ are integers. Since the series has alternating signs with decreasing absolute values, the truncating error is bounded by the first neglected term, which is less than $2^{-3N-1}$ for $N \geq 2$; since $N \geq (w+1)/3$, this error is bounded by $2^{-w-2}$. We then use the following algorithm: \begin{quote} $t \leftarrow \circ(T)$ [rounded to nearest] \\ $q \leftarrow \circ(Q)$ [rounded to nearest] \\ $x \leftarrow \circ(t/q)$ [rounded to nearest] \end{quote} Using Higham's notation, we write $t = T (1+\theta_1)$, $q = Q (1+\theta_2)$, $x = t/q (1+\theta_3)$ with $|\theta_i| \leq 2^{-w}$. We thus have $x = T/Q (1+\theta)^3$ with $|\theta| \leq 2^{-w}$. Since $T/Q \leq 1$, the total absolute error on $x$ is thus bounded by $3 |\theta| + 3 \theta^2 + |\theta|^3 + 2^{-w-2} < 2^{-w+2}$ as long as $w \geq 3$. \subsection{Catalan's constant} Catalan's constant is defined by \[ G = \sum_{k=0}^{\infty} \frac{(-1)^k}{(2k+1)^2}. \] We compute it using formula (31) of Victor Adamchik's document ``33 representations for Catalan's constant''\footnote{ \url{http://www-2.cs.cmu.edu/~adamchik/articles/catalan/catalan.htm}}: \[ G = \frac{\pi}{8} \log(2 + \sqrt{3}) + \frac{3}{8} \sum_{k=0}^{\infty} \frac{(k!)^2}{(2k)! (2k+1)^2}. \] Let $f(k) = \frac{(k!)^2}{(2k)! (2k+1)^2}$, and $S(0,K) = \sum_{k=0}^{K-1} f(k)$, and $S = S(0, \infty)$. We compute $S(0,K)$ exactly by binary splitting. Since $f(k)/f(k-1) = \frac{k (2k-1)}{2 (2k+1)^2} \leq 1/4$, the truncation error on $S$ is bounded by $4/3 f(K) \leq 4/3 \cdot 4^{-K}$. Since $S$ is multiplied by $3/8$, the corresponding contribution to the absolute error on $G$ is $2^{-2K-1}$. As long as $2K + 1$ is greater or equal to the working precision $w$, this truncation error is less than one ulp of the final result. \begin{quote} $K \leftarrow \lceil \frac{w-1}{2} \rceil$ \\ $T/Q \leftarrow S(0,K)$ [exact, rational] \\ $T \leftarrow 3T$ [exact, integer] \\ $t \leftarrow \circ(T)$ [up] \\ $q \leftarrow \circ(Q)$ [down] \\ $s \leftarrow \circ(t/q)$ [nearest] \\ $x \leftarrow \circ(\sqrt{3})$ [up] \\ $y \leftarrow \circ(2 + x)$ [up] \\ $z \leftarrow \circ(\log y)$ [up] \\ $u \leftarrow \circ(\pi)$ [up] \\ $v \leftarrow \circ(u z)$ [nearest] \\ $g \leftarrow \circ(v + s)$ [nearest] \\ Return $g/8$ [exact]. \end{quote} The error on $t$ and $q$ is less than one ulp; using the generic error on the division, since $t \geq T$ and $q \leq Q$, the error on $s$ is at most $9/2$ ulps. The error on $x$ is at most $1$ ulp; since $1 < x < 2$ --- assuming $w \geq 2$ ---, $\ulp(x) = 1/2 \ulp(y)$, thus the error on $y$ is at most $3/2 \ulp(y)$. The generic error on the logarithm gives an error bound of $1 + \frac{3}{2} \cdot 2^{1-\Exp(z)} = \frac{5}{2}$ ulps for $z$ (since $3 \leq y < 4$, we have $1 \leq z < 2$, so $\Exp(z) = 1$). The error on $u$ is at most $1$ ulp; thus using the generic error on the multiplication, since both $u$ and $z$ are upper approximations, the error on $v$ is at most $8$ ulps. Finally that on $g$ is at most $8 + 9/2 = 25/2$ ulps. Taking into account the truncation error, this gives $27/2$ ulps. \nocite{BoBo98,DeHi02} \bibliographystyle{acm} \bibliography{algorithms} \end{document}