mpn/generic/div_qr_2.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314

/* mpn_div_qr_2 -- Divide natural numbers, producing both remainder and
   quotient.  The divisor is two limbs.

   Contributed to the GNU project by Torbjorn Granlund and Niels Möller

   THIS FILE CONTAINS INTERNAL FUNCTIONS WITH MUTABLE INTERFACES.  IT IS ONLY
   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
   GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.


Copyright 1993-1996, 1999-2002, 2011, 2017 Free Software Foundation, Inc.

This file is part of the GNU MP Library.

The GNU MP Library is free software; you can redistribute it and/or modify
it under the terms of either:

  * the GNU Lesser General Public License as published by the Free
    Software Foundation; either version 3 of the License, or (at your
    option) any later version.

or

  * the GNU General Public License as published by the Free Software
    Foundation; either version 2 of the License, or (at your option) any
    later version.

or both in parallel, as here.

The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
for more details.

You should have received copies of the GNU General Public License and the
GNU Lesser General Public License along with the GNU MP Library.  If not,
see https://www.gnu.org/licenses/.  */

#include "gmp-impl.h"
#include "longlong.h"

#ifndef DIV_QR_2_PI2_THRESHOLD
/* Disabled unless explicitly tuned. */
#define DIV_QR_2_PI2_THRESHOLD MP_LIMB_T_MAX
#endif

#ifndef SANITY_CHECK
#define SANITY_CHECK 0
#endif

/* Define some longlong.h-style macros, but for wider operations.
   * add_sssaaaa is like longlong.h's add_ssaaaa but propagating carry-out into
     an additional sum operand.
   * add_csaac accepts two addends and a carry in, and generates a sum and a
     carry out.  A little like a "full adder".
*/
#if defined (__GNUC__)  && ! defined (__INTEL_COMPILER) && ! defined (NO_ASM)

#if HAVE_HOST_CPU_FAMILY_x86 && W_TYPE_SIZE == 32
#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
  __asm__ ("add\t%7, %k2\n\tadc\t%5, %k1\n\tadc\t$0, %k0"		\
	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
	   : "0"  ((USItype)(s2)),					\
	     "1"  ((USItype)(a1)), "g" ((USItype)(b1)),			\
	     "%2" ((USItype)(a0)), "g" ((USItype)(b0)))
#endif

#if defined (__amd64__) && W_TYPE_SIZE == 64
#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
  __asm__ ("add\t%7, %q2\n\tadc\t%5, %q1\n\tadc\t$0, %q0"		\
	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
	   : "0"  ((UDItype)(s2)),					\
	     "1"  ((UDItype)(a1)), "rme" ((UDItype)(b1)),		\
	     "%2" ((UDItype)(a0)), "rme" ((UDItype)(b0)))
#endif

#if defined (__aarch64__) && W_TYPE_SIZE == 64
#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
  __asm__ ("adds\t%2, %x6, %7\n\tadcs\t%1, %x4, %x5\n\tadc\t%0, %3, xzr"\
	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
	   : "rZ" (s2), "%rZ"  (a1), "rZ" (b1), "%rZ" (a0), "rI" (b0)	\
	     __CLOBBER_CC)
#endif

#if HAVE_HOST_CPU_FAMILY_powerpc && !defined (_LONG_LONG_LIMB)
/* This works fine for 32-bit and 64-bit limbs, except for 64-bit limbs with a
   processor running in 32-bit mode, since the carry flag then gets the 32-bit
   carry.  */
#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
  __asm__ ("add%I7c\t%2,%6,%7\n\tadde\t%1,%4,%5\n\taddze\t%0,%3"	\
	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
	   : "r"  (s2), "r"  (a1), "r" (b1), "%r" (a0), "rI" (b0)	\
	     __CLOBBER_CC)
#endif

#endif /* __GNUC__ */

#ifndef add_sssaaaa
#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
  do {									\
    UWtype __s0, __s1, __c0, __c1;					\
    __s0 = (a0) + (b0);							\
    __s1 = (a1) + (b1);							\
    __c0 = __s0 < (a0);							\
    __c1 = __s1 < (a1);							\
    (s0) = __s0;							\
    __s1 = __s1 + __c0;							\
    (s1) = __s1;							\
    (s2) += __c1 + (__s1 < __c0);					\
  } while (0)
#endif

/* Typically used with r1, r0 same as n3, n2. Other types of overlap
   between inputs and outputs are not supported. */
#define udiv_qr_4by2(q1,q0, r1,r0, n3,n2,n1,n0, d1,d0, di1,di0)		\
  do {									\
    mp_limb_t _q3, _q2a, _q2, _q1, _q2c, _q1c, _q1d, _q0;		\
    mp_limb_t _t1, _t0;							\
    mp_limb_t _mask;							\
									\
    /* [q3,q2,q1,q0] = [n3,n2]*[di1,di0] + [n3,n2,n1,n0] + [0,1,0,0] */	\
    umul_ppmm (_q2,_q1, n2, di1);					\
    umul_ppmm (_q3,_q2a, n3, di1);					\
    ++_q2;	/* _q2 cannot overflow */				\
    add_ssaaaa (_q3,_q2, _q3,_q2, n3,_q2a);				\
    umul_ppmm (_q2c,_q1c, n3, di0);					\
    add_sssaaaa (_q3,_q2,_q1, _q2,_q1, n2,_q1c);			\
    umul_ppmm (_q1d,_q0, n2, di0);					\
    add_sssaaaa (_q2c,_q1,_q0, _q1,_q0, n1,n0); /* _q2c cannot overflow */ \
    add_sssaaaa (_q3,_q2,_q1, _q2,_q1, _q2c,_q1d);			\
									\
    umul_ppmm (_t1,_t0, _q2, d0);					\
    _t1 += _q2 * d1 + _q3 * d0;						\
									\
    sub_ddmmss (r1, r0, n1, n0, _t1, _t0);				\
									\
    _mask = -(mp_limb_t) ((r1 >= _q1) & ((r1 > _q1) | (r0 >= _q0)));  /* (r1,r0) >= (q1,q0) */  \
    add_ssaaaa (r1, r0, r1, r0, d1 & _mask, d0 & _mask);		\
    sub_ddmmss (_q3, _q2, _q3, _q2, CNST_LIMB(0), -_mask);		\
									\
    if (UNLIKELY (r1 >= d1))						\
      {									\
	if (r1 > d1 || r0 >= d0)					\
	  {								\
	    sub_ddmmss (r1, r0, r1, r0, d1, d0);			\
	    add_ssaaaa (_q3, _q2, _q3, _q2, CNST_LIMB(0), CNST_LIMB(1));\
	  }								\
      }									\
    (q1) = _q3;								\
    (q0) = _q2;								\
  } while (0)

static void
invert_4by2 (mp_ptr di, mp_limb_t d1, mp_limb_t d0)
{
  mp_limb_t v1, v0, p1, t1, t0, p0, mask;
  invert_limb (v1, d1);
  p1 = d1 * v1;
  /* <1, v1> * d1 = <B-1, p1> */
  p1 += d0;
  if (p1 < d0)
    {
      v1--;
      mask = -(mp_limb_t) (p1 >= d1);
      p1 -= d1;
      v1 += mask;
      p1 -= mask & d1;
    }
  /* <1, v1> * d1 + d0 = <B-1, p1> */
  umul_ppmm (t1, p0, d0, v1);
  p1 += t1;
  if (p1 < t1)
    {
      if (UNLIKELY (p1 >= d1))
	{
	  if (p1 > d1 || p0 >= d0)
	    {
	      sub_ddmmss (p1, p0, p1, p0, d1, d0);
	      v1--;
	    }
	}
      sub_ddmmss (p1, p0, p1, p0, d1, d0);
      v1--;
    }
  /* Now v1 is the 3/2 inverse, <1, v1> * <d1, d0> = <B-1, p1, p0>,
   * with <p1, p0> + <d1, d0> >= B^2.
   *
   * The 4/2 inverse is (B^4 - 1) / <d1, d0> = <1, v1, v0>. The
   * partial remainder after <1, v1> is
   *
   * B^4 - 1 - B <1, v1> <d1, d0> = <B-1, B-1, B-1, B-1> - <B-1, p1, p0, 0>
   *                              = <~p1, ~p0, B-1>
   */
  udiv_qr_3by2 (v0, t1, t0, ~p1, ~p0, MP_LIMB_T_MAX, d1, d0, v1);
  di[0] = v0;
  di[1] = v1;

#if SANITY_CHECK
  {
    mp_limb_t tp[4];
    mp_limb_t dp[2];
    dp[0] = d0;
    dp[1] = d1;
    mpn_mul_n (tp, dp, di, 2);
    ASSERT_ALWAYS (mpn_add_n (tp+2, tp+2, dp, 2) == 0);
    ASSERT_ALWAYS (tp[2] == MP_LIMB_T_MAX);
    ASSERT_ALWAYS (tp[3] == MP_LIMB_T_MAX);
    ASSERT_ALWAYS (mpn_add_n (tp, tp, dp, 2) == 1);
  }
#endif
}

static mp_limb_t
mpn_div_qr_2n_pi2 (mp_ptr qp, mp_ptr rp, mp_srcptr np, mp_size_t nn,
		   mp_limb_t d1, mp_limb_t d0, mp_limb_t di1, mp_limb_t di0)
{
  mp_limb_t qh;
  mp_size_t i;
  mp_limb_t r1, r0;

  ASSERT (nn >= 2);
  ASSERT (d1 & GMP_NUMB_HIGHBIT);

  r1 = np[nn-1];
  r0 = np[nn-2];

  qh = 0;
  if (r1 >= d1 && (r1 > d1 || r0 >= d0))
    {
#if GMP_NAIL_BITS == 0
      sub_ddmmss (r1, r0, r1, r0, d1, d0);
#else
      r0 = r0 - d0;
      r1 = r1 - d1 - (r0 >> GMP_LIMB_BITS - 1);
      r0 &= GMP_NUMB_MASK;
#endif
      qh = 1;
    }

  for (i = nn - 2; i >= 2; i -= 2)
    {
      mp_limb_t n1, n0, q1, q0;
      n1 = np[i-1];
      n0 = np[i-2];
      udiv_qr_4by2 (q1, q0, r1, r0, r1, r0, n1, n0, d1, d0, di1, di0);
      qp[i-1] = q1;
      qp[i-2] = q0;
    }

  if (i > 0)
    {
      mp_limb_t q;
      udiv_qr_3by2 (q, r1, r0, r1, r0, np[0], d1, d0, di1);
      qp[0] = q;
    }
  rp[1] = r1;
  rp[0] = r0;

  return qh;
}


/* Divide num {np,nn} by den {dp,2} and write the nn-2 least
   significant quotient limbs at qp and the 2 long remainder at np.
   Return the most significant limb of the quotient.

   Preconditions:
   1. qp must either not overlap with the other operands at all, or
      qp >= np + 2 must hold true.  (This means that it's possible to put
      the quotient in the high part of {np,nn}, right above the remainder.)
   2. nn >= 2.  */

mp_limb_t
mpn_div_qr_2 (mp_ptr qp, mp_ptr rp, mp_srcptr np, mp_size_t nn,
	      mp_srcptr dp)
{
  mp_limb_t d1;
  mp_limb_t d0;
  gmp_pi1_t dinv;

  ASSERT (nn >= 2);
  ASSERT (! MPN_OVERLAP_P (qp, nn-2, np, nn) || qp >= np + 2);
  ASSERT_MPN (np, nn);
  ASSERT_MPN (dp, 2);

  d1 = dp[1]; d0 = dp[0];

  ASSERT (d1 > 0);

  if (UNLIKELY (d1 & GMP_NUMB_HIGHBIT))
    {
      if (BELOW_THRESHOLD (nn, DIV_QR_2_PI2_THRESHOLD))
	{
	  gmp_pi1_t dinv;
	  invert_pi1 (dinv, d1, d0);
	  return mpn_div_qr_2n_pi1 (qp, rp, np, nn, d1, d0, dinv.inv32);
	}
      else
	{
	  mp_limb_t di[2];
	  invert_4by2 (di, d1, d0);
	  return mpn_div_qr_2n_pi2 (qp, rp, np, nn, d1, d0, di[1], di[0]);
	}
    }
  else
    {
      int shift;
      count_leading_zeros (shift, d1);
      d1 = (d1 << shift) | (d0 >> (GMP_LIMB_BITS - shift));
      d0 <<= shift;
      invert_pi1 (dinv, d1, d0);
      return mpn_div_qr_2u_pi1 (qp, rp, np, nn, d1, d0, shift, dinv.inv32);
    }
}