sqrtrem.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229

/* mpn_sqrtrem_new -- integer square root with remainder
   (should be directly integrated in a future release of GNU MP)

Copyright 1999, 2000, 2001 Free Software Foundation, Inc.

This file is part of the MPFR Library.

The MPFR Library is free software; you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation; either version 2.1 of the License, or (at your
option) any later version.

The MPFR Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
License for more details.

You should have received a copy of the GNU Lesser General Public License
along with the MPFR Library; see the file COPYING.LIB.  If not, write to
the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
MA 02111-1307, USA. */

#include <stdio.h> /* for NULL */
#include "gmp.h"
#include "gmp-impl.h"
#include "longlong.h"

#define MP_LIMB_T_ONE ((mp_limb_t) 1)

static mp_size_t mpn_sqrtrem1 (mp_ptr, mp_ptr, mp_srcptr);
static mp_limb_t mpn_sqrtrem2 (mp_ptr, mp_ptr, mp_srcptr);
static mp_limb_t mpn_dq_sqrtrem (mp_ptr, mp_ptr, mp_size_t);
mp_size_t mpn_sqrtrem_new (mp_ptr, mp_ptr, mp_srcptr, mp_size_t);

/* table generated by isqrt(2^8*i) $ i=64..255 in MuPAD */
static const unsigned char approx_tab[192] = {
128, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141,
   142, 143, 144, 144, 145, 146, 147, 148, 149, 150, 150, 151, 152, 153,
   154, 155, 155, 156, 157, 158, 159, 160, 160, 161, 162, 163, 163, 164,
   165, 166, 167, 167, 168, 169, 170, 170, 171, 172, 173, 173, 174, 175,
   176, 176, 177, 178, 178, 179, 180, 181, 181, 182, 183, 183, 184, 185,
   185, 186, 187, 187, 188, 189, 189, 190, 191, 192, 192, 193, 193, 194,
   195, 195, 196, 197, 197, 198, 199, 199, 200, 201, 201, 202, 203, 203,
   204, 204, 205, 206, 206, 207, 208, 208, 209, 209, 210, 211, 211, 212,
   212, 213, 214, 214, 215, 215, 216, 217, 217, 218, 218, 219, 219, 220,
   221, 221, 222, 222, 223, 224, 224, 225, 225, 226, 226, 227, 227, 228,
   229, 229, 230, 230, 231, 231, 232, 232, 233, 234, 234, 235, 235, 236,
   236, 237, 237, 238, 238, 239, 240, 240, 241, 241, 242, 242, 243, 243,
   244, 244, 245, 245, 246, 246, 247, 247, 248, 248, 249, 249, 250, 250,
   251, 251, 252, 252, 253, 253, 254, 254, 255};

/* same as mpn_sqrtrem, but for size=1 and {np, 1} normalized */
static mp_size_t
mpn_sqrtrem1 (mp_ptr sp, mp_ptr rp, mp_srcptr np)
{
  mp_limb_t np0, s, r, q, u;
  int prec;

  /* first compute a 8-bit approximation from the high 8-bits of np[0] */
  np0 = np[0];
  q = np0 >> (BITS_PER_MP_LIMB - 8);
  /* 2^6 = 64 <= q < 256 = 2^8 */
  s = approx_tab[q - 64]; /* 128 <= s < 255 */
  r = (np0 >> (BITS_PER_MP_LIMB - 16)) - s * s; /* r <= 2*s */
  if (r > 2 * s) { r -= 2 * s + 1; s++; }
#ifdef DEBUG
  if (r > 2*s) {
    fprintf(stderr, "Error: r > 2*s, np[0]=%lu r=%lu s=%lu\n", np[0], r, s); exit(1);
  }
#endif

  prec = 8;
  np0 <<= 2 * prec;
  while (2*prec < BITS_PER_MP_LIMB) {
    /* invariant: s has prec bits, and r <= 2*s */
    r = (r << prec) + (np0 >> (BITS_PER_MP_LIMB - prec));
    np0 <<= prec;
    u = 2 * s;
    q = r / u;
    u = r - q * u;
    s = (s << prec) + q;
    u = (u << prec) + (np0 >> (BITS_PER_MP_LIMB - prec));
    q = q * q;
    r = u - q;
    if (u < q) {
      r += 2 * s - 1;
      s --;
    }
    np0 <<= prec;
    prec = 2 * prec;
  }
  sp[0] = s;
  if (rp != NULL) rp[0] = r;
  return (r) ? 1 : 0;
}

#define Prec (BITS_PER_MP_LIMB >> 1)

/* same as mpn_sqrtrem, but for size=2 and {np, 2} normalized
   return cc such that {np, 2} = sp[0]^2 + cc*2^BITS+PER_MP_LIMB + rp[0] */
static mp_limb_t
mpn_sqrtrem2 (mp_ptr sp, mp_ptr rp, mp_srcptr np)
{
  mp_limb_t qhl, q, u, np0;
  int cc;

  np0 = np[0];
  mpn_sqrtrem1(sp, rp, np+1);
  qhl = 0;
  while (rp[0] >= sp[0]) {
    qhl ++;
    rp[0] -= sp[0];
  }
  /* now rp[0] < sp[0] < 2^Prec */
  rp[0] = (rp[0] << Prec) + (np0 >> Prec);
  u = 2 * sp[0];
  q = rp[0] / u;
  u = rp[0] - q * u;
  q += (qhl & 1) << (Prec-1);
  qhl >>= 1; /* if qhl=1, necessary q=0 as qhl*2^Prec + q <= 2^Prec */
  /* now we have (initial rp[0])<<Prec + np0>>Prec = (qhl<<Prec + q) * (2sp[0]) + u */
  sp[0] = ((sp[0]+qhl) << Prec) + q;
  cc = u >> Prec;
  rp[0] = (u << Prec) + (np0 & ((MP_LIMB_T_ONE << Prec) - MP_LIMB_T_ONE));
  /* subtract q * q or qhl*2^(2*Prec) from rp */
  cc -= mpn_sub_1(rp, rp, 1, q * q) + qhl;
  /* now subtract 2*q*2^Prec + 2^(2*Prec) if qhl is set */
  if (cc < 0) {
    cc += (sp[0]) ? mpn_add_1(rp, rp, 1, sp[0]) : 1;
    cc += mpn_add_1(rp, rp, 1, --sp[0]);
  }
  return cc;
}

/* writes in {sp, n} the square root (rounded towards zero) of {np, 2n},
   and in {np, n} the low n limbs of the remainder, returns the high
   limb of the remainder (which is 0 or 1).
   Assumes {np, 2n} is normalized, i.e. np[2n-1] >= B/4
   where B=2^BITS_PER_MP_LIMB.
*/
static mp_limb_t
mpn_dq_sqrtrem (mp_ptr sp, mp_ptr np, mp_size_t n)
{
  mp_limb_t q; /* carry out of {sp, n} */
  int c, b; /* carry out of remainder */
  mp_size_t l, h;

  if (n == 1) return mpn_sqrtrem2 (sp, np, np);
    
  l = n / 2;
  h = n - l;
  q = mpn_dq_sqrtrem (sp + l, np + 2 * l, h);
  if (q) mpn_sub_n (np + 2 * l, np + 2 * l, sp + l, h);
  q += mpn_divrem (sp, 0, np + l, n, sp + l, h);
  c = sp[0] & 1;
  mpn_rshift (sp, sp, l, 1);
  sp[l-1] |= q << (BITS_PER_MP_LIMB - 1);
  q >>= 1;
  if (c) c = mpn_add_n (np + l, np + l, sp + l, h);
  mpn_sqr_n (np + n, sp, l);
  b = q + mpn_sub_n (np, np, np + n, 2 * l);
  c -= (l == h) ? b : mpn_sub_1 (np + 2 * l, np + 2 * l, 1, b);
  q = mpn_add_1 (sp + l, sp + l, h, q);

  if (c < 0) {
    c += mpn_addmul_1 (np, sp, n, 2) + 2 * q;
    c -= mpn_sub_1 (np, np, n, 1);
    q -= mpn_sub_1 (sp, sp, n, 1);
  }

  return c;
}

/* main function */
mp_size_t
mpn_sqrtrem_new (mp_ptr sp, mp_ptr rp, mp_srcptr np, mp_size_t nn)
{
  mp_limb_t *tp, s0[1], cc, high, rl;
  int c;
  mp_size_t rn, tn;
  TMP_DECL (marker);

  /* If OP is zero, both results are zero.  */
  if (nn == 0)
    return 0;

  high = np[nn-1];
  if (nn == 1 && (high & MP_LIMB_T_HIGHBIT))
    return mpn_sqrtrem1 (sp, rp, np);
  count_leading_zeros(c, high);

  c = c / 2; /* we have to shift left by 2c bits to normalize {np, nn} */
  tn = (nn+1) / 2; /* 2*tn is the smallest even integer >= nn */

  TMP_MARK (marker);
  if ((nn % 2) || (c > 0)) {
    tp = (mp_limb_t*) TMP_ALLOC (2 * tn * BYTES_PER_MP_LIMB);
    tp[0] = 0; /* needed only when 2*tn > nn, but saves a test */
    if (c) mpn_lshift(tp + 2*tn - nn, np, nn, 2 * c);
    else MPN_COPY (tp + 2*tn - nn, np, nn);
    rl = mpn_dq_sqrtrem (sp, tp, tn);
    /* we have 2^(2k)*N = S^2 + R where k = c + (2tn-nn)*BITS_PER_MP_LIMB/2,
       thus 2^(2k)*N = (S-s0)^2 + 2*S*s0 - s0^2 + R where s0=S mod 2^k */
    c += (nn % 2) * BITS_PER_MP_LIMB / 2; /* c now represents k */
    s0[0] = sp[0] & ((MP_LIMB_T_ONE << c) - MP_LIMB_T_ONE); /* S mod 2^k */
    rl += mpn_addmul_1 (tp, sp, tn, 2 * s0[0]); /* R = R + 2*s0*S */
    cc = mpn_submul_1 (tp, s0, 1, s0[0]);
    rl -= (tn > 1) ? mpn_sub_1(tp + 1, tp + 1, tn - 1, cc) : cc;
    mpn_rshift (sp, sp, tn, c);
    tp[tn] = rl;
    if (rp == NULL) rp = tp;
    c = c << 1;
    if (c < BITS_PER_MP_LIMB) tn++; else { tp++; c -= BITS_PER_MP_LIMB; }
    if (c) mpn_rshift (rp, tp, tn, c); else MPN_COPY(rp, tp, tn);
    rn = tn;
  }
  else {
    if (rp == NULL)
      rp = (mp_limb_t*) TMP_ALLOC (nn * BYTES_PER_MP_LIMB);
    if (rp != np) MPN_COPY (rp, np, nn);
    rn = tn + (rp[tn] = mpn_dq_sqrtrem (sp, rp, tn));
  }
  
  MPN_NORMALIZE (rp, rn);

  TMP_FREE (marker);
  return rn;
}