/* __gmpfr_invert_limb -- implement GMP's invert_limb (which is not in GMP API)

Copyright 2016-2021 Free Software Foundation, Inc.
Contributed by the AriC and Caramba projects, INRIA.

This file is part of the GNU MPFR Library.

The GNU MPFR Library is free software; you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation; either version 3 of the License, or (at your
option) any later version.

The GNU MPFR Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
License for more details.

You should have received a copy of the GNU Lesser General Public License
along with the GNU MPFR Library; see the file COPYING.LESSER.  If not, see
https://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc.,
51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */

#define MPFR_NEED_LONGLONG_H
#include "mpfr-impl.h"

/* for now, we only provide __gmpfr_invert_limb for 64-bit limb */
#if GMP_NUMB_BITS == 64

/* umul_hi(h, x, y) puts in h the high part of x*y */
#ifdef HAVE_MULX_U64
#include <immintrin.h>
#define umul_hi(h, x, y) _mulx_u64 (x, y, (unsigned long long *) &(h))
#else
#define umul_hi(h, x, y)                        \
  do {                                          \
    mp_limb_t _l;                               \
    umul_ppmm (h, _l, x, y);                    \
    (void) _l;  /* unused variable */           \
  } while (0)
#endif

/* for 256 <= d9 < 512, invert_limb_table[d9-256] = floor((2^19-3*2^8)/d9) */
static const unsigned short invert_limb_table[256] =
  { 2045, 2037, 2029, 2021, 2013, 2005, 1998, 1990,
    1983, 1975, 1968, 1960, 1953, 1946, 1938, 1931,
    1924, 1917, 1910, 1903, 1896, 1889, 1883, 1876,
    1869, 1863, 1856, 1849, 1843, 1836, 1830, 1824,
    1817, 1811, 1805, 1799, 1792, 1786, 1780, 1774,
    1768, 1762, 1756, 1750, 1745, 1739, 1733, 1727,
    1722, 1716, 1710, 1705, 1699, 1694, 1688, 1683,
    1677, 1672, 1667, 1661, 1656, 1651, 1646, 1641,
    1636, 1630, 1625, 1620, 1615, 1610, 1605, 1600,
    1596, 1591, 1586, 1581, 1576, 1572, 1567, 1562,
    1558, 1553, 1548, 1544, 1539, 1535, 1530, 1526,
    1521, 1517, 1513, 1508, 1504, 1500, 1495, 1491,
    1487, 1483, 1478, 1474, 1470, 1466, 1462, 1458,
    1454, 1450, 1446, 1442, 1438, 1434, 1430, 1426,
    1422, 1418, 1414, 1411, 1407, 1403, 1399, 1396,
    1392, 1388, 1384, 1381, 1377, 1374, 1370, 1366,
    1363, 1359, 1356, 1352, 1349, 1345, 1342, 1338,
    1335, 1332, 1328, 1325, 1322, 1318, 1315, 1312,
    1308, 1305, 1302, 1299, 1295, 1292, 1289, 1286,
    1283, 1280, 1276, 1273, 1270, 1267, 1264, 1261,
    1258, 1255, 1252, 1249, 1246, 1243, 1240, 1237,
    1234, 1231, 1228, 1226, 1223, 1220, 1217, 1214,
    1211, 1209, 1206, 1203, 1200, 1197, 1195, 1192,
    1189, 1187, 1184, 1181, 1179, 1176, 1173, 1171,
    1168, 1165, 1163, 1160, 1158, 1155, 1153, 1150,
    1148, 1145, 1143, 1140, 1138, 1135, 1133, 1130,
    1128, 1125, 1123, 1121, 1118, 1116, 1113, 1111,
    1109, 1106, 1104, 1102, 1099, 1097, 1095, 1092,
    1090, 1088, 1086, 1083, 1081, 1079, 1077, 1074,
    1072, 1070, 1068, 1066, 1064, 1061, 1059, 1057,
    1055, 1053, 1051, 1049, 1047, 1044, 1042, 1040,
    1038, 1036, 1034, 1032, 1030, 1028, 1026, 1024 };

/* for 256 <= d9 < 512, invert_limb_table2[d9-256] = floor((2^19-3*2^8)/d9)^2
 *
 * Note: This table requires 4182025 to be representable in unsigned int,
 * thus disallows 16-bit int, for instance. However, this code is under
 * "#if GMP_NUMB_BITS == 64", and a system with int smaller than 32 bits
 * should better be used with GMP_NUMB_BITS == 32 (or 16 if supported).
 * This constraint is checked with MPFR_STAT_STATIC_ASSERT below.
 */
static const unsigned int invert_limb_table2[256] =
  { 4182025, 4149369, 4116841, 4084441, 4052169, 4020025, 3992004, 3960100, 3932289,
    3900625, 3873024, 3841600, 3814209, 3786916, 3755844, 3728761, 3701776, 3674889,
    3648100, 3621409, 3594816, 3568321, 3545689, 3519376, 3493161, 3470769, 3444736,
    3418801, 3396649, 3370896, 3348900, 3326976, 3301489, 3279721, 3258025, 3236401,
    3211264, 3189796, 3168400, 3147076, 3125824, 3104644, 3083536, 3062500, 3045025,
    3024121, 3003289, 2982529, 2965284, 2944656, 2924100, 2907025, 2886601, 2869636,
    2849344, 2832489, 2812329, 2795584, 2778889, 2758921, 2742336, 2725801, 2709316,
    2692881, 2676496, 2656900, 2640625, 2624400, 2608225, 2592100, 2576025, 2560000,
    2547216, 2531281, 2515396, 2499561, 2483776, 2471184, 2455489, 2439844, 2427364,
    2411809, 2396304, 2383936, 2368521, 2356225, 2340900, 2328676, 2313441, 2301289,
    2289169, 2274064, 2262016, 2250000, 2235025, 2223081, 2211169, 2199289, 2184484,
    2172676, 2160900, 2149156, 2137444, 2125764, 2114116, 2102500, 2090916, 2079364,
    2067844, 2056356, 2044900, 2033476, 2022084, 2010724, 1999396, 1990921, 1979649,
    1968409, 1957201, 1948816, 1937664, 1926544, 1915456, 1907161, 1896129, 1887876,
    1876900, 1865956, 1857769, 1846881, 1838736, 1827904, 1819801, 1809025, 1800964,
    1790244, 1782225, 1774224, 1763584, 1755625, 1747684, 1737124, 1729225, 1721344,
    1710864, 1703025, 1695204, 1687401, 1677025, 1669264, 1661521, 1653796, 1646089,
    1638400, 1628176, 1620529, 1612900, 1605289, 1597696, 1590121, 1582564, 1575025,
    1567504, 1560001, 1552516, 1545049, 1537600, 1530169, 1522756, 1515361, 1507984,
    1503076, 1495729, 1488400, 1481089, 1473796, 1466521, 1461681, 1454436, 1447209,
    1440000, 1432809, 1428025, 1420864, 1413721, 1408969, 1401856, 1394761, 1390041,
    1382976, 1375929, 1371241, 1364224, 1357225, 1352569, 1345600, 1340964, 1334025,
    1329409, 1322500, 1317904, 1311025, 1306449, 1299600, 1295044, 1288225, 1283689,
    1276900, 1272384, 1265625, 1261129, 1256641, 1249924, 1245456, 1238769, 1234321,
    1229881, 1223236, 1218816, 1214404, 1207801, 1203409, 1199025, 1192464, 1188100,
    1183744, 1179396, 1172889, 1168561, 1164241, 1159929, 1153476, 1149184, 1144900,
    1140624, 1136356, 1132096, 1125721, 1121481, 1117249, 1113025, 1108809, 1104601,
    1100401, 1096209, 1089936, 1085764, 1081600, 1077444, 1073296, 1069156, 1065024,
    1060900, 1056784, 1052676, 1048576 };

/* Implements Algorithm 2 from "Improved Division by Invariant Integers",
   Niels Möller and Torbjörn Granlund, IEEE Transactions on Computers,
   volume 60, number 2, pages 165-175, 2011. */
#define __gmpfr_invert_limb(r, d)                                       \
    do {                                                                \
      mp_limb_t _d, _d0, _i, _d40, _d63, _v0, _v1, _v2, _e, _v3, _h, _l; \
      MPFR_STAT_STATIC_ASSERT (4182025 <= UINT_MAX);                    \
      _d = (d);                                                         \
      _i = (_d >> 55) - 256; /* i = d9 - 256 */                         \
      /* the shift by 11 is for free since it is hidden in the */       \
      /* invert_limb_table2[_i] * _d40 multiplication latency */        \
      _v0 = (mp_limb_t) invert_limb_table[_i] << 11;                    \
      _d40 = (_d >> 24) + 1;                                            \
      _v1 = _v0 - ((invert_limb_table2[_i] * _d40) >> 40) - 1;          \
      _v2 = (_v1 << 13) +                                               \
        ((_v1 * ((MPFR_LIMB_ONE << 60) - _v1 * _d40)) >> 47);           \
      _d0 = _d & 1;                                                     \
      _d63 = ((_d - 1) >> 1) + 1;                                       \
      _e = - _v2 * _d63 + ((_v2 & -_d0) >> 1);                          \
      umul_hi (_h, _v2, _e);                                            \
      _v3 = (_v2 << 31) + (_h >> 1);                                    \
      umul_ppmm (_h, _l, _v3, _d);                                      \
      /* v3 is too small iff (h+d)*2^64+l+d < 2^128 */                  \
      add_ssaaaa(_h, _l, _h, _l, _d, _d);                               \
      MPFR_ASSERTD(_h == MPFR_LIMB_ZERO || -_h == MPFR_LIMB_ONE);       \
      (r) = _v3 - _h;                                                   \
    } while (0)

/* same algorithm, but return the value v3, which is such that
   v3 <= invert_limb (d) <= v3 + 1 */
#define __gmpfr_invert_limb_approx(r, d)                                \
    do {                                                                \
      mp_limb_t _d, _d0, _i, _d40, _d63, _v0, _v1, _v2, _e, _h;         \
      MPFR_STAT_STATIC_ASSERT (4182025 <= UINT_MAX);                    \
      _d = (d);                                                         \
      _i = (_d >> 55) - 256; /* i = d9 - 256 */                         \
      _v0 = (mp_limb_t) invert_limb_table[_i] << 11;                    \
      _d40 = (_d >> 24) + 1;                                            \
      _v1 = _v0 - ((invert_limb_table2[_i] * _d40) >> 40) - 1;          \
      _v2 = (_v1 << 13) +                                               \
        ((_v1 * ((MPFR_LIMB_ONE << 60) - _v1 * _d40)) >> 47);           \
      _d0 = _d & 1;                                                     \
      _d63 = ((_d - 1) >> 1) + 1;                                       \
      _e = - _v2 * _d63 + ((_v2 & -_d0) >> 1);                          \
      umul_hi (_h, _v2, _e);                                            \
      (r) = (_v2 << 31) + (_h >> 1);                                    \
    } while (0)

#elif GMP_NUMB_BITS == 32

/* for 512 <= d10 < 1024, l[d10-512] = floor((2^24-2^14+2^9)/d10) */
static const unsigned short invert_limb_table[512] =
  { 32737, 32673, 32609, 32546, 32483, 32420, 32357, 32295, 32233, 32171,
    32109, 32048, 31987, 31926, 31865, 31805, 31744, 31684, 31625, 31565,
    31506, 31447, 31388, 31329, 31271, 31212, 31154, 31097, 31039, 30982,
    30924, 30868, 30811, 30754, 30698, 30642, 30586, 30530, 30475, 30419,
    30364, 30309, 30255, 30200, 30146, 30092, 30038, 29984, 29930, 29877,
    29824, 29771, 29718, 29666, 29613, 29561, 29509, 29457, 29405, 29354,
    29303, 29251, 29200, 29150, 29099, 29049, 28998, 28948, 28898, 28849,
    28799, 28750, 28700, 28651, 28602, 28554, 28505, 28457, 28409, 28360,
    28313, 28265, 28217, 28170, 28123, 28075, 28029, 27982, 27935, 27889,
    27842, 27796, 27750, 27704, 27658, 27613, 27568, 27522, 27477, 27432,
    27387, 27343, 27298, 27254, 27209, 27165, 27121, 27078, 27034, 26990,
    26947, 26904, 26861, 26818, 26775, 26732, 26690, 26647, 26605, 26563,
    26521, 26479, 26437, 26395, 26354, 26312, 26271, 26230, 26189, 26148,
    26108, 26067, 26026, 25986, 25946, 25906, 25866, 25826, 25786, 25747,
    25707, 25668, 25628, 25589, 25550, 25511, 25473, 25434, 25395, 25357,
    25319, 25281, 25242, 25205, 25167, 25129, 25091, 25054, 25016, 24979,
    24942, 24905, 24868, 24831, 24794, 24758, 24721, 24685, 24649, 24612,
    24576, 24540, 24504, 24469, 24433, 24397, 24362, 24327, 24291, 24256,
    24221, 24186, 24151, 24117, 24082, 24047, 24013, 23979, 23944, 23910,
    23876, 23842, 23808, 23774, 23741, 23707, 23674, 23640, 23607, 23574,
    23541, 23508, 23475, 23442, 23409, 23377, 23344, 23312, 23279, 23247,
    23215, 23183, 23151, 23119, 23087, 23055, 23023, 22992, 22960, 22929,
    22898, 22866, 22835, 22804, 22773, 22742, 22711, 22681, 22650, 22619,
    22589, 22559, 22528, 22498, 22468, 22438, 22408, 22378, 22348, 22318,
    22289, 22259, 22229, 22200, 22171, 22141, 22112, 22083, 22054, 22025,
    21996, 21967, 21938, 21910, 21881, 21853, 21824, 21796, 21767, 21739,
    21711, 21683, 21655, 21627, 21599, 21571, 21544, 21516, 21488, 21461,
    21433, 21406, 21379, 21352, 21324, 21297, 21270, 21243, 21216, 21190,
    21163, 21136, 21110, 21083, 21056, 21030, 21004, 20977, 20951, 20925,
    20899, 20873, 20847, 20821, 20795, 20769, 20744, 20718, 20693, 20667,
    20642, 20616, 20591, 20566, 20540, 20515, 20490, 20465, 20440, 20415,
    20390, 20366, 20341, 20316, 20292, 20267, 20243, 20218, 20194, 20170,
    20145, 20121, 20097, 20073, 20049, 20025, 20001, 19977, 19953, 19930,
    19906, 19882, 19859, 19835, 19812, 19789, 19765, 19742, 19719, 19696,
    19672, 19649, 19626, 19603, 19581, 19558, 19535, 19512, 19489, 19467,
    19444, 19422, 19399, 19377, 19354, 19332, 19310, 19288, 19265, 19243,
    19221, 19199, 19177, 19155, 19133, 19112, 19090, 19068, 19046, 19025,
    19003, 18982, 18960, 18939, 18917, 18896, 18875, 18854, 18832, 18811,
    18790, 18769, 18748, 18727, 18706, 18686, 18665, 18644, 18623, 18603,
    18582, 18561, 18541, 18520, 18500, 18479, 18459, 18439, 18419, 18398,
    18378, 18358, 18338, 18318, 18298, 18278, 18258, 18238, 18218, 18199,
    18179, 18159, 18139, 18120, 18100, 18081, 18061, 18042, 18022, 18003,
    17984, 17964, 17945, 17926, 17907, 17888, 17869, 17850, 17831, 17812,
    17793, 17774, 17755, 17736, 17718, 17699, 17680, 17662, 17643, 17624,
    17606, 17587, 17569, 17551, 17532, 17514, 17496, 17477, 17459, 17441,
    17423, 17405, 17387, 17369, 17351, 17333, 17315, 17297, 17279, 17261,
    17244, 17226, 17208, 17191, 17173, 17155, 17138, 17120, 17103, 17085,
    17068, 17051, 17033, 17016, 16999, 16982, 16964, 16947, 16930, 16913,
    16896, 16879, 16862, 16845, 16828, 16811, 16794, 16778, 16761, 16744,
    16727, 16711, 16694, 16677, 16661, 16644, 16628, 16611, 16595, 16578,
    16562, 16546, 16529, 16513, 16497, 16481, 16464, 16448, 16432, 16416,
    16400, 16384 };

/* Implements Algorithm 3 from "Improved Division by Invariant Integers",
   Niels Möller and Torbjörn Granlund, IEEE Transactions on Computers,
   volume 60, number 2, pages 165-175, 2011. */
#define __gmpfr_invert_limb(r, d)                                       \
    do {                                                                \
      mp_limb_t _d, _d0, _d10, _d21, _d31, _v0, _v1, _v2, _e, _h, _l;   \
      _d = (d);                                                         \
      _d0 = _d & 1;                                                     \
      _d10 = _d >> 22;                                                  \
      _d21 = (_d >> 11) + 1;                                            \
      _d31 = ((_d - 1) >> 1) + 1;                                       \
      _v0 = invert_limb_table[_d10 - 512];                              \
      umul_ppmm (_h, _l, _v0 * _v0, _d21);                              \
      _v1 = (_v0 << 4) - _h - 1;                                        \
      _e = - _v1 * _d31 + ((_v1 & - _d0) >> 1);                         \
      umul_ppmm (_h, _l, _v1, _e);                                      \
      _v2 = (_v1 << 15) + (_h >> 1);                                    \
      umul_ppmm (_h, _l, _v2, _d);                                      \
      /* v2 is too small iff (h+d)*2^32+l+d < 2^64 */                   \
      add_ssaaaa(_h, _l, _h, _l, _d, _d);                               \
      MPFR_ASSERTD(_h == MPFR_LIMB_ZERO || -_h == MPFR_LIMB_ONE);       \
      (r) = _v2 - _h;                                                   \
    } while (0)

#endif /* GMP_NUMB_BITS == 64 or 32 */