diff options
Diffstat (limited to 'deps/v8/src/bigint')
-rw-r--r-- | deps/v8/src/bigint/OWNERS | 1 | ||||
-rw-r--r-- | deps/v8/src/bigint/bigint-internal.cc | 68 | ||||
-rw-r--r-- | deps/v8/src/bigint/bigint-internal.h | 62 | ||||
-rw-r--r-- | deps/v8/src/bigint/bigint.h | 60 | ||||
-rw-r--r-- | deps/v8/src/bigint/digit-arithmetic.h | 108 | ||||
-rw-r--r-- | deps/v8/src/bigint/div-burnikel.cc | 279 | ||||
-rw-r--r-- | deps/v8/src/bigint/div-helpers.cc | 68 | ||||
-rw-r--r-- | deps/v8/src/bigint/div-helpers.h | 82 | ||||
-rw-r--r-- | deps/v8/src/bigint/div-schoolbook.cc | 190 | ||||
-rw-r--r-- | deps/v8/src/bigint/mul-fft.cc | 790 | ||||
-rw-r--r-- | deps/v8/src/bigint/mul-karatsuba.cc | 205 | ||||
-rw-r--r-- | deps/v8/src/bigint/mul-schoolbook.cc | 2 | ||||
-rw-r--r-- | deps/v8/src/bigint/mul-toom.cc | 223 | ||||
-rw-r--r-- | deps/v8/src/bigint/tostring.cc | 303 | ||||
-rw-r--r-- | deps/v8/src/bigint/util.h | 75 | ||||
-rw-r--r-- | deps/v8/src/bigint/vector-arithmetic.cc | 118 | ||||
-rw-r--r-- | deps/v8/src/bigint/vector-arithmetic.h | 40 |
17 files changed, 2655 insertions, 19 deletions
diff --git a/deps/v8/src/bigint/OWNERS b/deps/v8/src/bigint/OWNERS index f8c2fcc47a..8a2ddcc49d 100644 --- a/deps/v8/src/bigint/OWNERS +++ b/deps/v8/src/bigint/OWNERS @@ -1,2 +1,3 @@ jkummerow@chromium.org +mslekova@chromium.org thibaudm@chromium.org diff --git a/deps/v8/src/bigint/bigint-internal.cc b/deps/v8/src/bigint/bigint-internal.cc index 6630c6c4c9..828a450e8a 100644 --- a/deps/v8/src/bigint/bigint-internal.cc +++ b/deps/v8/src/bigint/bigint-internal.cc @@ -30,7 +30,61 @@ void ProcessorImpl::Multiply(RWDigits Z, Digits X, Digits Y) { if (X.len() == 0 || Y.len() == 0) return Z.Clear(); if (X.len() < Y.len()) std::swap(X, Y); if (Y.len() == 1) return MultiplySingle(Z, X, Y[0]); - return MultiplySchoolbook(Z, X, Y); + if (Y.len() < kKaratsubaThreshold) return MultiplySchoolbook(Z, X, Y); +#if !V8_ADVANCED_BIGINT_ALGORITHMS + return MultiplyKaratsuba(Z, X, Y); +#else + if (Y.len() < kToomThreshold) return MultiplyKaratsuba(Z, X, Y); + if (Y.len() < kFftThreshold) return MultiplyToomCook(Z, X, Y); + return MultiplyFFT(Z, X, Y); +#endif +} + +void ProcessorImpl::Divide(RWDigits Q, Digits A, Digits B) { + A.Normalize(); + B.Normalize(); + DCHECK(B.len() > 0); // NOLINT(readability/check) + int cmp = Compare(A, B); + if (cmp < 0) return Q.Clear(); + if (cmp == 0) { + Q[0] = 1; + for (int i = 1; i < Q.len(); i++) Q[i] = 0; + return; + } + if (B.len() == 1) { + digit_t remainder; + return DivideSingle(Q, &remainder, A, B[0]); + } + if (B.len() < kBurnikelThreshold) { + return DivideSchoolbook(Q, RWDigits(nullptr, 0), A, B); + } + return DivideBurnikelZiegler(Q, RWDigits(nullptr, 0), A, B); +} + +void ProcessorImpl::Modulo(RWDigits R, Digits A, Digits B) { + A.Normalize(); + B.Normalize(); + DCHECK(B.len() > 0); // NOLINT(readability/check) + int cmp = Compare(A, B); + if (cmp < 0) { + for (int i = 0; i < B.len(); i++) R[i] = B[i]; + for (int i = B.len(); i < R.len(); i++) R[i] = 0; + return; + } + if (cmp == 0) return R.Clear(); + if (B.len() == 1) { + digit_t remainder; + DivideSingle(RWDigits(nullptr, 0), &remainder, A, B[0]); + R[0] = remainder; + for (int i = 1; i < R.len(); i++) R[i] = 0; + return; + } + if (B.len() < kBurnikelThreshold) { + return DivideSchoolbook(RWDigits(nullptr, 0), R, A, B); + } + int q_len = DivideResultLength(A, B); + ScratchDigits Q(q_len); + return DivideBurnikelZiegler(Q, R, A, B); } Status Processor::Multiply(RWDigits Z, Digits X, Digits Y) { @@ -39,5 +93,17 @@ Status Processor::Multiply(RWDigits Z, Digits X, Digits Y) { return impl->get_and_clear_status(); } +Status Processor::Divide(RWDigits Q, Digits A, Digits B) { + ProcessorImpl* impl = static_cast<ProcessorImpl*>(this); + impl->Divide(Q, A, B); + return impl->get_and_clear_status(); +} + +Status Processor::Modulo(RWDigits R, Digits A, Digits B) { + ProcessorImpl* impl = static_cast<ProcessorImpl*>(this); + impl->Modulo(R, A, B); + return impl->get_and_clear_status(); +} + } // namespace bigint } // namespace v8 diff --git a/deps/v8/src/bigint/bigint-internal.h b/deps/v8/src/bigint/bigint-internal.h index efe63a06a5..41ef9526e5 100644 --- a/deps/v8/src/bigint/bigint-internal.h +++ b/deps/v8/src/bigint/bigint-internal.h @@ -5,11 +5,20 @@ #ifndef V8_BIGINT_BIGINT_INTERNAL_H_ #define V8_BIGINT_BIGINT_INTERNAL_H_ +#include <memory> + #include "src/bigint/bigint.h" namespace v8 { namespace bigint { +constexpr int kKaratsubaThreshold = 34; +constexpr int kToomThreshold = 193; +constexpr int kFftThreshold = 1500; +constexpr int kFftInnerThreshold = 200; + +constexpr int kBurnikelThreshold = 57; + class ProcessorImpl : public Processor { public: explicit ProcessorImpl(Platform* platform); @@ -21,7 +30,31 @@ class ProcessorImpl : public Processor { void MultiplySingle(RWDigits Z, Digits X, digit_t y); void MultiplySchoolbook(RWDigits Z, Digits X, Digits Y); - private: + void MultiplyKaratsuba(RWDigits Z, Digits X, Digits Y); + void KaratsubaStart(RWDigits Z, Digits X, Digits Y, RWDigits scratch, int k); + void KaratsubaChunk(RWDigits Z, Digits X, Digits Y, RWDigits scratch); + void KaratsubaMain(RWDigits Z, Digits X, Digits Y, RWDigits scratch, int n); + + void Divide(RWDigits Q, Digits A, Digits B); + void DivideSingle(RWDigits Q, digit_t* remainder, Digits A, digit_t b); + void DivideSchoolbook(RWDigits Q, RWDigits R, Digits A, Digits B); + void DivideBurnikelZiegler(RWDigits Q, RWDigits R, Digits A, Digits B); + + void Modulo(RWDigits R, Digits A, Digits B); + +#if V8_ADVANCED_BIGINT_ALGORITHMS + void MultiplyToomCook(RWDigits Z, Digits X, Digits Y); + void Toom3Main(RWDigits Z, Digits X, Digits Y); + + void MultiplyFFT(RWDigits Z, Digits X, Digits Y); +#endif // V8_ADVANCED_BIGINT_ALGORITHMS + + // {out_length} initially contains the allocated capacity of {out}, and + // upon return will be set to the actual length of the result string. + void ToString(char* out, int* out_length, Digits X, int radix, bool sign); + + bool should_terminate() { return status_ == Status::kInterrupted; } + // Each unit is supposed to represent approximately one CPU {mul} instruction. // Doesn't need to be accurate; we just want to make sure to check for // interrupt requests every now and then (roughly every 10-100 ms; often @@ -39,8 +72,7 @@ class ProcessorImpl : public Processor { } } - bool should_terminate() { return status_ == Status::kInterrupted; } - + private: uintptr_t work_estimate_{0}; Status status_{Status::kOk}; Platform* platform_; @@ -59,6 +91,30 @@ class ProcessorImpl : public Processor { #define DCHECK(cond) (void(0)) #endif +#define USE(var) ((void)var) + +// RAII memory for a Digits array. +class Storage { + public: + explicit Storage(int count) : ptr_(new digit_t[count]) {} + + digit_t* get() { return ptr_.get(); } + + private: + std::unique_ptr<digit_t[]> ptr_; +}; + +// A writable Digits array with attached storage. +class ScratchDigits : public RWDigits { + public: + explicit ScratchDigits(int len) : RWDigits(nullptr, len), storage_(len) { + digits_ = storage_.get(); + } + + private: + Storage storage_; +}; + } // namespace bigint } // namespace v8 diff --git a/deps/v8/src/bigint/bigint.h b/deps/v8/src/bigint/bigint.h index a365359c53..6d3790808c 100644 --- a/deps/v8/src/bigint/bigint.h +++ b/deps/v8/src/bigint/bigint.h @@ -101,7 +101,7 @@ class Digits { const digit_t* digits() const { return digits_; } protected: - friend class TemporaryLeftShift; + friend class ShiftedDigits; digit_t* digits_; int len_; @@ -201,12 +201,35 @@ class Platform { // // The operations are divided into two groups: "fast" (O(n) with small // coefficient) operations are exposed directly as free functions, "slow" -// operations are methods on a {BigIntProcessor} object, which provides +// operations are methods on a {Processor} object, which provides // support for interrupting execution via the {Platform}'s {InterruptRequested} // mechanism when it takes too long. These functions return a {Status} value. // Returns r such that r < 0 if A < B; r > 0 if A > B; r == 0 if A == B. -int Compare(Digits A, Digits B); +// Defined here to be inlineable, which helps ia32 a lot (64-bit platforms +// don't care). +inline int Compare(Digits A, Digits B) { + A.Normalize(); + B.Normalize(); + int diff = A.len() - B.len(); + if (diff != 0) return diff; + int i = A.len() - 1; + while (i >= 0 && A[i] == B[i]) i--; + if (i < 0) return 0; + return A[i] > B[i] ? 1 : -1; +} + +// Z := X + Y +void Add(RWDigits Z, Digits X, Digits Y); +// Addition of signed integers. Returns true if the result is negative. +bool AddSigned(RWDigits Z, Digits X, bool x_negative, Digits Y, + bool y_negative); + +// Z := X - Y. Requires X >= Y. +void Subtract(RWDigits Z, Digits X, Digits Y); +// Subtraction of signed integers. Returns true if the result is negative. +bool SubtractSigned(RWDigits Z, Digits X, bool x_negative, Digits Y, + bool y_negative); enum class Status { kOk, kInterrupted }; @@ -215,7 +238,7 @@ class Processor { // Takes ownership of {platform}. static Processor* New(Platform* platform); - // Use this for any std::unique_ptr holding an instance of BigIntProcessor. + // Use this for any std::unique_ptr holding an instance of {Processor}. class Destroyer { public: void operator()(Processor* proc) { proc->Destroy(); } @@ -225,11 +248,40 @@ class Processor { // Z := X * Y Status Multiply(RWDigits Z, Digits X, Digits Y); + // Q := A / B + Status Divide(RWDigits Q, Digits A, Digits B); + // R := A % B + Status Modulo(RWDigits R, Digits A, Digits B); + + // {out_length} initially contains the allocated capacity of {out}, and + // upon return will be set to the actual length of the result string. + Status ToString(char* out, int* out_length, Digits X, int radix, bool sign); }; +inline int AddResultLength(int x_length, int y_length) { + return std::max(x_length, y_length) + 1; +} +inline int AddSignedResultLength(int x_length, int y_length, bool same_sign) { + return same_sign ? AddResultLength(x_length, y_length) + : std::max(x_length, y_length); +} +inline int SubtractResultLength(int x_length, int y_length) { return x_length; } +inline int SubtractSignedResultLength(int x_length, int y_length, + bool same_sign) { + return same_sign ? std::max(x_length, y_length) + : AddResultLength(x_length, y_length); +} inline int MultiplyResultLength(Digits X, Digits Y) { return X.len() + Y.len(); } +inline int DivideResultLength(Digits A, Digits B) { + return A.len() - B.len() + 1; +} +inline int ModuloResultLength(Digits B) { return B.len(); } + +int ToStringResultLength(Digits X, int radix, bool sign); +// In DEBUG builds, the result of {ToString} will be initialized to this value. +constexpr char kStringZapValue = '?'; } // namespace bigint } // namespace v8 diff --git a/deps/v8/src/bigint/digit-arithmetic.h b/deps/v8/src/bigint/digit-arithmetic.h index 1c5c93c035..96ac949eb7 100644 --- a/deps/v8/src/bigint/digit-arithmetic.h +++ b/deps/v8/src/bigint/digit-arithmetic.h @@ -8,6 +8,7 @@ #define V8_BIGINT_DIGIT_ARITHMETIC_H_ #include "src/bigint/bigint.h" +#include "src/bigint/util.h" namespace v8 { namespace bigint { @@ -45,6 +46,36 @@ inline digit_t digit_add3(digit_t a, digit_t b, digit_t c, digit_t* carry) { #endif } +// {borrow} will be set to 0 or 1. +inline digit_t digit_sub(digit_t a, digit_t b, digit_t* borrow) { +#if HAVE_TWODIGIT_T + twodigit_t result = twodigit_t{a} - b; + *borrow = (result >> kDigitBits) & 1; + return static_cast<digit_t>(result); +#else + digit_t result = a - b; + *borrow = (result > a) ? 1 : 0; + return result; +#endif +} + +// {borrow_out} will be set to 0 or 1. +inline digit_t digit_sub2(digit_t a, digit_t b, digit_t borrow_in, + digit_t* borrow_out) { +#if HAVE_TWODIGIT_T + twodigit_t subtrahend = twodigit_t{b} + borrow_in; + twodigit_t result = twodigit_t{a} - subtrahend; + *borrow_out = (result >> kDigitBits) & 1; + return static_cast<digit_t>(result); +#else + digit_t result = a - b; + *borrow_out = (result > a) ? 1 : 0; + if (result < borrow_in) *borrow_out += 1; + result -= borrow_in; + return result; +#endif +} + // Returns the low half of the result. High half is in {high}. inline digit_t digit_mul(digit_t a, digit_t b, digit_t* high) { #if HAVE_TWODIGIT_T @@ -81,6 +112,83 @@ inline digit_t digit_mul(digit_t a, digit_t b, digit_t* high) { #endif } +// Returns the quotient. +// quotient = (high << kDigitBits + low - remainder) / divisor +static inline digit_t digit_div(digit_t high, digit_t low, digit_t divisor, + digit_t* remainder) { +#if defined(DCHECK) + DCHECK(high < divisor); + DCHECK(divisor != 0); // NOLINT(readability/check) +#endif +#if __x86_64__ && (__GNUC__ || __clang__) + digit_t quotient; + digit_t rem; + __asm__("divq %[divisor]" + // Outputs: {quotient} will be in rax, {rem} in rdx. + : "=a"(quotient), "=d"(rem) + // Inputs: put {high} into rdx, {low} into rax, and {divisor} into + // any register or stack slot. + : "d"(high), "a"(low), [divisor] "rm"(divisor)); + *remainder = rem; + return quotient; +#elif __i386__ && (__GNUC__ || __clang__) + digit_t quotient; + digit_t rem; + __asm__("divl %[divisor]" + // Outputs: {quotient} will be in eax, {rem} in edx. + : "=a"(quotient), "=d"(rem) + // Inputs: put {high} into edx, {low} into eax, and {divisor} into + // any register or stack slot. + : "d"(high), "a"(low), [divisor] "rm"(divisor)); + *remainder = rem; + return quotient; +#else + // Adapted from Warren, Hacker's Delight, p. 152. + int s = CountLeadingZeros(divisor); +#if defined(DCHECK) + DCHECK(s != kDigitBits); // {divisor} is not 0. +#endif + divisor <<= s; + + digit_t vn1 = divisor >> kHalfDigitBits; + digit_t vn0 = divisor & kHalfDigitMask; + // {s} can be 0. {low >> kDigitBits} would be undefined behavior, so + // we mask the shift amount with {kShiftMask}, and the result with + // {s_zero_mask} which is 0 if s == 0 and all 1-bits otherwise. + static_assert(sizeof(intptr_t) == sizeof(digit_t), + "intptr_t and digit_t must have the same size"); + const int kShiftMask = kDigitBits - 1; + digit_t s_zero_mask = + static_cast<digit_t>(static_cast<intptr_t>(-s) >> (kDigitBits - 1)); + digit_t un32 = + (high << s) | ((low >> ((kDigitBits - s) & kShiftMask)) & s_zero_mask); + digit_t un10 = low << s; + digit_t un1 = un10 >> kHalfDigitBits; + digit_t un0 = un10 & kHalfDigitMask; + digit_t q1 = un32 / vn1; + digit_t rhat = un32 - q1 * vn1; + + while (q1 >= kHalfDigitBase || q1 * vn0 > rhat * kHalfDigitBase + un1) { + q1--; + rhat += vn1; + if (rhat >= kHalfDigitBase) break; + } + + digit_t un21 = un32 * kHalfDigitBase + un1 - q1 * divisor; + digit_t q0 = un21 / vn1; + rhat = un21 - q0 * vn1; + + while (q0 >= kHalfDigitBase || q0 * vn0 > rhat * kHalfDigitBase + un0) { + q0--; + rhat += vn1; + if (rhat >= kHalfDigitBase) break; + } + + *remainder = (un21 * kHalfDigitBase + un0 - q0 * divisor) >> s; + return q1 * kHalfDigitBase + q0; +#endif +} + } // namespace bigint } // namespace v8 diff --git a/deps/v8/src/bigint/div-burnikel.cc b/deps/v8/src/bigint/div-burnikel.cc new file mode 100644 index 0000000000..0caedb1cc1 --- /dev/null +++ b/deps/v8/src/bigint/div-burnikel.cc @@ -0,0 +1,279 @@ +// Copyright 2021 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Burnikel-Ziegler division. +// Reference: "Fast Recursive Division" by Christoph Burnikel and Joachim +// Ziegler, found at http://cr.yp.to/bib/1998/burnikel.ps + +#include <string.h> + +#include "src/bigint/bigint-internal.h" +#include "src/bigint/digit-arithmetic.h" +#include "src/bigint/div-helpers.h" +#include "src/bigint/util.h" +#include "src/bigint/vector-arithmetic.h" + +namespace v8 { +namespace bigint { + +namespace { + +// Compares [a_high, A] with B. +// Returns: +// - a value < 0 if [a_high, A] < B +// - 0 if [a_high, A] == B +// - a value > 0 if [a_high, A] > B. +int SpecialCompare(digit_t a_high, Digits A, Digits B) { + B.Normalize(); + int a_len; + if (a_high == 0) { + A.Normalize(); + a_len = A.len(); + } else { + a_len = A.len() + 1; + } + int diff = a_len - B.len(); + if (diff != 0) return diff; + int i = a_len - 1; + if (a_high != 0) { + if (a_high > B[i]) return 1; + if (a_high < B[i]) return -1; + i--; + } + while (i >= 0 && A[i] == B[i]) i--; + if (i < 0) return 0; + return A[i] > B[i] ? 1 : -1; +} + +void SetOnes(RWDigits X) { + memset(X.digits(), 0xFF, X.len() * sizeof(digit_t)); +} + +// Since the Burnikel-Ziegler method is inherently recursive, we put +// non-changing data into a container object. +class BZ { + public: + BZ(ProcessorImpl* proc, int scratch_space) + : proc_(proc), + scratch_mem_(scratch_space >= kBurnikelThreshold ? scratch_space : 0) {} + + void DivideBasecase(RWDigits Q, RWDigits R, Digits A, Digits B); + void D3n2n(RWDigits Q, RWDigits R, Digits A1A2, Digits A3, Digits B); + void D2n1n(RWDigits Q, RWDigits R, Digits A, Digits B); + + private: + ProcessorImpl* proc_; + Storage scratch_mem_; +}; + +void BZ::DivideBasecase(RWDigits Q, RWDigits R, Digits A, Digits B) { + A.Normalize(); + B.Normalize(); + DCHECK(B.len() > 0); // NOLINT(readability/check) + int cmp = Compare(A, B); + if (cmp <= 0) { + Q.Clear(); + if (cmp == 0) { + // If A == B, then Q=1, R=0. + R.Clear(); + Q[0] = 1; + } else { + // If A < B, then Q=0, R=A. + PutAt(R, A, R.len()); + } + return; + } + if (B.len() == 1) { + return proc_->DivideSingle(Q, R.digits(), A, B[0]); + } + return proc_->DivideSchoolbook(Q, R, A, B); +} + +// Algorithm 2 from the paper. Variable names same as there. +// Returns Q(uotient) and R(emainder) for A/B, with B having two thirds +// the size of A = [A1, A2, A3]. +void BZ::D3n2n(RWDigits Q, RWDigits R, Digits A1A2, Digits A3, Digits B) { + DCHECK((B.len() & 1) == 0); // NOLINT(readability/check) + int n = B.len() / 2; + DCHECK(A1A2.len() == 2 * n); + // Actual condition is stricter than length: A < B * 2^(kDigitBits * n) + DCHECK(Compare(A1A2, B) < 0); // NOLINT(readability/check) + DCHECK(A3.len() == n); + DCHECK(Q.len() == n); + DCHECK(R.len() == 2 * n); + // 1. Split A into three parts A = [A1, A2, A3] with Ai < 2^(kDigitBits * n). + Digits A1(A1A2, n, n); + // 2. Split B into two parts B = [B1, B2] with Bi < 2^(kDigitBits * n). + Digits B1(B, n, n); + Digits B2(B, 0, n); + // 3. Distinguish the cases A1 < B1 or A1 >= B1. + RWDigits Qhat = Q; + RWDigits R1(R, n, n); + digit_t r1_high = 0; + if (Compare(A1, B1) < 0) { + // 3a. If A1 < B1, compute Qhat = floor([A1, A2] / B1) with remainder R1 + // using algorithm D2n1n. + D2n1n(Qhat, R1, A1A2, B1); + if (proc_->should_terminate()) return; + } else { + // 3b. If A1 >= B1, set Qhat = 2^(kDigitBits * n) - 1 and set + // R1 = [A1, A2] - [B1, 0] + [0, B1] + SetOnes(Qhat); + // Step 1: compute A1 - B1, which can't underflow because of the comparison + // guarding this else-branch, and always has a one-digit result because + // of this function's preconditions. + RWDigits temp = R1; + Subtract(temp, A1, B1); + temp.Normalize(); + DCHECK(temp.len() <= 1); // NOLINT(readability/check) + if (temp.len() > 0) r1_high = temp[0]; + // Step 2: compute A2 + B1. + Digits A2(A1A2, 0, n); + r1_high += AddAndReturnCarry(R1, A2, B1); + } + // 4. Compute D = Qhat * B2 using (Karatsuba) multiplication. + RWDigits D(scratch_mem_.get(), 2 * n); + proc_->Multiply(D, Qhat, B2); + if (proc_->should_terminate()) return; + + // 5. Compute Rhat = R1*2^(kDigitBits * n) + A3 - D = [R1, A3] - D. + PutAt(R, A3, n); + // 6. As long as Rhat < 0, repeat: + while (SpecialCompare(r1_high, R, D) < 0) { + // 6a. Rhat = Rhat + B + r1_high += AddAndReturnCarry(R, R, B); + // 6b. Qhat = Qhat - 1 + Subtract(Qhat, 1); + } + // 5. Compute Rhat = R1*2^(kDigitBits * n) + A3 - D = [R1, A3] - D. + digit_t borrow = SubtractAndReturnBorrow(R, R, D); + DCHECK(borrow == r1_high); + DCHECK(Compare(R, B) < 0); // NOLINT(readability/check) + (void)borrow; + // 7. Return R = Rhat, Q = Qhat. +} + +// Algorithm 1 from the paper. Variable names same as there. +// Returns Q(uotient) and (R)emainder for A/B, with A twice the size of B. +void BZ::D2n1n(RWDigits Q, RWDigits R, Digits A, Digits B) { + int n = B.len(); + DCHECK(A.len() <= 2 * n); + // A < B * 2^(kDigitsBits * n) + DCHECK(Compare(Digits(A, n, n), B) < 0); // NOLINT(readability/check) + DCHECK(Q.len() == n); + DCHECK(R.len() == n); + // 1. If n is odd or smaller than some convenient constant, compute Q and R + // by school division and return. + if ((n & 1) == 1 || n < kBurnikelThreshold) { + return DivideBasecase(Q, R, A, B); + } + // 2. Split A into four parts A = [A1, ..., A4] with + // Ai < 2^(kDigitBits * n / 2). Split B into two parts [B2, B1] with + // Bi < 2^(kDigitBits * n / 2). + Digits A1A2(A, n, n); + Digits A3(A, n / 2, n / 2); + Digits A4(A, 0, n / 2); + // 3. Compute the high part Q1 of floor(A/B) as + // Q1 = floor([A1, A2, A3] / [B1, B2]) with remainder R1 = [R11, R12], + // using algorithm D3n2n. + RWDigits Q1(Q, n / 2, n / 2); + ScratchDigits R1(n); + D3n2n(Q1, R1, A1A2, A3, B); + if (proc_->should_terminate()) return; + // 4. Compute the low part Q2 of floor(A/B) as + // Q2 = floor([R11, R12, A4] / [B1, B2]) with remainder R, using + // algorithm D3n2n. + RWDigits Q2(Q, 0, n / 2); + D3n2n(Q2, R, R1, A4, B); + // 5. Return Q = [Q1, Q2] and R. +} + +} // namespace + +// Algorithm 3 from the paper. Variables names same as there. +// Returns Q(uotient) and R(emainder) for A/B (no size restrictions). +// R is optional, Q is not. +void ProcessorImpl::DivideBurnikelZiegler(RWDigits Q, RWDigits R, Digits A, + Digits B) { + DCHECK(A.len() >= B.len()); + DCHECK(R.len() == 0 || R.len() >= B.len()); + DCHECK(Q.len() > A.len() - B.len()); + int r = A.len(); + int s = B.len(); + // The requirements are: + // - n >= s, n as small as possible. + // - m must be a power of two. + // 1. Set m = min {2^k | 2^k * kBurnikelThreshold > s}. + int m = 1 << BitLength(s / kBurnikelThreshold); + // 2. Set j = roundup(s/m) and n = j * m. + int j = DIV_CEIL(s, m); + int n = j * m; + // 3. Set sigma = max{tao | 2^tao * B < 2^(kDigitBits * n)}. + int sigma = CountLeadingZeros(B[s - 1]); + int digit_shift = n - s; + // 4. Set B = B * 2^sigma to normalize B. Shift A by the same amount. + ScratchDigits B_shifted(n); + LeftShift(B_shifted + digit_shift, B, sigma); + for (int i = 0; i < digit_shift; i++) B_shifted[i] = 0; + B = B_shifted; + // We need an extra digit if A's top digit does not have enough space for + // the left-shift by {sigma}. Additionally, the top bit of A must be 0 + // (see "-1" in step 5 below), which combined with B being normalized (i.e. + // B's top bit is 1) ensures the preconditions of the helper functions. + int extra_digit = CountLeadingZeros(A[r - 1]) < (sigma + 1) ? 1 : 0; + r = A.len() + digit_shift + extra_digit; + ScratchDigits A_shifted(r); + LeftShift(A_shifted + digit_shift, A, sigma); + for (int i = 0; i < digit_shift; i++) A_shifted[i] = 0; + A = A_shifted; + // 5. Set t = min{t >= 2 | A < 2^(kDigitBits * t * n - 1)}. + int t = std::max(DIV_CEIL(r, n), 2); + // 6. Split A conceptually into t blocks. + // 7. Set Z_(t-2) = [A_(t-1), A_(t-2)]. + int z_len = n * 2; + ScratchDigits Z(z_len); + PutAt(Z, A + n * (t - 2), z_len); + // 8. For i from t-2 downto 0 do: + BZ bz(this, n); + ScratchDigits Ri(n); + { + // First iteration unrolled and specialized. + // We might not have n digits at the top of Q, so use temporary storage + // for Qi... + ScratchDigits Qi(n); + bz.D2n1n(Qi, Ri, Z, B); + if (should_terminate()) return; + // ...but there *will* be enough space for any non-zero result digits! + Qi.Normalize(); + RWDigits target = Q + n * (t - 2); + DCHECK(Qi.len() <= target.len()); + PutAt(target, Qi, target.len()); + } + // Now loop over any remaining iterations. + for (int i = t - 3; i >= 0; i--) { + // 8b. If i > 0, set Z_(i-1) = [Ri, A_(i-1)]. + // (De-duped with unrolled first iteration, hence reading A_(i).) + PutAt(Z + n, Ri, n); + PutAt(Z, A + n * i, n); + // 8a. Using algorithm D2n1n compute Qi, Ri such that Zi = B*Qi + Ri. + RWDigits Qi(Q, i * n, n); + bz.D2n1n(Qi, Ri, Z, B); + if (should_terminate()) return; + } + // 9. Return Q = [Q_(t-2), ..., Q_0] and R = R_0 * 2^(-sigma). +#if DEBUG + for (int i = 0; i < digit_shift; i++) { + DCHECK(Ri[i] == 0); // NOLINT(readability/check) + } +#endif + if (R.len() != 0) { + Digits Ri_part(Ri, digit_shift, Ri.len()); + Ri_part.Normalize(); + DCHECK(Ri_part.len() <= R.len()); + RightShift(R, Ri_part, sigma); + } +} + +} // namespace bigint +} // namespace v8 diff --git a/deps/v8/src/bigint/div-helpers.cc b/deps/v8/src/bigint/div-helpers.cc new file mode 100644 index 0000000000..0dfca0b02c --- /dev/null +++ b/deps/v8/src/bigint/div-helpers.cc @@ -0,0 +1,68 @@ +// Copyright 2021 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "src/bigint/div-helpers.h" + +#include "src/bigint/bigint-internal.h" + +namespace v8 { +namespace bigint { + +namespace { + +void Copy(RWDigits Z, Digits X) { + if (Z == X) return; + int i = 0; + for (; i < X.len(); i++) Z[i] = X[i]; + for (; i < Z.len(); i++) Z[i] = 0; +} + +} // namespace + +// Z := X << shift +// Z and X may alias for an in-place shift. +void LeftShift(RWDigits Z, Digits X, int shift) { + DCHECK(shift >= 0); // NOLINT(readability/check) + DCHECK(shift < kDigitBits); + DCHECK(Z.len() >= X.len()); + if (shift == 0) return Copy(Z, X); + digit_t carry = 0; + int i = 0; + for (; i < X.len(); i++) { + digit_t d = X[i]; + Z[i] = (d << shift) | carry; + carry = d >> (kDigitBits - shift); + } + if (i < Z.len()) { + Z[i++] = carry; + } else { + DCHECK(carry == 0); // NOLINT(readability/check) + } + for (; i < Z.len(); i++) Z[i] = 0; +} + +// Z := X >> shift +// Z and X may alias for an in-place shift. +void RightShift(RWDigits Z, Digits X, int shift) { + DCHECK(shift >= 0); // NOLINT(readability/check) + DCHECK(shift < kDigitBits); + X.Normalize(); + DCHECK(Z.len() >= X.len()); + if (shift == 0) return Copy(Z, X); + int i = 0; + if (X.len() > 0) { + digit_t carry = X[0] >> shift; + int last = X.len() - 1; + for (; i < last; i++) { + digit_t d = X[i + 1]; + Z[i] = (d << (kDigitBits - shift)) | carry; + carry = d >> shift; + } + Z[i++] = carry; + } + for (; i < Z.len(); i++) Z[i] = 0; +} + +} // namespace bigint +} // namespace v8 diff --git a/deps/v8/src/bigint/div-helpers.h b/deps/v8/src/bigint/div-helpers.h new file mode 100644 index 0000000000..db5da3e128 --- /dev/null +++ b/deps/v8/src/bigint/div-helpers.h @@ -0,0 +1,82 @@ +// Copyright 2021 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef V8_BIGINT_DIV_HELPERS_H_ +#define V8_BIGINT_DIV_HELPERS_H_ + +#include <memory> + +#include "src/bigint/bigint.h" +#include "src/bigint/util.h" + +namespace v8 { +namespace bigint { + +void LeftShift(RWDigits Z, Digits X, int shift); +void RightShift(RWDigits Z, Digits X, int shift); + +inline void PutAt(RWDigits Z, Digits A, int count) { + int len = std::min(A.len(), count); + int i = 0; + for (; i < len; i++) Z[i] = A[i]; + for (; i < count; i++) Z[i] = 0; +} + +// Division algorithms typically need to left-shift their inputs into +// "bit-normalized" form (i.e. top bit is set). The inputs are considered +// read-only, and V8 relies on that by allowing concurrent reads from them, +// so by default, {ShiftedDigits} allocate temporary storage for their +// contents. In-place modification is opt-in for cases where callers can +// guarantee that it is safe. +// When callers allow in-place shifting and wish to undo it, they have to do +// so manually using {Reset()}. +// If {shift} is not given, it is auto-detected from {original}'s +// leading zeros. +class ShiftedDigits : public Digits { + public: + explicit ShiftedDigits(Digits& original, int shift = -1, + bool allow_inplace = false) + : Digits(original.digits_, original.len_) { + int leading_zeros = CountLeadingZeros(original.msd()); + if (shift < 0) { + shift = leading_zeros; + } else if (shift > leading_zeros) { + allow_inplace = false; + len_++; + } + shift_ = shift; + if (shift == 0) { + inplace_ = true; + return; + } + inplace_ = allow_inplace; + if (!inplace_) { + digit_t* digits = new digit_t[len_]; + storage_.reset(digits); + digits_ = digits; + } + RWDigits rw_view(digits_, len_); + LeftShift(rw_view, original, shift_); + } + ~ShiftedDigits() = default; + + void Reset() { + if (inplace_) { + RWDigits rw_view(digits_, len_); + RightShift(rw_view, rw_view, shift_); + } + } + + int shift() { return shift_; } + + private: + int shift_; + bool inplace_; + std::unique_ptr<digit_t[]> storage_; +}; + +} // namespace bigint +} // namespace v8 + +#endif // V8_BIGINT_DIV_HELPERS_H_ diff --git a/deps/v8/src/bigint/div-schoolbook.cc b/deps/v8/src/bigint/div-schoolbook.cc new file mode 100644 index 0000000000..a6295c573c --- /dev/null +++ b/deps/v8/src/bigint/div-schoolbook.cc @@ -0,0 +1,190 @@ +// Copyright 2021 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// "Schoolbook" division. This is loosely based on Go's implementation +// found at https://golang.org/src/math/big/nat.go, licensed as follows: +// +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file [1]. +// +// [1] https://golang.org/LICENSE + +#include <limits> + +#include "src/bigint/bigint-internal.h" +#include "src/bigint/digit-arithmetic.h" +#include "src/bigint/div-helpers.h" +#include "src/bigint/util.h" +#include "src/bigint/vector-arithmetic.h" + +namespace v8 { +namespace bigint { + +// Computes Q(uotient) and remainder for A/b, such that +// Q = (A - remainder) / b, with 0 <= remainder < b. +// If Q.len == 0, only the remainder will be returned. +// Q may be the same as A for an in-place division. +void ProcessorImpl::DivideSingle(RWDigits Q, digit_t* remainder, Digits A, + digit_t b) { + DCHECK(b != 0); // NOLINT(readability/check) + DCHECK(A.len() > 0); // NOLINT(readability/check) + *remainder = 0; + int length = A.len(); + if (Q.len() != 0) { + if (A[length - 1] >= b) { + DCHECK(Q.len() >= A.len()); + for (int i = length - 1; i >= 0; i--) { + Q[i] = digit_div(*remainder, A[i], b, remainder); + } + for (int i = length; i < Q.len(); i++) Q[i] = 0; + } else { + DCHECK(Q.len() >= A.len() - 1); + *remainder = A[length - 1]; + for (int i = length - 2; i >= 0; i--) { + Q[i] = digit_div(*remainder, A[i], b, remainder); + } + for (int i = length - 1; i < Q.len(); i++) Q[i] = 0; + } + } else { + for (int i = length - 1; i >= 0; i--) { + digit_div(*remainder, A[i], b, remainder); + } + } +} + +// Z += X. Returns the "carry" (0 or 1) after adding all of X's digits. +inline digit_t InplaceAdd(RWDigits Z, Digits X) { + return AddAndReturnCarry(Z, Z, X); +} + +// Z -= X. Returns the "borrow" (0 or 1) after subtracting all of X's digits. +inline digit_t InplaceSub(RWDigits Z, Digits X) { + return SubtractAndReturnBorrow(Z, Z, X); +} + +// Returns whether (factor1 * factor2) > (high << kDigitBits) + low. +bool ProductGreaterThan(digit_t factor1, digit_t factor2, digit_t high, + digit_t low) { + digit_t result_high; + digit_t result_low = digit_mul(factor1, factor2, &result_high); + return result_high > high || (result_high == high && result_low > low); +} + +#if DEBUG +bool QLengthOK(Digits Q, Digits A, Digits B) { + // If A's top B.len digits are greater than or equal to B, then the division + // result will be greater than A.len - B.len, otherwise it will be that + // difference. Intuitively: 100/10 has 2 digits, 100/11 has 1. + if (GreaterThanOrEqual(Digits(A, A.len() - B.len(), B.len()), B)) { + return Q.len() >= A.len() - B.len() + 1; + } + return Q.len() >= A.len() - B.len(); +} +#endif + +// Computes Q(uotient) and R(emainder) for A/B, such that +// Q = (A - R) / B, with 0 <= R < B. +// Both Q and R are optional: callers that are only interested in one of them +// can pass the other with len == 0. +// If Q is present, its length must be at least A.len - B.len + 1. +// If R is present, its length must be at least B.len. +// See Knuth, Volume 2, section 4.3.1, Algorithm D. +void ProcessorImpl::DivideSchoolbook(RWDigits Q, RWDigits R, Digits A, + Digits B) { + // NOLINTNEXTLINE(readability/check) + DCHECK(B.len() >= 2); // Use DivideSingle otherwise. + DCHECK(A.len() >= B.len()); // No-op otherwise. + DCHECK(Q.len() == 0 || QLengthOK(Q, A, B)); + DCHECK(R.len() == 0 || R.len() >= B.len()); + // The unusual variable names inside this function are consistent with + // Knuth's book, as well as with Go's implementation of this algorithm. + // Maintaining this consistency is probably more useful than trying to + // come up with more descriptive names for them. + const int n = B.len(); + const int m = A.len() - n; + + // In each iteration, {qhatv} holds {divisor} * {current quotient digit}. + // "v" is the book's name for {divisor}, "qhat" the current quotient digit. + ScratchDigits qhatv(n + 1); + + // D1. + // Left-shift inputs so that the divisor's MSB is set. This is necessary + // to prevent the digit-wise divisions (see digit_div call below) from + // overflowing (they take a two digits wide input, and return a one digit + // result). + ShiftedDigits b_normalized(B); + B = b_normalized; + // U holds the (continuously updated) remaining part of the dividend, which + // eventually becomes the remainder. + ScratchDigits U(A.len() + 1); + LeftShift(U, A, b_normalized.shift()); + + // D2. + // Iterate over the dividend's digits (like the "grad school" algorithm). + // {vn1} is the divisor's most significant digit. + digit_t vn1 = B[n - 1]; + for (int j = m; j >= 0; j--) { + // D3. + // Estimate the current iteration's quotient digit (see Knuth for details). + // {qhat} is the current quotient digit. + digit_t qhat = std::numeric_limits<digit_t>::max(); + // {ujn} is the dividend's most significant remaining digit. + digit_t ujn = U[j + n]; + if (ujn != vn1) { + // {rhat} is the current iteration's remainder. + digit_t rhat = 0; + // Estimate the current quotient digit by dividing the most significant + // digits of dividend and divisor. The result will not be too small, + // but could be a bit too large. + qhat = digit_div(ujn, U[j + n - 1], vn1, &rhat); + + // Decrement the quotient estimate as needed by looking at the next + // digit, i.e. by testing whether + // qhat * v_{n-2} > (rhat << kDigitBits) + u_{j+n-2}. + digit_t vn2 = B[n - 2]; + digit_t ujn2 = U[j + n - 2]; + while (ProductGreaterThan(qhat, vn2, rhat, ujn2)) { + qhat--; + digit_t prev_rhat = rhat; + rhat += vn1; + // v[n-1] >= 0, so this tests for overflow. + if (rhat < prev_rhat) break; + } + } + + // D4. + // Multiply the divisor with the current quotient digit, and subtract + // it from the dividend. If there was "borrow", then the quotient digit + // was one too high, so we must correct it and undo one subtraction of + // the (shifted) divisor. + if (qhat == 0) { + qhatv.Clear(); + } else { + MultiplySingle(qhatv, B, qhat); + } + digit_t c = InplaceSub(U + j, qhatv); + if (c != 0) { + c = InplaceAdd(U + j, B); + U[j + n] = U[j + n] + c; + qhat--; + } + + if (Q.len() != 0) { + if (j >= Q.len()) { + DCHECK(qhat == 0); // NOLINT(readability/check) + } else { + Q[j] = qhat; + } + } + } + if (R.len() != 0) { + RightShift(R, U, b_normalized.shift()); + } + // If Q has extra storage, clear it. + for (int i = m + 1; i < Q.len(); i++) Q[i] = 0; +} + +} // namespace bigint +} // namespace v8 diff --git a/deps/v8/src/bigint/mul-fft.cc b/deps/v8/src/bigint/mul-fft.cc new file mode 100644 index 0000000000..a3971a7276 --- /dev/null +++ b/deps/v8/src/bigint/mul-fft.cc @@ -0,0 +1,790 @@ +// Copyright 2021 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// FFT-based multiplication, due to Schönhage and Strassen. +// This implementation mostly follows the description given in: +// Christoph Lüders: Fast Multiplication of Large Integers, +// http://arxiv.org/abs/1503.04955 + +#include "src/bigint/bigint-internal.h" +#include "src/bigint/digit-arithmetic.h" +#include "src/bigint/util.h" +#include "src/bigint/vector-arithmetic.h" + +namespace v8 { +namespace bigint { + +namespace { + +//////////////////////////////////////////////////////////////////////////////// +// Part 1: Functions for "mod F_n" arithmetic. +// F_n is of the shape 2^K + 1, and for convenience we use K to count the +// number of digits rather than the number of bits, so F_n (or K) are implicit +// and deduced from the length {len} of the digits array. + +// Helper function for {ModFn} below. +void ModFn_Helper(digit_t* x, int len, signed_digit_t high) { + if (high > 0) { + digit_t borrow = high; + x[len - 1] = 0; + for (int i = 0; i < len; i++) { + x[i] = digit_sub(x[i], borrow, &borrow); + if (borrow == 0) break; + } + } else { + digit_t carry = -high; + x[len - 1] = 0; + for (int i = 0; i < len; i++) { + x[i] = digit_add2(x[i], carry, &carry); + if (carry == 0) break; + } + } +} + +// {x} := {x} mod F_n, assuming that {x} is "slightly" larger than F_n (e.g. +// after addition of two numbers that were mod-F_n-normalized before). +void ModFn(digit_t* x, int len) { + int K = len - 1; + signed_digit_t high = x[K]; + if (high == 0) return; + ModFn_Helper(x, len, high); + high = x[K]; + if (high == 0) return; + DCHECK(high == 1 || high == -1); + ModFn_Helper(x, len, high); + high = x[K]; + if (high == -1) ModFn_Helper(x, len, high); +} + +// {dest} := {src} mod F_n, assuming that {src} is about twice as long as F_n +// (e.g. after multiplication of two numbers that were mod-F_n-normalized +// before). +// {len} is length of {dest}; {src} is twice as long. +void ModFnDoubleWidth(digit_t* dest, const digit_t* src, int len) { + int K = len - 1; + digit_t borrow = 0; + for (int i = 0; i < K; i++) { + dest[i] = digit_sub2(src[i], src[i + K], borrow, &borrow); + } + dest[K] = digit_sub2(0, src[2 * K], borrow, &borrow); + // {borrow} may be non-zero here, that's OK as {ModFn} will take care of it. + ModFn(dest, len); +} + +// Sets {sum} := {a} + {b} and {diff} := {a} - {b}, which is more efficient +// than computing sum and difference separately. Applies "mod F_n" normalization +// to both results. +void SumDiff(digit_t* sum, digit_t* diff, const digit_t* a, const digit_t* b, + int len) { + digit_t carry = 0; + digit_t borrow = 0; + for (int i = 0; i < len; i++) { + // Read both values first, because inputs and outputs can overlap. + digit_t ai = a[i]; + digit_t bi = b[i]; + sum[i] = digit_add3(ai, bi, carry, &carry); + diff[i] = digit_sub2(ai, bi, borrow, &borrow); + } + ModFn(sum, len); + ModFn(diff, len); +} + +// {result} := ({input} << shift) mod F_n, where shift >= K. +void ShiftModFn_Large(digit_t* result, const digit_t* input, int digit_shift, + int bits_shift, int K) { + // If {digit_shift} is greater than K, we use the following transformation + // (where, since everything is mod 2^K + 1, we are allowed to add or + // subtract any multiple of 2^K + 1 at any time): + // x * 2^{K+m} mod 2^K + 1 + // == x * 2^K * 2^m - (2^K + 1)*(x * 2^m) mod 2^K + 1 + // == x * 2^K * 2^m - x * 2^K * 2^m - x * 2^m mod 2^K + 1 + // == -x * 2^m mod 2^K + 1 + // So the flow is the same as for m < K, but we invert the subtraction's + // operands. In order to avoid underflow, we virtually initialize the + // result to 2^K + 1: + // input = [ iK ][iK-1] .... .... [ i1 ][ i0 ] + // result = [ 1][0000] .... .... [0000][0001] + // + [ iK ] .... [ iX ] + // - [iX-1] .... [ i0 ] + DCHECK(digit_shift >= K); + digit_shift -= K; + digit_t borrow = 0; + if (bits_shift == 0) { + digit_t carry = 1; + for (int i = 0; i < digit_shift; i++) { + result[i] = digit_add2(input[i + K - digit_shift], carry, &carry); + } + result[digit_shift] = digit_sub(input[K] + carry, input[0], &borrow); + for (int i = digit_shift + 1; i < K; i++) { + digit_t d = input[i - digit_shift]; + result[i] = digit_sub2(0, d, borrow, &borrow); + } + } else { + digit_t add_carry = 1; + digit_t input_carry = + input[K - digit_shift - 1] >> (kDigitBits - bits_shift); + for (int i = 0; i < digit_shift; i++) { + digit_t d = input[i + K - digit_shift]; + digit_t summand = (d << bits_shift) | input_carry; + result[i] = digit_add2(summand, add_carry, &add_carry); + input_carry = d >> (kDigitBits - bits_shift); + } + { + // result[digit_shift] = (add_carry + iK_part) - i0_part + digit_t d = input[K]; + digit_t iK_part = (d << bits_shift) | input_carry; + digit_t iK_carry = d >> (kDigitBits - bits_shift); + digit_t sum = digit_add2(add_carry, iK_part, &add_carry); + // {iK_carry} is less than a full digit, so we can merge {add_carry} + // into it without overflow. + iK_carry += add_carry; + d = input[0]; + digit_t i0_part = d << bits_shift; + result[digit_shift] = digit_sub(sum, i0_part, &borrow); + input_carry = d >> (kDigitBits - bits_shift); + if (digit_shift + 1 < K) { + digit_t d = input[1]; + digit_t subtrahend = (d << bits_shift) | input_carry; + result[digit_shift + 1] = + digit_sub2(iK_carry, subtrahend, borrow, &borrow); + input_carry = d >> (kDigitBits - bits_shift); + } + } + for (int i = digit_shift + 2; i < K; i++) { + digit_t d = input[i - digit_shift]; + digit_t subtrahend = (d << bits_shift) | input_carry; + result[i] = digit_sub2(0, subtrahend, borrow, &borrow); + input_carry = d >> (kDigitBits - bits_shift); + } + } + // The virtual 1 in result[K] should be eliminated by {borrow}. If there + // is no borrow, then the virtual initialization was too much. Subtract + // 2^K + 1. + result[K] = 0; + if (borrow != 1) { + borrow = 1; + for (int i = 0; i < K; i++) { + result[i] = digit_sub(result[i], borrow, &borrow); + if (borrow == 0) break; + } + if (borrow != 0) { + // The result must be 2^K. + for (int i = 0; i < K; i++) result[i] = 0; + result[K] = 1; + } + } +} + +// Sets {result} := {input} * 2^{power_of_two} mod 2^{K} + 1. +// This function is highly relevant for overall performance. +void ShiftModFn(digit_t* result, const digit_t* input, int power_of_two, int K, + int zero_above = 0x7FFFFFFF) { + // The modulo-reduction amounts to a subtraction, which we combine + // with the shift as follows: + // input = [ iK ][iK-1] .... .... [ i1 ][ i0 ] + // result = [iX-1] .... [ i0 ] <<<<<<<<<<< shift by {power_of_two} + // - [ iK ] .... [ iX ] + // where "X" is the index "K - digit_shift". + int digit_shift = power_of_two / kDigitBits; + int bits_shift = power_of_two % kDigitBits; + // By an analogous construction to the "digit_shift >= K" case, + // it turns out that: + // x * 2^{2K+m} == x * 2^m mod 2^K + 1. + while (digit_shift >= 2 * K) digit_shift -= 2 * K; // Faster than '%'! + if (digit_shift >= K) { + return ShiftModFn_Large(result, input, digit_shift, bits_shift, K); + } + digit_t borrow = 0; + if (bits_shift == 0) { + // We do a single pass over {input}, starting by copying digits [i1] to + // [iX-1] to result indices digit_shift+1 to K-1. + int i = 1; + // Read input digits unless we know they are zero. + int cap = std::min(K - digit_shift, zero_above); + for (; i < cap; i++) { + result[i + digit_shift] = input[i]; + } + // Any remaining work can hard-code the knowledge that input[i] == 0. + for (; i < K - digit_shift; i++) { + DCHECK(input[i] == 0); // NOLINT(readability/check) + result[i + digit_shift] = 0; + } + // Second phase: subtract input digits [iX] to [iK] from (virtually) zero- + // initialized result indices 0 to digit_shift-1. + cap = std::min(K, zero_above); + for (; i < cap; i++) { + digit_t d = input[i]; + result[i - K + digit_shift] = digit_sub2(0, d, borrow, &borrow); + } + // Any remaining work can hard-code the knowledge that input[i] == 0. + for (; i < K; i++) { + DCHECK(input[i] == 0); // NOLINT(readability/check) + result[i - K + digit_shift] = digit_sub(0, borrow, &borrow); + } + // Last step: subtract [iK] from [i0] and store at result index digit_shift. + result[digit_shift] = digit_sub2(input[0], input[K], borrow, &borrow); + } else { + // Same flow as before, but taking bits_shift != 0 into account. + // First phase: result indices digit_shift+1 to K. + digit_t carry = 0; + int i = 0; + // Read input digits unless we know they are zero. + int cap = std::min(K - digit_shift, zero_above); + for (; i < cap; i++) { + digit_t d = input[i]; + result[i + digit_shift] = (d << bits_shift) | carry; + carry = d >> (kDigitBits - bits_shift); + } + // Any remaining work can hard-code the knowledge that input[i] == 0. + for (; i < K - digit_shift; i++) { + DCHECK(input[i] == 0); // NOLINT(readability/check) + result[i + digit_shift] = carry; + carry = 0; + } + // Second phase: result indices 0 to digit_shift - 1. + cap = std::min(K, zero_above); + for (; i < cap; i++) { + digit_t d = input[i]; + result[i - K + digit_shift] = + digit_sub2(0, (d << bits_shift) | carry, borrow, &borrow); + carry = d >> (kDigitBits - bits_shift); + } + // Any remaining work can hard-code the knowledge that input[i] == 0. + if (i < K) { + DCHECK(input[i] == 0); // NOLINT(readability/check) + result[i - K + digit_shift] = digit_sub2(0, carry, borrow, &borrow); + carry = 0; + i++; + } + for (; i < K; i++) { + DCHECK(input[i] == 0); // NOLINT(readability/check) + result[i - K + digit_shift] = digit_sub(0, borrow, &borrow); + } + // Last step: compute result[digit_shift]. + digit_t d = input[K]; + result[digit_shift] = digit_sub2( + result[digit_shift], (d << bits_shift) | carry, borrow, &borrow); + // No carry left. + DCHECK((d >> (kDigitBits - bits_shift)) == 0); // NOLINT(readability/check) + } + result[K] = 0; + for (int i = digit_shift + 1; i <= K && borrow > 0; i++) { + result[i] = digit_sub(result[i], borrow, &borrow); + } + if (borrow > 0) { + // Underflow means we subtracted too much. Add 2^K + 1. + digit_t carry = 1; + for (int i = 0; i <= K; i++) { + result[i] = digit_add2(result[i], carry, &carry); + if (carry == 0) break; + } + result[K] = digit_add2(result[K], 1, &carry); + } +} + +//////////////////////////////////////////////////////////////////////////////// +// Part 2: FFT-based multiplication is very sensitive to appropriate choice +// of parameters. The following functions choose the parameters that the +// subsequent actual computation will use. This is partially based on formal +// constraints and partially on experimentally-determined heuristics. + +struct Parameters { + // We never use the default values, but skipping zero-initialization + // of these fields saddens and confuses MSan. + int m{0}; + int K{0}; + int n{0}; + int s{0}; + int r{0}; +}; + +// Computes parameters for the main calculation, given a bit length {N} and +// an {m}. See the paper for details. +void ComputeParameters(int N, int m, Parameters* params) { + N *= kDigitBits; + int n = 1 << m; // 2^m + int nhalf = n >> 1; + int s = (N + n - 1) >> m; // ceil(N/n) + s = RoundUp(s, kDigitBits); + int K = m + 2 * s + 1; // K must be at least this big... + K = RoundUp(K, nhalf); // ...and a multiple of n/2. + int r = K >> (m - 1); // Which multiple? + + // We want recursive calls to make progress, so force K to be a multiple + // of 8 if it's above the recursion threshold. Otherwise, K must be a + // multiple of kDigitBits. + const int threshold = (K + 1 >= kFftInnerThreshold * kDigitBits) + ? 3 + kLog2DigitBits + : kLog2DigitBits; + int K_tz = CountTrailingZeros(K); + while (K_tz < threshold) { + K += (1 << K_tz); + r = K >> (m - 1); + K_tz = CountTrailingZeros(K); + } + + DCHECK(K % kDigitBits == 0); // NOLINT(readability/check) + DCHECK(s % kDigitBits == 0); // NOLINT(readability/check) + params->K = K / kDigitBits; + params->s = s / kDigitBits; + params->n = n; + params->r = r; +} + +// Computes parameters for recursive invocations ("inner layer"). +void ComputeParameters_Inner(int N, Parameters* params) { + int max_m = CountTrailingZeros(N); + int N_bits = BitLength(N); + int m = N_bits - 4; // Don't let s get too small. + m = std::min(max_m, m); + N *= kDigitBits; + int n = 1 << m; // 2^m + // We can't round up s in the inner layer, because N = n*s is fixed. + int s = N >> m; + DCHECK(N == s * n); + int K = m + 2 * s + 1; // K must be at least this big... + K = RoundUp(K, n); // ...and a multiple of n and kDigitBits. + K = RoundUp(K, kDigitBits); + params->r = K >> m; // Which multiple? + DCHECK(K % kDigitBits == 0); // NOLINT(readability/check) + DCHECK(s % kDigitBits == 0); // NOLINT(readability/check) + params->K = K / kDigitBits; + params->s = s / kDigitBits; + params->n = n; + params->m = m; +} + +int PredictInnerK(int N) { + Parameters params; + ComputeParameters_Inner(N, ¶ms); + return params.K; +} + +// Applies heuristics to decide whether {m} should be decremented, by looking +// at what would happen to {K} and {s} if {m} was decremented. +bool ShouldDecrementM(const Parameters& current, const Parameters& next, + const Parameters& after_next) { + // K == 64 seems to work particularly well. + if (current.K == 64 && next.K >= 112) return false; + // Small values for s are never efficient. + if (current.s < 6) return true; + // The time is roughly determined by K * n. When we decrement m, then + // n always halves, and K usually gets bigger, by up to 2x. + // For not-quite-so-small s, look at how much bigger K would get: if + // the K increase is small enough, making n smaller is worth it. + // Empirically, it's most meaningful to look at the K *after* next. + // The specific threshold values have been chosen by running many + // benchmarks on inputs of many sizes, and manually selecting thresholds + // that seemed to produce good results. + double factor = static_cast<double>(after_next.K) / current.K; + if ((current.s == 6 && factor < 3.85) || // -- + (current.s == 7 && factor < 3.73) || // -- + (current.s == 8 && factor < 3.55) || // -- + (current.s == 9 && factor < 3.50) || // -- + factor < 3.4) { + return true; + } + // If K is just below the recursion threshold, make sure we do recurse, + // unless doing so would be particularly inefficient (large inner_K). + // If K is just above the recursion threshold, doubling it often makes + // the inner call more efficient. + if (current.K >= 160 && current.K < 250 && PredictInnerK(next.K) < 28) { + return true; + } + // If we found no reason to decrement, keep m as large as possible. + return false; +} + +// Decides what parameters to use for a given input bit length {N}. +// Returns the chosen m. +int GetParameters(int N, Parameters* params) { + int N_bits = BitLength(N); + int max_m = N_bits - 3; // Larger m make s too small. + max_m = std::max(kLog2DigitBits, max_m); // Smaller m break the logic below. + int m = max_m; + Parameters current; + ComputeParameters(N, m, ¤t); + Parameters next; + ComputeParameters(N, m - 1, &next); + while (m > 2) { + Parameters after_next; + ComputeParameters(N, m - 2, &after_next); + if (ShouldDecrementM(current, next, after_next)) { + m--; + current = next; + next = after_next; + } else { + break; + } + } + *params = current; + return m; +} + +//////////////////////////////////////////////////////////////////////////////// +// Part 3: Fast Fourier Transformation. + +class FFTContainer { + public: + // {n} is the number of chunks, whose length is {K}+1. + // {K} determines F_n = 2^(K * kDigitBits) + 1. + FFTContainer(int n, int K, ProcessorImpl* processor) + : n_(n), K_(K), length_(K + 1), processor_(processor) { + storage_ = new digit_t[length_ * n_]; + part_ = new digit_t*[n_]; + digit_t* ptr = storage_; + for (int i = 0; i < n; i++, ptr += length_) { + part_[i] = ptr; + } + temp_ = new digit_t[length_ * 2]; + } + FFTContainer() = delete; + FFTContainer(const FFTContainer&) = delete; + FFTContainer& operator=(const FFTContainer&) = delete; + + ~FFTContainer() { + delete[] storage_; + delete[] part_; + delete[] temp_; + } + + void Start_Default(Digits X, int chunk_size, int theta, int omega); + void Start(Digits X, int chunk_size, int theta, int omega); + + void NormalizeAndRecombine(int omega, int m, RWDigits Z, int chunk_size); + void CounterWeightAndRecombine(int theta, int m, RWDigits Z, int chunk_size); + + void FFT_ReturnShuffledThreadsafe(int start, int len, int omega, + digit_t* temp); + void FFT_Recurse(int start, int half, int omega, digit_t* temp); + + void BackwardFFT(int start, int len, int omega); + void BackwardFFT_Threadsafe(int start, int len, int omega, digit_t* temp); + + void PointwiseMultiply(const FFTContainer& other); + void DoPointwiseMultiplication(const FFTContainer& other, int start, int end, + digit_t* temp); + + int length() const { return length_; } + + private: + const int n_; // Number of parts. + const int K_; // Always length_ - 1. + const int length_; // Length of each part, in digits. + ProcessorImpl* processor_; + digit_t* storage_; // Combined storage of all parts. + digit_t** part_; // Pointers to each part. + digit_t* temp_; // Temporary storage with size 2 * length_. +}; + +inline void CopyAndZeroExtend(digit_t* dst, const digit_t* src, + int digits_to_copy, size_t total_bytes) { + size_t bytes_to_copy = digits_to_copy * sizeof(digit_t); + memcpy(dst, src, bytes_to_copy); + memset(dst + digits_to_copy, 0, total_bytes - bytes_to_copy); +} + +// Reads {X} into the FFTContainer's internal storage, dividing it into chunks +// while doing so; then performs the forward FFT. +void FFTContainer::Start_Default(Digits X, int chunk_size, int theta, + int omega) { + int len = X.len(); + const digit_t* pointer = X.digits(); + const size_t part_length_in_bytes = length_ * sizeof(digit_t); + int current_theta = 0; + int i = 0; + for (; i < n_ && len > 0; i++, current_theta += theta) { + chunk_size = std::min(chunk_size, len); + // For invocations via MultiplyFFT_Inner, X.len() == n_ * chunk_size + 1, + // because the outer layer's "K" is passed as the inner layer's "N". + // Since X is (mod Fn)-normalized on the outer layer, there is the rare + // corner case where X[n_ * chunk_size] == 1. Detect that case, and handle + // the extra bit as part of the last chunk; we always have the space. + if (i == n_ - 1 && len == chunk_size + 1) { + DCHECK(X[n_ * chunk_size] <= 1); // NOLINT(readability/check) + DCHECK(length_ >= chunk_size + 1); + chunk_size++; + } + if (current_theta != 0) { + // Multiply with theta^i, and reduce modulo 2^K + 1. + // We pass theta as a shift amount; it really means 2^theta. + CopyAndZeroExtend(temp_, pointer, chunk_size, part_length_in_bytes); + ShiftModFn(part_[i], temp_, current_theta, K_, chunk_size); + } else { + CopyAndZeroExtend(part_[i], pointer, chunk_size, part_length_in_bytes); + } + pointer += chunk_size; + len -= chunk_size; + } + DCHECK(len == 0); // NOLINT(readability/check) + for (; i < n_; i++) { + memset(part_[i], 0, part_length_in_bytes); + } + FFT_ReturnShuffledThreadsafe(0, n_, omega, temp_); +} + +// This version of Start is optimized for the case where ~half of the +// container will be filled with padding zeros. +void FFTContainer::Start(Digits X, int chunk_size, int theta, int omega) { + int len = X.len(); + if (len > n_ * chunk_size / 2) { + return Start_Default(X, chunk_size, theta, omega); + } + DCHECK(theta == 0); // NOLINT(readability/check) + const digit_t* pointer = X.digits(); + const size_t part_length_in_bytes = length_ * sizeof(digit_t); + int nhalf = n_ / 2; + // Unrolled first iteration. + CopyAndZeroExtend(part_[0], pointer, chunk_size, part_length_in_bytes); + CopyAndZeroExtend(part_[nhalf], pointer, chunk_size, part_length_in_bytes); + pointer += chunk_size; + len -= chunk_size; + int i = 1; + for (; i < nhalf && len > 0; i++) { + chunk_size = std::min(chunk_size, len); + CopyAndZeroExtend(part_[i], pointer, chunk_size, part_length_in_bytes); + int w = omega * i; + ShiftModFn(part_[i + nhalf], part_[i], w, K_, chunk_size); + pointer += chunk_size; + len -= chunk_size; + } + for (; i < nhalf; i++) { + memset(part_[i], 0, part_length_in_bytes); + memset(part_[i + nhalf], 0, part_length_in_bytes); + } + FFT_Recurse(0, nhalf, omega, temp_); +} + +// Forward transformation. +// We use the "DIF" aka "decimation in frequency" transform, because it +// leaves the result in "bit reversed" order, which is precisely what we +// need as input for the "DIT" aka "decimation in time" backwards transform. +void FFTContainer::FFT_ReturnShuffledThreadsafe(int start, int len, int omega, + digit_t* temp) { + DCHECK((len & 1) == 0); // {len} must be even. NOLINT(readability/check) + int half = len / 2; + SumDiff(part_[start], part_[start + half], part_[start], part_[start + half], + length_); + for (int k = 1; k < half; k++) { + SumDiff(part_[start + k], temp, part_[start + k], part_[start + half + k], + length_); + int w = omega * k; + ShiftModFn(part_[start + half + k], temp, w, K_); + } + FFT_Recurse(start, half, omega, temp); +} + +// Recursive step of the above, factored out for additional callers. +void FFTContainer::FFT_Recurse(int start, int half, int omega, digit_t* temp) { + if (half > 1) { + FFT_ReturnShuffledThreadsafe(start, half, 2 * omega, temp); + FFT_ReturnShuffledThreadsafe(start + half, half, 2 * omega, temp); + } +} + +// Backward transformation. +// We use the "DIT" aka "decimation in time" transform here, because it +// turns bit-reversed input into normally sorted output. +void FFTContainer::BackwardFFT(int start, int len, int omega) { + BackwardFFT_Threadsafe(start, len, omega, temp_); +} + +void FFTContainer::BackwardFFT_Threadsafe(int start, int len, int omega, + digit_t* temp) { + DCHECK((len & 1) == 0); // {len} must be even. NOLINT(readability/check) + int half = len / 2; + // Don't recurse for half == 2, as PointwiseMultiply already performed + // the first level of the backwards FFT. + if (half > 2) { + BackwardFFT_Threadsafe(start, half, 2 * omega, temp); + BackwardFFT_Threadsafe(start + half, half, 2 * omega, temp); + } + SumDiff(part_[start], part_[start + half], part_[start], part_[start + half], + length_); + for (int k = 1; k < half; k++) { + int w = omega * (len - k); + ShiftModFn(temp, part_[start + half + k], w, K_); + SumDiff(part_[start + k], part_[start + half + k], part_[start + k], temp, + length_); + } +} + +// Recombines the result's parts into {Z}, after backwards FFT. +void FFTContainer::NormalizeAndRecombine(int omega, int m, RWDigits Z, + int chunk_size) { + Z.Clear(); + int z_index = 0; + const int shift = n_ * omega - m; + for (int i = 0; i < n_; i++, z_index += chunk_size) { + digit_t* part = part_[i]; + ShiftModFn(temp_, part, shift, K_); + digit_t carry = 0; + int zi = z_index; + int j = 0; + for (; j < length_ && zi < Z.len(); j++, zi++) { + Z[zi] = digit_add3(Z[zi], temp_[j], carry, &carry); + } + for (; j < length_; j++) { + DCHECK(temp_[j] == 0); // NOLINT(readability/check) + } + if (carry != 0) { + DCHECK(zi < Z.len()); + Z[zi] = carry; + } + } +} + +// Helper function for {CounterWeightAndRecombine} below. +bool ShouldBeNegative(const digit_t* x, int xlen, digit_t threshold, int s) { + if (x[2 * s] >= threshold) return true; + for (int i = 2 * s + 1; i < xlen; i++) { + if (x[i] > 0) return true; + } + return false; +} + +// Same as {NormalizeAndRecombine} above, but for the needs of the recursive +// invocation ("inner layer") of FFT multiplication, where an additional +// counter-weighting step is required. +void FFTContainer::CounterWeightAndRecombine(int theta, int m, RWDigits Z, + int s) { + Z.Clear(); + int z_index = 0; + for (int k = 0; k < n_; k++, z_index += s) { + int shift = -theta * k - m; + if (shift < 0) shift += 2 * n_ * theta; + DCHECK(shift >= 0); // NOLINT(readability/check) + digit_t* input = part_[k]; + ShiftModFn(temp_, input, shift, K_); + int remaining_z = Z.len() - z_index; + if (ShouldBeNegative(temp_, length_, k + 1, s)) { + // Subtract F_n from input before adding to result. We use the following + // transformation (knowing that X < F_n): + // Z + (X - F_n) == Z - (F_n - X) + digit_t borrow_z = 0; + digit_t borrow_Fn = 0; + { + // i == 0: + digit_t d = digit_sub(1, temp_[0], &borrow_Fn); + Z[z_index] = digit_sub(Z[z_index], d, &borrow_z); + } + int i = 1; + for (; i < K_ && i < remaining_z; i++) { + digit_t d = digit_sub2(0, temp_[i], borrow_Fn, &borrow_Fn); + Z[z_index + i] = digit_sub2(Z[z_index + i], d, borrow_z, &borrow_z); + } + DCHECK(i == K_ && K_ == length_ - 1); + for (; i < length_ && i < remaining_z; i++) { + digit_t d = digit_sub2(1, temp_[i], borrow_Fn, &borrow_Fn); + Z[z_index + i] = digit_sub2(Z[z_index + i], d, borrow_z, &borrow_z); + } + DCHECK(borrow_Fn == 0); // NOLINT(readability/check) + for (; borrow_z > 0 && i < remaining_z; i++) { + Z[z_index + i] = digit_sub(Z[z_index + i], borrow_z, &borrow_z); + } + } else { + digit_t carry = 0; + int i = 0; + for (; i < length_ && i < remaining_z; i++) { + Z[z_index + i] = digit_add3(Z[z_index + i], temp_[i], carry, &carry); + } + for (; i < length_; i++) { + DCHECK(temp_[i] == 0); // NOLINT(readability/check) + } + for (; carry > 0 && i < remaining_z; i++) { + Z[z_index + i] = digit_add2(Z[z_index + i], carry, &carry); + } + // {carry} might be != 0 here if Z was negative before. That's fine. + } + } +} + +// Main FFT function for recursive invocations ("inner layer"). +void MultiplyFFT_Inner(RWDigits Z, Digits X, Digits Y, const Parameters& params, + ProcessorImpl* processor) { + int omega = 2 * params.r; // really: 2^(2r) + int theta = params.r; // really: 2^r + + FFTContainer a(params.n, params.K, processor); + a.Start_Default(X, params.s, theta, omega); + FFTContainer b(params.n, params.K, processor); + b.Start_Default(Y, params.s, theta, omega); + + a.PointwiseMultiply(b); + if (processor->should_terminate()) return; + + FFTContainer& c = a; + c.BackwardFFT(0, params.n, omega); + + c.CounterWeightAndRecombine(theta, params.m, Z, params.s); +} + +// Actual implementation of pointwise multiplications. +void FFTContainer::DoPointwiseMultiplication(const FFTContainer& other, + int start, int end, + digit_t* temp) { + // The (K_ & 3) != 0 condition makes sure that the inner FFT gets + // to split the work into at least 4 chunks. + bool use_fft = length_ >= kFftInnerThreshold && (K_ & 3) == 0; + Parameters params; + if (use_fft) ComputeParameters_Inner(K_, ¶ms); + RWDigits result(temp, 2 * length_); + for (int i = start; i < end; i++) { + Digits A(part_[i], length_); + Digits B(other.part_[i], length_); + if (use_fft) { + MultiplyFFT_Inner(result, A, B, params, processor_); + } else { + processor_->Multiply(result, A, B); + } + if (processor_->should_terminate()) return; + ModFnDoubleWidth(part_[i], result.digits(), length_); + // To improve cache friendliness, we perform the first level of the + // backwards FFT here. + if ((i & 1) == 1) { + SumDiff(part_[i - 1], part_[i], part_[i - 1], part_[i], length_); + } + } +} + +// Convenient entry point for pointwise multiplications. +void FFTContainer::PointwiseMultiply(const FFTContainer& other) { + DCHECK(n_ == other.n_); + DoPointwiseMultiplication(other, 0, n_, temp_); +} + +} // namespace + +//////////////////////////////////////////////////////////////////////////////// +// Part 4: Tying everything together into a multiplication algorithm. + +// TODO(jkummerow): Consider doing a "Mersenne transform" and CRT reconstruction +// of the final result. Might yield a few percent of perf improvement. + +// TODO(jkummerow): Consider implementing the "sqrt(2) trick". +// Gaudry/Kruppa/Zimmerman report that it saved them around 10%. + +void ProcessorImpl::MultiplyFFT(RWDigits Z, Digits X, Digits Y) { + Parameters params; + int m = GetParameters(X.len() + Y.len(), ¶ms); + int omega = params.r; // really: 2^r + + FFTContainer a(params.n, params.K, this); + a.Start(X, params.s, 0, omega); + if (X == Y) { + // Squaring. + a.PointwiseMultiply(a); + } else { + FFTContainer b(params.n, params.K, this); + b.Start(Y, params.s, 0, omega); + a.PointwiseMultiply(b); + } + if (should_terminate()) return; + + a.BackwardFFT(0, params.n, omega); + a.NormalizeAndRecombine(omega, m, Z, params.s); +} + +} // namespace bigint +} // namespace v8 diff --git a/deps/v8/src/bigint/mul-karatsuba.cc b/deps/v8/src/bigint/mul-karatsuba.cc new file mode 100644 index 0000000000..2a141f213c --- /dev/null +++ b/deps/v8/src/bigint/mul-karatsuba.cc @@ -0,0 +1,205 @@ +// Copyright 2021 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Karatsuba multiplication. This is loosely based on Go's implementation +// found at https://golang.org/src/math/big/nat.go, licensed as follows: +// +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file [1]. +// +// [1] https://golang.org/LICENSE + +#include <algorithm> +#include <utility> + +#include "src/bigint/bigint-internal.h" +#include "src/bigint/digit-arithmetic.h" +#include "src/bigint/util.h" +#include "src/bigint/vector-arithmetic.h" + +namespace v8 { +namespace bigint { + +// If Karatsuba is the best supported algorithm, then it must check for +// termination requests. If there are more advanced algorithms available +// for larger inputs, then Karatsuba will only be used for sufficiently +// small chunks that checking for termination requests is not necessary. +#if V8_ADVANCED_BIGINT_ALGORITHMS +#define MAYBE_TERMINATE +#else +#define MAYBE_TERMINATE \ + if (should_terminate()) return; +#endif + +namespace { + +// The Karatsuba algorithm sometimes finishes more quickly when the +// input length is rounded up a bit. This method encodes some heuristics +// to accomplish this. The details have been determined experimentally. +int RoundUpLen(int len) { + if (len <= 36) return RoundUp(len, 2); + // Keep the 4 or 5 most significant non-zero bits. + int shift = BitLength(len) - 5; + if ((len >> shift) >= 0x18) { + shift++; + } + // Round up, unless we're only just above the threshold. This smoothes + // the steps by which time goes up as input size increases. + int additive = ((1 << shift) - 1); + if (shift >= 2 && (len & additive) < (1 << (shift - 2))) { + return len; + } + return ((len + additive) >> shift) << shift; +} + +// This method makes the final decision how much to bump up the input size. +int KaratsubaLength(int n) { + n = RoundUpLen(n); + int i = 0; + while (n > kKaratsubaThreshold) { + n >>= 1; + i++; + } + return n << i; +} + +// Performs the specific subtraction required by {KaratsubaMain} below. +void KaratsubaSubtractionHelper(RWDigits result, Digits X, Digits Y, + int* sign) { + X.Normalize(); + Y.Normalize(); + digit_t borrow = 0; + int i = 0; + if (!GreaterThanOrEqual(X, Y)) { + *sign = -(*sign); + std::swap(X, Y); + } + for (; i < Y.len(); i++) { + result[i] = digit_sub2(X[i], Y[i], borrow, &borrow); + } + for (; i < X.len(); i++) { + result[i] = digit_sub(X[i], borrow, &borrow); + } + DCHECK(borrow == 0); // NOLINT(readability/check) + for (; i < result.len(); i++) result[i] = 0; +} + +} // namespace + +void ProcessorImpl::MultiplyKaratsuba(RWDigits Z, Digits X, Digits Y) { + DCHECK(X.len() >= Y.len()); + DCHECK(Y.len() >= kKaratsubaThreshold); + DCHECK(Z.len() >= X.len() + Y.len()); + int k = KaratsubaLength(Y.len()); + int scratch_len = 4 * k; + ScratchDigits scratch(scratch_len); + KaratsubaStart(Z, X, Y, scratch, k); +} + +// Entry point for Karatsuba-based multiplication, takes care of inputs +// with unequal lengths by chopping the larger into chunks. +void ProcessorImpl::KaratsubaStart(RWDigits Z, Digits X, Digits Y, + RWDigits scratch, int k) { + KaratsubaMain(Z, X, Y, scratch, k); + MAYBE_TERMINATE + for (int i = 2 * k; i < Z.len(); i++) Z[i] = 0; + if (k < Y.len() || X.len() != Y.len()) { + ScratchDigits T(2 * k); + // Add X0 * Y1 * b. + Digits X0(X, 0, k); + Digits Y1 = Y + std::min(k, Y.len()); + if (Y1.len() > 0) { + KaratsubaChunk(T, X0, Y1, scratch); + MAYBE_TERMINATE + AddAndReturnOverflow(Z + k, T); // Can't overflow. + } + + // Add Xi * Y0 << i and Xi * Y1 * b << (i + k). + Digits Y0(Y, 0, k); + for (int i = k; i < X.len(); i += k) { + Digits Xi(X, i, k); + KaratsubaChunk(T, Xi, Y0, scratch); + MAYBE_TERMINATE + AddAndReturnOverflow(Z + i, T); // Can't overflow. + if (Y1.len() > 0) { + KaratsubaChunk(T, Xi, Y1, scratch); + MAYBE_TERMINATE + AddAndReturnOverflow(Z + (i + k), T); // Can't overflow. + } + } + } +} + +// Entry point for chunk-wise multiplications, selects an appropriate +// algorithm for the inputs based on their sizes. +void ProcessorImpl::KaratsubaChunk(RWDigits Z, Digits X, Digits Y, + RWDigits scratch) { + X.Normalize(); + Y.Normalize(); + if (X.len() == 0 || Y.len() == 0) return Z.Clear(); + if (X.len() < Y.len()) std::swap(X, Y); + if (Y.len() == 1) return MultiplySingle(Z, X, Y[0]); + if (Y.len() < kKaratsubaThreshold) return MultiplySchoolbook(Z, X, Y); + int k = KaratsubaLength(Y.len()); + DCHECK(scratch.len() >= 4 * k); + return KaratsubaStart(Z, X, Y, scratch, k); +} + +// The main recursive Karatsuba method. +void ProcessorImpl::KaratsubaMain(RWDigits Z, Digits X, Digits Y, + RWDigits scratch, int n) { + if (n < kKaratsubaThreshold) { + X.Normalize(); + Y.Normalize(); + if (X.len() >= Y.len()) { + return MultiplySchoolbook(RWDigits(Z, 0, 2 * n), X, Y); + } else { + return MultiplySchoolbook(RWDigits(Z, 0, 2 * n), Y, X); + } + } + DCHECK(scratch.len() >= 4 * n); + DCHECK((n & 1) == 0); // NOLINT(readability/check) + int n2 = n >> 1; + Digits X0(X, 0, n2); + Digits X1(X, n2, n2); + Digits Y0(Y, 0, n2); + Digits Y1(Y, n2, n2); + RWDigits scratch_for_recursion(scratch, 2 * n, 2 * n); + RWDigits P0(scratch, 0, n); + KaratsubaMain(P0, X0, Y0, scratch_for_recursion, n2); + MAYBE_TERMINATE + for (int i = 0; i < n; i++) Z[i] = P0[i]; + RWDigits P2(scratch, n, n); + KaratsubaMain(P2, X1, Y1, scratch_for_recursion, n2); + MAYBE_TERMINATE + RWDigits Z2 = Z + n; + int end = std::min(Z2.len(), P2.len()); + for (int i = 0; i < end; i++) Z2[i] = P2[i]; + for (int i = end; i < n; i++) { + DCHECK(P2[i] == 0); // NOLINT(readability/check) + } + // The intermediate result can be one digit too large; the subtraction + // below will fix this. + digit_t overflow = AddAndReturnOverflow(Z + n2, P0); + overflow += AddAndReturnOverflow(Z + n2, P2); + RWDigits X_diff(scratch, 0, n2); + RWDigits Y_diff(scratch, n2, n2); + int sign = 1; + KaratsubaSubtractionHelper(X_diff, X1, X0, &sign); + KaratsubaSubtractionHelper(Y_diff, Y0, Y1, &sign); + RWDigits P1(scratch, n, n); + KaratsubaMain(P1, X_diff, Y_diff, scratch_for_recursion, n2); + if (sign > 0) { + overflow += AddAndReturnOverflow(Z + n2, P1); + } else { + overflow -= SubAndReturnBorrow(Z + n2, P1); + } + // The intermediate result may have been bigger, but the final result fits. + DCHECK(overflow == 0); // NOLINT(readability/check) + USE(overflow); +} + +} // namespace bigint +} // namespace v8 diff --git a/deps/v8/src/bigint/mul-schoolbook.cc b/deps/v8/src/bigint/mul-schoolbook.cc index 8e10685018..9222e1e675 100644 --- a/deps/v8/src/bigint/mul-schoolbook.cc +++ b/deps/v8/src/bigint/mul-schoolbook.cc @@ -71,7 +71,6 @@ void ProcessorImpl::MultiplySchoolbook(RWDigits Z, Digits X, Digits Y) { next_carry = 0; BODY(0, i); AddWorkEstimate(i); - if (should_terminate()) return; } // Last part: i exceeds Y now, we have to be careful about bounds. int loop_end = X.len() + Y.len() - 2; @@ -85,7 +84,6 @@ void ProcessorImpl::MultiplySchoolbook(RWDigits Z, Digits X, Digits Y) { next_carry = 0; BODY(min_x_index, max_x_index); AddWorkEstimate(max_x_index - min_x_index); - if (should_terminate()) return; } // Write the last digit, and zero out any extra space in Z. Z[i++] = digit_add2(next, carry, &carry); diff --git a/deps/v8/src/bigint/mul-toom.cc b/deps/v8/src/bigint/mul-toom.cc new file mode 100644 index 0000000000..76a4fdcdfb --- /dev/null +++ b/deps/v8/src/bigint/mul-toom.cc @@ -0,0 +1,223 @@ +// Copyright 2021 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Toom-Cook multiplication. +// Reference: https://en.wikipedia.org/wiki/Toom%E2%80%93Cook_multiplication + +#include <algorithm> + +#include "src/bigint/bigint-internal.h" +#include "src/bigint/digit-arithmetic.h" +#include "src/bigint/vector-arithmetic.h" + +namespace v8 { +namespace bigint { + +namespace { + +void TimesTwo(RWDigits X) { + digit_t carry = 0; + for (int i = 0; i < X.len(); i++) { + digit_t d = X[i]; + X[i] = (d << 1) | carry; + carry = d >> (kDigitBits - 1); + } +} + +void DivideByTwo(RWDigits X) { + digit_t carry = 0; + for (int i = X.len() - 1; i >= 0; i--) { + digit_t d = X[i]; + X[i] = (d >> 1) | carry; + carry = d << (kDigitBits - 1); + } +} + +void DivideByThree(RWDigits X) { + digit_t remainder = 0; + for (int i = X.len() - 1; i >= 0; i--) { + digit_t d = X[i]; + digit_t upper = (remainder << kHalfDigitBits) | (d >> kHalfDigitBits); + digit_t u_result = upper / 3; + remainder = upper - 3 * u_result; + digit_t lower = (remainder << kHalfDigitBits) | (d & kHalfDigitMask); + digit_t l_result = lower / 3; + remainder = lower - 3 * l_result; + X[i] = (u_result << kHalfDigitBits) | l_result; + } +} + +} // namespace + +#if DEBUG +// Set {len_} to 1 rather than 0 so that attempts to access the first digit +// will crash. +#define MARK_INVALID(D) D = RWDigits(nullptr, 1) +#else +#define MARK_INVALID(D) (void(0)) +#endif + +void ProcessorImpl::Toom3Main(RWDigits Z, Digits X, Digits Y) { + DCHECK(Z.len() >= X.len() + Y.len()); + // Phase 1: Splitting. + int i = DIV_CEIL(std::max(X.len(), Y.len()), 3); + Digits X0(X, 0, i); + Digits X1(X, i, i); + Digits X2(X, 2 * i, i); + Digits Y0(Y, 0, i); + Digits Y1(Y, i, i); + Digits Y2(Y, 2 * i, i); + + // Temporary storage. + int p_len = i + 1; // For all px, qx below. + int r_len = 2 * p_len; // For all r_x, Rx below. + Storage temp_storage(4 * r_len); + // We will use the same variable names as the Wikipedia article, as much as + // C++ lets us: our "p_m1" is their "p(-1)" etc. For consistency with other + // algorithms, we use X and Y where Wikipedia uses m and n. + // We will use and re-use the temporary storage as follows: + // + // chunk | -------- time -----------> + // [0 .. i] |( po )( p_m1 ) ( r_m2 ) + // [i+1 .. rlen-1] |( qo )( q_m1 ) ( r_m2 ) + // [rlen .. rlen+i] | (p_1 ) ( p_m2 ) (r_inf) + // [rlen+i+1 .. 2*rlen-1] | (q_1 ) ( q_m2 ) (r_inf) + // [2*rlen .. 3*rlen-1] | ( r_1 ) + // [3*rlen .. 4*rlen-1] | ( r_m1 ) + // + // This requires interleaving phases 2 and 3 a bit: after computing + // r_1 = p_1 * q_1, we can re-use p_1's storage for p_m2, and so on. + digit_t* t = temp_storage.get(); + RWDigits po(t, p_len); + RWDigits qo(t + p_len, p_len); + RWDigits p_1(t + r_len, p_len); + RWDigits q_1(t + r_len + p_len, p_len); + RWDigits r_1(t + 2 * r_len, r_len); + RWDigits r_m1(t + 3 * r_len, r_len); + + // We can also share the backing stores of Z, r_0, R0. + DCHECK(Z.len() >= r_len); + RWDigits r_0(Z, 0, r_len); + + // Phase 2a: Evaluation, steps 0, 1, m1. + // po = X0 + X2 + Add(po, X0, X2); + // p_0 = X0 + // p_1 = po + X1 + Add(p_1, po, X1); + // p_m1 = po - X1 + RWDigits p_m1 = po; + bool p_m1_sign = SubtractSigned(p_m1, po, false, X1, false); + MARK_INVALID(po); + + // qo = Y0 + Y2 + Add(qo, Y0, Y2); + // q_0 = Y0 + // q_1 = qo + Y1 + Add(q_1, qo, Y1); + // q_m1 = qo - Y1 + RWDigits q_m1 = qo; + bool q_m1_sign = SubtractSigned(q_m1, qo, false, Y1, false); + MARK_INVALID(qo); + + // Phase 3a: Pointwise multiplication, steps 0, 1, m1. + Multiply(r_0, X0, Y0); + Multiply(r_1, p_1, q_1); + Multiply(r_m1, p_m1, q_m1); + bool r_m1_sign = p_m1_sign != q_m1_sign; + + // Phase 2b: Evaluation, steps m2 and inf. + // p_m2 = (p_m1 + X2) * 2 - X0 + RWDigits p_m2 = p_1; + MARK_INVALID(p_1); + bool p_m2_sign = AddSigned(p_m2, p_m1, p_m1_sign, X2, false); + TimesTwo(p_m2); + p_m2_sign = SubtractSigned(p_m2, p_m2, p_m2_sign, X0, false); + // p_inf = X2 + + // q_m2 = (q_m1 + Y2) * 2 - Y0 + RWDigits q_m2 = q_1; + MARK_INVALID(q_1); + bool q_m2_sign = AddSigned(q_m2, q_m1, q_m1_sign, Y2, false); + TimesTwo(q_m2); + q_m2_sign = SubtractSigned(q_m2, q_m2, q_m2_sign, Y0, false); + // q_inf = Y2 + + // Phase 3b: Pointwise multiplication, steps m2 and inf. + RWDigits r_m2(t, r_len); + MARK_INVALID(p_m1); + MARK_INVALID(q_m1); + Multiply(r_m2, p_m2, q_m2); + bool r_m2_sign = p_m2_sign != q_m2_sign; + + RWDigits r_inf(t + r_len, r_len); + MARK_INVALID(p_m2); + MARK_INVALID(q_m2); + Multiply(r_inf, X2, Y2); + + // Phase 4: Interpolation. + Digits R0 = r_0; + Digits R4 = r_inf; + // R3 <- (r_m2 - r_1) / 3 + RWDigits R3 = r_m2; + bool R3_sign = SubtractSigned(R3, r_m2, r_m2_sign, r_1, false); + DivideByThree(R3); + // R1 <- (r_1 - r_m1) / 2 + RWDigits R1 = r_1; + bool R1_sign = SubtractSigned(R1, r_1, false, r_m1, r_m1_sign); + DivideByTwo(R1); + // R2 <- r_m1 - r_0 + RWDigits R2 = r_m1; + bool R2_sign = SubtractSigned(R2, r_m1, r_m1_sign, R0, false); + // R3 <- (R2 - R3) / 2 + 2 * r_inf + R3_sign = SubtractSigned(R3, R2, R2_sign, R3, R3_sign); + DivideByTwo(R3); + // TODO(jkummerow): Would it be a measurable improvement to write an + // "AddTwice" helper? + R3_sign = AddSigned(R3, R3, R3_sign, r_inf, false); + R3_sign = AddSigned(R3, R3, R3_sign, r_inf, false); + // R2 <- R2 + R1 - R4 + R2_sign = AddSigned(R2, R2, R2_sign, R1, R1_sign); + R2_sign = SubtractSigned(R2, R2, R2_sign, R4, false); + // R1 <- R1 - R3 + R1_sign = SubtractSigned(R1, R1, R1_sign, R3, R3_sign); + +#if DEBUG + R1.Normalize(); + R2.Normalize(); + R3.Normalize(); + DCHECK(R1_sign == false || R1.len() == 0); + DCHECK(R2_sign == false || R2.len() == 0); + DCHECK(R3_sign == false || R3.len() == 0); +#endif + + // Phase 5: Recomposition. R0 is already in place. Overflow can't happen. + for (int j = R0.len(); j < Z.len(); j++) Z[j] = 0; + AddAndReturnOverflow(Z + i, R1); + AddAndReturnOverflow(Z + 2 * i, R2); + AddAndReturnOverflow(Z + 3 * i, R3); + AddAndReturnOverflow(Z + 4 * i, R4); +} + +void ProcessorImpl::MultiplyToomCook(RWDigits Z, Digits X, Digits Y) { + DCHECK(X.len() >= Y.len()); + int k = Y.len(); + // TODO(jkummerow): Would it be a measurable improvement to share the + // scratch memory for several invocations? + Digits X0(X, 0, k); + Toom3Main(Z, X0, Y); + if (X.len() > Y.len()) { + ScratchDigits T(2 * k); + for (int i = k; i < X.len(); i += k) { + Digits Xi(X, i, k); + // TODO(jkummerow): would it be a measurable improvement to craft a + // "ToomChunk" method in the style of {KaratsubaChunk}? + Toom3Main(T, Xi, Y); + AddAndReturnOverflow(Z + i, T); // Can't overflow. + } + } +} + +} // namespace bigint +} // namespace v8 diff --git a/deps/v8/src/bigint/tostring.cc b/deps/v8/src/bigint/tostring.cc new file mode 100644 index 0000000000..b426c864cd --- /dev/null +++ b/deps/v8/src/bigint/tostring.cc @@ -0,0 +1,303 @@ +// Copyright 2021 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include <cstring> +#include <limits> + +#include "src/bigint/bigint-internal.h" +#include "src/bigint/digit-arithmetic.h" +#include "src/bigint/div-helpers.h" +#include "src/bigint/util.h" +#include "src/bigint/vector-arithmetic.h" + +namespace v8 { +namespace bigint { + +namespace { + +// Lookup table for the maximum number of bits required per character of a +// base-N string representation of a number. To increase accuracy, the array +// value is the actual value multiplied by 32. To generate this table: +// for (var i = 0; i <= 36; i++) { print(Math.ceil(Math.log2(i) * 32) + ","); } +constexpr uint8_t kMaxBitsPerChar[] = { + 0, 0, 32, 51, 64, 75, 83, 90, 96, // 0..8 + 102, 107, 111, 115, 119, 122, 126, 128, // 9..16 + 131, 134, 136, 139, 141, 143, 145, 147, // 17..24 + 149, 151, 153, 154, 156, 158, 159, 160, // 25..32 + 162, 163, 165, 166, // 33..36 +}; + +static const int kBitsPerCharTableShift = 5; +static const size_t kBitsPerCharTableMultiplier = 1u << kBitsPerCharTableShift; + +static const char kConversionChars[] = "0123456789abcdefghijklmnopqrstuvwxyz"; + +// Raises {base} to the power of {exponent}. Does not check for overflow. +digit_t digit_pow(digit_t base, digit_t exponent) { + digit_t result = 1ull; + while (exponent > 0) { + if (exponent & 1) { + result *= base; + } + exponent >>= 1; + base *= base; + } + return result; +} + +// Compile-time version of the above. +constexpr digit_t digit_pow_rec(digit_t base, digit_t exponent) { + return exponent == 1 ? base : base * digit_pow_rec(base, exponent - 1); +} + +// A variant of ToStringFormatter::BasecaseLast, specialized for a radix +// known at compile-time. +template <int radix> +char* BasecaseFixedLast(digit_t chunk, char* out) { + while (chunk != 0) { + DCHECK(*(out - 1) == kStringZapValue); // NOLINT(readability/check) + if (radix <= 10) { + *(--out) = '0' + (chunk % radix); + } else { + *(--out) = kConversionChars[chunk % radix]; + } + chunk /= radix; + } + return out; +} + +// By making {radix} a compile-time constant and computing {chunk_divisor} +// as another compile-time constant from it, we allow the compiler to emit +// an optimized instruction sequence based on multiplications with "magic" +// numbers (modular multiplicative inverses) instead of actual divisions. +// The price we pay is having to work on half digits; the technique doesn't +// work with twodigit_t-by-digit_t divisions. +// Includes an equivalent of ToStringFormatter::BasecaseMiddle, accordingly +// specialized for a radix known at compile time. +template <digit_t radix> +char* DivideByMagic(RWDigits rest, Digits input, char* output) { + constexpr uint8_t max_bits_per_char = kMaxBitsPerChar[radix]; + constexpr int chunk_chars = + kHalfDigitBits * kBitsPerCharTableMultiplier / max_bits_per_char; + constexpr digit_t chunk_divisor = digit_pow_rec(radix, chunk_chars); + digit_t remainder = 0; + for (int i = input.len() - 1; i >= 0; i--) { + digit_t d = input[i]; + digit_t upper = (remainder << kHalfDigitBits) | (d >> kHalfDigitBits); + digit_t u_result = upper / chunk_divisor; + remainder = upper % chunk_divisor; + digit_t lower = (remainder << kHalfDigitBits) | (d & kHalfDigitMask); + digit_t l_result = lower / chunk_divisor; + remainder = lower % chunk_divisor; + rest[i] = (u_result << kHalfDigitBits) | l_result; + } + // {remainder} is now the current chunk to be written out. + for (int i = 0; i < chunk_chars; i++) { + DCHECK(*(output - 1) == kStringZapValue); // NOLINT(readability/check) + if (radix <= 10) { + *(--output) = '0' + (remainder % radix); + } else { + *(--output) = kConversionChars[remainder % radix]; + } + remainder /= radix; + } + DCHECK(remainder == 0); // NOLINT(readability/check) + return output; +} + +class ToStringFormatter { + public: + ToStringFormatter(Digits X, int radix, bool sign, char* out, + int chars_available, ProcessorImpl* processor) + : digits_(X), + radix_(radix), + sign_(sign), + out_start_(out), + out_end_(out + chars_available), + out_(out_end_), + processor_(processor) { + DCHECK(chars_available >= ToStringResultLength(digits_, radix_, sign_)); + } + + void Start(); + int Finish(); + + void Classic() { + if (digits_.len() == 0) { + *(--out_) = '0'; + return; + } + if (digits_.len() == 1) { + out_ = BasecaseLast(digits_[0], out_); + return; + } + // {rest} holds the part of the BigInt that we haven't looked at yet. + // Not to be confused with "remainder"! + ScratchDigits rest(digits_.len()); + // In the first round, divide the input, allocating a new BigInt for + // the result == rest; from then on divide the rest in-place. + Digits dividend = digits_; + do { + if (radix_ == 10) { + // Faster but costs binary size, so we optimize the most common case. + out_ = DivideByMagic<10>(rest, dividend, out_); + processor_->AddWorkEstimate(rest.len() * 2); + } else { + digit_t chunk; + processor_->DivideSingle(rest, &chunk, dividend, chunk_divisor_); + out_ = BasecaseMiddle(chunk, out_); + // Assume that a division is about ten times as expensive as a + // multiplication. + processor_->AddWorkEstimate(rest.len() * 10); + } + if (processor_->should_terminate()) return; + rest.Normalize(); + dividend = rest; + } while (rest.len() > 1); + out_ = BasecaseLast(rest[0], out_); + } + + void BasePowerOfTwo(); + + private: + // When processing the last (most significant) digit, don't write leading + // zeros. + char* BasecaseLast(digit_t digit, char* out) { + if (radix_ == 10) return BasecaseFixedLast<10>(digit, out); + do { + DCHECK(*(out - 1) == kStringZapValue); // NOLINT(readability/check) + *(--out) = kConversionChars[digit % radix_]; + digit /= radix_; + } while (digit > 0); + return out; + } + + // When processing a middle (non-most significant) digit, always write the + // same number of characters (as many '0' as necessary). + char* BasecaseMiddle(digit_t digit, char* out) { + for (int i = 0; i < chunk_chars_; i++) { + DCHECK(*(out - 1) == kStringZapValue); // NOLINT(readability/check) + *(--out) = kConversionChars[digit % radix_]; + digit /= radix_; + } + DCHECK(digit == 0); // NOLINT(readability/check) + return out; + } + + Digits digits_; + int radix_; + int max_bits_per_char_ = 0; + int chunk_chars_ = 0; + bool sign_; + char* out_start_; + char* out_end_; + char* out_; + digit_t chunk_divisor_ = 0; + ProcessorImpl* processor_; +}; + +// Prepares data for {Classic}. Not needed for {BasePowerOfTwo}. +void ToStringFormatter::Start() { + max_bits_per_char_ = kMaxBitsPerChar[radix_]; + chunk_chars_ = kDigitBits * kBitsPerCharTableMultiplier / max_bits_per_char_; + chunk_divisor_ = digit_pow(radix_, chunk_chars_); + // By construction of chunk_chars_, there can't have been overflow. + DCHECK(chunk_divisor_ != 0); // NOLINT(readability/check) +} + +int ToStringFormatter::Finish() { + DCHECK(out_ >= out_start_); + DCHECK(out_ < out_end_); // At least one character was written. + while (out_ < out_end_ && *out_ == '0') out_++; + if (sign_) *(--out_) = '-'; + int excess = 0; + if (out_ > out_start_) { + size_t actual_length = out_end_ - out_; + excess = static_cast<int>(out_ - out_start_); + std::memmove(out_start_, out_, actual_length); + } + return excess; +} + +void ToStringFormatter::BasePowerOfTwo() { + const int bits_per_char = CountTrailingZeros(radix_); + const int char_mask = radix_ - 1; + digit_t digit = 0; + // Keeps track of how many unprocessed bits there are in {digit}. + int available_bits = 0; + for (int i = 0; i < digits_.len() - 1; i++) { + digit_t new_digit = digits_[i]; + // Take any leftover bits from the last iteration into account. + int current = (digit | (new_digit << available_bits)) & char_mask; + *(--out_) = kConversionChars[current]; + int consumed_bits = bits_per_char - available_bits; + digit = new_digit >> consumed_bits; + available_bits = kDigitBits - consumed_bits; + while (available_bits >= bits_per_char) { + *(--out_) = kConversionChars[digit & char_mask]; + digit >>= bits_per_char; + available_bits -= bits_per_char; + } + } + // Take any leftover bits from the last iteration into account. + digit_t msd = digits_.msd(); + int current = (digit | (msd << available_bits)) & char_mask; + *(--out_) = kConversionChars[current]; + digit = msd >> (bits_per_char - available_bits); + while (digit != 0) { + *(--out_) = kConversionChars[digit & char_mask]; + digit >>= bits_per_char; + } +} + +} // namespace + +void ProcessorImpl::ToString(char* out, int* out_length, Digits X, int radix, + bool sign) { +#if DEBUG + for (int i = 0; i < *out_length; i++) out[i] = kStringZapValue; +#endif + ToStringFormatter formatter(X, radix, sign, out, *out_length, this); + if (IsPowerOfTwo(radix)) { + formatter.BasePowerOfTwo(); + } else { + formatter.Start(); + formatter.Classic(); + } + int excess = formatter.Finish(); + *out_length -= excess; +} + +Status Processor::ToString(char* out, int* out_length, Digits X, int radix, + bool sign) { + ProcessorImpl* impl = static_cast<ProcessorImpl*>(this); + impl->ToString(out, out_length, X, radix, sign); + return impl->get_and_clear_status(); +} + +int ToStringResultLength(Digits X, int radix, bool sign) { + const int bit_length = BitLength(X); + int result; + if (IsPowerOfTwo(radix)) { + const int bits_per_char = CountTrailingZeros(radix); + result = DIV_CEIL(bit_length, bits_per_char) + sign; + } else { + // Maximum number of bits we can represent with one character. + const uint8_t max_bits_per_char = kMaxBitsPerChar[radix]; + // For estimating the result length, we have to be pessimistic and work with + // the minimum number of bits one character can represent. + const uint8_t min_bits_per_char = max_bits_per_char - 1; + // Perform the following computation with uint64_t to avoid overflows. + uint64_t chars_required = bit_length; + chars_required *= kBitsPerCharTableMultiplier; + chars_required = DIV_CEIL(chars_required, min_bits_per_char); + DCHECK(chars_required < std::numeric_limits<int>::max()); + result = static_cast<int>(chars_required); + } + result += sign; + return result; +} + +} // namespace bigint +} // namespace v8 diff --git a/deps/v8/src/bigint/util.h b/deps/v8/src/bigint/util.h new file mode 100644 index 0000000000..3d98e734dc --- /dev/null +++ b/deps/v8/src/bigint/util.h @@ -0,0 +1,75 @@ +// Copyright 2021 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// "Generic" helper functions (not specific to BigInts). + +#include <stdint.h> + +#include <type_traits> + +#ifdef _MSC_VER +#include <intrin.h> // For _BitScanReverse. +#endif + +#ifndef V8_BIGINT_UTIL_H_ +#define V8_BIGINT_UTIL_H_ + +// Integer division, rounding up. +#define DIV_CEIL(x, y) (((x)-1) / (y) + 1) + +namespace v8 { +namespace bigint { + +// Rounds up x to a multiple of y. +inline constexpr int RoundUp(int x, int y) { return (x + y - 1) & -y; } + +// Different environments disagree on how 64-bit uintptr_t and uint64_t are +// defined, so we have to use templates to be generic. +template <typename T, typename = typename std::enable_if< + std::is_unsigned<T>::value && sizeof(T) == 8>::type> +constexpr int CountLeadingZeros(T value) { +#if __GNUC__ || __clang__ + return value == 0 ? 64 : __builtin_clzll(value); +#elif _MSC_VER + unsigned long index = 0; // NOLINT(runtime/int). MSVC insists. + return _BitScanReverse64(&index, value) ? 63 - index : 64; +#else +#error Unsupported compiler. +#endif +} + +constexpr int CountLeadingZeros(uint32_t value) { +#if __GNUC__ || __clang__ + return value == 0 ? 32 : __builtin_clz(value); +#elif _MSC_VER + unsigned long index = 0; // NOLINT(runtime/int). MSVC insists. + return _BitScanReverse(&index, value) ? 31 - index : 32; +#else +#error Unsupported compiler. +#endif +} + +inline constexpr int CountTrailingZeros(uint32_t value) { +#if __GNUC__ || __clang__ + return value == 0 ? 32 : __builtin_ctz(value); +#elif _MSC_VER + unsigned long index = 0; // NOLINT(runtime/int). + return _BitScanForward(&index, value) ? index : 32; +#else +#error Unsupported compiler. +#endif +} + +inline constexpr int BitLength(int n) { + return 32 - CountLeadingZeros(static_cast<uint32_t>(n)); +} + +inline constexpr bool IsPowerOfTwo(int value) { + return value > 0 && (value & (value - 1)) == 0; +} + +} // namespace bigint +} // namespace v8 + +#endif // V8_BIGINT_UTIL_H_ diff --git a/deps/v8/src/bigint/vector-arithmetic.cc b/deps/v8/src/bigint/vector-arithmetic.cc index 734b443911..4191755bc9 100644 --- a/deps/v8/src/bigint/vector-arithmetic.cc +++ b/deps/v8/src/bigint/vector-arithmetic.cc @@ -4,18 +4,118 @@ #include "src/bigint/vector-arithmetic.h" +#include "src/bigint/bigint-internal.h" +#include "src/bigint/digit-arithmetic.h" + namespace v8 { namespace bigint { -int Compare(Digits A, Digits B) { - A.Normalize(); - B.Normalize(); - int diff = A.len() - B.len(); - if (diff != 0) return diff; - int i = A.len() - 1; - while (i >= 0 && A[i] == B[i]) i--; - if (i < 0) return 0; - return A[i] > B[i] ? 1 : -1; +digit_t AddAndReturnOverflow(RWDigits Z, Digits X) { + X.Normalize(); + if (X.len() == 0) return 0; + digit_t carry = 0; + int i = 0; + for (; i < X.len(); i++) { + Z[i] = digit_add3(Z[i], X[i], carry, &carry); + } + for (; i < Z.len() && carry != 0; i++) { + Z[i] = digit_add2(Z[i], carry, &carry); + } + return carry; +} + +digit_t SubAndReturnBorrow(RWDigits Z, Digits X) { + X.Normalize(); + if (X.len() == 0) return 0; + digit_t borrow = 0; + int i = 0; + for (; i < X.len(); i++) { + Z[i] = digit_sub2(Z[i], X[i], borrow, &borrow); + } + for (; i < Z.len() && borrow != 0; i++) { + Z[i] = digit_sub(Z[i], borrow, &borrow); + } + return borrow; +} + +void Add(RWDigits Z, Digits X, Digits Y) { + if (X.len() < Y.len()) { + return Add(Z, Y, X); + } + int i = 0; + digit_t carry = 0; + for (; i < Y.len(); i++) { + Z[i] = digit_add3(X[i], Y[i], carry, &carry); + } + for (; i < X.len(); i++) { + Z[i] = digit_add2(X[i], carry, &carry); + } + for (; i < Z.len(); i++) { + Z[i] = carry; + carry = 0; + } +} + +void Subtract(RWDigits Z, Digits X, Digits Y) { + X.Normalize(); + Y.Normalize(); + DCHECK(X.len() >= Y.len()); + int i = 0; + digit_t borrow = 0; + for (; i < Y.len(); i++) { + Z[i] = digit_sub2(X[i], Y[i], borrow, &borrow); + } + for (; i < X.len(); i++) { + Z[i] = digit_sub(X[i], borrow, &borrow); + } + DCHECK(borrow == 0); // NOLINT(readability/check) + for (; i < Z.len(); i++) Z[i] = 0; +} + +digit_t AddAndReturnCarry(RWDigits Z, Digits X, Digits Y) { + DCHECK(Z.len() >= Y.len() && X.len() >= Y.len()); + digit_t carry = 0; + for (int i = 0; i < Y.len(); i++) { + Z[i] = digit_add3(X[i], Y[i], carry, &carry); + } + return carry; +} + +digit_t SubtractAndReturnBorrow(RWDigits Z, Digits X, Digits Y) { + DCHECK(Z.len() >= Y.len() && X.len() >= Y.len()); + digit_t borrow = 0; + for (int i = 0; i < Y.len(); i++) { + Z[i] = digit_sub2(X[i], Y[i], borrow, &borrow); + } + return borrow; +} + +bool AddSigned(RWDigits Z, Digits X, bool x_negative, Digits Y, + bool y_negative) { + if (x_negative == y_negative) { + Add(Z, X, Y); + return x_negative; + } + if (GreaterThanOrEqual(X, Y)) { + Subtract(Z, X, Y); + return x_negative; + } + Subtract(Z, Y, X); + return !x_negative; +} + +bool SubtractSigned(RWDigits Z, Digits X, bool x_negative, Digits Y, + bool y_negative) { + if (x_negative != y_negative) { + Add(Z, X, Y); + return x_negative; + } + if (GreaterThanOrEqual(X, Y)) { + Subtract(Z, X, Y); + return x_negative; + } + Subtract(Z, Y, X); + return !x_negative; } } // namespace bigint diff --git a/deps/v8/src/bigint/vector-arithmetic.h b/deps/v8/src/bigint/vector-arithmetic.h index 617cb20b55..3247660f95 100644 --- a/deps/v8/src/bigint/vector-arithmetic.h +++ b/deps/v8/src/bigint/vector-arithmetic.h @@ -8,12 +8,52 @@ #define V8_BIGINT_VECTOR_ARITHMETIC_H_ #include "src/bigint/bigint.h" +#include "src/bigint/digit-arithmetic.h" namespace v8 { namespace bigint { +// Z += X. Returns carry on overflow. +digit_t AddAndReturnOverflow(RWDigits Z, Digits X); + +// Z -= X. Returns borrow on overflow. +digit_t SubAndReturnBorrow(RWDigits Z, Digits X); + +// X += y. +inline void Add(RWDigits X, digit_t y) { + digit_t carry = y; + int i = 0; + do { + X[i] = digit_add2(X[i], carry, &carry); + i++; + } while (carry != 0); +} + +// X -= y. +inline void Subtract(RWDigits X, digit_t y) { + digit_t borrow = y; + int i = 0; + do { + X[i] = digit_sub(X[i], borrow, &borrow); + i++; + } while (borrow != 0); +} + +// These add exactly Y's digits to the matching digits in X, storing the +// result in (part of) Z, and return the carry/borrow. +digit_t AddAndReturnCarry(RWDigits Z, Digits X, Digits Y); +digit_t SubtractAndReturnBorrow(RWDigits Z, Digits X, Digits Y); + inline bool IsDigitNormalized(Digits X) { return X.len() == 0 || X.msd() != 0; } +inline bool GreaterThanOrEqual(Digits A, Digits B) { + return Compare(A, B) >= 0; +} + +inline int BitLength(Digits X) { + return X.len() * kDigitBits - CountLeadingZeros(X.msd()); +} + } // namespace bigint } // namespace v8 |