summaryrefslogtreecommitdiff
path: root/deps/v8/src/bigint
diff options
context:
space:
mode:
Diffstat (limited to 'deps/v8/src/bigint')
-rw-r--r--deps/v8/src/bigint/OWNERS1
-rw-r--r--deps/v8/src/bigint/bigint-internal.cc68
-rw-r--r--deps/v8/src/bigint/bigint-internal.h62
-rw-r--r--deps/v8/src/bigint/bigint.h60
-rw-r--r--deps/v8/src/bigint/digit-arithmetic.h108
-rw-r--r--deps/v8/src/bigint/div-burnikel.cc279
-rw-r--r--deps/v8/src/bigint/div-helpers.cc68
-rw-r--r--deps/v8/src/bigint/div-helpers.h82
-rw-r--r--deps/v8/src/bigint/div-schoolbook.cc190
-rw-r--r--deps/v8/src/bigint/mul-fft.cc790
-rw-r--r--deps/v8/src/bigint/mul-karatsuba.cc205
-rw-r--r--deps/v8/src/bigint/mul-schoolbook.cc2
-rw-r--r--deps/v8/src/bigint/mul-toom.cc223
-rw-r--r--deps/v8/src/bigint/tostring.cc303
-rw-r--r--deps/v8/src/bigint/util.h75
-rw-r--r--deps/v8/src/bigint/vector-arithmetic.cc118
-rw-r--r--deps/v8/src/bigint/vector-arithmetic.h40
17 files changed, 2655 insertions, 19 deletions
diff --git a/deps/v8/src/bigint/OWNERS b/deps/v8/src/bigint/OWNERS
index f8c2fcc47a..8a2ddcc49d 100644
--- a/deps/v8/src/bigint/OWNERS
+++ b/deps/v8/src/bigint/OWNERS
@@ -1,2 +1,3 @@
jkummerow@chromium.org
+mslekova@chromium.org
thibaudm@chromium.org
diff --git a/deps/v8/src/bigint/bigint-internal.cc b/deps/v8/src/bigint/bigint-internal.cc
index 6630c6c4c9..828a450e8a 100644
--- a/deps/v8/src/bigint/bigint-internal.cc
+++ b/deps/v8/src/bigint/bigint-internal.cc
@@ -30,7 +30,61 @@ void ProcessorImpl::Multiply(RWDigits Z, Digits X, Digits Y) {
if (X.len() == 0 || Y.len() == 0) return Z.Clear();
if (X.len() < Y.len()) std::swap(X, Y);
if (Y.len() == 1) return MultiplySingle(Z, X, Y[0]);
- return MultiplySchoolbook(Z, X, Y);
+ if (Y.len() < kKaratsubaThreshold) return MultiplySchoolbook(Z, X, Y);
+#if !V8_ADVANCED_BIGINT_ALGORITHMS
+ return MultiplyKaratsuba(Z, X, Y);
+#else
+ if (Y.len() < kToomThreshold) return MultiplyKaratsuba(Z, X, Y);
+ if (Y.len() < kFftThreshold) return MultiplyToomCook(Z, X, Y);
+ return MultiplyFFT(Z, X, Y);
+#endif
+}
+
+void ProcessorImpl::Divide(RWDigits Q, Digits A, Digits B) {
+ A.Normalize();
+ B.Normalize();
+ DCHECK(B.len() > 0); // NOLINT(readability/check)
+ int cmp = Compare(A, B);
+ if (cmp < 0) return Q.Clear();
+ if (cmp == 0) {
+ Q[0] = 1;
+ for (int i = 1; i < Q.len(); i++) Q[i] = 0;
+ return;
+ }
+ if (B.len() == 1) {
+ digit_t remainder;
+ return DivideSingle(Q, &remainder, A, B[0]);
+ }
+ if (B.len() < kBurnikelThreshold) {
+ return DivideSchoolbook(Q, RWDigits(nullptr, 0), A, B);
+ }
+ return DivideBurnikelZiegler(Q, RWDigits(nullptr, 0), A, B);
+}
+
+void ProcessorImpl::Modulo(RWDigits R, Digits A, Digits B) {
+ A.Normalize();
+ B.Normalize();
+ DCHECK(B.len() > 0); // NOLINT(readability/check)
+ int cmp = Compare(A, B);
+ if (cmp < 0) {
+ for (int i = 0; i < B.len(); i++) R[i] = B[i];
+ for (int i = B.len(); i < R.len(); i++) R[i] = 0;
+ return;
+ }
+ if (cmp == 0) return R.Clear();
+ if (B.len() == 1) {
+ digit_t remainder;
+ DivideSingle(RWDigits(nullptr, 0), &remainder, A, B[0]);
+ R[0] = remainder;
+ for (int i = 1; i < R.len(); i++) R[i] = 0;
+ return;
+ }
+ if (B.len() < kBurnikelThreshold) {
+ return DivideSchoolbook(RWDigits(nullptr, 0), R, A, B);
+ }
+ int q_len = DivideResultLength(A, B);
+ ScratchDigits Q(q_len);
+ return DivideBurnikelZiegler(Q, R, A, B);
}
Status Processor::Multiply(RWDigits Z, Digits X, Digits Y) {
@@ -39,5 +93,17 @@ Status Processor::Multiply(RWDigits Z, Digits X, Digits Y) {
return impl->get_and_clear_status();
}
+Status Processor::Divide(RWDigits Q, Digits A, Digits B) {
+ ProcessorImpl* impl = static_cast<ProcessorImpl*>(this);
+ impl->Divide(Q, A, B);
+ return impl->get_and_clear_status();
+}
+
+Status Processor::Modulo(RWDigits R, Digits A, Digits B) {
+ ProcessorImpl* impl = static_cast<ProcessorImpl*>(this);
+ impl->Modulo(R, A, B);
+ return impl->get_and_clear_status();
+}
+
} // namespace bigint
} // namespace v8
diff --git a/deps/v8/src/bigint/bigint-internal.h b/deps/v8/src/bigint/bigint-internal.h
index efe63a06a5..41ef9526e5 100644
--- a/deps/v8/src/bigint/bigint-internal.h
+++ b/deps/v8/src/bigint/bigint-internal.h
@@ -5,11 +5,20 @@
#ifndef V8_BIGINT_BIGINT_INTERNAL_H_
#define V8_BIGINT_BIGINT_INTERNAL_H_
+#include <memory>
+
#include "src/bigint/bigint.h"
namespace v8 {
namespace bigint {
+constexpr int kKaratsubaThreshold = 34;
+constexpr int kToomThreshold = 193;
+constexpr int kFftThreshold = 1500;
+constexpr int kFftInnerThreshold = 200;
+
+constexpr int kBurnikelThreshold = 57;
+
class ProcessorImpl : public Processor {
public:
explicit ProcessorImpl(Platform* platform);
@@ -21,7 +30,31 @@ class ProcessorImpl : public Processor {
void MultiplySingle(RWDigits Z, Digits X, digit_t y);
void MultiplySchoolbook(RWDigits Z, Digits X, Digits Y);
- private:
+ void MultiplyKaratsuba(RWDigits Z, Digits X, Digits Y);
+ void KaratsubaStart(RWDigits Z, Digits X, Digits Y, RWDigits scratch, int k);
+ void KaratsubaChunk(RWDigits Z, Digits X, Digits Y, RWDigits scratch);
+ void KaratsubaMain(RWDigits Z, Digits X, Digits Y, RWDigits scratch, int n);
+
+ void Divide(RWDigits Q, Digits A, Digits B);
+ void DivideSingle(RWDigits Q, digit_t* remainder, Digits A, digit_t b);
+ void DivideSchoolbook(RWDigits Q, RWDigits R, Digits A, Digits B);
+ void DivideBurnikelZiegler(RWDigits Q, RWDigits R, Digits A, Digits B);
+
+ void Modulo(RWDigits R, Digits A, Digits B);
+
+#if V8_ADVANCED_BIGINT_ALGORITHMS
+ void MultiplyToomCook(RWDigits Z, Digits X, Digits Y);
+ void Toom3Main(RWDigits Z, Digits X, Digits Y);
+
+ void MultiplyFFT(RWDigits Z, Digits X, Digits Y);
+#endif // V8_ADVANCED_BIGINT_ALGORITHMS
+
+ // {out_length} initially contains the allocated capacity of {out}, and
+ // upon return will be set to the actual length of the result string.
+ void ToString(char* out, int* out_length, Digits X, int radix, bool sign);
+
+ bool should_terminate() { return status_ == Status::kInterrupted; }
+
// Each unit is supposed to represent approximately one CPU {mul} instruction.
// Doesn't need to be accurate; we just want to make sure to check for
// interrupt requests every now and then (roughly every 10-100 ms; often
@@ -39,8 +72,7 @@ class ProcessorImpl : public Processor {
}
}
- bool should_terminate() { return status_ == Status::kInterrupted; }
-
+ private:
uintptr_t work_estimate_{0};
Status status_{Status::kOk};
Platform* platform_;
@@ -59,6 +91,30 @@ class ProcessorImpl : public Processor {
#define DCHECK(cond) (void(0))
#endif
+#define USE(var) ((void)var)
+
+// RAII memory for a Digits array.
+class Storage {
+ public:
+ explicit Storage(int count) : ptr_(new digit_t[count]) {}
+
+ digit_t* get() { return ptr_.get(); }
+
+ private:
+ std::unique_ptr<digit_t[]> ptr_;
+};
+
+// A writable Digits array with attached storage.
+class ScratchDigits : public RWDigits {
+ public:
+ explicit ScratchDigits(int len) : RWDigits(nullptr, len), storage_(len) {
+ digits_ = storage_.get();
+ }
+
+ private:
+ Storage storage_;
+};
+
} // namespace bigint
} // namespace v8
diff --git a/deps/v8/src/bigint/bigint.h b/deps/v8/src/bigint/bigint.h
index a365359c53..6d3790808c 100644
--- a/deps/v8/src/bigint/bigint.h
+++ b/deps/v8/src/bigint/bigint.h
@@ -101,7 +101,7 @@ class Digits {
const digit_t* digits() const { return digits_; }
protected:
- friend class TemporaryLeftShift;
+ friend class ShiftedDigits;
digit_t* digits_;
int len_;
@@ -201,12 +201,35 @@ class Platform {
//
// The operations are divided into two groups: "fast" (O(n) with small
// coefficient) operations are exposed directly as free functions, "slow"
-// operations are methods on a {BigIntProcessor} object, which provides
+// operations are methods on a {Processor} object, which provides
// support for interrupting execution via the {Platform}'s {InterruptRequested}
// mechanism when it takes too long. These functions return a {Status} value.
// Returns r such that r < 0 if A < B; r > 0 if A > B; r == 0 if A == B.
-int Compare(Digits A, Digits B);
+// Defined here to be inlineable, which helps ia32 a lot (64-bit platforms
+// don't care).
+inline int Compare(Digits A, Digits B) {
+ A.Normalize();
+ B.Normalize();
+ int diff = A.len() - B.len();
+ if (diff != 0) return diff;
+ int i = A.len() - 1;
+ while (i >= 0 && A[i] == B[i]) i--;
+ if (i < 0) return 0;
+ return A[i] > B[i] ? 1 : -1;
+}
+
+// Z := X + Y
+void Add(RWDigits Z, Digits X, Digits Y);
+// Addition of signed integers. Returns true if the result is negative.
+bool AddSigned(RWDigits Z, Digits X, bool x_negative, Digits Y,
+ bool y_negative);
+
+// Z := X - Y. Requires X >= Y.
+void Subtract(RWDigits Z, Digits X, Digits Y);
+// Subtraction of signed integers. Returns true if the result is negative.
+bool SubtractSigned(RWDigits Z, Digits X, bool x_negative, Digits Y,
+ bool y_negative);
enum class Status { kOk, kInterrupted };
@@ -215,7 +238,7 @@ class Processor {
// Takes ownership of {platform}.
static Processor* New(Platform* platform);
- // Use this for any std::unique_ptr holding an instance of BigIntProcessor.
+ // Use this for any std::unique_ptr holding an instance of {Processor}.
class Destroyer {
public:
void operator()(Processor* proc) { proc->Destroy(); }
@@ -225,11 +248,40 @@ class Processor {
// Z := X * Y
Status Multiply(RWDigits Z, Digits X, Digits Y);
+ // Q := A / B
+ Status Divide(RWDigits Q, Digits A, Digits B);
+ // R := A % B
+ Status Modulo(RWDigits R, Digits A, Digits B);
+
+ // {out_length} initially contains the allocated capacity of {out}, and
+ // upon return will be set to the actual length of the result string.
+ Status ToString(char* out, int* out_length, Digits X, int radix, bool sign);
};
+inline int AddResultLength(int x_length, int y_length) {
+ return std::max(x_length, y_length) + 1;
+}
+inline int AddSignedResultLength(int x_length, int y_length, bool same_sign) {
+ return same_sign ? AddResultLength(x_length, y_length)
+ : std::max(x_length, y_length);
+}
+inline int SubtractResultLength(int x_length, int y_length) { return x_length; }
+inline int SubtractSignedResultLength(int x_length, int y_length,
+ bool same_sign) {
+ return same_sign ? std::max(x_length, y_length)
+ : AddResultLength(x_length, y_length);
+}
inline int MultiplyResultLength(Digits X, Digits Y) {
return X.len() + Y.len();
}
+inline int DivideResultLength(Digits A, Digits B) {
+ return A.len() - B.len() + 1;
+}
+inline int ModuloResultLength(Digits B) { return B.len(); }
+
+int ToStringResultLength(Digits X, int radix, bool sign);
+// In DEBUG builds, the result of {ToString} will be initialized to this value.
+constexpr char kStringZapValue = '?';
} // namespace bigint
} // namespace v8
diff --git a/deps/v8/src/bigint/digit-arithmetic.h b/deps/v8/src/bigint/digit-arithmetic.h
index 1c5c93c035..96ac949eb7 100644
--- a/deps/v8/src/bigint/digit-arithmetic.h
+++ b/deps/v8/src/bigint/digit-arithmetic.h
@@ -8,6 +8,7 @@
#define V8_BIGINT_DIGIT_ARITHMETIC_H_
#include "src/bigint/bigint.h"
+#include "src/bigint/util.h"
namespace v8 {
namespace bigint {
@@ -45,6 +46,36 @@ inline digit_t digit_add3(digit_t a, digit_t b, digit_t c, digit_t* carry) {
#endif
}
+// {borrow} will be set to 0 or 1.
+inline digit_t digit_sub(digit_t a, digit_t b, digit_t* borrow) {
+#if HAVE_TWODIGIT_T
+ twodigit_t result = twodigit_t{a} - b;
+ *borrow = (result >> kDigitBits) & 1;
+ return static_cast<digit_t>(result);
+#else
+ digit_t result = a - b;
+ *borrow = (result > a) ? 1 : 0;
+ return result;
+#endif
+}
+
+// {borrow_out} will be set to 0 or 1.
+inline digit_t digit_sub2(digit_t a, digit_t b, digit_t borrow_in,
+ digit_t* borrow_out) {
+#if HAVE_TWODIGIT_T
+ twodigit_t subtrahend = twodigit_t{b} + borrow_in;
+ twodigit_t result = twodigit_t{a} - subtrahend;
+ *borrow_out = (result >> kDigitBits) & 1;
+ return static_cast<digit_t>(result);
+#else
+ digit_t result = a - b;
+ *borrow_out = (result > a) ? 1 : 0;
+ if (result < borrow_in) *borrow_out += 1;
+ result -= borrow_in;
+ return result;
+#endif
+}
+
// Returns the low half of the result. High half is in {high}.
inline digit_t digit_mul(digit_t a, digit_t b, digit_t* high) {
#if HAVE_TWODIGIT_T
@@ -81,6 +112,83 @@ inline digit_t digit_mul(digit_t a, digit_t b, digit_t* high) {
#endif
}
+// Returns the quotient.
+// quotient = (high << kDigitBits + low - remainder) / divisor
+static inline digit_t digit_div(digit_t high, digit_t low, digit_t divisor,
+ digit_t* remainder) {
+#if defined(DCHECK)
+ DCHECK(high < divisor);
+ DCHECK(divisor != 0); // NOLINT(readability/check)
+#endif
+#if __x86_64__ && (__GNUC__ || __clang__)
+ digit_t quotient;
+ digit_t rem;
+ __asm__("divq %[divisor]"
+ // Outputs: {quotient} will be in rax, {rem} in rdx.
+ : "=a"(quotient), "=d"(rem)
+ // Inputs: put {high} into rdx, {low} into rax, and {divisor} into
+ // any register or stack slot.
+ : "d"(high), "a"(low), [divisor] "rm"(divisor));
+ *remainder = rem;
+ return quotient;
+#elif __i386__ && (__GNUC__ || __clang__)
+ digit_t quotient;
+ digit_t rem;
+ __asm__("divl %[divisor]"
+ // Outputs: {quotient} will be in eax, {rem} in edx.
+ : "=a"(quotient), "=d"(rem)
+ // Inputs: put {high} into edx, {low} into eax, and {divisor} into
+ // any register or stack slot.
+ : "d"(high), "a"(low), [divisor] "rm"(divisor));
+ *remainder = rem;
+ return quotient;
+#else
+ // Adapted from Warren, Hacker's Delight, p. 152.
+ int s = CountLeadingZeros(divisor);
+#if defined(DCHECK)
+ DCHECK(s != kDigitBits); // {divisor} is not 0.
+#endif
+ divisor <<= s;
+
+ digit_t vn1 = divisor >> kHalfDigitBits;
+ digit_t vn0 = divisor & kHalfDigitMask;
+ // {s} can be 0. {low >> kDigitBits} would be undefined behavior, so
+ // we mask the shift amount with {kShiftMask}, and the result with
+ // {s_zero_mask} which is 0 if s == 0 and all 1-bits otherwise.
+ static_assert(sizeof(intptr_t) == sizeof(digit_t),
+ "intptr_t and digit_t must have the same size");
+ const int kShiftMask = kDigitBits - 1;
+ digit_t s_zero_mask =
+ static_cast<digit_t>(static_cast<intptr_t>(-s) >> (kDigitBits - 1));
+ digit_t un32 =
+ (high << s) | ((low >> ((kDigitBits - s) & kShiftMask)) & s_zero_mask);
+ digit_t un10 = low << s;
+ digit_t un1 = un10 >> kHalfDigitBits;
+ digit_t un0 = un10 & kHalfDigitMask;
+ digit_t q1 = un32 / vn1;
+ digit_t rhat = un32 - q1 * vn1;
+
+ while (q1 >= kHalfDigitBase || q1 * vn0 > rhat * kHalfDigitBase + un1) {
+ q1--;
+ rhat += vn1;
+ if (rhat >= kHalfDigitBase) break;
+ }
+
+ digit_t un21 = un32 * kHalfDigitBase + un1 - q1 * divisor;
+ digit_t q0 = un21 / vn1;
+ rhat = un21 - q0 * vn1;
+
+ while (q0 >= kHalfDigitBase || q0 * vn0 > rhat * kHalfDigitBase + un0) {
+ q0--;
+ rhat += vn1;
+ if (rhat >= kHalfDigitBase) break;
+ }
+
+ *remainder = (un21 * kHalfDigitBase + un0 - q0 * divisor) >> s;
+ return q1 * kHalfDigitBase + q0;
+#endif
+}
+
} // namespace bigint
} // namespace v8
diff --git a/deps/v8/src/bigint/div-burnikel.cc b/deps/v8/src/bigint/div-burnikel.cc
new file mode 100644
index 0000000000..0caedb1cc1
--- /dev/null
+++ b/deps/v8/src/bigint/div-burnikel.cc
@@ -0,0 +1,279 @@
+// Copyright 2021 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// Burnikel-Ziegler division.
+// Reference: "Fast Recursive Division" by Christoph Burnikel and Joachim
+// Ziegler, found at http://cr.yp.to/bib/1998/burnikel.ps
+
+#include <string.h>
+
+#include "src/bigint/bigint-internal.h"
+#include "src/bigint/digit-arithmetic.h"
+#include "src/bigint/div-helpers.h"
+#include "src/bigint/util.h"
+#include "src/bigint/vector-arithmetic.h"
+
+namespace v8 {
+namespace bigint {
+
+namespace {
+
+// Compares [a_high, A] with B.
+// Returns:
+// - a value < 0 if [a_high, A] < B
+// - 0 if [a_high, A] == B
+// - a value > 0 if [a_high, A] > B.
+int SpecialCompare(digit_t a_high, Digits A, Digits B) {
+ B.Normalize();
+ int a_len;
+ if (a_high == 0) {
+ A.Normalize();
+ a_len = A.len();
+ } else {
+ a_len = A.len() + 1;
+ }
+ int diff = a_len - B.len();
+ if (diff != 0) return diff;
+ int i = a_len - 1;
+ if (a_high != 0) {
+ if (a_high > B[i]) return 1;
+ if (a_high < B[i]) return -1;
+ i--;
+ }
+ while (i >= 0 && A[i] == B[i]) i--;
+ if (i < 0) return 0;
+ return A[i] > B[i] ? 1 : -1;
+}
+
+void SetOnes(RWDigits X) {
+ memset(X.digits(), 0xFF, X.len() * sizeof(digit_t));
+}
+
+// Since the Burnikel-Ziegler method is inherently recursive, we put
+// non-changing data into a container object.
+class BZ {
+ public:
+ BZ(ProcessorImpl* proc, int scratch_space)
+ : proc_(proc),
+ scratch_mem_(scratch_space >= kBurnikelThreshold ? scratch_space : 0) {}
+
+ void DivideBasecase(RWDigits Q, RWDigits R, Digits A, Digits B);
+ void D3n2n(RWDigits Q, RWDigits R, Digits A1A2, Digits A3, Digits B);
+ void D2n1n(RWDigits Q, RWDigits R, Digits A, Digits B);
+
+ private:
+ ProcessorImpl* proc_;
+ Storage scratch_mem_;
+};
+
+void BZ::DivideBasecase(RWDigits Q, RWDigits R, Digits A, Digits B) {
+ A.Normalize();
+ B.Normalize();
+ DCHECK(B.len() > 0); // NOLINT(readability/check)
+ int cmp = Compare(A, B);
+ if (cmp <= 0) {
+ Q.Clear();
+ if (cmp == 0) {
+ // If A == B, then Q=1, R=0.
+ R.Clear();
+ Q[0] = 1;
+ } else {
+ // If A < B, then Q=0, R=A.
+ PutAt(R, A, R.len());
+ }
+ return;
+ }
+ if (B.len() == 1) {
+ return proc_->DivideSingle(Q, R.digits(), A, B[0]);
+ }
+ return proc_->DivideSchoolbook(Q, R, A, B);
+}
+
+// Algorithm 2 from the paper. Variable names same as there.
+// Returns Q(uotient) and R(emainder) for A/B, with B having two thirds
+// the size of A = [A1, A2, A3].
+void BZ::D3n2n(RWDigits Q, RWDigits R, Digits A1A2, Digits A3, Digits B) {
+ DCHECK((B.len() & 1) == 0); // NOLINT(readability/check)
+ int n = B.len() / 2;
+ DCHECK(A1A2.len() == 2 * n);
+ // Actual condition is stricter than length: A < B * 2^(kDigitBits * n)
+ DCHECK(Compare(A1A2, B) < 0); // NOLINT(readability/check)
+ DCHECK(A3.len() == n);
+ DCHECK(Q.len() == n);
+ DCHECK(R.len() == 2 * n);
+ // 1. Split A into three parts A = [A1, A2, A3] with Ai < 2^(kDigitBits * n).
+ Digits A1(A1A2, n, n);
+ // 2. Split B into two parts B = [B1, B2] with Bi < 2^(kDigitBits * n).
+ Digits B1(B, n, n);
+ Digits B2(B, 0, n);
+ // 3. Distinguish the cases A1 < B1 or A1 >= B1.
+ RWDigits Qhat = Q;
+ RWDigits R1(R, n, n);
+ digit_t r1_high = 0;
+ if (Compare(A1, B1) < 0) {
+ // 3a. If A1 < B1, compute Qhat = floor([A1, A2] / B1) with remainder R1
+ // using algorithm D2n1n.
+ D2n1n(Qhat, R1, A1A2, B1);
+ if (proc_->should_terminate()) return;
+ } else {
+ // 3b. If A1 >= B1, set Qhat = 2^(kDigitBits * n) - 1 and set
+ // R1 = [A1, A2] - [B1, 0] + [0, B1]
+ SetOnes(Qhat);
+ // Step 1: compute A1 - B1, which can't underflow because of the comparison
+ // guarding this else-branch, and always has a one-digit result because
+ // of this function's preconditions.
+ RWDigits temp = R1;
+ Subtract(temp, A1, B1);
+ temp.Normalize();
+ DCHECK(temp.len() <= 1); // NOLINT(readability/check)
+ if (temp.len() > 0) r1_high = temp[0];
+ // Step 2: compute A2 + B1.
+ Digits A2(A1A2, 0, n);
+ r1_high += AddAndReturnCarry(R1, A2, B1);
+ }
+ // 4. Compute D = Qhat * B2 using (Karatsuba) multiplication.
+ RWDigits D(scratch_mem_.get(), 2 * n);
+ proc_->Multiply(D, Qhat, B2);
+ if (proc_->should_terminate()) return;
+
+ // 5. Compute Rhat = R1*2^(kDigitBits * n) + A3 - D = [R1, A3] - D.
+ PutAt(R, A3, n);
+ // 6. As long as Rhat < 0, repeat:
+ while (SpecialCompare(r1_high, R, D) < 0) {
+ // 6a. Rhat = Rhat + B
+ r1_high += AddAndReturnCarry(R, R, B);
+ // 6b. Qhat = Qhat - 1
+ Subtract(Qhat, 1);
+ }
+ // 5. Compute Rhat = R1*2^(kDigitBits * n) + A3 - D = [R1, A3] - D.
+ digit_t borrow = SubtractAndReturnBorrow(R, R, D);
+ DCHECK(borrow == r1_high);
+ DCHECK(Compare(R, B) < 0); // NOLINT(readability/check)
+ (void)borrow;
+ // 7. Return R = Rhat, Q = Qhat.
+}
+
+// Algorithm 1 from the paper. Variable names same as there.
+// Returns Q(uotient) and (R)emainder for A/B, with A twice the size of B.
+void BZ::D2n1n(RWDigits Q, RWDigits R, Digits A, Digits B) {
+ int n = B.len();
+ DCHECK(A.len() <= 2 * n);
+ // A < B * 2^(kDigitsBits * n)
+ DCHECK(Compare(Digits(A, n, n), B) < 0); // NOLINT(readability/check)
+ DCHECK(Q.len() == n);
+ DCHECK(R.len() == n);
+ // 1. If n is odd or smaller than some convenient constant, compute Q and R
+ // by school division and return.
+ if ((n & 1) == 1 || n < kBurnikelThreshold) {
+ return DivideBasecase(Q, R, A, B);
+ }
+ // 2. Split A into four parts A = [A1, ..., A4] with
+ // Ai < 2^(kDigitBits * n / 2). Split B into two parts [B2, B1] with
+ // Bi < 2^(kDigitBits * n / 2).
+ Digits A1A2(A, n, n);
+ Digits A3(A, n / 2, n / 2);
+ Digits A4(A, 0, n / 2);
+ // 3. Compute the high part Q1 of floor(A/B) as
+ // Q1 = floor([A1, A2, A3] / [B1, B2]) with remainder R1 = [R11, R12],
+ // using algorithm D3n2n.
+ RWDigits Q1(Q, n / 2, n / 2);
+ ScratchDigits R1(n);
+ D3n2n(Q1, R1, A1A2, A3, B);
+ if (proc_->should_terminate()) return;
+ // 4. Compute the low part Q2 of floor(A/B) as
+ // Q2 = floor([R11, R12, A4] / [B1, B2]) with remainder R, using
+ // algorithm D3n2n.
+ RWDigits Q2(Q, 0, n / 2);
+ D3n2n(Q2, R, R1, A4, B);
+ // 5. Return Q = [Q1, Q2] and R.
+}
+
+} // namespace
+
+// Algorithm 3 from the paper. Variables names same as there.
+// Returns Q(uotient) and R(emainder) for A/B (no size restrictions).
+// R is optional, Q is not.
+void ProcessorImpl::DivideBurnikelZiegler(RWDigits Q, RWDigits R, Digits A,
+ Digits B) {
+ DCHECK(A.len() >= B.len());
+ DCHECK(R.len() == 0 || R.len() >= B.len());
+ DCHECK(Q.len() > A.len() - B.len());
+ int r = A.len();
+ int s = B.len();
+ // The requirements are:
+ // - n >= s, n as small as possible.
+ // - m must be a power of two.
+ // 1. Set m = min {2^k | 2^k * kBurnikelThreshold > s}.
+ int m = 1 << BitLength(s / kBurnikelThreshold);
+ // 2. Set j = roundup(s/m) and n = j * m.
+ int j = DIV_CEIL(s, m);
+ int n = j * m;
+ // 3. Set sigma = max{tao | 2^tao * B < 2^(kDigitBits * n)}.
+ int sigma = CountLeadingZeros(B[s - 1]);
+ int digit_shift = n - s;
+ // 4. Set B = B * 2^sigma to normalize B. Shift A by the same amount.
+ ScratchDigits B_shifted(n);
+ LeftShift(B_shifted + digit_shift, B, sigma);
+ for (int i = 0; i < digit_shift; i++) B_shifted[i] = 0;
+ B = B_shifted;
+ // We need an extra digit if A's top digit does not have enough space for
+ // the left-shift by {sigma}. Additionally, the top bit of A must be 0
+ // (see "-1" in step 5 below), which combined with B being normalized (i.e.
+ // B's top bit is 1) ensures the preconditions of the helper functions.
+ int extra_digit = CountLeadingZeros(A[r - 1]) < (sigma + 1) ? 1 : 0;
+ r = A.len() + digit_shift + extra_digit;
+ ScratchDigits A_shifted(r);
+ LeftShift(A_shifted + digit_shift, A, sigma);
+ for (int i = 0; i < digit_shift; i++) A_shifted[i] = 0;
+ A = A_shifted;
+ // 5. Set t = min{t >= 2 | A < 2^(kDigitBits * t * n - 1)}.
+ int t = std::max(DIV_CEIL(r, n), 2);
+ // 6. Split A conceptually into t blocks.
+ // 7. Set Z_(t-2) = [A_(t-1), A_(t-2)].
+ int z_len = n * 2;
+ ScratchDigits Z(z_len);
+ PutAt(Z, A + n * (t - 2), z_len);
+ // 8. For i from t-2 downto 0 do:
+ BZ bz(this, n);
+ ScratchDigits Ri(n);
+ {
+ // First iteration unrolled and specialized.
+ // We might not have n digits at the top of Q, so use temporary storage
+ // for Qi...
+ ScratchDigits Qi(n);
+ bz.D2n1n(Qi, Ri, Z, B);
+ if (should_terminate()) return;
+ // ...but there *will* be enough space for any non-zero result digits!
+ Qi.Normalize();
+ RWDigits target = Q + n * (t - 2);
+ DCHECK(Qi.len() <= target.len());
+ PutAt(target, Qi, target.len());
+ }
+ // Now loop over any remaining iterations.
+ for (int i = t - 3; i >= 0; i--) {
+ // 8b. If i > 0, set Z_(i-1) = [Ri, A_(i-1)].
+ // (De-duped with unrolled first iteration, hence reading A_(i).)
+ PutAt(Z + n, Ri, n);
+ PutAt(Z, A + n * i, n);
+ // 8a. Using algorithm D2n1n compute Qi, Ri such that Zi = B*Qi + Ri.
+ RWDigits Qi(Q, i * n, n);
+ bz.D2n1n(Qi, Ri, Z, B);
+ if (should_terminate()) return;
+ }
+ // 9. Return Q = [Q_(t-2), ..., Q_0] and R = R_0 * 2^(-sigma).
+#if DEBUG
+ for (int i = 0; i < digit_shift; i++) {
+ DCHECK(Ri[i] == 0); // NOLINT(readability/check)
+ }
+#endif
+ if (R.len() != 0) {
+ Digits Ri_part(Ri, digit_shift, Ri.len());
+ Ri_part.Normalize();
+ DCHECK(Ri_part.len() <= R.len());
+ RightShift(R, Ri_part, sigma);
+ }
+}
+
+} // namespace bigint
+} // namespace v8
diff --git a/deps/v8/src/bigint/div-helpers.cc b/deps/v8/src/bigint/div-helpers.cc
new file mode 100644
index 0000000000..0dfca0b02c
--- /dev/null
+++ b/deps/v8/src/bigint/div-helpers.cc
@@ -0,0 +1,68 @@
+// Copyright 2021 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "src/bigint/div-helpers.h"
+
+#include "src/bigint/bigint-internal.h"
+
+namespace v8 {
+namespace bigint {
+
+namespace {
+
+void Copy(RWDigits Z, Digits X) {
+ if (Z == X) return;
+ int i = 0;
+ for (; i < X.len(); i++) Z[i] = X[i];
+ for (; i < Z.len(); i++) Z[i] = 0;
+}
+
+} // namespace
+
+// Z := X << shift
+// Z and X may alias for an in-place shift.
+void LeftShift(RWDigits Z, Digits X, int shift) {
+ DCHECK(shift >= 0); // NOLINT(readability/check)
+ DCHECK(shift < kDigitBits);
+ DCHECK(Z.len() >= X.len());
+ if (shift == 0) return Copy(Z, X);
+ digit_t carry = 0;
+ int i = 0;
+ for (; i < X.len(); i++) {
+ digit_t d = X[i];
+ Z[i] = (d << shift) | carry;
+ carry = d >> (kDigitBits - shift);
+ }
+ if (i < Z.len()) {
+ Z[i++] = carry;
+ } else {
+ DCHECK(carry == 0); // NOLINT(readability/check)
+ }
+ for (; i < Z.len(); i++) Z[i] = 0;
+}
+
+// Z := X >> shift
+// Z and X may alias for an in-place shift.
+void RightShift(RWDigits Z, Digits X, int shift) {
+ DCHECK(shift >= 0); // NOLINT(readability/check)
+ DCHECK(shift < kDigitBits);
+ X.Normalize();
+ DCHECK(Z.len() >= X.len());
+ if (shift == 0) return Copy(Z, X);
+ int i = 0;
+ if (X.len() > 0) {
+ digit_t carry = X[0] >> shift;
+ int last = X.len() - 1;
+ for (; i < last; i++) {
+ digit_t d = X[i + 1];
+ Z[i] = (d << (kDigitBits - shift)) | carry;
+ carry = d >> shift;
+ }
+ Z[i++] = carry;
+ }
+ for (; i < Z.len(); i++) Z[i] = 0;
+}
+
+} // namespace bigint
+} // namespace v8
diff --git a/deps/v8/src/bigint/div-helpers.h b/deps/v8/src/bigint/div-helpers.h
new file mode 100644
index 0000000000..db5da3e128
--- /dev/null
+++ b/deps/v8/src/bigint/div-helpers.h
@@ -0,0 +1,82 @@
+// Copyright 2021 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef V8_BIGINT_DIV_HELPERS_H_
+#define V8_BIGINT_DIV_HELPERS_H_
+
+#include <memory>
+
+#include "src/bigint/bigint.h"
+#include "src/bigint/util.h"
+
+namespace v8 {
+namespace bigint {
+
+void LeftShift(RWDigits Z, Digits X, int shift);
+void RightShift(RWDigits Z, Digits X, int shift);
+
+inline void PutAt(RWDigits Z, Digits A, int count) {
+ int len = std::min(A.len(), count);
+ int i = 0;
+ for (; i < len; i++) Z[i] = A[i];
+ for (; i < count; i++) Z[i] = 0;
+}
+
+// Division algorithms typically need to left-shift their inputs into
+// "bit-normalized" form (i.e. top bit is set). The inputs are considered
+// read-only, and V8 relies on that by allowing concurrent reads from them,
+// so by default, {ShiftedDigits} allocate temporary storage for their
+// contents. In-place modification is opt-in for cases where callers can
+// guarantee that it is safe.
+// When callers allow in-place shifting and wish to undo it, they have to do
+// so manually using {Reset()}.
+// If {shift} is not given, it is auto-detected from {original}'s
+// leading zeros.
+class ShiftedDigits : public Digits {
+ public:
+ explicit ShiftedDigits(Digits& original, int shift = -1,
+ bool allow_inplace = false)
+ : Digits(original.digits_, original.len_) {
+ int leading_zeros = CountLeadingZeros(original.msd());
+ if (shift < 0) {
+ shift = leading_zeros;
+ } else if (shift > leading_zeros) {
+ allow_inplace = false;
+ len_++;
+ }
+ shift_ = shift;
+ if (shift == 0) {
+ inplace_ = true;
+ return;
+ }
+ inplace_ = allow_inplace;
+ if (!inplace_) {
+ digit_t* digits = new digit_t[len_];
+ storage_.reset(digits);
+ digits_ = digits;
+ }
+ RWDigits rw_view(digits_, len_);
+ LeftShift(rw_view, original, shift_);
+ }
+ ~ShiftedDigits() = default;
+
+ void Reset() {
+ if (inplace_) {
+ RWDigits rw_view(digits_, len_);
+ RightShift(rw_view, rw_view, shift_);
+ }
+ }
+
+ int shift() { return shift_; }
+
+ private:
+ int shift_;
+ bool inplace_;
+ std::unique_ptr<digit_t[]> storage_;
+};
+
+} // namespace bigint
+} // namespace v8
+
+#endif // V8_BIGINT_DIV_HELPERS_H_
diff --git a/deps/v8/src/bigint/div-schoolbook.cc b/deps/v8/src/bigint/div-schoolbook.cc
new file mode 100644
index 0000000000..a6295c573c
--- /dev/null
+++ b/deps/v8/src/bigint/div-schoolbook.cc
@@ -0,0 +1,190 @@
+// Copyright 2021 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// "Schoolbook" division. This is loosely based on Go's implementation
+// found at https://golang.org/src/math/big/nat.go, licensed as follows:
+//
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file [1].
+//
+// [1] https://golang.org/LICENSE
+
+#include <limits>
+
+#include "src/bigint/bigint-internal.h"
+#include "src/bigint/digit-arithmetic.h"
+#include "src/bigint/div-helpers.h"
+#include "src/bigint/util.h"
+#include "src/bigint/vector-arithmetic.h"
+
+namespace v8 {
+namespace bigint {
+
+// Computes Q(uotient) and remainder for A/b, such that
+// Q = (A - remainder) / b, with 0 <= remainder < b.
+// If Q.len == 0, only the remainder will be returned.
+// Q may be the same as A for an in-place division.
+void ProcessorImpl::DivideSingle(RWDigits Q, digit_t* remainder, Digits A,
+ digit_t b) {
+ DCHECK(b != 0); // NOLINT(readability/check)
+ DCHECK(A.len() > 0); // NOLINT(readability/check)
+ *remainder = 0;
+ int length = A.len();
+ if (Q.len() != 0) {
+ if (A[length - 1] >= b) {
+ DCHECK(Q.len() >= A.len());
+ for (int i = length - 1; i >= 0; i--) {
+ Q[i] = digit_div(*remainder, A[i], b, remainder);
+ }
+ for (int i = length; i < Q.len(); i++) Q[i] = 0;
+ } else {
+ DCHECK(Q.len() >= A.len() - 1);
+ *remainder = A[length - 1];
+ for (int i = length - 2; i >= 0; i--) {
+ Q[i] = digit_div(*remainder, A[i], b, remainder);
+ }
+ for (int i = length - 1; i < Q.len(); i++) Q[i] = 0;
+ }
+ } else {
+ for (int i = length - 1; i >= 0; i--) {
+ digit_div(*remainder, A[i], b, remainder);
+ }
+ }
+}
+
+// Z += X. Returns the "carry" (0 or 1) after adding all of X's digits.
+inline digit_t InplaceAdd(RWDigits Z, Digits X) {
+ return AddAndReturnCarry(Z, Z, X);
+}
+
+// Z -= X. Returns the "borrow" (0 or 1) after subtracting all of X's digits.
+inline digit_t InplaceSub(RWDigits Z, Digits X) {
+ return SubtractAndReturnBorrow(Z, Z, X);
+}
+
+// Returns whether (factor1 * factor2) > (high << kDigitBits) + low.
+bool ProductGreaterThan(digit_t factor1, digit_t factor2, digit_t high,
+ digit_t low) {
+ digit_t result_high;
+ digit_t result_low = digit_mul(factor1, factor2, &result_high);
+ return result_high > high || (result_high == high && result_low > low);
+}
+
+#if DEBUG
+bool QLengthOK(Digits Q, Digits A, Digits B) {
+ // If A's top B.len digits are greater than or equal to B, then the division
+ // result will be greater than A.len - B.len, otherwise it will be that
+ // difference. Intuitively: 100/10 has 2 digits, 100/11 has 1.
+ if (GreaterThanOrEqual(Digits(A, A.len() - B.len(), B.len()), B)) {
+ return Q.len() >= A.len() - B.len() + 1;
+ }
+ return Q.len() >= A.len() - B.len();
+}
+#endif
+
+// Computes Q(uotient) and R(emainder) for A/B, such that
+// Q = (A - R) / B, with 0 <= R < B.
+// Both Q and R are optional: callers that are only interested in one of them
+// can pass the other with len == 0.
+// If Q is present, its length must be at least A.len - B.len + 1.
+// If R is present, its length must be at least B.len.
+// See Knuth, Volume 2, section 4.3.1, Algorithm D.
+void ProcessorImpl::DivideSchoolbook(RWDigits Q, RWDigits R, Digits A,
+ Digits B) {
+ // NOLINTNEXTLINE(readability/check)
+ DCHECK(B.len() >= 2); // Use DivideSingle otherwise.
+ DCHECK(A.len() >= B.len()); // No-op otherwise.
+ DCHECK(Q.len() == 0 || QLengthOK(Q, A, B));
+ DCHECK(R.len() == 0 || R.len() >= B.len());
+ // The unusual variable names inside this function are consistent with
+ // Knuth's book, as well as with Go's implementation of this algorithm.
+ // Maintaining this consistency is probably more useful than trying to
+ // come up with more descriptive names for them.
+ const int n = B.len();
+ const int m = A.len() - n;
+
+ // In each iteration, {qhatv} holds {divisor} * {current quotient digit}.
+ // "v" is the book's name for {divisor}, "qhat" the current quotient digit.
+ ScratchDigits qhatv(n + 1);
+
+ // D1.
+ // Left-shift inputs so that the divisor's MSB is set. This is necessary
+ // to prevent the digit-wise divisions (see digit_div call below) from
+ // overflowing (they take a two digits wide input, and return a one digit
+ // result).
+ ShiftedDigits b_normalized(B);
+ B = b_normalized;
+ // U holds the (continuously updated) remaining part of the dividend, which
+ // eventually becomes the remainder.
+ ScratchDigits U(A.len() + 1);
+ LeftShift(U, A, b_normalized.shift());
+
+ // D2.
+ // Iterate over the dividend's digits (like the "grad school" algorithm).
+ // {vn1} is the divisor's most significant digit.
+ digit_t vn1 = B[n - 1];
+ for (int j = m; j >= 0; j--) {
+ // D3.
+ // Estimate the current iteration's quotient digit (see Knuth for details).
+ // {qhat} is the current quotient digit.
+ digit_t qhat = std::numeric_limits<digit_t>::max();
+ // {ujn} is the dividend's most significant remaining digit.
+ digit_t ujn = U[j + n];
+ if (ujn != vn1) {
+ // {rhat} is the current iteration's remainder.
+ digit_t rhat = 0;
+ // Estimate the current quotient digit by dividing the most significant
+ // digits of dividend and divisor. The result will not be too small,
+ // but could be a bit too large.
+ qhat = digit_div(ujn, U[j + n - 1], vn1, &rhat);
+
+ // Decrement the quotient estimate as needed by looking at the next
+ // digit, i.e. by testing whether
+ // qhat * v_{n-2} > (rhat << kDigitBits) + u_{j+n-2}.
+ digit_t vn2 = B[n - 2];
+ digit_t ujn2 = U[j + n - 2];
+ while (ProductGreaterThan(qhat, vn2, rhat, ujn2)) {
+ qhat--;
+ digit_t prev_rhat = rhat;
+ rhat += vn1;
+ // v[n-1] >= 0, so this tests for overflow.
+ if (rhat < prev_rhat) break;
+ }
+ }
+
+ // D4.
+ // Multiply the divisor with the current quotient digit, and subtract
+ // it from the dividend. If there was "borrow", then the quotient digit
+ // was one too high, so we must correct it and undo one subtraction of
+ // the (shifted) divisor.
+ if (qhat == 0) {
+ qhatv.Clear();
+ } else {
+ MultiplySingle(qhatv, B, qhat);
+ }
+ digit_t c = InplaceSub(U + j, qhatv);
+ if (c != 0) {
+ c = InplaceAdd(U + j, B);
+ U[j + n] = U[j + n] + c;
+ qhat--;
+ }
+
+ if (Q.len() != 0) {
+ if (j >= Q.len()) {
+ DCHECK(qhat == 0); // NOLINT(readability/check)
+ } else {
+ Q[j] = qhat;
+ }
+ }
+ }
+ if (R.len() != 0) {
+ RightShift(R, U, b_normalized.shift());
+ }
+ // If Q has extra storage, clear it.
+ for (int i = m + 1; i < Q.len(); i++) Q[i] = 0;
+}
+
+} // namespace bigint
+} // namespace v8
diff --git a/deps/v8/src/bigint/mul-fft.cc b/deps/v8/src/bigint/mul-fft.cc
new file mode 100644
index 0000000000..a3971a7276
--- /dev/null
+++ b/deps/v8/src/bigint/mul-fft.cc
@@ -0,0 +1,790 @@
+// Copyright 2021 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// FFT-based multiplication, due to Schönhage and Strassen.
+// This implementation mostly follows the description given in:
+// Christoph Lüders: Fast Multiplication of Large Integers,
+// http://arxiv.org/abs/1503.04955
+
+#include "src/bigint/bigint-internal.h"
+#include "src/bigint/digit-arithmetic.h"
+#include "src/bigint/util.h"
+#include "src/bigint/vector-arithmetic.h"
+
+namespace v8 {
+namespace bigint {
+
+namespace {
+
+////////////////////////////////////////////////////////////////////////////////
+// Part 1: Functions for "mod F_n" arithmetic.
+// F_n is of the shape 2^K + 1, and for convenience we use K to count the
+// number of digits rather than the number of bits, so F_n (or K) are implicit
+// and deduced from the length {len} of the digits array.
+
+// Helper function for {ModFn} below.
+void ModFn_Helper(digit_t* x, int len, signed_digit_t high) {
+ if (high > 0) {
+ digit_t borrow = high;
+ x[len - 1] = 0;
+ for (int i = 0; i < len; i++) {
+ x[i] = digit_sub(x[i], borrow, &borrow);
+ if (borrow == 0) break;
+ }
+ } else {
+ digit_t carry = -high;
+ x[len - 1] = 0;
+ for (int i = 0; i < len; i++) {
+ x[i] = digit_add2(x[i], carry, &carry);
+ if (carry == 0) break;
+ }
+ }
+}
+
+// {x} := {x} mod F_n, assuming that {x} is "slightly" larger than F_n (e.g.
+// after addition of two numbers that were mod-F_n-normalized before).
+void ModFn(digit_t* x, int len) {
+ int K = len - 1;
+ signed_digit_t high = x[K];
+ if (high == 0) return;
+ ModFn_Helper(x, len, high);
+ high = x[K];
+ if (high == 0) return;
+ DCHECK(high == 1 || high == -1);
+ ModFn_Helper(x, len, high);
+ high = x[K];
+ if (high == -1) ModFn_Helper(x, len, high);
+}
+
+// {dest} := {src} mod F_n, assuming that {src} is about twice as long as F_n
+// (e.g. after multiplication of two numbers that were mod-F_n-normalized
+// before).
+// {len} is length of {dest}; {src} is twice as long.
+void ModFnDoubleWidth(digit_t* dest, const digit_t* src, int len) {
+ int K = len - 1;
+ digit_t borrow = 0;
+ for (int i = 0; i < K; i++) {
+ dest[i] = digit_sub2(src[i], src[i + K], borrow, &borrow);
+ }
+ dest[K] = digit_sub2(0, src[2 * K], borrow, &borrow);
+ // {borrow} may be non-zero here, that's OK as {ModFn} will take care of it.
+ ModFn(dest, len);
+}
+
+// Sets {sum} := {a} + {b} and {diff} := {a} - {b}, which is more efficient
+// than computing sum and difference separately. Applies "mod F_n" normalization
+// to both results.
+void SumDiff(digit_t* sum, digit_t* diff, const digit_t* a, const digit_t* b,
+ int len) {
+ digit_t carry = 0;
+ digit_t borrow = 0;
+ for (int i = 0; i < len; i++) {
+ // Read both values first, because inputs and outputs can overlap.
+ digit_t ai = a[i];
+ digit_t bi = b[i];
+ sum[i] = digit_add3(ai, bi, carry, &carry);
+ diff[i] = digit_sub2(ai, bi, borrow, &borrow);
+ }
+ ModFn(sum, len);
+ ModFn(diff, len);
+}
+
+// {result} := ({input} << shift) mod F_n, where shift >= K.
+void ShiftModFn_Large(digit_t* result, const digit_t* input, int digit_shift,
+ int bits_shift, int K) {
+ // If {digit_shift} is greater than K, we use the following transformation
+ // (where, since everything is mod 2^K + 1, we are allowed to add or
+ // subtract any multiple of 2^K + 1 at any time):
+ // x * 2^{K+m} mod 2^K + 1
+ // == x * 2^K * 2^m - (2^K + 1)*(x * 2^m) mod 2^K + 1
+ // == x * 2^K * 2^m - x * 2^K * 2^m - x * 2^m mod 2^K + 1
+ // == -x * 2^m mod 2^K + 1
+ // So the flow is the same as for m < K, but we invert the subtraction's
+ // operands. In order to avoid underflow, we virtually initialize the
+ // result to 2^K + 1:
+ // input = [ iK ][iK-1] .... .... [ i1 ][ i0 ]
+ // result = [ 1][0000] .... .... [0000][0001]
+ // + [ iK ] .... [ iX ]
+ // - [iX-1] .... [ i0 ]
+ DCHECK(digit_shift >= K);
+ digit_shift -= K;
+ digit_t borrow = 0;
+ if (bits_shift == 0) {
+ digit_t carry = 1;
+ for (int i = 0; i < digit_shift; i++) {
+ result[i] = digit_add2(input[i + K - digit_shift], carry, &carry);
+ }
+ result[digit_shift] = digit_sub(input[K] + carry, input[0], &borrow);
+ for (int i = digit_shift + 1; i < K; i++) {
+ digit_t d = input[i - digit_shift];
+ result[i] = digit_sub2(0, d, borrow, &borrow);
+ }
+ } else {
+ digit_t add_carry = 1;
+ digit_t input_carry =
+ input[K - digit_shift - 1] >> (kDigitBits - bits_shift);
+ for (int i = 0; i < digit_shift; i++) {
+ digit_t d = input[i + K - digit_shift];
+ digit_t summand = (d << bits_shift) | input_carry;
+ result[i] = digit_add2(summand, add_carry, &add_carry);
+ input_carry = d >> (kDigitBits - bits_shift);
+ }
+ {
+ // result[digit_shift] = (add_carry + iK_part) - i0_part
+ digit_t d = input[K];
+ digit_t iK_part = (d << bits_shift) | input_carry;
+ digit_t iK_carry = d >> (kDigitBits - bits_shift);
+ digit_t sum = digit_add2(add_carry, iK_part, &add_carry);
+ // {iK_carry} is less than a full digit, so we can merge {add_carry}
+ // into it without overflow.
+ iK_carry += add_carry;
+ d = input[0];
+ digit_t i0_part = d << bits_shift;
+ result[digit_shift] = digit_sub(sum, i0_part, &borrow);
+ input_carry = d >> (kDigitBits - bits_shift);
+ if (digit_shift + 1 < K) {
+ digit_t d = input[1];
+ digit_t subtrahend = (d << bits_shift) | input_carry;
+ result[digit_shift + 1] =
+ digit_sub2(iK_carry, subtrahend, borrow, &borrow);
+ input_carry = d >> (kDigitBits - bits_shift);
+ }
+ }
+ for (int i = digit_shift + 2; i < K; i++) {
+ digit_t d = input[i - digit_shift];
+ digit_t subtrahend = (d << bits_shift) | input_carry;
+ result[i] = digit_sub2(0, subtrahend, borrow, &borrow);
+ input_carry = d >> (kDigitBits - bits_shift);
+ }
+ }
+ // The virtual 1 in result[K] should be eliminated by {borrow}. If there
+ // is no borrow, then the virtual initialization was too much. Subtract
+ // 2^K + 1.
+ result[K] = 0;
+ if (borrow != 1) {
+ borrow = 1;
+ for (int i = 0; i < K; i++) {
+ result[i] = digit_sub(result[i], borrow, &borrow);
+ if (borrow == 0) break;
+ }
+ if (borrow != 0) {
+ // The result must be 2^K.
+ for (int i = 0; i < K; i++) result[i] = 0;
+ result[K] = 1;
+ }
+ }
+}
+
+// Sets {result} := {input} * 2^{power_of_two} mod 2^{K} + 1.
+// This function is highly relevant for overall performance.
+void ShiftModFn(digit_t* result, const digit_t* input, int power_of_two, int K,
+ int zero_above = 0x7FFFFFFF) {
+ // The modulo-reduction amounts to a subtraction, which we combine
+ // with the shift as follows:
+ // input = [ iK ][iK-1] .... .... [ i1 ][ i0 ]
+ // result = [iX-1] .... [ i0 ] <<<<<<<<<<< shift by {power_of_two}
+ // - [ iK ] .... [ iX ]
+ // where "X" is the index "K - digit_shift".
+ int digit_shift = power_of_two / kDigitBits;
+ int bits_shift = power_of_two % kDigitBits;
+ // By an analogous construction to the "digit_shift >= K" case,
+ // it turns out that:
+ // x * 2^{2K+m} == x * 2^m mod 2^K + 1.
+ while (digit_shift >= 2 * K) digit_shift -= 2 * K; // Faster than '%'!
+ if (digit_shift >= K) {
+ return ShiftModFn_Large(result, input, digit_shift, bits_shift, K);
+ }
+ digit_t borrow = 0;
+ if (bits_shift == 0) {
+ // We do a single pass over {input}, starting by copying digits [i1] to
+ // [iX-1] to result indices digit_shift+1 to K-1.
+ int i = 1;
+ // Read input digits unless we know they are zero.
+ int cap = std::min(K - digit_shift, zero_above);
+ for (; i < cap; i++) {
+ result[i + digit_shift] = input[i];
+ }
+ // Any remaining work can hard-code the knowledge that input[i] == 0.
+ for (; i < K - digit_shift; i++) {
+ DCHECK(input[i] == 0); // NOLINT(readability/check)
+ result[i + digit_shift] = 0;
+ }
+ // Second phase: subtract input digits [iX] to [iK] from (virtually) zero-
+ // initialized result indices 0 to digit_shift-1.
+ cap = std::min(K, zero_above);
+ for (; i < cap; i++) {
+ digit_t d = input[i];
+ result[i - K + digit_shift] = digit_sub2(0, d, borrow, &borrow);
+ }
+ // Any remaining work can hard-code the knowledge that input[i] == 0.
+ for (; i < K; i++) {
+ DCHECK(input[i] == 0); // NOLINT(readability/check)
+ result[i - K + digit_shift] = digit_sub(0, borrow, &borrow);
+ }
+ // Last step: subtract [iK] from [i0] and store at result index digit_shift.
+ result[digit_shift] = digit_sub2(input[0], input[K], borrow, &borrow);
+ } else {
+ // Same flow as before, but taking bits_shift != 0 into account.
+ // First phase: result indices digit_shift+1 to K.
+ digit_t carry = 0;
+ int i = 0;
+ // Read input digits unless we know they are zero.
+ int cap = std::min(K - digit_shift, zero_above);
+ for (; i < cap; i++) {
+ digit_t d = input[i];
+ result[i + digit_shift] = (d << bits_shift) | carry;
+ carry = d >> (kDigitBits - bits_shift);
+ }
+ // Any remaining work can hard-code the knowledge that input[i] == 0.
+ for (; i < K - digit_shift; i++) {
+ DCHECK(input[i] == 0); // NOLINT(readability/check)
+ result[i + digit_shift] = carry;
+ carry = 0;
+ }
+ // Second phase: result indices 0 to digit_shift - 1.
+ cap = std::min(K, zero_above);
+ for (; i < cap; i++) {
+ digit_t d = input[i];
+ result[i - K + digit_shift] =
+ digit_sub2(0, (d << bits_shift) | carry, borrow, &borrow);
+ carry = d >> (kDigitBits - bits_shift);
+ }
+ // Any remaining work can hard-code the knowledge that input[i] == 0.
+ if (i < K) {
+ DCHECK(input[i] == 0); // NOLINT(readability/check)
+ result[i - K + digit_shift] = digit_sub2(0, carry, borrow, &borrow);
+ carry = 0;
+ i++;
+ }
+ for (; i < K; i++) {
+ DCHECK(input[i] == 0); // NOLINT(readability/check)
+ result[i - K + digit_shift] = digit_sub(0, borrow, &borrow);
+ }
+ // Last step: compute result[digit_shift].
+ digit_t d = input[K];
+ result[digit_shift] = digit_sub2(
+ result[digit_shift], (d << bits_shift) | carry, borrow, &borrow);
+ // No carry left.
+ DCHECK((d >> (kDigitBits - bits_shift)) == 0); // NOLINT(readability/check)
+ }
+ result[K] = 0;
+ for (int i = digit_shift + 1; i <= K && borrow > 0; i++) {
+ result[i] = digit_sub(result[i], borrow, &borrow);
+ }
+ if (borrow > 0) {
+ // Underflow means we subtracted too much. Add 2^K + 1.
+ digit_t carry = 1;
+ for (int i = 0; i <= K; i++) {
+ result[i] = digit_add2(result[i], carry, &carry);
+ if (carry == 0) break;
+ }
+ result[K] = digit_add2(result[K], 1, &carry);
+ }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Part 2: FFT-based multiplication is very sensitive to appropriate choice
+// of parameters. The following functions choose the parameters that the
+// subsequent actual computation will use. This is partially based on formal
+// constraints and partially on experimentally-determined heuristics.
+
+struct Parameters {
+ // We never use the default values, but skipping zero-initialization
+ // of these fields saddens and confuses MSan.
+ int m{0};
+ int K{0};
+ int n{0};
+ int s{0};
+ int r{0};
+};
+
+// Computes parameters for the main calculation, given a bit length {N} and
+// an {m}. See the paper for details.
+void ComputeParameters(int N, int m, Parameters* params) {
+ N *= kDigitBits;
+ int n = 1 << m; // 2^m
+ int nhalf = n >> 1;
+ int s = (N + n - 1) >> m; // ceil(N/n)
+ s = RoundUp(s, kDigitBits);
+ int K = m + 2 * s + 1; // K must be at least this big...
+ K = RoundUp(K, nhalf); // ...and a multiple of n/2.
+ int r = K >> (m - 1); // Which multiple?
+
+ // We want recursive calls to make progress, so force K to be a multiple
+ // of 8 if it's above the recursion threshold. Otherwise, K must be a
+ // multiple of kDigitBits.
+ const int threshold = (K + 1 >= kFftInnerThreshold * kDigitBits)
+ ? 3 + kLog2DigitBits
+ : kLog2DigitBits;
+ int K_tz = CountTrailingZeros(K);
+ while (K_tz < threshold) {
+ K += (1 << K_tz);
+ r = K >> (m - 1);
+ K_tz = CountTrailingZeros(K);
+ }
+
+ DCHECK(K % kDigitBits == 0); // NOLINT(readability/check)
+ DCHECK(s % kDigitBits == 0); // NOLINT(readability/check)
+ params->K = K / kDigitBits;
+ params->s = s / kDigitBits;
+ params->n = n;
+ params->r = r;
+}
+
+// Computes parameters for recursive invocations ("inner layer").
+void ComputeParameters_Inner(int N, Parameters* params) {
+ int max_m = CountTrailingZeros(N);
+ int N_bits = BitLength(N);
+ int m = N_bits - 4; // Don't let s get too small.
+ m = std::min(max_m, m);
+ N *= kDigitBits;
+ int n = 1 << m; // 2^m
+ // We can't round up s in the inner layer, because N = n*s is fixed.
+ int s = N >> m;
+ DCHECK(N == s * n);
+ int K = m + 2 * s + 1; // K must be at least this big...
+ K = RoundUp(K, n); // ...and a multiple of n and kDigitBits.
+ K = RoundUp(K, kDigitBits);
+ params->r = K >> m; // Which multiple?
+ DCHECK(K % kDigitBits == 0); // NOLINT(readability/check)
+ DCHECK(s % kDigitBits == 0); // NOLINT(readability/check)
+ params->K = K / kDigitBits;
+ params->s = s / kDigitBits;
+ params->n = n;
+ params->m = m;
+}
+
+int PredictInnerK(int N) {
+ Parameters params;
+ ComputeParameters_Inner(N, &params);
+ return params.K;
+}
+
+// Applies heuristics to decide whether {m} should be decremented, by looking
+// at what would happen to {K} and {s} if {m} was decremented.
+bool ShouldDecrementM(const Parameters& current, const Parameters& next,
+ const Parameters& after_next) {
+ // K == 64 seems to work particularly well.
+ if (current.K == 64 && next.K >= 112) return false;
+ // Small values for s are never efficient.
+ if (current.s < 6) return true;
+ // The time is roughly determined by K * n. When we decrement m, then
+ // n always halves, and K usually gets bigger, by up to 2x.
+ // For not-quite-so-small s, look at how much bigger K would get: if
+ // the K increase is small enough, making n smaller is worth it.
+ // Empirically, it's most meaningful to look at the K *after* next.
+ // The specific threshold values have been chosen by running many
+ // benchmarks on inputs of many sizes, and manually selecting thresholds
+ // that seemed to produce good results.
+ double factor = static_cast<double>(after_next.K) / current.K;
+ if ((current.s == 6 && factor < 3.85) || // --
+ (current.s == 7 && factor < 3.73) || // --
+ (current.s == 8 && factor < 3.55) || // --
+ (current.s == 9 && factor < 3.50) || // --
+ factor < 3.4) {
+ return true;
+ }
+ // If K is just below the recursion threshold, make sure we do recurse,
+ // unless doing so would be particularly inefficient (large inner_K).
+ // If K is just above the recursion threshold, doubling it often makes
+ // the inner call more efficient.
+ if (current.K >= 160 && current.K < 250 && PredictInnerK(next.K) < 28) {
+ return true;
+ }
+ // If we found no reason to decrement, keep m as large as possible.
+ return false;
+}
+
+// Decides what parameters to use for a given input bit length {N}.
+// Returns the chosen m.
+int GetParameters(int N, Parameters* params) {
+ int N_bits = BitLength(N);
+ int max_m = N_bits - 3; // Larger m make s too small.
+ max_m = std::max(kLog2DigitBits, max_m); // Smaller m break the logic below.
+ int m = max_m;
+ Parameters current;
+ ComputeParameters(N, m, &current);
+ Parameters next;
+ ComputeParameters(N, m - 1, &next);
+ while (m > 2) {
+ Parameters after_next;
+ ComputeParameters(N, m - 2, &after_next);
+ if (ShouldDecrementM(current, next, after_next)) {
+ m--;
+ current = next;
+ next = after_next;
+ } else {
+ break;
+ }
+ }
+ *params = current;
+ return m;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Part 3: Fast Fourier Transformation.
+
+class FFTContainer {
+ public:
+ // {n} is the number of chunks, whose length is {K}+1.
+ // {K} determines F_n = 2^(K * kDigitBits) + 1.
+ FFTContainer(int n, int K, ProcessorImpl* processor)
+ : n_(n), K_(K), length_(K + 1), processor_(processor) {
+ storage_ = new digit_t[length_ * n_];
+ part_ = new digit_t*[n_];
+ digit_t* ptr = storage_;
+ for (int i = 0; i < n; i++, ptr += length_) {
+ part_[i] = ptr;
+ }
+ temp_ = new digit_t[length_ * 2];
+ }
+ FFTContainer() = delete;
+ FFTContainer(const FFTContainer&) = delete;
+ FFTContainer& operator=(const FFTContainer&) = delete;
+
+ ~FFTContainer() {
+ delete[] storage_;
+ delete[] part_;
+ delete[] temp_;
+ }
+
+ void Start_Default(Digits X, int chunk_size, int theta, int omega);
+ void Start(Digits X, int chunk_size, int theta, int omega);
+
+ void NormalizeAndRecombine(int omega, int m, RWDigits Z, int chunk_size);
+ void CounterWeightAndRecombine(int theta, int m, RWDigits Z, int chunk_size);
+
+ void FFT_ReturnShuffledThreadsafe(int start, int len, int omega,
+ digit_t* temp);
+ void FFT_Recurse(int start, int half, int omega, digit_t* temp);
+
+ void BackwardFFT(int start, int len, int omega);
+ void BackwardFFT_Threadsafe(int start, int len, int omega, digit_t* temp);
+
+ void PointwiseMultiply(const FFTContainer& other);
+ void DoPointwiseMultiplication(const FFTContainer& other, int start, int end,
+ digit_t* temp);
+
+ int length() const { return length_; }
+
+ private:
+ const int n_; // Number of parts.
+ const int K_; // Always length_ - 1.
+ const int length_; // Length of each part, in digits.
+ ProcessorImpl* processor_;
+ digit_t* storage_; // Combined storage of all parts.
+ digit_t** part_; // Pointers to each part.
+ digit_t* temp_; // Temporary storage with size 2 * length_.
+};
+
+inline void CopyAndZeroExtend(digit_t* dst, const digit_t* src,
+ int digits_to_copy, size_t total_bytes) {
+ size_t bytes_to_copy = digits_to_copy * sizeof(digit_t);
+ memcpy(dst, src, bytes_to_copy);
+ memset(dst + digits_to_copy, 0, total_bytes - bytes_to_copy);
+}
+
+// Reads {X} into the FFTContainer's internal storage, dividing it into chunks
+// while doing so; then performs the forward FFT.
+void FFTContainer::Start_Default(Digits X, int chunk_size, int theta,
+ int omega) {
+ int len = X.len();
+ const digit_t* pointer = X.digits();
+ const size_t part_length_in_bytes = length_ * sizeof(digit_t);
+ int current_theta = 0;
+ int i = 0;
+ for (; i < n_ && len > 0; i++, current_theta += theta) {
+ chunk_size = std::min(chunk_size, len);
+ // For invocations via MultiplyFFT_Inner, X.len() == n_ * chunk_size + 1,
+ // because the outer layer's "K" is passed as the inner layer's "N".
+ // Since X is (mod Fn)-normalized on the outer layer, there is the rare
+ // corner case where X[n_ * chunk_size] == 1. Detect that case, and handle
+ // the extra bit as part of the last chunk; we always have the space.
+ if (i == n_ - 1 && len == chunk_size + 1) {
+ DCHECK(X[n_ * chunk_size] <= 1); // NOLINT(readability/check)
+ DCHECK(length_ >= chunk_size + 1);
+ chunk_size++;
+ }
+ if (current_theta != 0) {
+ // Multiply with theta^i, and reduce modulo 2^K + 1.
+ // We pass theta as a shift amount; it really means 2^theta.
+ CopyAndZeroExtend(temp_, pointer, chunk_size, part_length_in_bytes);
+ ShiftModFn(part_[i], temp_, current_theta, K_, chunk_size);
+ } else {
+ CopyAndZeroExtend(part_[i], pointer, chunk_size, part_length_in_bytes);
+ }
+ pointer += chunk_size;
+ len -= chunk_size;
+ }
+ DCHECK(len == 0); // NOLINT(readability/check)
+ for (; i < n_; i++) {
+ memset(part_[i], 0, part_length_in_bytes);
+ }
+ FFT_ReturnShuffledThreadsafe(0, n_, omega, temp_);
+}
+
+// This version of Start is optimized for the case where ~half of the
+// container will be filled with padding zeros.
+void FFTContainer::Start(Digits X, int chunk_size, int theta, int omega) {
+ int len = X.len();
+ if (len > n_ * chunk_size / 2) {
+ return Start_Default(X, chunk_size, theta, omega);
+ }
+ DCHECK(theta == 0); // NOLINT(readability/check)
+ const digit_t* pointer = X.digits();
+ const size_t part_length_in_bytes = length_ * sizeof(digit_t);
+ int nhalf = n_ / 2;
+ // Unrolled first iteration.
+ CopyAndZeroExtend(part_[0], pointer, chunk_size, part_length_in_bytes);
+ CopyAndZeroExtend(part_[nhalf], pointer, chunk_size, part_length_in_bytes);
+ pointer += chunk_size;
+ len -= chunk_size;
+ int i = 1;
+ for (; i < nhalf && len > 0; i++) {
+ chunk_size = std::min(chunk_size, len);
+ CopyAndZeroExtend(part_[i], pointer, chunk_size, part_length_in_bytes);
+ int w = omega * i;
+ ShiftModFn(part_[i + nhalf], part_[i], w, K_, chunk_size);
+ pointer += chunk_size;
+ len -= chunk_size;
+ }
+ for (; i < nhalf; i++) {
+ memset(part_[i], 0, part_length_in_bytes);
+ memset(part_[i + nhalf], 0, part_length_in_bytes);
+ }
+ FFT_Recurse(0, nhalf, omega, temp_);
+}
+
+// Forward transformation.
+// We use the "DIF" aka "decimation in frequency" transform, because it
+// leaves the result in "bit reversed" order, which is precisely what we
+// need as input for the "DIT" aka "decimation in time" backwards transform.
+void FFTContainer::FFT_ReturnShuffledThreadsafe(int start, int len, int omega,
+ digit_t* temp) {
+ DCHECK((len & 1) == 0); // {len} must be even. NOLINT(readability/check)
+ int half = len / 2;
+ SumDiff(part_[start], part_[start + half], part_[start], part_[start + half],
+ length_);
+ for (int k = 1; k < half; k++) {
+ SumDiff(part_[start + k], temp, part_[start + k], part_[start + half + k],
+ length_);
+ int w = omega * k;
+ ShiftModFn(part_[start + half + k], temp, w, K_);
+ }
+ FFT_Recurse(start, half, omega, temp);
+}
+
+// Recursive step of the above, factored out for additional callers.
+void FFTContainer::FFT_Recurse(int start, int half, int omega, digit_t* temp) {
+ if (half > 1) {
+ FFT_ReturnShuffledThreadsafe(start, half, 2 * omega, temp);
+ FFT_ReturnShuffledThreadsafe(start + half, half, 2 * omega, temp);
+ }
+}
+
+// Backward transformation.
+// We use the "DIT" aka "decimation in time" transform here, because it
+// turns bit-reversed input into normally sorted output.
+void FFTContainer::BackwardFFT(int start, int len, int omega) {
+ BackwardFFT_Threadsafe(start, len, omega, temp_);
+}
+
+void FFTContainer::BackwardFFT_Threadsafe(int start, int len, int omega,
+ digit_t* temp) {
+ DCHECK((len & 1) == 0); // {len} must be even. NOLINT(readability/check)
+ int half = len / 2;
+ // Don't recurse for half == 2, as PointwiseMultiply already performed
+ // the first level of the backwards FFT.
+ if (half > 2) {
+ BackwardFFT_Threadsafe(start, half, 2 * omega, temp);
+ BackwardFFT_Threadsafe(start + half, half, 2 * omega, temp);
+ }
+ SumDiff(part_[start], part_[start + half], part_[start], part_[start + half],
+ length_);
+ for (int k = 1; k < half; k++) {
+ int w = omega * (len - k);
+ ShiftModFn(temp, part_[start + half + k], w, K_);
+ SumDiff(part_[start + k], part_[start + half + k], part_[start + k], temp,
+ length_);
+ }
+}
+
+// Recombines the result's parts into {Z}, after backwards FFT.
+void FFTContainer::NormalizeAndRecombine(int omega, int m, RWDigits Z,
+ int chunk_size) {
+ Z.Clear();
+ int z_index = 0;
+ const int shift = n_ * omega - m;
+ for (int i = 0; i < n_; i++, z_index += chunk_size) {
+ digit_t* part = part_[i];
+ ShiftModFn(temp_, part, shift, K_);
+ digit_t carry = 0;
+ int zi = z_index;
+ int j = 0;
+ for (; j < length_ && zi < Z.len(); j++, zi++) {
+ Z[zi] = digit_add3(Z[zi], temp_[j], carry, &carry);
+ }
+ for (; j < length_; j++) {
+ DCHECK(temp_[j] == 0); // NOLINT(readability/check)
+ }
+ if (carry != 0) {
+ DCHECK(zi < Z.len());
+ Z[zi] = carry;
+ }
+ }
+}
+
+// Helper function for {CounterWeightAndRecombine} below.
+bool ShouldBeNegative(const digit_t* x, int xlen, digit_t threshold, int s) {
+ if (x[2 * s] >= threshold) return true;
+ for (int i = 2 * s + 1; i < xlen; i++) {
+ if (x[i] > 0) return true;
+ }
+ return false;
+}
+
+// Same as {NormalizeAndRecombine} above, but for the needs of the recursive
+// invocation ("inner layer") of FFT multiplication, where an additional
+// counter-weighting step is required.
+void FFTContainer::CounterWeightAndRecombine(int theta, int m, RWDigits Z,
+ int s) {
+ Z.Clear();
+ int z_index = 0;
+ for (int k = 0; k < n_; k++, z_index += s) {
+ int shift = -theta * k - m;
+ if (shift < 0) shift += 2 * n_ * theta;
+ DCHECK(shift >= 0); // NOLINT(readability/check)
+ digit_t* input = part_[k];
+ ShiftModFn(temp_, input, shift, K_);
+ int remaining_z = Z.len() - z_index;
+ if (ShouldBeNegative(temp_, length_, k + 1, s)) {
+ // Subtract F_n from input before adding to result. We use the following
+ // transformation (knowing that X < F_n):
+ // Z + (X - F_n) == Z - (F_n - X)
+ digit_t borrow_z = 0;
+ digit_t borrow_Fn = 0;
+ {
+ // i == 0:
+ digit_t d = digit_sub(1, temp_[0], &borrow_Fn);
+ Z[z_index] = digit_sub(Z[z_index], d, &borrow_z);
+ }
+ int i = 1;
+ for (; i < K_ && i < remaining_z; i++) {
+ digit_t d = digit_sub2(0, temp_[i], borrow_Fn, &borrow_Fn);
+ Z[z_index + i] = digit_sub2(Z[z_index + i], d, borrow_z, &borrow_z);
+ }
+ DCHECK(i == K_ && K_ == length_ - 1);
+ for (; i < length_ && i < remaining_z; i++) {
+ digit_t d = digit_sub2(1, temp_[i], borrow_Fn, &borrow_Fn);
+ Z[z_index + i] = digit_sub2(Z[z_index + i], d, borrow_z, &borrow_z);
+ }
+ DCHECK(borrow_Fn == 0); // NOLINT(readability/check)
+ for (; borrow_z > 0 && i < remaining_z; i++) {
+ Z[z_index + i] = digit_sub(Z[z_index + i], borrow_z, &borrow_z);
+ }
+ } else {
+ digit_t carry = 0;
+ int i = 0;
+ for (; i < length_ && i < remaining_z; i++) {
+ Z[z_index + i] = digit_add3(Z[z_index + i], temp_[i], carry, &carry);
+ }
+ for (; i < length_; i++) {
+ DCHECK(temp_[i] == 0); // NOLINT(readability/check)
+ }
+ for (; carry > 0 && i < remaining_z; i++) {
+ Z[z_index + i] = digit_add2(Z[z_index + i], carry, &carry);
+ }
+ // {carry} might be != 0 here if Z was negative before. That's fine.
+ }
+ }
+}
+
+// Main FFT function for recursive invocations ("inner layer").
+void MultiplyFFT_Inner(RWDigits Z, Digits X, Digits Y, const Parameters& params,
+ ProcessorImpl* processor) {
+ int omega = 2 * params.r; // really: 2^(2r)
+ int theta = params.r; // really: 2^r
+
+ FFTContainer a(params.n, params.K, processor);
+ a.Start_Default(X, params.s, theta, omega);
+ FFTContainer b(params.n, params.K, processor);
+ b.Start_Default(Y, params.s, theta, omega);
+
+ a.PointwiseMultiply(b);
+ if (processor->should_terminate()) return;
+
+ FFTContainer& c = a;
+ c.BackwardFFT(0, params.n, omega);
+
+ c.CounterWeightAndRecombine(theta, params.m, Z, params.s);
+}
+
+// Actual implementation of pointwise multiplications.
+void FFTContainer::DoPointwiseMultiplication(const FFTContainer& other,
+ int start, int end,
+ digit_t* temp) {
+ // The (K_ & 3) != 0 condition makes sure that the inner FFT gets
+ // to split the work into at least 4 chunks.
+ bool use_fft = length_ >= kFftInnerThreshold && (K_ & 3) == 0;
+ Parameters params;
+ if (use_fft) ComputeParameters_Inner(K_, &params);
+ RWDigits result(temp, 2 * length_);
+ for (int i = start; i < end; i++) {
+ Digits A(part_[i], length_);
+ Digits B(other.part_[i], length_);
+ if (use_fft) {
+ MultiplyFFT_Inner(result, A, B, params, processor_);
+ } else {
+ processor_->Multiply(result, A, B);
+ }
+ if (processor_->should_terminate()) return;
+ ModFnDoubleWidth(part_[i], result.digits(), length_);
+ // To improve cache friendliness, we perform the first level of the
+ // backwards FFT here.
+ if ((i & 1) == 1) {
+ SumDiff(part_[i - 1], part_[i], part_[i - 1], part_[i], length_);
+ }
+ }
+}
+
+// Convenient entry point for pointwise multiplications.
+void FFTContainer::PointwiseMultiply(const FFTContainer& other) {
+ DCHECK(n_ == other.n_);
+ DoPointwiseMultiplication(other, 0, n_, temp_);
+}
+
+} // namespace
+
+////////////////////////////////////////////////////////////////////////////////
+// Part 4: Tying everything together into a multiplication algorithm.
+
+// TODO(jkummerow): Consider doing a "Mersenne transform" and CRT reconstruction
+// of the final result. Might yield a few percent of perf improvement.
+
+// TODO(jkummerow): Consider implementing the "sqrt(2) trick".
+// Gaudry/Kruppa/Zimmerman report that it saved them around 10%.
+
+void ProcessorImpl::MultiplyFFT(RWDigits Z, Digits X, Digits Y) {
+ Parameters params;
+ int m = GetParameters(X.len() + Y.len(), &params);
+ int omega = params.r; // really: 2^r
+
+ FFTContainer a(params.n, params.K, this);
+ a.Start(X, params.s, 0, omega);
+ if (X == Y) {
+ // Squaring.
+ a.PointwiseMultiply(a);
+ } else {
+ FFTContainer b(params.n, params.K, this);
+ b.Start(Y, params.s, 0, omega);
+ a.PointwiseMultiply(b);
+ }
+ if (should_terminate()) return;
+
+ a.BackwardFFT(0, params.n, omega);
+ a.NormalizeAndRecombine(omega, m, Z, params.s);
+}
+
+} // namespace bigint
+} // namespace v8
diff --git a/deps/v8/src/bigint/mul-karatsuba.cc b/deps/v8/src/bigint/mul-karatsuba.cc
new file mode 100644
index 0000000000..2a141f213c
--- /dev/null
+++ b/deps/v8/src/bigint/mul-karatsuba.cc
@@ -0,0 +1,205 @@
+// Copyright 2021 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// Karatsuba multiplication. This is loosely based on Go's implementation
+// found at https://golang.org/src/math/big/nat.go, licensed as follows:
+//
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file [1].
+//
+// [1] https://golang.org/LICENSE
+
+#include <algorithm>
+#include <utility>
+
+#include "src/bigint/bigint-internal.h"
+#include "src/bigint/digit-arithmetic.h"
+#include "src/bigint/util.h"
+#include "src/bigint/vector-arithmetic.h"
+
+namespace v8 {
+namespace bigint {
+
+// If Karatsuba is the best supported algorithm, then it must check for
+// termination requests. If there are more advanced algorithms available
+// for larger inputs, then Karatsuba will only be used for sufficiently
+// small chunks that checking for termination requests is not necessary.
+#if V8_ADVANCED_BIGINT_ALGORITHMS
+#define MAYBE_TERMINATE
+#else
+#define MAYBE_TERMINATE \
+ if (should_terminate()) return;
+#endif
+
+namespace {
+
+// The Karatsuba algorithm sometimes finishes more quickly when the
+// input length is rounded up a bit. This method encodes some heuristics
+// to accomplish this. The details have been determined experimentally.
+int RoundUpLen(int len) {
+ if (len <= 36) return RoundUp(len, 2);
+ // Keep the 4 or 5 most significant non-zero bits.
+ int shift = BitLength(len) - 5;
+ if ((len >> shift) >= 0x18) {
+ shift++;
+ }
+ // Round up, unless we're only just above the threshold. This smoothes
+ // the steps by which time goes up as input size increases.
+ int additive = ((1 << shift) - 1);
+ if (shift >= 2 && (len & additive) < (1 << (shift - 2))) {
+ return len;
+ }
+ return ((len + additive) >> shift) << shift;
+}
+
+// This method makes the final decision how much to bump up the input size.
+int KaratsubaLength(int n) {
+ n = RoundUpLen(n);
+ int i = 0;
+ while (n > kKaratsubaThreshold) {
+ n >>= 1;
+ i++;
+ }
+ return n << i;
+}
+
+// Performs the specific subtraction required by {KaratsubaMain} below.
+void KaratsubaSubtractionHelper(RWDigits result, Digits X, Digits Y,
+ int* sign) {
+ X.Normalize();
+ Y.Normalize();
+ digit_t borrow = 0;
+ int i = 0;
+ if (!GreaterThanOrEqual(X, Y)) {
+ *sign = -(*sign);
+ std::swap(X, Y);
+ }
+ for (; i < Y.len(); i++) {
+ result[i] = digit_sub2(X[i], Y[i], borrow, &borrow);
+ }
+ for (; i < X.len(); i++) {
+ result[i] = digit_sub(X[i], borrow, &borrow);
+ }
+ DCHECK(borrow == 0); // NOLINT(readability/check)
+ for (; i < result.len(); i++) result[i] = 0;
+}
+
+} // namespace
+
+void ProcessorImpl::MultiplyKaratsuba(RWDigits Z, Digits X, Digits Y) {
+ DCHECK(X.len() >= Y.len());
+ DCHECK(Y.len() >= kKaratsubaThreshold);
+ DCHECK(Z.len() >= X.len() + Y.len());
+ int k = KaratsubaLength(Y.len());
+ int scratch_len = 4 * k;
+ ScratchDigits scratch(scratch_len);
+ KaratsubaStart(Z, X, Y, scratch, k);
+}
+
+// Entry point for Karatsuba-based multiplication, takes care of inputs
+// with unequal lengths by chopping the larger into chunks.
+void ProcessorImpl::KaratsubaStart(RWDigits Z, Digits X, Digits Y,
+ RWDigits scratch, int k) {
+ KaratsubaMain(Z, X, Y, scratch, k);
+ MAYBE_TERMINATE
+ for (int i = 2 * k; i < Z.len(); i++) Z[i] = 0;
+ if (k < Y.len() || X.len() != Y.len()) {
+ ScratchDigits T(2 * k);
+ // Add X0 * Y1 * b.
+ Digits X0(X, 0, k);
+ Digits Y1 = Y + std::min(k, Y.len());
+ if (Y1.len() > 0) {
+ KaratsubaChunk(T, X0, Y1, scratch);
+ MAYBE_TERMINATE
+ AddAndReturnOverflow(Z + k, T); // Can't overflow.
+ }
+
+ // Add Xi * Y0 << i and Xi * Y1 * b << (i + k).
+ Digits Y0(Y, 0, k);
+ for (int i = k; i < X.len(); i += k) {
+ Digits Xi(X, i, k);
+ KaratsubaChunk(T, Xi, Y0, scratch);
+ MAYBE_TERMINATE
+ AddAndReturnOverflow(Z + i, T); // Can't overflow.
+ if (Y1.len() > 0) {
+ KaratsubaChunk(T, Xi, Y1, scratch);
+ MAYBE_TERMINATE
+ AddAndReturnOverflow(Z + (i + k), T); // Can't overflow.
+ }
+ }
+ }
+}
+
+// Entry point for chunk-wise multiplications, selects an appropriate
+// algorithm for the inputs based on their sizes.
+void ProcessorImpl::KaratsubaChunk(RWDigits Z, Digits X, Digits Y,
+ RWDigits scratch) {
+ X.Normalize();
+ Y.Normalize();
+ if (X.len() == 0 || Y.len() == 0) return Z.Clear();
+ if (X.len() < Y.len()) std::swap(X, Y);
+ if (Y.len() == 1) return MultiplySingle(Z, X, Y[0]);
+ if (Y.len() < kKaratsubaThreshold) return MultiplySchoolbook(Z, X, Y);
+ int k = KaratsubaLength(Y.len());
+ DCHECK(scratch.len() >= 4 * k);
+ return KaratsubaStart(Z, X, Y, scratch, k);
+}
+
+// The main recursive Karatsuba method.
+void ProcessorImpl::KaratsubaMain(RWDigits Z, Digits X, Digits Y,
+ RWDigits scratch, int n) {
+ if (n < kKaratsubaThreshold) {
+ X.Normalize();
+ Y.Normalize();
+ if (X.len() >= Y.len()) {
+ return MultiplySchoolbook(RWDigits(Z, 0, 2 * n), X, Y);
+ } else {
+ return MultiplySchoolbook(RWDigits(Z, 0, 2 * n), Y, X);
+ }
+ }
+ DCHECK(scratch.len() >= 4 * n);
+ DCHECK((n & 1) == 0); // NOLINT(readability/check)
+ int n2 = n >> 1;
+ Digits X0(X, 0, n2);
+ Digits X1(X, n2, n2);
+ Digits Y0(Y, 0, n2);
+ Digits Y1(Y, n2, n2);
+ RWDigits scratch_for_recursion(scratch, 2 * n, 2 * n);
+ RWDigits P0(scratch, 0, n);
+ KaratsubaMain(P0, X0, Y0, scratch_for_recursion, n2);
+ MAYBE_TERMINATE
+ for (int i = 0; i < n; i++) Z[i] = P0[i];
+ RWDigits P2(scratch, n, n);
+ KaratsubaMain(P2, X1, Y1, scratch_for_recursion, n2);
+ MAYBE_TERMINATE
+ RWDigits Z2 = Z + n;
+ int end = std::min(Z2.len(), P2.len());
+ for (int i = 0; i < end; i++) Z2[i] = P2[i];
+ for (int i = end; i < n; i++) {
+ DCHECK(P2[i] == 0); // NOLINT(readability/check)
+ }
+ // The intermediate result can be one digit too large; the subtraction
+ // below will fix this.
+ digit_t overflow = AddAndReturnOverflow(Z + n2, P0);
+ overflow += AddAndReturnOverflow(Z + n2, P2);
+ RWDigits X_diff(scratch, 0, n2);
+ RWDigits Y_diff(scratch, n2, n2);
+ int sign = 1;
+ KaratsubaSubtractionHelper(X_diff, X1, X0, &sign);
+ KaratsubaSubtractionHelper(Y_diff, Y0, Y1, &sign);
+ RWDigits P1(scratch, n, n);
+ KaratsubaMain(P1, X_diff, Y_diff, scratch_for_recursion, n2);
+ if (sign > 0) {
+ overflow += AddAndReturnOverflow(Z + n2, P1);
+ } else {
+ overflow -= SubAndReturnBorrow(Z + n2, P1);
+ }
+ // The intermediate result may have been bigger, but the final result fits.
+ DCHECK(overflow == 0); // NOLINT(readability/check)
+ USE(overflow);
+}
+
+} // namespace bigint
+} // namespace v8
diff --git a/deps/v8/src/bigint/mul-schoolbook.cc b/deps/v8/src/bigint/mul-schoolbook.cc
index 8e10685018..9222e1e675 100644
--- a/deps/v8/src/bigint/mul-schoolbook.cc
+++ b/deps/v8/src/bigint/mul-schoolbook.cc
@@ -71,7 +71,6 @@ void ProcessorImpl::MultiplySchoolbook(RWDigits Z, Digits X, Digits Y) {
next_carry = 0;
BODY(0, i);
AddWorkEstimate(i);
- if (should_terminate()) return;
}
// Last part: i exceeds Y now, we have to be careful about bounds.
int loop_end = X.len() + Y.len() - 2;
@@ -85,7 +84,6 @@ void ProcessorImpl::MultiplySchoolbook(RWDigits Z, Digits X, Digits Y) {
next_carry = 0;
BODY(min_x_index, max_x_index);
AddWorkEstimate(max_x_index - min_x_index);
- if (should_terminate()) return;
}
// Write the last digit, and zero out any extra space in Z.
Z[i++] = digit_add2(next, carry, &carry);
diff --git a/deps/v8/src/bigint/mul-toom.cc b/deps/v8/src/bigint/mul-toom.cc
new file mode 100644
index 0000000000..76a4fdcdfb
--- /dev/null
+++ b/deps/v8/src/bigint/mul-toom.cc
@@ -0,0 +1,223 @@
+// Copyright 2021 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// Toom-Cook multiplication.
+// Reference: https://en.wikipedia.org/wiki/Toom%E2%80%93Cook_multiplication
+
+#include <algorithm>
+
+#include "src/bigint/bigint-internal.h"
+#include "src/bigint/digit-arithmetic.h"
+#include "src/bigint/vector-arithmetic.h"
+
+namespace v8 {
+namespace bigint {
+
+namespace {
+
+void TimesTwo(RWDigits X) {
+ digit_t carry = 0;
+ for (int i = 0; i < X.len(); i++) {
+ digit_t d = X[i];
+ X[i] = (d << 1) | carry;
+ carry = d >> (kDigitBits - 1);
+ }
+}
+
+void DivideByTwo(RWDigits X) {
+ digit_t carry = 0;
+ for (int i = X.len() - 1; i >= 0; i--) {
+ digit_t d = X[i];
+ X[i] = (d >> 1) | carry;
+ carry = d << (kDigitBits - 1);
+ }
+}
+
+void DivideByThree(RWDigits X) {
+ digit_t remainder = 0;
+ for (int i = X.len() - 1; i >= 0; i--) {
+ digit_t d = X[i];
+ digit_t upper = (remainder << kHalfDigitBits) | (d >> kHalfDigitBits);
+ digit_t u_result = upper / 3;
+ remainder = upper - 3 * u_result;
+ digit_t lower = (remainder << kHalfDigitBits) | (d & kHalfDigitMask);
+ digit_t l_result = lower / 3;
+ remainder = lower - 3 * l_result;
+ X[i] = (u_result << kHalfDigitBits) | l_result;
+ }
+}
+
+} // namespace
+
+#if DEBUG
+// Set {len_} to 1 rather than 0 so that attempts to access the first digit
+// will crash.
+#define MARK_INVALID(D) D = RWDigits(nullptr, 1)
+#else
+#define MARK_INVALID(D) (void(0))
+#endif
+
+void ProcessorImpl::Toom3Main(RWDigits Z, Digits X, Digits Y) {
+ DCHECK(Z.len() >= X.len() + Y.len());
+ // Phase 1: Splitting.
+ int i = DIV_CEIL(std::max(X.len(), Y.len()), 3);
+ Digits X0(X, 0, i);
+ Digits X1(X, i, i);
+ Digits X2(X, 2 * i, i);
+ Digits Y0(Y, 0, i);
+ Digits Y1(Y, i, i);
+ Digits Y2(Y, 2 * i, i);
+
+ // Temporary storage.
+ int p_len = i + 1; // For all px, qx below.
+ int r_len = 2 * p_len; // For all r_x, Rx below.
+ Storage temp_storage(4 * r_len);
+ // We will use the same variable names as the Wikipedia article, as much as
+ // C++ lets us: our "p_m1" is their "p(-1)" etc. For consistency with other
+ // algorithms, we use X and Y where Wikipedia uses m and n.
+ // We will use and re-use the temporary storage as follows:
+ //
+ // chunk | -------- time ----------->
+ // [0 .. i] |( po )( p_m1 ) ( r_m2 )
+ // [i+1 .. rlen-1] |( qo )( q_m1 ) ( r_m2 )
+ // [rlen .. rlen+i] | (p_1 ) ( p_m2 ) (r_inf)
+ // [rlen+i+1 .. 2*rlen-1] | (q_1 ) ( q_m2 ) (r_inf)
+ // [2*rlen .. 3*rlen-1] | ( r_1 )
+ // [3*rlen .. 4*rlen-1] | ( r_m1 )
+ //
+ // This requires interleaving phases 2 and 3 a bit: after computing
+ // r_1 = p_1 * q_1, we can re-use p_1's storage for p_m2, and so on.
+ digit_t* t = temp_storage.get();
+ RWDigits po(t, p_len);
+ RWDigits qo(t + p_len, p_len);
+ RWDigits p_1(t + r_len, p_len);
+ RWDigits q_1(t + r_len + p_len, p_len);
+ RWDigits r_1(t + 2 * r_len, r_len);
+ RWDigits r_m1(t + 3 * r_len, r_len);
+
+ // We can also share the backing stores of Z, r_0, R0.
+ DCHECK(Z.len() >= r_len);
+ RWDigits r_0(Z, 0, r_len);
+
+ // Phase 2a: Evaluation, steps 0, 1, m1.
+ // po = X0 + X2
+ Add(po, X0, X2);
+ // p_0 = X0
+ // p_1 = po + X1
+ Add(p_1, po, X1);
+ // p_m1 = po - X1
+ RWDigits p_m1 = po;
+ bool p_m1_sign = SubtractSigned(p_m1, po, false, X1, false);
+ MARK_INVALID(po);
+
+ // qo = Y0 + Y2
+ Add(qo, Y0, Y2);
+ // q_0 = Y0
+ // q_1 = qo + Y1
+ Add(q_1, qo, Y1);
+ // q_m1 = qo - Y1
+ RWDigits q_m1 = qo;
+ bool q_m1_sign = SubtractSigned(q_m1, qo, false, Y1, false);
+ MARK_INVALID(qo);
+
+ // Phase 3a: Pointwise multiplication, steps 0, 1, m1.
+ Multiply(r_0, X0, Y0);
+ Multiply(r_1, p_1, q_1);
+ Multiply(r_m1, p_m1, q_m1);
+ bool r_m1_sign = p_m1_sign != q_m1_sign;
+
+ // Phase 2b: Evaluation, steps m2 and inf.
+ // p_m2 = (p_m1 + X2) * 2 - X0
+ RWDigits p_m2 = p_1;
+ MARK_INVALID(p_1);
+ bool p_m2_sign = AddSigned(p_m2, p_m1, p_m1_sign, X2, false);
+ TimesTwo(p_m2);
+ p_m2_sign = SubtractSigned(p_m2, p_m2, p_m2_sign, X0, false);
+ // p_inf = X2
+
+ // q_m2 = (q_m1 + Y2) * 2 - Y0
+ RWDigits q_m2 = q_1;
+ MARK_INVALID(q_1);
+ bool q_m2_sign = AddSigned(q_m2, q_m1, q_m1_sign, Y2, false);
+ TimesTwo(q_m2);
+ q_m2_sign = SubtractSigned(q_m2, q_m2, q_m2_sign, Y0, false);
+ // q_inf = Y2
+
+ // Phase 3b: Pointwise multiplication, steps m2 and inf.
+ RWDigits r_m2(t, r_len);
+ MARK_INVALID(p_m1);
+ MARK_INVALID(q_m1);
+ Multiply(r_m2, p_m2, q_m2);
+ bool r_m2_sign = p_m2_sign != q_m2_sign;
+
+ RWDigits r_inf(t + r_len, r_len);
+ MARK_INVALID(p_m2);
+ MARK_INVALID(q_m2);
+ Multiply(r_inf, X2, Y2);
+
+ // Phase 4: Interpolation.
+ Digits R0 = r_0;
+ Digits R4 = r_inf;
+ // R3 <- (r_m2 - r_1) / 3
+ RWDigits R3 = r_m2;
+ bool R3_sign = SubtractSigned(R3, r_m2, r_m2_sign, r_1, false);
+ DivideByThree(R3);
+ // R1 <- (r_1 - r_m1) / 2
+ RWDigits R1 = r_1;
+ bool R1_sign = SubtractSigned(R1, r_1, false, r_m1, r_m1_sign);
+ DivideByTwo(R1);
+ // R2 <- r_m1 - r_0
+ RWDigits R2 = r_m1;
+ bool R2_sign = SubtractSigned(R2, r_m1, r_m1_sign, R0, false);
+ // R3 <- (R2 - R3) / 2 + 2 * r_inf
+ R3_sign = SubtractSigned(R3, R2, R2_sign, R3, R3_sign);
+ DivideByTwo(R3);
+ // TODO(jkummerow): Would it be a measurable improvement to write an
+ // "AddTwice" helper?
+ R3_sign = AddSigned(R3, R3, R3_sign, r_inf, false);
+ R3_sign = AddSigned(R3, R3, R3_sign, r_inf, false);
+ // R2 <- R2 + R1 - R4
+ R2_sign = AddSigned(R2, R2, R2_sign, R1, R1_sign);
+ R2_sign = SubtractSigned(R2, R2, R2_sign, R4, false);
+ // R1 <- R1 - R3
+ R1_sign = SubtractSigned(R1, R1, R1_sign, R3, R3_sign);
+
+#if DEBUG
+ R1.Normalize();
+ R2.Normalize();
+ R3.Normalize();
+ DCHECK(R1_sign == false || R1.len() == 0);
+ DCHECK(R2_sign == false || R2.len() == 0);
+ DCHECK(R3_sign == false || R3.len() == 0);
+#endif
+
+ // Phase 5: Recomposition. R0 is already in place. Overflow can't happen.
+ for (int j = R0.len(); j < Z.len(); j++) Z[j] = 0;
+ AddAndReturnOverflow(Z + i, R1);
+ AddAndReturnOverflow(Z + 2 * i, R2);
+ AddAndReturnOverflow(Z + 3 * i, R3);
+ AddAndReturnOverflow(Z + 4 * i, R4);
+}
+
+void ProcessorImpl::MultiplyToomCook(RWDigits Z, Digits X, Digits Y) {
+ DCHECK(X.len() >= Y.len());
+ int k = Y.len();
+ // TODO(jkummerow): Would it be a measurable improvement to share the
+ // scratch memory for several invocations?
+ Digits X0(X, 0, k);
+ Toom3Main(Z, X0, Y);
+ if (X.len() > Y.len()) {
+ ScratchDigits T(2 * k);
+ for (int i = k; i < X.len(); i += k) {
+ Digits Xi(X, i, k);
+ // TODO(jkummerow): would it be a measurable improvement to craft a
+ // "ToomChunk" method in the style of {KaratsubaChunk}?
+ Toom3Main(T, Xi, Y);
+ AddAndReturnOverflow(Z + i, T); // Can't overflow.
+ }
+ }
+}
+
+} // namespace bigint
+} // namespace v8
diff --git a/deps/v8/src/bigint/tostring.cc b/deps/v8/src/bigint/tostring.cc
new file mode 100644
index 0000000000..b426c864cd
--- /dev/null
+++ b/deps/v8/src/bigint/tostring.cc
@@ -0,0 +1,303 @@
+// Copyright 2021 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <cstring>
+#include <limits>
+
+#include "src/bigint/bigint-internal.h"
+#include "src/bigint/digit-arithmetic.h"
+#include "src/bigint/div-helpers.h"
+#include "src/bigint/util.h"
+#include "src/bigint/vector-arithmetic.h"
+
+namespace v8 {
+namespace bigint {
+
+namespace {
+
+// Lookup table for the maximum number of bits required per character of a
+// base-N string representation of a number. To increase accuracy, the array
+// value is the actual value multiplied by 32. To generate this table:
+// for (var i = 0; i <= 36; i++) { print(Math.ceil(Math.log2(i) * 32) + ","); }
+constexpr uint8_t kMaxBitsPerChar[] = {
+ 0, 0, 32, 51, 64, 75, 83, 90, 96, // 0..8
+ 102, 107, 111, 115, 119, 122, 126, 128, // 9..16
+ 131, 134, 136, 139, 141, 143, 145, 147, // 17..24
+ 149, 151, 153, 154, 156, 158, 159, 160, // 25..32
+ 162, 163, 165, 166, // 33..36
+};
+
+static const int kBitsPerCharTableShift = 5;
+static const size_t kBitsPerCharTableMultiplier = 1u << kBitsPerCharTableShift;
+
+static const char kConversionChars[] = "0123456789abcdefghijklmnopqrstuvwxyz";
+
+// Raises {base} to the power of {exponent}. Does not check for overflow.
+digit_t digit_pow(digit_t base, digit_t exponent) {
+ digit_t result = 1ull;
+ while (exponent > 0) {
+ if (exponent & 1) {
+ result *= base;
+ }
+ exponent >>= 1;
+ base *= base;
+ }
+ return result;
+}
+
+// Compile-time version of the above.
+constexpr digit_t digit_pow_rec(digit_t base, digit_t exponent) {
+ return exponent == 1 ? base : base * digit_pow_rec(base, exponent - 1);
+}
+
+// A variant of ToStringFormatter::BasecaseLast, specialized for a radix
+// known at compile-time.
+template <int radix>
+char* BasecaseFixedLast(digit_t chunk, char* out) {
+ while (chunk != 0) {
+ DCHECK(*(out - 1) == kStringZapValue); // NOLINT(readability/check)
+ if (radix <= 10) {
+ *(--out) = '0' + (chunk % radix);
+ } else {
+ *(--out) = kConversionChars[chunk % radix];
+ }
+ chunk /= radix;
+ }
+ return out;
+}
+
+// By making {radix} a compile-time constant and computing {chunk_divisor}
+// as another compile-time constant from it, we allow the compiler to emit
+// an optimized instruction sequence based on multiplications with "magic"
+// numbers (modular multiplicative inverses) instead of actual divisions.
+// The price we pay is having to work on half digits; the technique doesn't
+// work with twodigit_t-by-digit_t divisions.
+// Includes an equivalent of ToStringFormatter::BasecaseMiddle, accordingly
+// specialized for a radix known at compile time.
+template <digit_t radix>
+char* DivideByMagic(RWDigits rest, Digits input, char* output) {
+ constexpr uint8_t max_bits_per_char = kMaxBitsPerChar[radix];
+ constexpr int chunk_chars =
+ kHalfDigitBits * kBitsPerCharTableMultiplier / max_bits_per_char;
+ constexpr digit_t chunk_divisor = digit_pow_rec(radix, chunk_chars);
+ digit_t remainder = 0;
+ for (int i = input.len() - 1; i >= 0; i--) {
+ digit_t d = input[i];
+ digit_t upper = (remainder << kHalfDigitBits) | (d >> kHalfDigitBits);
+ digit_t u_result = upper / chunk_divisor;
+ remainder = upper % chunk_divisor;
+ digit_t lower = (remainder << kHalfDigitBits) | (d & kHalfDigitMask);
+ digit_t l_result = lower / chunk_divisor;
+ remainder = lower % chunk_divisor;
+ rest[i] = (u_result << kHalfDigitBits) | l_result;
+ }
+ // {remainder} is now the current chunk to be written out.
+ for (int i = 0; i < chunk_chars; i++) {
+ DCHECK(*(output - 1) == kStringZapValue); // NOLINT(readability/check)
+ if (radix <= 10) {
+ *(--output) = '0' + (remainder % radix);
+ } else {
+ *(--output) = kConversionChars[remainder % radix];
+ }
+ remainder /= radix;
+ }
+ DCHECK(remainder == 0); // NOLINT(readability/check)
+ return output;
+}
+
+class ToStringFormatter {
+ public:
+ ToStringFormatter(Digits X, int radix, bool sign, char* out,
+ int chars_available, ProcessorImpl* processor)
+ : digits_(X),
+ radix_(radix),
+ sign_(sign),
+ out_start_(out),
+ out_end_(out + chars_available),
+ out_(out_end_),
+ processor_(processor) {
+ DCHECK(chars_available >= ToStringResultLength(digits_, radix_, sign_));
+ }
+
+ void Start();
+ int Finish();
+
+ void Classic() {
+ if (digits_.len() == 0) {
+ *(--out_) = '0';
+ return;
+ }
+ if (digits_.len() == 1) {
+ out_ = BasecaseLast(digits_[0], out_);
+ return;
+ }
+ // {rest} holds the part of the BigInt that we haven't looked at yet.
+ // Not to be confused with "remainder"!
+ ScratchDigits rest(digits_.len());
+ // In the first round, divide the input, allocating a new BigInt for
+ // the result == rest; from then on divide the rest in-place.
+ Digits dividend = digits_;
+ do {
+ if (radix_ == 10) {
+ // Faster but costs binary size, so we optimize the most common case.
+ out_ = DivideByMagic<10>(rest, dividend, out_);
+ processor_->AddWorkEstimate(rest.len() * 2);
+ } else {
+ digit_t chunk;
+ processor_->DivideSingle(rest, &chunk, dividend, chunk_divisor_);
+ out_ = BasecaseMiddle(chunk, out_);
+ // Assume that a division is about ten times as expensive as a
+ // multiplication.
+ processor_->AddWorkEstimate(rest.len() * 10);
+ }
+ if (processor_->should_terminate()) return;
+ rest.Normalize();
+ dividend = rest;
+ } while (rest.len() > 1);
+ out_ = BasecaseLast(rest[0], out_);
+ }
+
+ void BasePowerOfTwo();
+
+ private:
+ // When processing the last (most significant) digit, don't write leading
+ // zeros.
+ char* BasecaseLast(digit_t digit, char* out) {
+ if (radix_ == 10) return BasecaseFixedLast<10>(digit, out);
+ do {
+ DCHECK(*(out - 1) == kStringZapValue); // NOLINT(readability/check)
+ *(--out) = kConversionChars[digit % radix_];
+ digit /= radix_;
+ } while (digit > 0);
+ return out;
+ }
+
+ // When processing a middle (non-most significant) digit, always write the
+ // same number of characters (as many '0' as necessary).
+ char* BasecaseMiddle(digit_t digit, char* out) {
+ for (int i = 0; i < chunk_chars_; i++) {
+ DCHECK(*(out - 1) == kStringZapValue); // NOLINT(readability/check)
+ *(--out) = kConversionChars[digit % radix_];
+ digit /= radix_;
+ }
+ DCHECK(digit == 0); // NOLINT(readability/check)
+ return out;
+ }
+
+ Digits digits_;
+ int radix_;
+ int max_bits_per_char_ = 0;
+ int chunk_chars_ = 0;
+ bool sign_;
+ char* out_start_;
+ char* out_end_;
+ char* out_;
+ digit_t chunk_divisor_ = 0;
+ ProcessorImpl* processor_;
+};
+
+// Prepares data for {Classic}. Not needed for {BasePowerOfTwo}.
+void ToStringFormatter::Start() {
+ max_bits_per_char_ = kMaxBitsPerChar[radix_];
+ chunk_chars_ = kDigitBits * kBitsPerCharTableMultiplier / max_bits_per_char_;
+ chunk_divisor_ = digit_pow(radix_, chunk_chars_);
+ // By construction of chunk_chars_, there can't have been overflow.
+ DCHECK(chunk_divisor_ != 0); // NOLINT(readability/check)
+}
+
+int ToStringFormatter::Finish() {
+ DCHECK(out_ >= out_start_);
+ DCHECK(out_ < out_end_); // At least one character was written.
+ while (out_ < out_end_ && *out_ == '0') out_++;
+ if (sign_) *(--out_) = '-';
+ int excess = 0;
+ if (out_ > out_start_) {
+ size_t actual_length = out_end_ - out_;
+ excess = static_cast<int>(out_ - out_start_);
+ std::memmove(out_start_, out_, actual_length);
+ }
+ return excess;
+}
+
+void ToStringFormatter::BasePowerOfTwo() {
+ const int bits_per_char = CountTrailingZeros(radix_);
+ const int char_mask = radix_ - 1;
+ digit_t digit = 0;
+ // Keeps track of how many unprocessed bits there are in {digit}.
+ int available_bits = 0;
+ for (int i = 0; i < digits_.len() - 1; i++) {
+ digit_t new_digit = digits_[i];
+ // Take any leftover bits from the last iteration into account.
+ int current = (digit | (new_digit << available_bits)) & char_mask;
+ *(--out_) = kConversionChars[current];
+ int consumed_bits = bits_per_char - available_bits;
+ digit = new_digit >> consumed_bits;
+ available_bits = kDigitBits - consumed_bits;
+ while (available_bits >= bits_per_char) {
+ *(--out_) = kConversionChars[digit & char_mask];
+ digit >>= bits_per_char;
+ available_bits -= bits_per_char;
+ }
+ }
+ // Take any leftover bits from the last iteration into account.
+ digit_t msd = digits_.msd();
+ int current = (digit | (msd << available_bits)) & char_mask;
+ *(--out_) = kConversionChars[current];
+ digit = msd >> (bits_per_char - available_bits);
+ while (digit != 0) {
+ *(--out_) = kConversionChars[digit & char_mask];
+ digit >>= bits_per_char;
+ }
+}
+
+} // namespace
+
+void ProcessorImpl::ToString(char* out, int* out_length, Digits X, int radix,
+ bool sign) {
+#if DEBUG
+ for (int i = 0; i < *out_length; i++) out[i] = kStringZapValue;
+#endif
+ ToStringFormatter formatter(X, radix, sign, out, *out_length, this);
+ if (IsPowerOfTwo(radix)) {
+ formatter.BasePowerOfTwo();
+ } else {
+ formatter.Start();
+ formatter.Classic();
+ }
+ int excess = formatter.Finish();
+ *out_length -= excess;
+}
+
+Status Processor::ToString(char* out, int* out_length, Digits X, int radix,
+ bool sign) {
+ ProcessorImpl* impl = static_cast<ProcessorImpl*>(this);
+ impl->ToString(out, out_length, X, radix, sign);
+ return impl->get_and_clear_status();
+}
+
+int ToStringResultLength(Digits X, int radix, bool sign) {
+ const int bit_length = BitLength(X);
+ int result;
+ if (IsPowerOfTwo(radix)) {
+ const int bits_per_char = CountTrailingZeros(radix);
+ result = DIV_CEIL(bit_length, bits_per_char) + sign;
+ } else {
+ // Maximum number of bits we can represent with one character.
+ const uint8_t max_bits_per_char = kMaxBitsPerChar[radix];
+ // For estimating the result length, we have to be pessimistic and work with
+ // the minimum number of bits one character can represent.
+ const uint8_t min_bits_per_char = max_bits_per_char - 1;
+ // Perform the following computation with uint64_t to avoid overflows.
+ uint64_t chars_required = bit_length;
+ chars_required *= kBitsPerCharTableMultiplier;
+ chars_required = DIV_CEIL(chars_required, min_bits_per_char);
+ DCHECK(chars_required < std::numeric_limits<int>::max());
+ result = static_cast<int>(chars_required);
+ }
+ result += sign;
+ return result;
+}
+
+} // namespace bigint
+} // namespace v8
diff --git a/deps/v8/src/bigint/util.h b/deps/v8/src/bigint/util.h
new file mode 100644
index 0000000000..3d98e734dc
--- /dev/null
+++ b/deps/v8/src/bigint/util.h
@@ -0,0 +1,75 @@
+// Copyright 2021 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// "Generic" helper functions (not specific to BigInts).
+
+#include <stdint.h>
+
+#include <type_traits>
+
+#ifdef _MSC_VER
+#include <intrin.h> // For _BitScanReverse.
+#endif
+
+#ifndef V8_BIGINT_UTIL_H_
+#define V8_BIGINT_UTIL_H_
+
+// Integer division, rounding up.
+#define DIV_CEIL(x, y) (((x)-1) / (y) + 1)
+
+namespace v8 {
+namespace bigint {
+
+// Rounds up x to a multiple of y.
+inline constexpr int RoundUp(int x, int y) { return (x + y - 1) & -y; }
+
+// Different environments disagree on how 64-bit uintptr_t and uint64_t are
+// defined, so we have to use templates to be generic.
+template <typename T, typename = typename std::enable_if<
+ std::is_unsigned<T>::value && sizeof(T) == 8>::type>
+constexpr int CountLeadingZeros(T value) {
+#if __GNUC__ || __clang__
+ return value == 0 ? 64 : __builtin_clzll(value);
+#elif _MSC_VER
+ unsigned long index = 0; // NOLINT(runtime/int). MSVC insists.
+ return _BitScanReverse64(&index, value) ? 63 - index : 64;
+#else
+#error Unsupported compiler.
+#endif
+}
+
+constexpr int CountLeadingZeros(uint32_t value) {
+#if __GNUC__ || __clang__
+ return value == 0 ? 32 : __builtin_clz(value);
+#elif _MSC_VER
+ unsigned long index = 0; // NOLINT(runtime/int). MSVC insists.
+ return _BitScanReverse(&index, value) ? 31 - index : 32;
+#else
+#error Unsupported compiler.
+#endif
+}
+
+inline constexpr int CountTrailingZeros(uint32_t value) {
+#if __GNUC__ || __clang__
+ return value == 0 ? 32 : __builtin_ctz(value);
+#elif _MSC_VER
+ unsigned long index = 0; // NOLINT(runtime/int).
+ return _BitScanForward(&index, value) ? index : 32;
+#else
+#error Unsupported compiler.
+#endif
+}
+
+inline constexpr int BitLength(int n) {
+ return 32 - CountLeadingZeros(static_cast<uint32_t>(n));
+}
+
+inline constexpr bool IsPowerOfTwo(int value) {
+ return value > 0 && (value & (value - 1)) == 0;
+}
+
+} // namespace bigint
+} // namespace v8
+
+#endif // V8_BIGINT_UTIL_H_
diff --git a/deps/v8/src/bigint/vector-arithmetic.cc b/deps/v8/src/bigint/vector-arithmetic.cc
index 734b443911..4191755bc9 100644
--- a/deps/v8/src/bigint/vector-arithmetic.cc
+++ b/deps/v8/src/bigint/vector-arithmetic.cc
@@ -4,18 +4,118 @@
#include "src/bigint/vector-arithmetic.h"
+#include "src/bigint/bigint-internal.h"
+#include "src/bigint/digit-arithmetic.h"
+
namespace v8 {
namespace bigint {
-int Compare(Digits A, Digits B) {
- A.Normalize();
- B.Normalize();
- int diff = A.len() - B.len();
- if (diff != 0) return diff;
- int i = A.len() - 1;
- while (i >= 0 && A[i] == B[i]) i--;
- if (i < 0) return 0;
- return A[i] > B[i] ? 1 : -1;
+digit_t AddAndReturnOverflow(RWDigits Z, Digits X) {
+ X.Normalize();
+ if (X.len() == 0) return 0;
+ digit_t carry = 0;
+ int i = 0;
+ for (; i < X.len(); i++) {
+ Z[i] = digit_add3(Z[i], X[i], carry, &carry);
+ }
+ for (; i < Z.len() && carry != 0; i++) {
+ Z[i] = digit_add2(Z[i], carry, &carry);
+ }
+ return carry;
+}
+
+digit_t SubAndReturnBorrow(RWDigits Z, Digits X) {
+ X.Normalize();
+ if (X.len() == 0) return 0;
+ digit_t borrow = 0;
+ int i = 0;
+ for (; i < X.len(); i++) {
+ Z[i] = digit_sub2(Z[i], X[i], borrow, &borrow);
+ }
+ for (; i < Z.len() && borrow != 0; i++) {
+ Z[i] = digit_sub(Z[i], borrow, &borrow);
+ }
+ return borrow;
+}
+
+void Add(RWDigits Z, Digits X, Digits Y) {
+ if (X.len() < Y.len()) {
+ return Add(Z, Y, X);
+ }
+ int i = 0;
+ digit_t carry = 0;
+ for (; i < Y.len(); i++) {
+ Z[i] = digit_add3(X[i], Y[i], carry, &carry);
+ }
+ for (; i < X.len(); i++) {
+ Z[i] = digit_add2(X[i], carry, &carry);
+ }
+ for (; i < Z.len(); i++) {
+ Z[i] = carry;
+ carry = 0;
+ }
+}
+
+void Subtract(RWDigits Z, Digits X, Digits Y) {
+ X.Normalize();
+ Y.Normalize();
+ DCHECK(X.len() >= Y.len());
+ int i = 0;
+ digit_t borrow = 0;
+ for (; i < Y.len(); i++) {
+ Z[i] = digit_sub2(X[i], Y[i], borrow, &borrow);
+ }
+ for (; i < X.len(); i++) {
+ Z[i] = digit_sub(X[i], borrow, &borrow);
+ }
+ DCHECK(borrow == 0); // NOLINT(readability/check)
+ for (; i < Z.len(); i++) Z[i] = 0;
+}
+
+digit_t AddAndReturnCarry(RWDigits Z, Digits X, Digits Y) {
+ DCHECK(Z.len() >= Y.len() && X.len() >= Y.len());
+ digit_t carry = 0;
+ for (int i = 0; i < Y.len(); i++) {
+ Z[i] = digit_add3(X[i], Y[i], carry, &carry);
+ }
+ return carry;
+}
+
+digit_t SubtractAndReturnBorrow(RWDigits Z, Digits X, Digits Y) {
+ DCHECK(Z.len() >= Y.len() && X.len() >= Y.len());
+ digit_t borrow = 0;
+ for (int i = 0; i < Y.len(); i++) {
+ Z[i] = digit_sub2(X[i], Y[i], borrow, &borrow);
+ }
+ return borrow;
+}
+
+bool AddSigned(RWDigits Z, Digits X, bool x_negative, Digits Y,
+ bool y_negative) {
+ if (x_negative == y_negative) {
+ Add(Z, X, Y);
+ return x_negative;
+ }
+ if (GreaterThanOrEqual(X, Y)) {
+ Subtract(Z, X, Y);
+ return x_negative;
+ }
+ Subtract(Z, Y, X);
+ return !x_negative;
+}
+
+bool SubtractSigned(RWDigits Z, Digits X, bool x_negative, Digits Y,
+ bool y_negative) {
+ if (x_negative != y_negative) {
+ Add(Z, X, Y);
+ return x_negative;
+ }
+ if (GreaterThanOrEqual(X, Y)) {
+ Subtract(Z, X, Y);
+ return x_negative;
+ }
+ Subtract(Z, Y, X);
+ return !x_negative;
}
} // namespace bigint
diff --git a/deps/v8/src/bigint/vector-arithmetic.h b/deps/v8/src/bigint/vector-arithmetic.h
index 617cb20b55..3247660f95 100644
--- a/deps/v8/src/bigint/vector-arithmetic.h
+++ b/deps/v8/src/bigint/vector-arithmetic.h
@@ -8,12 +8,52 @@
#define V8_BIGINT_VECTOR_ARITHMETIC_H_
#include "src/bigint/bigint.h"
+#include "src/bigint/digit-arithmetic.h"
namespace v8 {
namespace bigint {
+// Z += X. Returns carry on overflow.
+digit_t AddAndReturnOverflow(RWDigits Z, Digits X);
+
+// Z -= X. Returns borrow on overflow.
+digit_t SubAndReturnBorrow(RWDigits Z, Digits X);
+
+// X += y.
+inline void Add(RWDigits X, digit_t y) {
+ digit_t carry = y;
+ int i = 0;
+ do {
+ X[i] = digit_add2(X[i], carry, &carry);
+ i++;
+ } while (carry != 0);
+}
+
+// X -= y.
+inline void Subtract(RWDigits X, digit_t y) {
+ digit_t borrow = y;
+ int i = 0;
+ do {
+ X[i] = digit_sub(X[i], borrow, &borrow);
+ i++;
+ } while (borrow != 0);
+}
+
+// These add exactly Y's digits to the matching digits in X, storing the
+// result in (part of) Z, and return the carry/borrow.
+digit_t AddAndReturnCarry(RWDigits Z, Digits X, Digits Y);
+digit_t SubtractAndReturnBorrow(RWDigits Z, Digits X, Digits Y);
+
inline bool IsDigitNormalized(Digits X) { return X.len() == 0 || X.msd() != 0; }
+inline bool GreaterThanOrEqual(Digits A, Digits B) {
+ return Compare(A, B) >= 0;
+}
+
+inline int BitLength(Digits X) {
+ return X.len() * kDigitBits - CountLeadingZeros(X.msd());
+}
+
} // namespace bigint
} // namespace v8