17 files changed, 2655 insertions, 19 deletions
diff --git a/deps/v8/src/bigint/OWNERS b/deps/v8/src/bigint/OWNERS
index f8c2fcc47a..8a2ddcc49d 100644
--- a/deps/v8/src/bigint/OWNERS
+++ b/deps/v8/src/bigint/OWNERS
@@ -1,2 +1,3 @@
 jkummerow@chromium.org
+mslekova@chromium.org
 thibaudm@chromium.org
diff --git a/deps/v8/src/bigint/bigint-internal.cc b/deps/v8/src/bigint/bigint-internal.cc
index 6630c6c4c9..828a450e8a 100644
--- a/deps/v8/src/bigint/bigint-internal.cc
+++ b/deps/v8/src/bigint/bigint-internal.cc
@@ -30,7 +30,61 @@ void ProcessorImpl::Multiply(RWDigits Z, Digits X, Digits Y) {
   if (X.len() == 0 || Y.len() == 0) return Z.Clear();
   if (X.len() < Y.len()) std::swap(X, Y);
   if (Y.len() == 1) return MultiplySingle(Z, X, Y[0]);
-  return MultiplySchoolbook(Z, X, Y);
+  if (Y.len() < kKaratsubaThreshold) return MultiplySchoolbook(Z, X, Y);
+#if !V8_ADVANCED_BIGINT_ALGORITHMS
+  return MultiplyKaratsuba(Z, X, Y);
+#else
+  if (Y.len() < kToomThreshold) return MultiplyKaratsuba(Z, X, Y);
+  if (Y.len() < kFftThreshold) return MultiplyToomCook(Z, X, Y);
+  return MultiplyFFT(Z, X, Y);
+#endif
+}
+
+void ProcessorImpl::Divide(RWDigits Q, Digits A, Digits B) {
+  A.Normalize();
+  B.Normalize();
+  DCHECK(B.len() > 0);  // NOLINT(readability/check)
+  int cmp = Compare(A, B);
+  if (cmp < 0) return Q.Clear();
+  if (cmp == 0) {
+    Q[0] = 1;
+    for (int i = 1; i < Q.len(); i++) Q[i] = 0;
+    return;
+  }
+  if (B.len() == 1) {
+    digit_t remainder;
+    return DivideSingle(Q, &remainder, A, B[0]);
+  }
+  if (B.len() < kBurnikelThreshold) {
+    return DivideSchoolbook(Q, RWDigits(nullptr, 0), A, B);
+  }
+  return DivideBurnikelZiegler(Q, RWDigits(nullptr, 0), A, B);
+}
+
+void ProcessorImpl::Modulo(RWDigits R, Digits A, Digits B) {
+  A.Normalize();
+  B.Normalize();
+  DCHECK(B.len() > 0);  // NOLINT(readability/check)
+  int cmp = Compare(A, B);
+  if (cmp < 0) {
+    for (int i = 0; i < B.len(); i++) R[i] = B[i];
+    for (int i = B.len(); i < R.len(); i++) R[i] = 0;
+    return;
+  }
+  if (cmp == 0) return R.Clear();
+  if (B.len() == 1) {
+    digit_t remainder;
+    DivideSingle(RWDigits(nullptr, 0), &remainder, A, B[0]);
+    R[0] = remainder;
+    for (int i = 1; i < R.len(); i++) R[i] = 0;
+    return;
+  }
+  if (B.len() < kBurnikelThreshold) {
+    return DivideSchoolbook(RWDigits(nullptr, 0), R, A, B);
+  }
+  int q_len = DivideResultLength(A, B);
+  ScratchDigits Q(q_len);
+  return DivideBurnikelZiegler(Q, R, A, B);
 }
 
 Status Processor::Multiply(RWDigits Z, Digits X, Digits Y) {
@@ -39,5 +93,17 @@ Status Processor::Multiply(RWDigits Z, Digits X, Digits Y) {
   return impl->get_and_clear_status();
 }
 
+Status Processor::Divide(RWDigits Q, Digits A, Digits B) {
+  ProcessorImpl* impl = static_cast<ProcessorImpl*>(this);
+  impl->Divide(Q, A, B);
+  return impl->get_and_clear_status();
+}
+
+Status Processor::Modulo(RWDigits R, Digits A, Digits B) {
+  ProcessorImpl* impl = static_cast<ProcessorImpl*>(this);
+  impl->Modulo(R, A, B);
+  return impl->get_and_clear_status();
+}
+
 }  // namespace bigint
 }  // namespace v8
diff --git a/deps/v8/src/bigint/bigint-internal.h b/deps/v8/src/bigint/bigint-internal.h
index efe63a06a5..41ef9526e5 100644
--- a/deps/v8/src/bigint/bigint-internal.h
+++ b/deps/v8/src/bigint/bigint-internal.h
@@ -5,11 +5,20 @@
 #ifndef V8_BIGINT_BIGINT_INTERNAL_H_
 #define V8_BIGINT_BIGINT_INTERNAL_H_
 
+#include <memory>
+
 #include "src/bigint/bigint.h"
 
 namespace v8 {
 namespace bigint {
 
+constexpr int kKaratsubaThreshold = 34;
+constexpr int kToomThreshold = 193;
+constexpr int kFftThreshold = 1500;
+constexpr int kFftInnerThreshold = 200;
+
+constexpr int kBurnikelThreshold = 57;
+
 class ProcessorImpl : public Processor {
  public:
   explicit ProcessorImpl(Platform* platform);
@@ -21,7 +30,31 @@ class ProcessorImpl : public Processor {
   void MultiplySingle(RWDigits Z, Digits X, digit_t y);
   void MultiplySchoolbook(RWDigits Z, Digits X, Digits Y);
 
- private:
+  void MultiplyKaratsuba(RWDigits Z, Digits X, Digits Y);
+  void KaratsubaStart(RWDigits Z, Digits X, Digits Y, RWDigits scratch, int k);
+  void KaratsubaChunk(RWDigits Z, Digits X, Digits Y, RWDigits scratch);
+  void KaratsubaMain(RWDigits Z, Digits X, Digits Y, RWDigits scratch, int n);
+
+  void Divide(RWDigits Q, Digits A, Digits B);
+  void DivideSingle(RWDigits Q, digit_t* remainder, Digits A, digit_t b);
+  void DivideSchoolbook(RWDigits Q, RWDigits R, Digits A, Digits B);
+  void DivideBurnikelZiegler(RWDigits Q, RWDigits R, Digits A, Digits B);
+
+  void Modulo(RWDigits R, Digits A, Digits B);
+
+#if V8_ADVANCED_BIGINT_ALGORITHMS
+  void MultiplyToomCook(RWDigits Z, Digits X, Digits Y);
+  void Toom3Main(RWDigits Z, Digits X, Digits Y);
+
+  void MultiplyFFT(RWDigits Z, Digits X, Digits Y);
+#endif  // V8_ADVANCED_BIGINT_ALGORITHMS
+
+  // {out_length} initially contains the allocated capacity of {out}, and
+  // upon return will be set to the actual length of the result string.
+  void ToString(char* out, int* out_length, Digits X, int radix, bool sign);
+
+  bool should_terminate() { return status_ == Status::kInterrupted; }
+
   // Each unit is supposed to represent approximately one CPU {mul} instruction.
   // Doesn't need to be accurate; we just want to make sure to check for
   // interrupt requests every now and then (roughly every 10-100 ms; often
@@ -39,8 +72,7 @@ class ProcessorImpl : public Processor {
     }
   }
 
-  bool should_terminate() { return status_ == Status::kInterrupted; }
-
+ private:
   uintptr_t work_estimate_{0};
   Status status_{Status::kOk};
   Platform* platform_;
@@ -59,6 +91,30 @@ class ProcessorImpl : public Processor {
 #define DCHECK(cond) (void(0))
 #endif
 
+#define USE(var) ((void)var)
+
+// RAII memory for a Digits array.
+class Storage {
+ public:
+  explicit Storage(int count) : ptr_(new digit_t[count]) {}
+
+  digit_t* get() { return ptr_.get(); }
+
+ private:
+  std::unique_ptr<digit_t[]> ptr_;
+};
+
+// A writable Digits array with attached storage.
+class ScratchDigits : public RWDigits {
+ public:
+  explicit ScratchDigits(int len) : RWDigits(nullptr, len), storage_(len) {
+    digits_ = storage_.get();
+  }
+
+ private:
+  Storage storage_;
+};
+
 }  // namespace bigint
 }  // namespace v8
 
diff --git a/deps/v8/src/bigint/bigint.h b/deps/v8/src/bigint/bigint.h
index a365359c53..6d3790808c 100644
--- a/deps/v8/src/bigint/bigint.h
+++ b/deps/v8/src/bigint/bigint.h
@@ -101,7 +101,7 @@ class Digits {
   const digit_t* digits() const { return digits_; }
 
  protected:
-  friend class TemporaryLeftShift;
+  friend class ShiftedDigits;
   digit_t* digits_;
   int len_;
 
@@ -201,12 +201,35 @@ class Platform {
 //
 // The operations are divided into two groups: "fast" (O(n) with small
 // coefficient) operations are exposed directly as free functions, "slow"
-// operations are methods on a {BigIntProcessor} object, which provides
+// operations are methods on a {Processor} object, which provides
 // support for interrupting execution via the {Platform}'s {InterruptRequested}
 // mechanism when it takes too long. These functions return a {Status} value.
 
 // Returns r such that r < 0 if A < B; r > 0 if A > B; r == 0 if A == B.
-int Compare(Digits A, Digits B);
+// Defined here to be inlineable, which helps ia32 a lot (64-bit platforms
+// don't care).
+inline int Compare(Digits A, Digits B) {
+  A.Normalize();
+  B.Normalize();
+  int diff = A.len() - B.len();
+  if (diff != 0) return diff;
+  int i = A.len() - 1;
+  while (i >= 0 && A[i] == B[i]) i--;
+  if (i < 0) return 0;
+  return A[i] > B[i] ? 1 : -1;
+}
+
+// Z := X + Y
+void Add(RWDigits Z, Digits X, Digits Y);
+// Addition of signed integers. Returns true if the result is negative.
+bool AddSigned(RWDigits Z, Digits X, bool x_negative, Digits Y,
+               bool y_negative);
+
+// Z := X - Y. Requires X >= Y.
+void Subtract(RWDigits Z, Digits X, Digits Y);
+// Subtraction of signed integers. Returns true if the result is negative.
+bool SubtractSigned(RWDigits Z, Digits X, bool x_negative, Digits Y,
+                    bool y_negative);
 
 enum class Status { kOk, kInterrupted };
 
@@ -215,7 +238,7 @@ class Processor {
   // Takes ownership of {platform}.
   static Processor* New(Platform* platform);
 
-  // Use this for any std::unique_ptr holding an instance of BigIntProcessor.
+  // Use this for any std::unique_ptr holding an instance of {Processor}.
   class Destroyer {
    public:
     void operator()(Processor* proc) { proc->Destroy(); }
@@ -225,11 +248,40 @@ class Processor {
 
   // Z := X * Y
   Status Multiply(RWDigits Z, Digits X, Digits Y);
+  // Q := A / B
+  Status Divide(RWDigits Q, Digits A, Digits B);
+  // R := A % B
+  Status Modulo(RWDigits R, Digits A, Digits B);
+
+  // {out_length} initially contains the allocated capacity of {out}, and
+  // upon return will be set to the actual length of the result string.
+  Status ToString(char* out, int* out_length, Digits X, int radix, bool sign);
 };
 
+inline int AddResultLength(int x_length, int y_length) {
+  return std::max(x_length, y_length) + 1;
+}
+inline int AddSignedResultLength(int x_length, int y_length, bool same_sign) {
+  return same_sign ? AddResultLength(x_length, y_length)
+                   : std::max(x_length, y_length);
+}
+inline int SubtractResultLength(int x_length, int y_length) { return x_length; }
+inline int SubtractSignedResultLength(int x_length, int y_length,
+                                      bool same_sign) {
+  return same_sign ? std::max(x_length, y_length)
+                   : AddResultLength(x_length, y_length);
+}
 inline int MultiplyResultLength(Digits X, Digits Y) {
   return X.len() + Y.len();
 }
+inline int DivideResultLength(Digits A, Digits B) {
+  return A.len() - B.len() + 1;
+}
+inline int ModuloResultLength(Digits B) { return B.len(); }
+
+int ToStringResultLength(Digits X, int radix, bool sign);
+// In DEBUG builds, the result of {ToString} will be initialized to this value.
+constexpr char kStringZapValue = '?';
 
 }  // namespace bigint
 }  // namespace v8
diff --git a/deps/v8/src/bigint/digit-arithmetic.h b/deps/v8/src/bigint/digit-arithmetic.h
index 1c5c93c035..96ac949eb7 100644
--- a/deps/v8/src/bigint/digit-arithmetic.h
+++ b/deps/v8/src/bigint/digit-arithmetic.h
@@ -8,6 +8,7 @@
 #define V8_BIGINT_DIGIT_ARITHMETIC_H_
 
 #include "src/bigint/bigint.h"
+#include "src/bigint/util.h"
 
 namespace v8 {
 namespace bigint {
@@ -45,6 +46,36 @@ inline digit_t digit_add3(digit_t a, digit_t b, digit_t c, digit_t* carry) {
 #endif
 }
 
+// {borrow} will be set to 0 or 1.
+inline digit_t digit_sub(digit_t a, digit_t b, digit_t* borrow) {
+#if HAVE_TWODIGIT_T
+  twodigit_t result = twodigit_t{a} - b;
+  *borrow = (result >> kDigitBits) & 1;
+  return static_cast<digit_t>(result);
+#else
+  digit_t result = a - b;
+  *borrow = (result > a) ? 1 : 0;
+  return result;
+#endif
+}
+
+// {borrow_out} will be set to 0 or 1.
+inline digit_t digit_sub2(digit_t a, digit_t b, digit_t borrow_in,
+                          digit_t* borrow_out) {
+#if HAVE_TWODIGIT_T
+  twodigit_t subtrahend = twodigit_t{b} + borrow_in;
+  twodigit_t result = twodigit_t{a} - subtrahend;
+  *borrow_out = (result >> kDigitBits) & 1;
+  return static_cast<digit_t>(result);
+#else
+  digit_t result = a - b;
+  *borrow_out = (result > a) ? 1 : 0;
+  if (result < borrow_in) *borrow_out += 1;
+  result -= borrow_in;
+  return result;
+#endif
+}
+
 // Returns the low half of the result. High half is in {high}.
 inline digit_t digit_mul(digit_t a, digit_t b, digit_t* high) {
 #if HAVE_TWODIGIT_T
@@ -81,6 +112,83 @@ inline digit_t digit_mul(digit_t a, digit_t b, digit_t* high) {
 #endif
 }
 
+// Returns the quotient.
+// quotient = (high << kDigitBits + low - remainder) / divisor
+static inline digit_t digit_div(digit_t high, digit_t low, digit_t divisor,
+                                digit_t* remainder) {
+#if defined(DCHECK)
+  DCHECK(high < divisor);
+  DCHECK(divisor != 0);  // NOLINT(readability/check)
+#endif
+#if __x86_64__ && (__GNUC__ || __clang__)
+  digit_t quotient;
+  digit_t rem;
+  __asm__("divq  %[divisor]"
+          // Outputs: {quotient} will be in rax, {rem} in rdx.
+          : "=a"(quotient), "=d"(rem)
+          // Inputs: put {high} into rdx, {low} into rax, and {divisor} into
+          // any register or stack slot.
+          : "d"(high), "a"(low), [divisor] "rm"(divisor));
+  *remainder = rem;
+  return quotient;
+#elif __i386__ && (__GNUC__ || __clang__)
+  digit_t quotient;
+  digit_t rem;
+  __asm__("divl  %[divisor]"
+          // Outputs: {quotient} will be in eax, {rem} in edx.
+          : "=a"(quotient), "=d"(rem)
+          // Inputs: put {high} into edx, {low} into eax, and {divisor} into
+          // any register or stack slot.
+          : "d"(high), "a"(low), [divisor] "rm"(divisor));
+  *remainder = rem;
+  return quotient;
+#else
+  // Adapted from Warren, Hacker's Delight, p. 152.
+  int s = CountLeadingZeros(divisor);
+#if defined(DCHECK)
+  DCHECK(s != kDigitBits);  // {divisor} is not 0.
+#endif
+  divisor <<= s;
+
+  digit_t vn1 = divisor >> kHalfDigitBits;
+  digit_t vn0 = divisor & kHalfDigitMask;
+  // {s} can be 0. {low >> kDigitBits} would be undefined behavior, so
+  // we mask the shift amount with {kShiftMask}, and the result with
+  // {s_zero_mask} which is 0 if s == 0 and all 1-bits otherwise.
+  static_assert(sizeof(intptr_t) == sizeof(digit_t),
+                "intptr_t and digit_t must have the same size");
+  const int kShiftMask = kDigitBits - 1;
+  digit_t s_zero_mask =
+      static_cast<digit_t>(static_cast<intptr_t>(-s) >> (kDigitBits - 1));
+  digit_t un32 =
+      (high << s) | ((low >> ((kDigitBits - s) & kShiftMask)) & s_zero_mask);
+  digit_t un10 = low << s;
+  digit_t un1 = un10 >> kHalfDigitBits;
+  digit_t un0 = un10 & kHalfDigitMask;
+  digit_t q1 = un32 / vn1;
+  digit_t rhat = un32 - q1 * vn1;
+
+  while (q1 >= kHalfDigitBase || q1 * vn0 > rhat * kHalfDigitBase + un1) {
+    q1--;
+    rhat += vn1;
+    if (rhat >= kHalfDigitBase) break;
+  }
+
+  digit_t un21 = un32 * kHalfDigitBase + un1 - q1 * divisor;
+  digit_t q0 = un21 / vn1;
+  rhat = un21 - q0 * vn1;
+
+  while (q0 >= kHalfDigitBase || q0 * vn0 > rhat * kHalfDigitBase + un0) {
+    q0--;
+    rhat += vn1;
+    if (rhat >= kHalfDigitBase) break;
+  }
+
+  *remainder = (un21 * kHalfDigitBase + un0 - q0 * divisor) >> s;
+  return q1 * kHalfDigitBase + q0;
+#endif
+}
+
 }  // namespace bigint
 }  // namespace v8
 
diff --git a/deps/v8/src/bigint/div-burnikel.cc b/deps/v8/src/bigint/div-burnikel.cc
new file mode 100644
index 0000000000..0caedb1cc1
--- /dev/null
+++ b/deps/v8/src/bigint/div-burnikel.cc
@@ -0,0 +1,279 @@
+// Copyright 2021 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// Burnikel-Ziegler division.
+// Reference: "Fast Recursive Division" by Christoph Burnikel and Joachim
+// Ziegler, found at http://cr.yp.to/bib/1998/burnikel.ps
+
+#include <string.h>
+
+#include "src/bigint/bigint-internal.h"
+#include "src/bigint/digit-arithmetic.h"
+#include "src/bigint/div-helpers.h"
+#include "src/bigint/util.h"
+#include "src/bigint/vector-arithmetic.h"
+
+namespace v8 {
+namespace bigint {
+
+namespace {
+
+// Compares [a_high, A] with B.
+// Returns:
+// - a value < 0 if [a_high, A] < B
+// - 0           if [a_high, A] == B
+// - a value > 0 if [a_high, A] > B.
+int SpecialCompare(digit_t a_high, Digits A, Digits B) {
+  B.Normalize();
+  int a_len;
+  if (a_high == 0) {
+    A.Normalize();
+    a_len = A.len();
+  } else {
+    a_len = A.len() + 1;
+  }
+  int diff = a_len - B.len();
+  if (diff != 0) return diff;
+  int i = a_len - 1;
+  if (a_high != 0) {
+    if (a_high > B[i]) return 1;
+    if (a_high < B[i]) return -1;
+    i--;
+  }
+  while (i >= 0 && A[i] == B[i]) i--;
+  if (i < 0) return 0;
+  return A[i] > B[i] ? 1 : -1;
+}
+
+void SetOnes(RWDigits X) {
+  memset(X.digits(), 0xFF, X.len() * sizeof(digit_t));
+}
+
+// Since the Burnikel-Ziegler method is inherently recursive, we put
+// non-changing data into a container object.
+class BZ {
+ public:
+  BZ(ProcessorImpl* proc, int scratch_space)
+      : proc_(proc),
+        scratch_mem_(scratch_space >= kBurnikelThreshold ? scratch_space : 0) {}
+
+  void DivideBasecase(RWDigits Q, RWDigits R, Digits A, Digits B);
+  void D3n2n(RWDigits Q, RWDigits R, Digits A1A2, Digits A3, Digits B);
+  void D2n1n(RWDigits Q, RWDigits R, Digits A, Digits B);
+
+ private:
+  ProcessorImpl* proc_;
+  Storage scratch_mem_;
+};
+
+void BZ::DivideBasecase(RWDigits Q, RWDigits R, Digits A, Digits B) {
+  A.Normalize();
+  B.Normalize();
+  DCHECK(B.len() > 0);  // NOLINT(readability/check)
+  int cmp = Compare(A, B);
+  if (cmp <= 0) {
+    Q.Clear();
+    if (cmp == 0) {
+      // If A == B, then Q=1, R=0.
+      R.Clear();
+      Q[0] = 1;
+    } else {
+      // If A < B, then Q=0, R=A.
+      PutAt(R, A, R.len());
+    }
+    return;
+  }
+  if (B.len() == 1) {
+    return proc_->DivideSingle(Q, R.digits(), A, B[0]);
+  }
+  return proc_->DivideSchoolbook(Q, R, A, B);
+}
+
+// Algorithm 2 from the paper. Variable names same as there.
+// Returns Q(uotient) and R(emainder) for A/B, with B having two thirds
+// the size of A = [A1, A2, A3].
+void BZ::D3n2n(RWDigits Q, RWDigits R, Digits A1A2, Digits A3, Digits B) {
+  DCHECK((B.len() & 1) == 0);  // NOLINT(readability/check)
+  int n = B.len() / 2;
+  DCHECK(A1A2.len() == 2 * n);
+  // Actual condition is stricter than length: A < B * 2^(kDigitBits * n)
+  DCHECK(Compare(A1A2, B) < 0);  // NOLINT(readability/check)
+  DCHECK(A3.len() == n);
+  DCHECK(Q.len() == n);
+  DCHECK(R.len() == 2 * n);
+  // 1. Split A into three parts A = [A1, A2, A3] with Ai < 2^(kDigitBits * n).
+  Digits A1(A1A2, n, n);
+  // 2. Split B into two parts B = [B1, B2] with Bi < 2^(kDigitBits * n).
+  Digits B1(B, n, n);
+  Digits B2(B, 0, n);
+  // 3. Distinguish the cases A1 < B1 or A1 >= B1.
+  RWDigits Qhat = Q;
+  RWDigits R1(R, n, n);
+  digit_t r1_high = 0;
+  if (Compare(A1, B1) < 0) {
+    // 3a. If A1 < B1, compute Qhat = floor([A1, A2] / B1) with remainder R1
+    //     using algorithm D2n1n.
+    D2n1n(Qhat, R1, A1A2, B1);
+    if (proc_->should_terminate()) return;
+  } else {
+    // 3b. If A1 >= B1, set Qhat = 2^(kDigitBits * n) - 1 and set
+    //     R1 = [A1, A2] - [B1, 0] + [0, B1]
+    SetOnes(Qhat);
+    // Step 1: compute A1 - B1, which can't underflow because of the comparison
+    // guarding this else-branch, and always has a one-digit result because
+    // of this function's preconditions.
+    RWDigits temp = R1;
+    Subtract(temp, A1, B1);
+    temp.Normalize();
+    DCHECK(temp.len() <= 1);  // NOLINT(readability/check)
+    if (temp.len() > 0) r1_high = temp[0];
+    // Step 2: compute A2 + B1.
+    Digits A2(A1A2, 0, n);
+    r1_high += AddAndReturnCarry(R1, A2, B1);
+  }
+  // 4. Compute D = Qhat * B2 using (Karatsuba) multiplication.
+  RWDigits D(scratch_mem_.get(), 2 * n);
+  proc_->Multiply(D, Qhat, B2);
+  if (proc_->should_terminate()) return;
+
+  // 5. Compute Rhat = R1*2^(kDigitBits * n) + A3 - D = [R1, A3] - D.
+  PutAt(R, A3, n);
+  // 6. As long as Rhat < 0, repeat:
+  while (SpecialCompare(r1_high, R, D) < 0) {
+    // 6a. Rhat = Rhat + B
+    r1_high += AddAndReturnCarry(R, R, B);
+    // 6b. Qhat = Qhat - 1
+    Subtract(Qhat, 1);
+  }
+  // 5. Compute Rhat = R1*2^(kDigitBits * n) + A3 - D = [R1, A3] - D.
+  digit_t borrow = SubtractAndReturnBorrow(R, R, D);
+  DCHECK(borrow == r1_high);
+  DCHECK(Compare(R, B) < 0);  // NOLINT(readability/check)
+  (void)borrow;
+  // 7. Return R = Rhat, Q = Qhat.
+}
+
+// Algorithm 1 from the paper. Variable names same as there.
+// Returns Q(uotient) and (R)emainder for A/B, with A twice the size of B.
+void BZ::D2n1n(RWDigits Q, RWDigits R, Digits A, Digits B) {
+  int n = B.len();
+  DCHECK(A.len() <= 2 * n);
+  // A < B * 2^(kDigitsBits * n)
+  DCHECK(Compare(Digits(A, n, n), B) < 0);  // NOLINT(readability/check)
+  DCHECK(Q.len() == n);
+  DCHECK(R.len() == n);
+  // 1. If n is odd or smaller than some convenient constant, compute Q and R
+  //    by school division and return.
+  if ((n & 1) == 1 || n < kBurnikelThreshold) {
+    return DivideBasecase(Q, R, A, B);
+  }
+  // 2. Split A into four parts A = [A1, ..., A4] with
+  //    Ai < 2^(kDigitBits * n / 2). Split B into two parts [B2, B1] with
+  //    Bi < 2^(kDigitBits * n / 2).
+  Digits A1A2(A, n, n);
+  Digits A3(A, n / 2, n / 2);
+  Digits A4(A, 0, n / 2);
+  // 3. Compute the high part Q1 of floor(A/B) as
+  //    Q1 = floor([A1, A2, A3] / [B1, B2]) with remainder R1 = [R11, R12],
+  //    using algorithm D3n2n.
+  RWDigits Q1(Q, n / 2, n / 2);
+  ScratchDigits R1(n);
+  D3n2n(Q1, R1, A1A2, A3, B);
+  if (proc_->should_terminate()) return;
+  // 4. Compute the low part Q2 of floor(A/B) as
+  //    Q2 = floor([R11, R12, A4] / [B1, B2]) with remainder R, using
+  //    algorithm D3n2n.
+  RWDigits Q2(Q, 0, n / 2);
+  D3n2n(Q2, R, R1, A4, B);
+  // 5. Return Q = [Q1, Q2] and R.
+}
+
+}  // namespace
+
+// Algorithm 3 from the paper. Variables names same as there.
+// Returns Q(uotient) and R(emainder) for A/B (no size restrictions).
+// R is optional, Q is not.
+void ProcessorImpl::DivideBurnikelZiegler(RWDigits Q, RWDigits R, Digits A,
+                                          Digits B) {
+  DCHECK(A.len() >= B.len());
+  DCHECK(R.len() == 0 || R.len() >= B.len());
+  DCHECK(Q.len() > A.len() - B.len());
+  int r = A.len();
+  int s = B.len();
+  // The requirements are:
+  // - n >= s, n as small as possible.
+  // - m must be a power of two.
+  // 1. Set m = min {2^k | 2^k * kBurnikelThreshold > s}.
+  int m = 1 << BitLength(s / kBurnikelThreshold);
+  // 2. Set j = roundup(s/m) and n = j * m.
+  int j = DIV_CEIL(s, m);
+  int n = j * m;
+  // 3. Set sigma = max{tao | 2^tao * B < 2^(kDigitBits * n)}.
+  int sigma = CountLeadingZeros(B[s - 1]);
+  int digit_shift = n - s;
+  // 4. Set B = B * 2^sigma to normalize B. Shift A by the same amount.
+  ScratchDigits B_shifted(n);
+  LeftShift(B_shifted + digit_shift, B, sigma);
+  for (int i = 0; i < digit_shift; i++) B_shifted[i] = 0;
+  B = B_shifted;
+  // We need an extra digit if A's top digit does not have enough space for
+  // the left-shift by {sigma}. Additionally, the top bit of A must be 0
+  // (see "-1" in step 5 below), which combined with B being normalized (i.e.
+  // B's top bit is 1) ensures the preconditions of the helper functions.
+  int extra_digit = CountLeadingZeros(A[r - 1]) < (sigma + 1) ? 1 : 0;
+  r = A.len() + digit_shift + extra_digit;
+  ScratchDigits A_shifted(r);
+  LeftShift(A_shifted + digit_shift, A, sigma);
+  for (int i = 0; i < digit_shift; i++) A_shifted[i] = 0;
+  A = A_shifted;
+  // 5. Set t = min{t >= 2 | A < 2^(kDigitBits * t * n - 1)}.
+  int t = std::max(DIV_CEIL(r, n), 2);
+  // 6. Split A conceptually into t blocks.
+  // 7. Set Z_(t-2) = [A_(t-1), A_(t-2)].
+  int z_len = n * 2;
+  ScratchDigits Z(z_len);
+  PutAt(Z, A + n * (t - 2), z_len);
+  // 8. For i from t-2 downto 0 do:
+  BZ bz(this, n);
+  ScratchDigits Ri(n);
+  {
+    // First iteration unrolled and specialized.
+    // We might not have n digits at the top of Q, so use temporary storage
+    // for Qi...
+    ScratchDigits Qi(n);
+    bz.D2n1n(Qi, Ri, Z, B);
+    if (should_terminate()) return;
+    // ...but there *will* be enough space for any non-zero result digits!
+    Qi.Normalize();
+    RWDigits target = Q + n * (t - 2);
+    DCHECK(Qi.len() <= target.len());
+    PutAt(target, Qi, target.len());
+  }
+  // Now loop over any remaining iterations.
+  for (int i = t - 3; i >= 0; i--) {
+    // 8b. If i > 0, set Z_(i-1) = [Ri, A_(i-1)].
+    // (De-duped with unrolled first iteration, hence reading A_(i).)
+    PutAt(Z + n, Ri, n);
+    PutAt(Z, A + n * i, n);
+    // 8a. Using algorithm D2n1n compute Qi, Ri such that Zi = B*Qi + Ri.
+    RWDigits Qi(Q, i * n, n);
+    bz.D2n1n(Qi, Ri, Z, B);
+    if (should_terminate()) return;
+  }
+  // 9. Return Q = [Q_(t-2), ..., Q_0] and R = R_0 * 2^(-sigma).
+#if DEBUG
+  for (int i = 0; i < digit_shift; i++) {
+    DCHECK(Ri[i] == 0);  // NOLINT(readability/check)
+  }
+#endif
+  if (R.len() != 0) {
+    Digits Ri_part(Ri, digit_shift, Ri.len());
+    Ri_part.Normalize();
+    DCHECK(Ri_part.len() <= R.len());
+    RightShift(R, Ri_part, sigma);
+  }
+}
+
+}  // namespace bigint
+}  // namespace v8
diff --git a/deps/v8/src/bigint/div-helpers.cc b/deps/v8/src/bigint/div-helpers.cc
new file mode 100644
index 0000000000..0dfca0b02c
--- /dev/null
+++ b/deps/v8/src/bigint/div-helpers.cc
@@ -0,0 +1,68 @@
+// Copyright 2021 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "src/bigint/div-helpers.h"
+
+#include "src/bigint/bigint-internal.h"
+
+namespace v8 {
+namespace bigint {
+
+namespace {
+
+void Copy(RWDigits Z, Digits X) {
+  if (Z == X) return;
+  int i = 0;
+  for (; i < X.len(); i++) Z[i] = X[i];
+  for (; i < Z.len(); i++) Z[i] = 0;
+}
+
+}  // namespace
+
+// Z := X << shift
+// Z and X may alias for an in-place shift.
+void LeftShift(RWDigits Z, Digits X, int shift) {
+  DCHECK(shift >= 0);  // NOLINT(readability/check)
+  DCHECK(shift < kDigitBits);
+  DCHECK(Z.len() >= X.len());
+  if (shift == 0) return Copy(Z, X);
+  digit_t carry = 0;
+  int i = 0;
+  for (; i < X.len(); i++) {
+    digit_t d = X[i];
+    Z[i] = (d << shift) | carry;
+    carry = d >> (kDigitBits - shift);
+  }
+  if (i < Z.len()) {
+    Z[i++] = carry;
+  } else {
+    DCHECK(carry == 0);  // NOLINT(readability/check)
+  }
+  for (; i < Z.len(); i++) Z[i] = 0;
+}
+
+// Z := X >> shift
+// Z and X may alias for an in-place shift.
+void RightShift(RWDigits Z, Digits X, int shift) {
+  DCHECK(shift >= 0);  // NOLINT(readability/check)
+  DCHECK(shift < kDigitBits);
+  X.Normalize();
+  DCHECK(Z.len() >= X.len());
+  if (shift == 0) return Copy(Z, X);
+  int i = 0;
+  if (X.len() > 0) {
+    digit_t carry = X[0] >> shift;
+    int last = X.len() - 1;
+    for (; i < last; i++) {
+      digit_t d = X[i + 1];
+      Z[i] = (d << (kDigitBits - shift)) | carry;
+      carry = d >> shift;
+    }
+    Z[i++] = carry;
+  }
+  for (; i < Z.len(); i++) Z[i] = 0;
+}
+
+}  // namespace bigint
+}  // namespace v8
diff --git a/deps/v8/src/bigint/div-helpers.h b/deps/v8/src/bigint/div-helpers.h
new file mode 100644
index 0000000000..db5da3e128
--- /dev/null
+++ b/deps/v8/src/bigint/div-helpers.h
@@ -0,0 +1,82 @@
+// Copyright 2021 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef V8_BIGINT_DIV_HELPERS_H_
+#define V8_BIGINT_DIV_HELPERS_H_
+
+#include <memory>
+
+#include "src/bigint/bigint.h"
+#include "src/bigint/util.h"
+
+namespace v8 {
+namespace bigint {
+
+void LeftShift(RWDigits Z, Digits X, int shift);
+void RightShift(RWDigits Z, Digits X, int shift);
+
+inline void PutAt(RWDigits Z, Digits A, int count) {
+  int len = std::min(A.len(), count);
+  int i = 0;
+  for (; i < len; i++) Z[i] = A[i];
+  for (; i < count; i++) Z[i] = 0;
+}
+
+// Division algorithms typically need to left-shift their inputs into
+// "bit-normalized" form (i.e. top bit is set). The inputs are considered
+// read-only, and V8 relies on that by allowing concurrent reads from them,
+// so by default, {ShiftedDigits} allocate temporary storage for their
+// contents. In-place modification is opt-in for cases where callers can
+// guarantee that it is safe.
+// When callers allow in-place shifting and wish to undo it, they have to do
+// so manually using {Reset()}.
+// If {shift} is not given, it is auto-detected from {original}'s
+// leading zeros.
+class ShiftedDigits : public Digits {
+ public:
+  explicit ShiftedDigits(Digits& original, int shift = -1,
+                         bool allow_inplace = false)
+      : Digits(original.digits_, original.len_) {
+    int leading_zeros = CountLeadingZeros(original.msd());
+    if (shift < 0) {
+      shift = leading_zeros;
+    } else if (shift > leading_zeros) {
+      allow_inplace = false;
+      len_++;
+    }
+    shift_ = shift;
+    if (shift == 0) {
+      inplace_ = true;
+      return;
+    }
+    inplace_ = allow_inplace;
+    if (!inplace_) {
+      digit_t* digits = new digit_t[len_];
+      storage_.reset(digits);
+      digits_ = digits;
+    }
+    RWDigits rw_view(digits_, len_);
+    LeftShift(rw_view, original, shift_);
+  }
+  ~ShiftedDigits() = default;
+
+  void Reset() {
+    if (inplace_) {
+      RWDigits rw_view(digits_, len_);
+      RightShift(rw_view, rw_view, shift_);
+    }
+  }
+
+  int shift() { return shift_; }
+
+ private:
+  int shift_;
+  bool inplace_;
+  std::unique_ptr<digit_t[]> storage_;
+};
+
+}  // namespace bigint
+}  // namespace v8
+
+#endif  // V8_BIGINT_DIV_HELPERS_H_
diff --git a/deps/v8/src/bigint/div-schoolbook.cc b/deps/v8/src/bigint/div-schoolbook.cc
new file mode 100644
index 0000000000..a6295c573c
--- /dev/null
+++ b/deps/v8/src/bigint/div-schoolbook.cc
@@ -0,0 +1,190 @@
+// Copyright 2021 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// "Schoolbook" division. This is loosely based on Go's implementation
+// found at https://golang.org/src/math/big/nat.go, licensed as follows:
+//
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file [1].
+//
+// [1] https://golang.org/LICENSE
+
+#include <limits>
+
+#include "src/bigint/bigint-internal.h"
+#include "src/bigint/digit-arithmetic.h"
+#include "src/bigint/div-helpers.h"
+#include "src/bigint/util.h"
+#include "src/bigint/vector-arithmetic.h"
+
+namespace v8 {
+namespace bigint {
+
+// Computes Q(uotient) and remainder for A/b, such that
+// Q = (A - remainder) / b, with 0 <= remainder < b.
+// If Q.len == 0, only the remainder will be returned.
+// Q may be the same as A for an in-place division.
+void ProcessorImpl::DivideSingle(RWDigits Q, digit_t* remainder, Digits A,
+                                 digit_t b) {
+  DCHECK(b != 0);       // NOLINT(readability/check)
+  DCHECK(A.len() > 0);  // NOLINT(readability/check)
+  *remainder = 0;
+  int length = A.len();
+  if (Q.len() != 0) {
+    if (A[length - 1] >= b) {
+      DCHECK(Q.len() >= A.len());
+      for (int i = length - 1; i >= 0; i--) {
+        Q[i] = digit_div(*remainder, A[i], b, remainder);
+      }
+      for (int i = length; i < Q.len(); i++) Q[i] = 0;
+    } else {
+      DCHECK(Q.len() >= A.len() - 1);
+      *remainder = A[length - 1];
+      for (int i = length - 2; i >= 0; i--) {
+        Q[i] = digit_div(*remainder, A[i], b, remainder);
+      }
+      for (int i = length - 1; i < Q.len(); i++) Q[i] = 0;
+    }
+  } else {
+    for (int i = length - 1; i >= 0; i--) {
+      digit_div(*remainder, A[i], b, remainder);
+    }
+  }
+}
+
+// Z += X. Returns the "carry" (0 or 1) after adding all of X's digits.
+inline digit_t InplaceAdd(RWDigits Z, Digits X) {
+  return AddAndReturnCarry(Z, Z, X);
+}
+
+// Z -= X. Returns the "borrow" (0 or 1) after subtracting all of X's digits.
+inline digit_t InplaceSub(RWDigits Z, Digits X) {
+  return SubtractAndReturnBorrow(Z, Z, X);
+}
+
+// Returns whether (factor1 * factor2) > (high << kDigitBits) + low.
+bool ProductGreaterThan(digit_t factor1, digit_t factor2, digit_t high,
+                        digit_t low) {
+  digit_t result_high;
+  digit_t result_low = digit_mul(factor1, factor2, &result_high);
+  return result_high > high || (result_high == high && result_low > low);
+}
+
+#if DEBUG
+bool QLengthOK(Digits Q, Digits A, Digits B) {
+  // If A's top B.len digits are greater than or equal to B, then the division
+  // result will be greater than A.len - B.len, otherwise it will be that
+  // difference. Intuitively: 100/10 has 2 digits, 100/11 has 1.
+  if (GreaterThanOrEqual(Digits(A, A.len() - B.len(), B.len()), B)) {
+    return Q.len() >= A.len() - B.len() + 1;
+  }
+  return Q.len() >= A.len() - B.len();
+}
+#endif
+
+// Computes Q(uotient) and R(emainder) for A/B, such that
+// Q = (A - R) / B, with 0 <= R < B.
+// Both Q and R are optional: callers that are only interested in one of them
+// can pass the other with len == 0.
+// If Q is present, its length must be at least A.len - B.len + 1.
+// If R is present, its length must be at least B.len.
+// See Knuth, Volume 2, section 4.3.1, Algorithm D.
+void ProcessorImpl::DivideSchoolbook(RWDigits Q, RWDigits R, Digits A,
+                                     Digits B) {
+  // NOLINTNEXTLINE(readability/check)
+  DCHECK(B.len() >= 2);        // Use DivideSingle otherwise.
+  DCHECK(A.len() >= B.len());  // No-op otherwise.
+  DCHECK(Q.len() == 0 || QLengthOK(Q, A, B));
+  DCHECK(R.len() == 0 || R.len() >= B.len());
+  // The unusual variable names inside this function are consistent with
+  // Knuth's book, as well as with Go's implementation of this algorithm.
+  // Maintaining this consistency is probably more useful than trying to
+  // come up with more descriptive names for them.
+  const int n = B.len();
+  const int m = A.len() - n;
+
+  // In each iteration, {qhatv} holds {divisor} * {current quotient digit}.
+  // "v" is the book's name for {divisor}, "qhat" the current quotient digit.
+  ScratchDigits qhatv(n + 1);
+
+  // D1.
+  // Left-shift inputs so that the divisor's MSB is set. This is necessary
+  // to prevent the digit-wise divisions (see digit_div call below) from
+  // overflowing (they take a two digits wide input, and return a one digit
+  // result).
+  ShiftedDigits b_normalized(B);
+  B = b_normalized;
+  // U holds the (continuously updated) remaining part of the dividend, which
+  // eventually becomes the remainder.
+  ScratchDigits U(A.len() + 1);
+  LeftShift(U, A, b_normalized.shift());
+
+  // D2.
+  // Iterate over the dividend's digits (like the "grad school" algorithm).
+  // {vn1} is the divisor's most significant digit.
+  digit_t vn1 = B[n - 1];
+  for (int j = m; j >= 0; j--) {
+    // D3.
+    // Estimate the current iteration's quotient digit (see Knuth for details).
+    // {qhat} is the current quotient digit.
+    digit_t qhat = std::numeric_limits<digit_t>::max();
+    // {ujn} is the dividend's most significant remaining digit.
+    digit_t ujn = U[j + n];
+    if (ujn != vn1) {
+      // {rhat} is the current iteration's remainder.
+      digit_t rhat = 0;
+      // Estimate the current quotient digit by dividing the most significant
+      // digits of dividend and divisor. The result will not be too small,
+      // but could be a bit too large.
+      qhat = digit_div(ujn, U[j + n - 1], vn1, &rhat);
+
+      // Decrement the quotient estimate as needed by looking at the next
+      // digit, i.e. by testing whether
+      // qhat * v_{n-2} > (rhat << kDigitBits) + u_{j+n-2}.
+      digit_t vn2 = B[n - 2];
+      digit_t ujn2 = U[j + n - 2];
+      while (ProductGreaterThan(qhat, vn2, rhat, ujn2)) {
+        qhat--;
+        digit_t prev_rhat = rhat;
+        rhat += vn1;
+        // v[n-1] >= 0, so this tests for overflow.
+        if (rhat < prev_rhat) break;
+      }
+    }
+
+    // D4.
+    // Multiply the divisor with the current quotient digit, and subtract
+    // it from the dividend. If there was "borrow", then the quotient digit
+    // was one too high, so we must correct it and undo one subtraction of
+    // the (shifted) divisor.
+    if (qhat == 0) {
+      qhatv.Clear();
+    } else {
+      MultiplySingle(qhatv, B, qhat);
+    }
+    digit_t c = InplaceSub(U + j, qhatv);
+    if (c != 0) {
+      c = InplaceAdd(U + j, B);
+      U[j + n] = U[j + n] + c;
+      qhat--;
+    }
+
+    if (Q.len() != 0) {
+      if (j >= Q.len()) {
+        DCHECK(qhat == 0);  // NOLINT(readability/check)
+      } else {
+        Q[j] = qhat;
+      }
+    }
+  }
+  if (R.len() != 0) {
+    RightShift(R, U, b_normalized.shift());
+  }
+  // If Q has extra storage, clear it.
+  for (int i = m + 1; i < Q.len(); i++) Q[i] = 0;
+}
+
+}  // namespace bigint
+}  // namespace v8
diff --git a/deps/v8/src/bigint/mul-fft.cc b/deps/v8/src/bigint/mul-fft.cc
new file mode 100644
index 0000000000..a3971a7276
--- /dev/null
+++ b/deps/v8/src/bigint/mul-fft.cc
@@ -0,0 +1,790 @@
+// Copyright 2021 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// FFT-based multiplication, due to Schönhage and Strassen.
+// This implementation mostly follows the description given in:
+// Christoph Lüders: Fast Multiplication of Large Integers,
+// http://arxiv.org/abs/1503.04955
+
+#include "src/bigint/bigint-internal.h"
+#include "src/bigint/digit-arithmetic.h"
+#include "src/bigint/util.h"
+#include "src/bigint/vector-arithmetic.h"
+
+namespace v8 {
+namespace bigint {
+
+namespace {
+
+////////////////////////////////////////////////////////////////////////////////
+// Part 1: Functions for "mod F_n" arithmetic.
+// F_n is of the shape 2^K + 1, and for convenience we use K to count the
+// number of digits rather than the number of bits, so F_n (or K) are implicit
+// and deduced from the length {len} of the digits array.
+
+// Helper function for {ModFn} below.
+void ModFn_Helper(digit_t* x, int len, signed_digit_t high) {
+  if (high > 0) {
+    digit_t borrow = high;
+    x[len - 1] = 0;
+    for (int i = 0; i < len; i++) {
+      x[i] = digit_sub(x[i], borrow, &borrow);
+      if (borrow == 0) break;
+    }
+  } else {
+    digit_t carry = -high;
+    x[len - 1] = 0;
+    for (int i = 0; i < len; i++) {
+      x[i] = digit_add2(x[i], carry, &carry);
+      if (carry == 0) break;
+    }
+  }
+}
+
+// {x} := {x} mod F_n, assuming that {x} is "slightly" larger than F_n (e.g.
+// after addition of two numbers that were mod-F_n-normalized before).
+void ModFn(digit_t* x, int len) {
+  int K = len - 1;
+  signed_digit_t high = x[K];
+  if (high == 0) return;
+  ModFn_Helper(x, len, high);
+  high = x[K];
+  if (high == 0) return;
+  DCHECK(high == 1 || high == -1);
+  ModFn_Helper(x, len, high);
+  high = x[K];
+  if (high == -1) ModFn_Helper(x, len, high);
+}
+
+// {dest} := {src} mod F_n, assuming that {src} is about twice as long as F_n
+// (e.g. after multiplication of two numbers that were mod-F_n-normalized
+// before).
+// {len} is length of {dest}; {src} is twice as long.
+void ModFnDoubleWidth(digit_t* dest, const digit_t* src, int len) {
+  int K = len - 1;
+  digit_t borrow = 0;
+  for (int i = 0; i < K; i++) {
+    dest[i] = digit_sub2(src[i], src[i + K], borrow, &borrow);
+  }
+  dest[K] = digit_sub2(0, src[2 * K], borrow, &borrow);
+  // {borrow} may be non-zero here, that's OK as {ModFn} will take care of it.
+  ModFn(dest, len);
+}
+
+// Sets {sum} := {a} + {b} and {diff} := {a} - {b}, which is more efficient
+// than computing sum and difference separately. Applies "mod F_n" normalization
+// to both results.
+void SumDiff(digit_t* sum, digit_t* diff, const digit_t* a, const digit_t* b,
+             int len) {
+  digit_t carry = 0;
+  digit_t borrow = 0;
+  for (int i = 0; i < len; i++) {
+    // Read both values first, because inputs and outputs can overlap.
+    digit_t ai = a[i];
+    digit_t bi = b[i];
+    sum[i] = digit_add3(ai, bi, carry, &carry);
+    diff[i] = digit_sub2(ai, bi, borrow, &borrow);
+  }
+  ModFn(sum, len);
+  ModFn(diff, len);
+}
+
+// {result} := ({input} << shift) mod F_n, where shift >= K.
+void ShiftModFn_Large(digit_t* result, const digit_t* input, int digit_shift,
+                      int bits_shift, int K) {
+  // If {digit_shift} is greater than K, we use the following transformation
+  // (where, since everything is mod 2^K + 1, we are allowed to add or
+  // subtract any multiple of 2^K + 1 at any time):
+  //      x * 2^{K+m}   mod 2^K + 1
+  //   == x * 2^K * 2^m - (2^K + 1)*(x * 2^m)   mod 2^K + 1
+  //   == x * 2^K * 2^m - x * 2^K * 2^m - x * 2^m   mod 2^K + 1
+  //   == -x * 2^m   mod 2^K + 1
+  // So the flow is the same as for m < K, but we invert the subtraction's
+  // operands. In order to avoid underflow, we virtually initialize the
+  // result to 2^K + 1:
+  //   input  =  [ iK ][iK-1] ....  .... [ i1 ][ i0 ]
+  //   result =  [   1][0000] ....  .... [0000][0001]
+  //            +                  [ iK ] .... [ iX ]
+  //            -      [iX-1] .... [ i0 ]
+  DCHECK(digit_shift >= K);
+  digit_shift -= K;
+  digit_t borrow = 0;
+  if (bits_shift == 0) {
+    digit_t carry = 1;
+    for (int i = 0; i < digit_shift; i++) {
+      result[i] = digit_add2(input[i + K - digit_shift], carry, &carry);
+    }
+    result[digit_shift] = digit_sub(input[K] + carry, input[0], &borrow);
+    for (int i = digit_shift + 1; i < K; i++) {
+      digit_t d = input[i - digit_shift];
+      result[i] = digit_sub2(0, d, borrow, &borrow);
+    }
+  } else {
+    digit_t add_carry = 1;
+    digit_t input_carry =
+        input[K - digit_shift - 1] >> (kDigitBits - bits_shift);
+    for (int i = 0; i < digit_shift; i++) {
+      digit_t d = input[i + K - digit_shift];
+      digit_t summand = (d << bits_shift) | input_carry;
+      result[i] = digit_add2(summand, add_carry, &add_carry);
+      input_carry = d >> (kDigitBits - bits_shift);
+    }
+    {
+      // result[digit_shift] = (add_carry + iK_part) - i0_part
+      digit_t d = input[K];
+      digit_t iK_part = (d << bits_shift) | input_carry;
+      digit_t iK_carry = d >> (kDigitBits - bits_shift);
+      digit_t sum = digit_add2(add_carry, iK_part, &add_carry);
+      // {iK_carry} is less than a full digit, so we can merge {add_carry}
+      // into it without overflow.
+      iK_carry += add_carry;
+      d = input[0];
+      digit_t i0_part = d << bits_shift;
+      result[digit_shift] = digit_sub(sum, i0_part, &borrow);
+      input_carry = d >> (kDigitBits - bits_shift);
+      if (digit_shift + 1 < K) {
+        digit_t d = input[1];
+        digit_t subtrahend = (d << bits_shift) | input_carry;
+        result[digit_shift + 1] =
+            digit_sub2(iK_carry, subtrahend, borrow, &borrow);
+        input_carry = d >> (kDigitBits - bits_shift);
+      }
+    }
+    for (int i = digit_shift + 2; i < K; i++) {
+      digit_t d = input[i - digit_shift];
+      digit_t subtrahend = (d << bits_shift) | input_carry;
+      result[i] = digit_sub2(0, subtrahend, borrow, &borrow);
+      input_carry = d >> (kDigitBits - bits_shift);
+    }
+  }
+  // The virtual 1 in result[K] should be eliminated by {borrow}. If there
+  // is no borrow, then the virtual initialization was too much. Subtract
+  // 2^K + 1.
+  result[K] = 0;
+  if (borrow != 1) {
+    borrow = 1;
+    for (int i = 0; i < K; i++) {
+      result[i] = digit_sub(result[i], borrow, &borrow);
+      if (borrow == 0) break;
+    }
+    if (borrow != 0) {
+      // The result must be 2^K.
+      for (int i = 0; i < K; i++) result[i] = 0;
+      result[K] = 1;
+    }
+  }
+}
+
+// Sets {result} := {input} * 2^{power_of_two} mod 2^{K} + 1.
+// This function is highly relevant for overall performance.
+void ShiftModFn(digit_t* result, const digit_t* input, int power_of_two, int K,
+                int zero_above = 0x7FFFFFFF) {
+  // The modulo-reduction amounts to a subtraction, which we combine
+  // with the shift as follows:
+  //   input  =  [ iK ][iK-1] ....  .... [ i1 ][ i0 ]
+  //   result =        [iX-1] .... [ i0 ] <<<<<<<<<<< shift by {power_of_two}
+  //            -                  [ iK ] .... [ iX ]
+  // where "X" is the index "K - digit_shift".
+  int digit_shift = power_of_two / kDigitBits;
+  int bits_shift = power_of_two % kDigitBits;
+  // By an analogous construction to the "digit_shift >= K" case,
+  // it turns out that:
+  //    x * 2^{2K+m} == x * 2^m   mod 2^K + 1.
+  while (digit_shift >= 2 * K) digit_shift -= 2 * K;  // Faster than '%'!
+  if (digit_shift >= K) {
+    return ShiftModFn_Large(result, input, digit_shift, bits_shift, K);
+  }
+  digit_t borrow = 0;
+  if (bits_shift == 0) {
+    // We do a single pass over {input}, starting by copying digits [i1] to
+    // [iX-1] to result indices digit_shift+1 to K-1.
+    int i = 1;
+    // Read input digits unless we know they are zero.
+    int cap = std::min(K - digit_shift, zero_above);
+    for (; i < cap; i++) {
+      result[i + digit_shift] = input[i];
+    }
+    // Any remaining work can hard-code the knowledge that input[i] == 0.
+    for (; i < K - digit_shift; i++) {
+      DCHECK(input[i] == 0);  // NOLINT(readability/check)
+      result[i + digit_shift] = 0;
+    }
+    // Second phase: subtract input digits [iX] to [iK] from (virtually) zero-
+    // initialized result indices 0 to digit_shift-1.
+    cap = std::min(K, zero_above);
+    for (; i < cap; i++) {
+      digit_t d = input[i];
+      result[i - K + digit_shift] = digit_sub2(0, d, borrow, &borrow);
+    }
+    // Any remaining work can hard-code the knowledge that input[i] == 0.
+    for (; i < K; i++) {
+      DCHECK(input[i] == 0);  // NOLINT(readability/check)
+      result[i - K + digit_shift] = digit_sub(0, borrow, &borrow);
+    }
+    // Last step: subtract [iK] from [i0] and store at result index digit_shift.
+    result[digit_shift] = digit_sub2(input[0], input[K], borrow, &borrow);
+  } else {
+    // Same flow as before, but taking bits_shift != 0 into account.
+    // First phase: result indices digit_shift+1 to K.
+    digit_t carry = 0;
+    int i = 0;
+    // Read input digits unless we know they are zero.
+    int cap = std::min(K - digit_shift, zero_above);
+    for (; i < cap; i++) {
+      digit_t d = input[i];
+      result[i + digit_shift] = (d << bits_shift) | carry;
+      carry = d >> (kDigitBits - bits_shift);
+    }
+    // Any remaining work can hard-code the knowledge that input[i] == 0.
+    for (; i < K - digit_shift; i++) {
+      DCHECK(input[i] == 0);  // NOLINT(readability/check)
+      result[i + digit_shift] = carry;
+      carry = 0;
+    }
+    // Second phase: result indices 0 to digit_shift - 1.
+    cap = std::min(K, zero_above);
+    for (; i < cap; i++) {
+      digit_t d = input[i];
+      result[i - K + digit_shift] =
+          digit_sub2(0, (d << bits_shift) | carry, borrow, &borrow);
+      carry = d >> (kDigitBits - bits_shift);
+    }
+    // Any remaining work can hard-code the knowledge that input[i] == 0.
+    if (i < K) {
+      DCHECK(input[i] == 0);  // NOLINT(readability/check)
+      result[i - K + digit_shift] = digit_sub2(0, carry, borrow, &borrow);
+      carry = 0;
+      i++;
+    }
+    for (; i < K; i++) {
+      DCHECK(input[i] == 0);  // NOLINT(readability/check)
+      result[i - K + digit_shift] = digit_sub(0, borrow, &borrow);
+    }
+    // Last step: compute result[digit_shift].
+    digit_t d = input[K];
+    result[digit_shift] = digit_sub2(
+        result[digit_shift], (d << bits_shift) | carry, borrow, &borrow);
+    // No carry left.
+    DCHECK((d >> (kDigitBits - bits_shift)) == 0);  // NOLINT(readability/check)
+  }
+  result[K] = 0;
+  for (int i = digit_shift + 1; i <= K && borrow > 0; i++) {
+    result[i] = digit_sub(result[i], borrow, &borrow);
+  }
+  if (borrow > 0) {
+    // Underflow means we subtracted too much. Add 2^K + 1.
+    digit_t carry = 1;
+    for (int i = 0; i <= K; i++) {
+      result[i] = digit_add2(result[i], carry, &carry);
+      if (carry == 0) break;
+    }
+    result[K] = digit_add2(result[K], 1, &carry);
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Part 2: FFT-based multiplication is very sensitive to appropriate choice
+// of parameters. The following functions choose the parameters that the
+// subsequent actual computation will use. This is partially based on formal
+// constraints and partially on experimentally-determined heuristics.
+
+struct Parameters {
+  // We never use the default values, but skipping zero-initialization
+  // of these fields saddens and confuses MSan.
+  int m{0};
+  int K{0};
+  int n{0};
+  int s{0};
+  int r{0};
+};
+
+// Computes parameters for the main calculation, given a bit length {N} and
+// an {m}. See the paper for details.
+void ComputeParameters(int N, int m, Parameters* params) {
+  N *= kDigitBits;
+  int n = 1 << m;  // 2^m
+  int nhalf = n >> 1;
+  int s = (N + n - 1) >> m;  // ceil(N/n)
+  s = RoundUp(s, kDigitBits);
+  int K = m + 2 * s + 1;  // K must be at least this big...
+  K = RoundUp(K, nhalf);  // ...and a multiple of n/2.
+  int r = K >> (m - 1);   // Which multiple?
+
+  // We want recursive calls to make progress, so force K to be a multiple
+  // of 8 if it's above the recursion threshold. Otherwise, K must be a
+  // multiple of kDigitBits.
+  const int threshold = (K + 1 >= kFftInnerThreshold * kDigitBits)
+                            ? 3 + kLog2DigitBits
+                            : kLog2DigitBits;
+  int K_tz = CountTrailingZeros(K);
+  while (K_tz < threshold) {
+    K += (1 << K_tz);
+    r = K >> (m - 1);
+    K_tz = CountTrailingZeros(K);
+  }
+
+  DCHECK(K % kDigitBits == 0);  // NOLINT(readability/check)
+  DCHECK(s % kDigitBits == 0);  // NOLINT(readability/check)
+  params->K = K / kDigitBits;
+  params->s = s / kDigitBits;
+  params->n = n;
+  params->r = r;
+}
+
+// Computes parameters for recursive invocations ("inner layer").
+void ComputeParameters_Inner(int N, Parameters* params) {
+  int max_m = CountTrailingZeros(N);
+  int N_bits = BitLength(N);
+  int m = N_bits - 4;  // Don't let s get too small.
+  m = std::min(max_m, m);
+  N *= kDigitBits;
+  int n = 1 << m;  // 2^m
+  // We can't round up s in the inner layer, because N = n*s is fixed.
+  int s = N >> m;
+  DCHECK(N == s * n);
+  int K = m + 2 * s + 1;  // K must be at least this big...
+  K = RoundUp(K, n);      // ...and a multiple of n and kDigitBits.
+  K = RoundUp(K, kDigitBits);
+  params->r = K >> m;           // Which multiple?
+  DCHECK(K % kDigitBits == 0);  // NOLINT(readability/check)
+  DCHECK(s % kDigitBits == 0);  // NOLINT(readability/check)
+  params->K = K / kDigitBits;
+  params->s = s / kDigitBits;
+  params->n = n;
+  params->m = m;
+}
+
+int PredictInnerK(int N) {
+  Parameters params;
+  ComputeParameters_Inner(N, &params);
+  return params.K;
+}
+
+// Applies heuristics to decide whether {m} should be decremented, by looking
+// at what would happen to {K} and {s} if {m} was decremented.
+bool ShouldDecrementM(const Parameters& current, const Parameters& next,
+                      const Parameters& after_next) {
+  // K == 64 seems to work particularly well.
+  if (current.K == 64 && next.K >= 112) return false;
+  // Small values for s are never efficient.
+  if (current.s < 6) return true;
+  // The time is roughly determined by K * n. When we decrement m, then
+  // n always halves, and K usually gets bigger, by up to 2x.
+  // For not-quite-so-small s, look at how much bigger K would get: if
+  // the K increase is small enough, making n smaller is worth it.
+  // Empirically, it's most meaningful to look at the K *after* next.
+  // The specific threshold values have been chosen by running many
+  // benchmarks on inputs of many sizes, and manually selecting thresholds
+  // that seemed to produce good results.
+  double factor = static_cast<double>(after_next.K) / current.K;
+  if ((current.s == 6 && factor < 3.85) ||  // --
+      (current.s == 7 && factor < 3.73) ||  // --
+      (current.s == 8 && factor < 3.55) ||  // --
+      (current.s == 9 && factor < 3.50) ||  // --
+      factor < 3.4) {
+    return true;
+  }
+  // If K is just below the recursion threshold, make sure we do recurse,
+  // unless doing so would be particularly inefficient (large inner_K).
+  // If K is just above the recursion threshold, doubling it often makes
+  // the inner call more efficient.
+  if (current.K >= 160 && current.K < 250 && PredictInnerK(next.K) < 28) {
+    return true;
+  }
+  // If we found no reason to decrement, keep m as large as possible.
+  return false;
+}
+
+// Decides what parameters to use for a given input bit length {N}.
+// Returns the chosen m.
+int GetParameters(int N, Parameters* params) {
+  int N_bits = BitLength(N);
+  int max_m = N_bits - 3;                   // Larger m make s too small.
+  max_m = std::max(kLog2DigitBits, max_m);  // Smaller m break the logic below.
+  int m = max_m;
+  Parameters current;
+  ComputeParameters(N, m, &current);
+  Parameters next;
+  ComputeParameters(N, m - 1, &next);
+  while (m > 2) {
+    Parameters after_next;
+    ComputeParameters(N, m - 2, &after_next);
+    if (ShouldDecrementM(current, next, after_next)) {
+      m--;
+      current = next;
+      next = after_next;
+    } else {
+      break;
+    }
+  }
+  *params = current;
+  return m;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Part 3: Fast Fourier Transformation.
+
+class FFTContainer {
+ public:
+  // {n} is the number of chunks, whose length is {K}+1.
+  // {K} determines F_n = 2^(K * kDigitBits) + 1.
+  FFTContainer(int n, int K, ProcessorImpl* processor)
+      : n_(n), K_(K), length_(K + 1), processor_(processor) {
+    storage_ = new digit_t[length_ * n_];
+    part_ = new digit_t*[n_];
+    digit_t* ptr = storage_;
+    for (int i = 0; i < n; i++, ptr += length_) {
+      part_[i] = ptr;
+    }
+    temp_ = new digit_t[length_ * 2];
+  }
+  FFTContainer() = delete;
+  FFTContainer(const FFTContainer&) = delete;
+  FFTContainer& operator=(const FFTContainer&) = delete;
+
+  ~FFTContainer() {
+    delete[] storage_;
+    delete[] part_;
+    delete[] temp_;
+  }
+
+  void Start_Default(Digits X, int chunk_size, int theta, int omega);
+  void Start(Digits X, int chunk_size, int theta, int omega);
+
+  void NormalizeAndRecombine(int omega, int m, RWDigits Z, int chunk_size);
+  void CounterWeightAndRecombine(int theta, int m, RWDigits Z, int chunk_size);
+
+  void FFT_ReturnShuffledThreadsafe(int start, int len, int omega,
+                                    digit_t* temp);
+  void FFT_Recurse(int start, int half, int omega, digit_t* temp);
+
+  void BackwardFFT(int start, int len, int omega);
+  void BackwardFFT_Threadsafe(int start, int len, int omega, digit_t* temp);
+
+  void PointwiseMultiply(const FFTContainer& other);
+  void DoPointwiseMultiplication(const FFTContainer& other, int start, int end,
+                                 digit_t* temp);
+
+  int length() const { return length_; }
+
+ private:
+  const int n_;       // Number of parts.
+  const int K_;       // Always length_ - 1.
+  const int length_;  // Length of each part, in digits.
+  ProcessorImpl* processor_;
+  digit_t* storage_;  // Combined storage of all parts.
+  digit_t** part_;    // Pointers to each part.
+  digit_t* temp_;     // Temporary storage with size 2 * length_.
+};
+
+inline void CopyAndZeroExtend(digit_t* dst, const digit_t* src,
+                              int digits_to_copy, size_t total_bytes) {
+  size_t bytes_to_copy = digits_to_copy * sizeof(digit_t);
+  memcpy(dst, src, bytes_to_copy);
+  memset(dst + digits_to_copy, 0, total_bytes - bytes_to_copy);
+}
+
+// Reads {X} into the FFTContainer's internal storage, dividing it into chunks
+// while doing so; then performs the forward FFT.
+void FFTContainer::Start_Default(Digits X, int chunk_size, int theta,
+                                 int omega) {
+  int len = X.len();
+  const digit_t* pointer = X.digits();
+  const size_t part_length_in_bytes = length_ * sizeof(digit_t);
+  int current_theta = 0;
+  int i = 0;
+  for (; i < n_ && len > 0; i++, current_theta += theta) {
+    chunk_size = std::min(chunk_size, len);
+    // For invocations via MultiplyFFT_Inner, X.len() == n_ * chunk_size + 1,
+    // because the outer layer's "K" is passed as the inner layer's "N".
+    // Since X is (mod Fn)-normalized on the outer layer, there is the rare
+    // corner case where X[n_ * chunk_size] == 1. Detect that case, and handle
+    // the extra bit as part of the last chunk; we always have the space.
+    if (i == n_ - 1 && len == chunk_size + 1) {
+      DCHECK(X[n_ * chunk_size] <= 1);  // NOLINT(readability/check)
+      DCHECK(length_ >= chunk_size + 1);
+      chunk_size++;
+    }
+    if (current_theta != 0) {
+      // Multiply with theta^i, and reduce modulo 2^K + 1.
+      // We pass theta as a shift amount; it really means 2^theta.
+      CopyAndZeroExtend(temp_, pointer, chunk_size, part_length_in_bytes);
+      ShiftModFn(part_[i], temp_, current_theta, K_, chunk_size);
+    } else {
+      CopyAndZeroExtend(part_[i], pointer, chunk_size, part_length_in_bytes);
+    }
+    pointer += chunk_size;
+    len -= chunk_size;
+  }
+  DCHECK(len == 0);  // NOLINT(readability/check)
+  for (; i < n_; i++) {
+    memset(part_[i], 0, part_length_in_bytes);
+  }
+  FFT_ReturnShuffledThreadsafe(0, n_, omega, temp_);
+}
+
+// This version of Start is optimized for the case where ~half of the
+// container will be filled with padding zeros.
+void FFTContainer::Start(Digits X, int chunk_size, int theta, int omega) {
+  int len = X.len();
+  if (len > n_ * chunk_size / 2) {
+    return Start_Default(X, chunk_size, theta, omega);
+  }
+  DCHECK(theta == 0);  // NOLINT(readability/check)
+  const digit_t* pointer = X.digits();
+  const size_t part_length_in_bytes = length_ * sizeof(digit_t);
+  int nhalf = n_ / 2;
+  // Unrolled first iteration.
+  CopyAndZeroExtend(part_[0], pointer, chunk_size, part_length_in_bytes);
+  CopyAndZeroExtend(part_[nhalf], pointer, chunk_size, part_length_in_bytes);
+  pointer += chunk_size;
+  len -= chunk_size;
+  int i = 1;
+  for (; i < nhalf && len > 0; i++) {
+    chunk_size = std::min(chunk_size, len);
+    CopyAndZeroExtend(part_[i], pointer, chunk_size, part_length_in_bytes);
+    int w = omega * i;
+    ShiftModFn(part_[i + nhalf], part_[i], w, K_, chunk_size);
+    pointer += chunk_size;
+    len -= chunk_size;
+  }
+  for (; i < nhalf; i++) {
+    memset(part_[i], 0, part_length_in_bytes);
+    memset(part_[i + nhalf], 0, part_length_in_bytes);
+  }
+  FFT_Recurse(0, nhalf, omega, temp_);
+}
+
+// Forward transformation.
+// We use the "DIF" aka "decimation in frequency" transform, because it
+// leaves the result in "bit reversed" order, which is precisely what we
+// need as input for the "DIT" aka "decimation in time" backwards transform.
+void FFTContainer::FFT_ReturnShuffledThreadsafe(int start, int len, int omega,
+                                                digit_t* temp) {
+  DCHECK((len & 1) == 0);  // {len} must be even. NOLINT(readability/check)
+  int half = len / 2;
+  SumDiff(part_[start], part_[start + half], part_[start], part_[start + half],
+          length_);
+  for (int k = 1; k < half; k++) {
+    SumDiff(part_[start + k], temp, part_[start + k], part_[start + half + k],
+            length_);
+    int w = omega * k;
+    ShiftModFn(part_[start + half + k], temp, w, K_);
+  }
+  FFT_Recurse(start, half, omega, temp);
+}
+
+// Recursive step of the above, factored out for additional callers.
+void FFTContainer::FFT_Recurse(int start, int half, int omega, digit_t* temp) {
+  if (half > 1) {
+    FFT_ReturnShuffledThreadsafe(start, half, 2 * omega, temp);
+    FFT_ReturnShuffledThreadsafe(start + half, half, 2 * omega, temp);
+  }
+}
+
+// Backward transformation.
+// We use the "DIT" aka "decimation in time" transform here, because it
+// turns bit-reversed input into normally sorted output.
+void FFTContainer::BackwardFFT(int start, int len, int omega) {
+  BackwardFFT_Threadsafe(start, len, omega, temp_);
+}
+
+void FFTContainer::BackwardFFT_Threadsafe(int start, int len, int omega,
+                                          digit_t* temp) {
+  DCHECK((len & 1) == 0);  // {len} must be even. NOLINT(readability/check)
+  int half = len / 2;
+  // Don't recurse for half == 2, as PointwiseMultiply already performed
+  // the first level of the backwards FFT.
+  if (half > 2) {
+    BackwardFFT_Threadsafe(start, half, 2 * omega, temp);
+    BackwardFFT_Threadsafe(start + half, half, 2 * omega, temp);
+  }
+  SumDiff(part_[start], part_[start + half], part_[start], part_[start + half],
+          length_);
+  for (int k = 1; k < half; k++) {
+    int w = omega * (len - k);
+    ShiftModFn(temp, part_[start + half + k], w, K_);
+    SumDiff(part_[start + k], part_[start + half + k], part_[start + k], temp,
+            length_);
+  }
+}
+
+// Recombines the result's parts into {Z}, after backwards FFT.
+void FFTContainer::NormalizeAndRecombine(int omega, int m, RWDigits Z,
+                                         int chunk_size) {
+  Z.Clear();
+  int z_index = 0;
+  const int shift = n_ * omega - m;
+  for (int i = 0; i < n_; i++, z_index += chunk_size) {
+    digit_t* part = part_[i];
+    ShiftModFn(temp_, part, shift, K_);
+    digit_t carry = 0;
+    int zi = z_index;
+    int j = 0;
+    for (; j < length_ && zi < Z.len(); j++, zi++) {
+      Z[zi] = digit_add3(Z[zi], temp_[j], carry, &carry);
+    }
+    for (; j < length_; j++) {
+      DCHECK(temp_[j] == 0);  // NOLINT(readability/check)
+    }
+    if (carry != 0) {
+      DCHECK(zi < Z.len());
+      Z[zi] = carry;
+    }
+  }
+}
+
+// Helper function for {CounterWeightAndRecombine} below.
+bool ShouldBeNegative(const digit_t* x, int xlen, digit_t threshold, int s) {
+  if (x[2 * s] >= threshold) return true;
+  for (int i = 2 * s + 1; i < xlen; i++) {
+    if (x[i] > 0) return true;
+  }
+  return false;
+}
+
+// Same as {NormalizeAndRecombine} above, but for the needs of the recursive
+// invocation ("inner layer") of FFT multiplication, where an additional
+// counter-weighting step is required.
+void FFTContainer::CounterWeightAndRecombine(int theta, int m, RWDigits Z,
+                                             int s) {
+  Z.Clear();
+  int z_index = 0;
+  for (int k = 0; k < n_; k++, z_index += s) {
+    int shift = -theta * k - m;
+    if (shift < 0) shift += 2 * n_ * theta;
+    DCHECK(shift >= 0);  // NOLINT(readability/check)
+    digit_t* input = part_[k];
+    ShiftModFn(temp_, input, shift, K_);
+    int remaining_z = Z.len() - z_index;
+    if (ShouldBeNegative(temp_, length_, k + 1, s)) {
+      // Subtract F_n from input before adding to result. We use the following
+      // transformation (knowing that X < F_n):
+      // Z + (X - F_n) == Z - (F_n - X)
+      digit_t borrow_z = 0;
+      digit_t borrow_Fn = 0;
+      {
+        // i == 0:
+        digit_t d = digit_sub(1, temp_[0], &borrow_Fn);
+        Z[z_index] = digit_sub(Z[z_index], d, &borrow_z);
+      }
+      int i = 1;
+      for (; i < K_ && i < remaining_z; i++) {
+        digit_t d = digit_sub2(0, temp_[i], borrow_Fn, &borrow_Fn);
+        Z[z_index + i] = digit_sub2(Z[z_index + i], d, borrow_z, &borrow_z);
+      }
+      DCHECK(i == K_ && K_ == length_ - 1);
+      for (; i < length_ && i < remaining_z; i++) {
+        digit_t d = digit_sub2(1, temp_[i], borrow_Fn, &borrow_Fn);
+        Z[z_index + i] = digit_sub2(Z[z_index + i], d, borrow_z, &borrow_z);
+      }
+      DCHECK(borrow_Fn == 0);  // NOLINT(readability/check)
+      for (; borrow_z > 0 && i < remaining_z; i++) {
+        Z[z_index + i] = digit_sub(Z[z_index + i], borrow_z, &borrow_z);
+      }
+    } else {
+      digit_t carry = 0;
+      int i = 0;
+      for (; i < length_ && i < remaining_z; i++) {
+        Z[z_index + i] = digit_add3(Z[z_index + i], temp_[i], carry, &carry);
+      }
+      for (; i < length_; i++) {
+        DCHECK(temp_[i] == 0);  // NOLINT(readability/check)
+      }
+      for (; carry > 0 && i < remaining_z; i++) {
+        Z[z_index + i] = digit_add2(Z[z_index + i], carry, &carry);
+      }
+      // {carry} might be != 0 here if Z was negative before. That's fine.
+    }
+  }
+}
+
+// Main FFT function for recursive invocations ("inner layer").
+void MultiplyFFT_Inner(RWDigits Z, Digits X, Digits Y, const Parameters& params,
+                       ProcessorImpl* processor) {
+  int omega = 2 * params.r;  // really: 2^(2r)
+  int theta = params.r;      // really: 2^r
+
+  FFTContainer a(params.n, params.K, processor);
+  a.Start_Default(X, params.s, theta, omega);
+  FFTContainer b(params.n, params.K, processor);
+  b.Start_Default(Y, params.s, theta, omega);
+
+  a.PointwiseMultiply(b);
+  if (processor->should_terminate()) return;
+
+  FFTContainer& c = a;
+  c.BackwardFFT(0, params.n, omega);
+
+  c.CounterWeightAndRecombine(theta, params.m, Z, params.s);
+}
+
+// Actual implementation of pointwise multiplications.
+void FFTContainer::DoPointwiseMultiplication(const FFTContainer& other,
+                                             int start, int end,
+                                             digit_t* temp) {
+  // The (K_ & 3) != 0 condition makes sure that the inner FFT gets
+  // to split the work into at least 4 chunks.
+  bool use_fft = length_ >= kFftInnerThreshold && (K_ & 3) == 0;
+  Parameters params;
+  if (use_fft) ComputeParameters_Inner(K_, &params);
+  RWDigits result(temp, 2 * length_);
+  for (int i = start; i < end; i++) {
+    Digits A(part_[i], length_);
+    Digits B(other.part_[i], length_);
+    if (use_fft) {
+      MultiplyFFT_Inner(result, A, B, params, processor_);
+    } else {
+      processor_->Multiply(result, A, B);
+    }
+    if (processor_->should_terminate()) return;
+    ModFnDoubleWidth(part_[i], result.digits(), length_);
+    // To improve cache friendliness, we perform the first level of the
+    // backwards FFT here.
+    if ((i & 1) == 1) {
+      SumDiff(part_[i - 1], part_[i], part_[i - 1], part_[i], length_);
+    }
+  }
+}
+
+// Convenient entry point for pointwise multiplications.
+void FFTContainer::PointwiseMultiply(const FFTContainer& other) {
+  DCHECK(n_ == other.n_);
+  DoPointwiseMultiplication(other, 0, n_, temp_);
+}
+
+}  // namespace
+
+////////////////////////////////////////////////////////////////////////////////
+// Part 4: Tying everything together into a multiplication algorithm.
+
+// TODO(jkummerow): Consider doing a "Mersenne transform" and CRT reconstruction
+// of the final result. Might yield a few percent of perf improvement.
+
+// TODO(jkummerow): Consider implementing the "sqrt(2) trick".
+// Gaudry/Kruppa/Zimmerman report that it saved them around 10%.
+
+void ProcessorImpl::MultiplyFFT(RWDigits Z, Digits X, Digits Y) {
+  Parameters params;
+  int m = GetParameters(X.len() + Y.len(), &params);
+  int omega = params.r;  // really: 2^r
+
+  FFTContainer a(params.n, params.K, this);
+  a.Start(X, params.s, 0, omega);
+  if (X == Y) {
+    // Squaring.
+    a.PointwiseMultiply(a);
+  } else {
+    FFTContainer b(params.n, params.K, this);
+    b.Start(Y, params.s, 0, omega);
+    a.PointwiseMultiply(b);
+  }
+  if (should_terminate()) return;
+
+  a.BackwardFFT(0, params.n, omega);
+  a.NormalizeAndRecombine(omega, m, Z, params.s);
+}
+
+}  // namespace bigint
+}  // namespace v8
diff --git a/deps/v8/src/bigint/mul-karatsuba.cc b/deps/v8/src/bigint/mul-karatsuba.cc
new file mode 100644
index 0000000000..2a141f213c
--- /dev/null
+++ b/deps/v8/src/bigint/mul-karatsuba.cc
@@ -0,0 +1,205 @@
+// Copyright 2021 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// Karatsuba multiplication. This is loosely based on Go's implementation
+// found at https://golang.org/src/math/big/nat.go, licensed as follows:
+//
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file [1].
+//
+// [1] https://golang.org/LICENSE
+
+#include <algorithm>
+#include <utility>
+
+#include "src/bigint/bigint-internal.h"
+#include "src/bigint/digit-arithmetic.h"
+#include "src/bigint/util.h"
+#include "src/bigint/vector-arithmetic.h"
+
+namespace v8 {
+namespace bigint {
+
+// If Karatsuba is the best supported algorithm, then it must check for
+// termination requests. If there are more advanced algorithms available
+// for larger inputs, then Karatsuba will only be used for sufficiently
+// small chunks that checking for termination requests is not necessary.
+#if V8_ADVANCED_BIGINT_ALGORITHMS
+#define MAYBE_TERMINATE
+#else
+#define MAYBE_TERMINATE \
+  if (should_terminate()) return;
+#endif
+
+namespace {
+
+// The Karatsuba algorithm sometimes finishes more quickly when the
+// input length is rounded up a bit. This method encodes some heuristics
+// to accomplish this. The details have been determined experimentally.
+int RoundUpLen(int len) {
+  if (len <= 36) return RoundUp(len, 2);
+  // Keep the 4 or 5 most significant non-zero bits.
+  int shift = BitLength(len) - 5;
+  if ((len >> shift) >= 0x18) {
+    shift++;
+  }
+  // Round up, unless we're only just above the threshold. This smoothes
+  // the steps by which time goes up as input size increases.
+  int additive = ((1 << shift) - 1);
+  if (shift >= 2 && (len & additive) < (1 << (shift - 2))) {
+    return len;
+  }
+  return ((len + additive) >> shift) << shift;
+}
+
+// This method makes the final decision how much to bump up the input size.
+int KaratsubaLength(int n) {
+  n = RoundUpLen(n);
+  int i = 0;
+  while (n > kKaratsubaThreshold) {
+    n >>= 1;
+    i++;
+  }
+  return n << i;
+}
+
+// Performs the specific subtraction required by {KaratsubaMain} below.
+void KaratsubaSubtractionHelper(RWDigits result, Digits X, Digits Y,
+                                int* sign) {
+  X.Normalize();
+  Y.Normalize();
+  digit_t borrow = 0;
+  int i = 0;
+  if (!GreaterThanOrEqual(X, Y)) {
+    *sign = -(*sign);
+    std::swap(X, Y);
+  }
+  for (; i < Y.len(); i++) {
+    result[i] = digit_sub2(X[i], Y[i], borrow, &borrow);
+  }
+  for (; i < X.len(); i++) {
+    result[i] = digit_sub(X[i], borrow, &borrow);
+  }
+  DCHECK(borrow == 0);  // NOLINT(readability/check)
+  for (; i < result.len(); i++) result[i] = 0;
+}
+
+}  // namespace
+
+void ProcessorImpl::MultiplyKaratsuba(RWDigits Z, Digits X, Digits Y) {
+  DCHECK(X.len() >= Y.len());
+  DCHECK(Y.len() >= kKaratsubaThreshold);
+  DCHECK(Z.len() >= X.len() + Y.len());
+  int k = KaratsubaLength(Y.len());
+  int scratch_len = 4 * k;
+  ScratchDigits scratch(scratch_len);
+  KaratsubaStart(Z, X, Y, scratch, k);
+}
+
+// Entry point for Karatsuba-based multiplication, takes care of inputs
+// with unequal lengths by chopping the larger into chunks.
+void ProcessorImpl::KaratsubaStart(RWDigits Z, Digits X, Digits Y,
+                                   RWDigits scratch, int k) {
+  KaratsubaMain(Z, X, Y, scratch, k);
+  MAYBE_TERMINATE
+  for (int i = 2 * k; i < Z.len(); i++) Z[i] = 0;
+  if (k < Y.len() || X.len() != Y.len()) {
+    ScratchDigits T(2 * k);
+    // Add X0 * Y1 * b.
+    Digits X0(X, 0, k);
+    Digits Y1 = Y + std::min(k, Y.len());
+    if (Y1.len() > 0) {
+      KaratsubaChunk(T, X0, Y1, scratch);
+      MAYBE_TERMINATE
+      AddAndReturnOverflow(Z + k, T);  // Can't overflow.
+    }
+
+    // Add Xi * Y0 << i and Xi * Y1 * b << (i + k).
+    Digits Y0(Y, 0, k);
+    for (int i = k; i < X.len(); i += k) {
+      Digits Xi(X, i, k);
+      KaratsubaChunk(T, Xi, Y0, scratch);
+      MAYBE_TERMINATE
+      AddAndReturnOverflow(Z + i, T);  // Can't overflow.
+      if (Y1.len() > 0) {
+        KaratsubaChunk(T, Xi, Y1, scratch);
+        MAYBE_TERMINATE
+        AddAndReturnOverflow(Z + (i + k), T);  // Can't overflow.
+      }
+    }
+  }
+}
+
+// Entry point for chunk-wise multiplications, selects an appropriate
+// algorithm for the inputs based on their sizes.
+void ProcessorImpl::KaratsubaChunk(RWDigits Z, Digits X, Digits Y,
+                                   RWDigits scratch) {
+  X.Normalize();
+  Y.Normalize();
+  if (X.len() == 0 || Y.len() == 0) return Z.Clear();
+  if (X.len() < Y.len()) std::swap(X, Y);
+  if (Y.len() == 1) return MultiplySingle(Z, X, Y[0]);
+  if (Y.len() < kKaratsubaThreshold) return MultiplySchoolbook(Z, X, Y);
+  int k = KaratsubaLength(Y.len());
+  DCHECK(scratch.len() >= 4 * k);
+  return KaratsubaStart(Z, X, Y, scratch, k);
+}
+
+// The main recursive Karatsuba method.
+void ProcessorImpl::KaratsubaMain(RWDigits Z, Digits X, Digits Y,
+                                  RWDigits scratch, int n) {
+  if (n < kKaratsubaThreshold) {
+    X.Normalize();
+    Y.Normalize();
+    if (X.len() >= Y.len()) {
+      return MultiplySchoolbook(RWDigits(Z, 0, 2 * n), X, Y);
+    } else {
+      return MultiplySchoolbook(RWDigits(Z, 0, 2 * n), Y, X);
+    }
+  }
+  DCHECK(scratch.len() >= 4 * n);
+  DCHECK((n & 1) == 0);  // NOLINT(readability/check)
+  int n2 = n >> 1;
+  Digits X0(X, 0, n2);
+  Digits X1(X, n2, n2);
+  Digits Y0(Y, 0, n2);
+  Digits Y1(Y, n2, n2);
+  RWDigits scratch_for_recursion(scratch, 2 * n, 2 * n);
+  RWDigits P0(scratch, 0, n);
+  KaratsubaMain(P0, X0, Y0, scratch_for_recursion, n2);
+  MAYBE_TERMINATE
+  for (int i = 0; i < n; i++) Z[i] = P0[i];
+  RWDigits P2(scratch, n, n);
+  KaratsubaMain(P2, X1, Y1, scratch_for_recursion, n2);
+  MAYBE_TERMINATE
+  RWDigits Z2 = Z + n;
+  int end = std::min(Z2.len(), P2.len());
+  for (int i = 0; i < end; i++) Z2[i] = P2[i];
+  for (int i = end; i < n; i++) {
+    DCHECK(P2[i] == 0);  // NOLINT(readability/check)
+  }
+  // The intermediate result can be one digit too large; the subtraction
+  // below will fix this.
+  digit_t overflow = AddAndReturnOverflow(Z + n2, P0);
+  overflow += AddAndReturnOverflow(Z + n2, P2);
+  RWDigits X_diff(scratch, 0, n2);
+  RWDigits Y_diff(scratch, n2, n2);
+  int sign = 1;
+  KaratsubaSubtractionHelper(X_diff, X1, X0, &sign);
+  KaratsubaSubtractionHelper(Y_diff, Y0, Y1, &sign);
+  RWDigits P1(scratch, n, n);
+  KaratsubaMain(P1, X_diff, Y_diff, scratch_for_recursion, n2);
+  if (sign > 0) {
+    overflow += AddAndReturnOverflow(Z + n2, P1);
+  } else {
+    overflow -= SubAndReturnBorrow(Z + n2, P1);
+  }
+  // The intermediate result may have been bigger, but the final result fits.
+  DCHECK(overflow == 0);  // NOLINT(readability/check)
+  USE(overflow);
+}
+
+}  // namespace bigint
+}  // namespace v8
diff --git a/deps/v8/src/bigint/mul-schoolbook.cc b/deps/v8/src/bigint/mul-schoolbook.cc
index 8e10685018..9222e1e675 100644
--- a/deps/v8/src/bigint/mul-schoolbook.cc
+++ b/deps/v8/src/bigint/mul-schoolbook.cc
@@ -71,7 +71,6 @@ void ProcessorImpl::MultiplySchoolbook(RWDigits Z, Digits X, Digits Y) {
     next_carry = 0;
     BODY(0, i);
     AddWorkEstimate(i);
-    if (should_terminate()) return;
   }
   // Last part: i exceeds Y now, we have to be careful about bounds.
   int loop_end = X.len() + Y.len() - 2;
@@ -85,7 +84,6 @@ void ProcessorImpl::MultiplySchoolbook(RWDigits Z, Digits X, Digits Y) {
     next_carry = 0;
     BODY(min_x_index, max_x_index);
     AddWorkEstimate(max_x_index - min_x_index);
-    if (should_terminate()) return;
   }
   // Write the last digit, and zero out any extra space in Z.
   Z[i++] = digit_add2(next, carry, &carry);
diff --git a/deps/v8/src/bigint/mul-toom.cc b/deps/v8/src/bigint/mul-toom.cc
new file mode 100644
index 0000000000..76a4fdcdfb
--- /dev/null
+++ b/deps/v8/src/bigint/mul-toom.cc
@@ -0,0 +1,223 @@
+// Copyright 2021 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// Toom-Cook multiplication.
+// Reference: https://en.wikipedia.org/wiki/Toom%E2%80%93Cook_multiplication
+
+#include <algorithm>
+
+#include "src/bigint/bigint-internal.h"
+#include "src/bigint/digit-arithmetic.h"
+#include "src/bigint/vector-arithmetic.h"
+
+namespace v8 {
+namespace bigint {
+
+namespace {
+
+void TimesTwo(RWDigits X) {
+  digit_t carry = 0;
+  for (int i = 0; i < X.len(); i++) {
+    digit_t d = X[i];
+    X[i] = (d << 1) | carry;
+    carry = d >> (kDigitBits - 1);
+  }
+}
+
+void DivideByTwo(RWDigits X) {
+  digit_t carry = 0;
+  for (int i = X.len() - 1; i >= 0; i--) {
+    digit_t d = X[i];
+    X[i] = (d >> 1) | carry;
+    carry = d << (kDigitBits - 1);
+  }
+}
+
+void DivideByThree(RWDigits X) {
+  digit_t remainder = 0;
+  for (int i = X.len() - 1; i >= 0; i--) {
+    digit_t d = X[i];
+    digit_t upper = (remainder << kHalfDigitBits) | (d >> kHalfDigitBits);
+    digit_t u_result = upper / 3;
+    remainder = upper - 3 * u_result;
+    digit_t lower = (remainder << kHalfDigitBits) | (d & kHalfDigitMask);
+    digit_t l_result = lower / 3;
+    remainder = lower - 3 * l_result;
+    X[i] = (u_result << kHalfDigitBits) | l_result;
+  }
+}
+
+}  // namespace
+
+#if DEBUG
+// Set {len_} to 1 rather than 0 so that attempts to access the first digit
+// will crash.
+#define MARK_INVALID(D) D = RWDigits(nullptr, 1)
+#else
+#define MARK_INVALID(D) (void(0))
+#endif
+
+void ProcessorImpl::Toom3Main(RWDigits Z, Digits X, Digits Y) {
+  DCHECK(Z.len() >= X.len() + Y.len());
+  // Phase 1: Splitting.
+  int i = DIV_CEIL(std::max(X.len(), Y.len()), 3);
+  Digits X0(X, 0, i);
+  Digits X1(X, i, i);
+  Digits X2(X, 2 * i, i);
+  Digits Y0(Y, 0, i);
+  Digits Y1(Y, i, i);
+  Digits Y2(Y, 2 * i, i);
+
+  // Temporary storage.
+  int p_len = i + 1;      // For all px, qx below.
+  int r_len = 2 * p_len;  // For all r_x, Rx below.
+  Storage temp_storage(4 * r_len);
+  // We will use the same variable names as the Wikipedia article, as much as
+  // C++ lets us: our "p_m1" is their "p(-1)" etc. For consistency with other
+  // algorithms, we use X and Y where Wikipedia uses m and n.
+  // We will use and re-use the temporary storage as follows:
+  //
+  //   chunk                  | -------- time ----------->
+  //   [0 .. i]               |( po )( p_m1 ) ( r_m2  )
+  //   [i+1 .. rlen-1]        |( qo )( q_m1 ) ( r_m2  )
+  //   [rlen .. rlen+i]       | (p_1 ) ( p_m2 ) (r_inf)
+  //   [rlen+i+1 .. 2*rlen-1] | (q_1 ) ( q_m2 ) (r_inf)
+  //   [2*rlen .. 3*rlen-1]   |      (   r_1          )
+  //   [3*rlen .. 4*rlen-1]   |             (  r_m1   )
+  //
+  // This requires interleaving phases 2 and 3 a bit: after computing
+  // r_1 = p_1 * q_1, we can re-use p_1's storage for p_m2, and so on.
+  digit_t* t = temp_storage.get();
+  RWDigits po(t, p_len);
+  RWDigits qo(t + p_len, p_len);
+  RWDigits p_1(t + r_len, p_len);
+  RWDigits q_1(t + r_len + p_len, p_len);
+  RWDigits r_1(t + 2 * r_len, r_len);
+  RWDigits r_m1(t + 3 * r_len, r_len);
+
+  // We can also share the  backing stores of Z, r_0, R0.
+  DCHECK(Z.len() >= r_len);
+  RWDigits r_0(Z, 0, r_len);
+
+  // Phase 2a: Evaluation, steps 0, 1, m1.
+  // po = X0 + X2
+  Add(po, X0, X2);
+  // p_0 = X0
+  // p_1 = po + X1
+  Add(p_1, po, X1);
+  // p_m1 = po - X1
+  RWDigits p_m1 = po;
+  bool p_m1_sign = SubtractSigned(p_m1, po, false, X1, false);
+  MARK_INVALID(po);
+
+  // qo = Y0 + Y2
+  Add(qo, Y0, Y2);
+  // q_0 = Y0
+  // q_1 = qo + Y1
+  Add(q_1, qo, Y1);
+  // q_m1 = qo - Y1
+  RWDigits q_m1 = qo;
+  bool q_m1_sign = SubtractSigned(q_m1, qo, false, Y1, false);
+  MARK_INVALID(qo);
+
+  // Phase 3a: Pointwise multiplication, steps 0, 1, m1.
+  Multiply(r_0, X0, Y0);
+  Multiply(r_1, p_1, q_1);
+  Multiply(r_m1, p_m1, q_m1);
+  bool r_m1_sign = p_m1_sign != q_m1_sign;
+
+  // Phase 2b: Evaluation, steps m2 and inf.
+  // p_m2 = (p_m1 + X2) * 2 - X0
+  RWDigits p_m2 = p_1;
+  MARK_INVALID(p_1);
+  bool p_m2_sign = AddSigned(p_m2, p_m1, p_m1_sign, X2, false);
+  TimesTwo(p_m2);
+  p_m2_sign = SubtractSigned(p_m2, p_m2, p_m2_sign, X0, false);
+  // p_inf = X2
+
+  // q_m2 = (q_m1 + Y2) * 2 - Y0
+  RWDigits q_m2 = q_1;
+  MARK_INVALID(q_1);
+  bool q_m2_sign = AddSigned(q_m2, q_m1, q_m1_sign, Y2, false);
+  TimesTwo(q_m2);
+  q_m2_sign = SubtractSigned(q_m2, q_m2, q_m2_sign, Y0, false);
+  // q_inf = Y2
+
+  // Phase 3b: Pointwise multiplication, steps m2 and inf.
+  RWDigits r_m2(t, r_len);
+  MARK_INVALID(p_m1);
+  MARK_INVALID(q_m1);
+  Multiply(r_m2, p_m2, q_m2);
+  bool r_m2_sign = p_m2_sign != q_m2_sign;
+
+  RWDigits r_inf(t + r_len, r_len);
+  MARK_INVALID(p_m2);
+  MARK_INVALID(q_m2);
+  Multiply(r_inf, X2, Y2);
+
+  // Phase 4: Interpolation.
+  Digits R0 = r_0;
+  Digits R4 = r_inf;
+  // R3 <- (r_m2 - r_1) / 3
+  RWDigits R3 = r_m2;
+  bool R3_sign = SubtractSigned(R3, r_m2, r_m2_sign, r_1, false);
+  DivideByThree(R3);
+  // R1 <- (r_1 - r_m1) / 2
+  RWDigits R1 = r_1;
+  bool R1_sign = SubtractSigned(R1, r_1, false, r_m1, r_m1_sign);
+  DivideByTwo(R1);
+  // R2 <- r_m1 - r_0
+  RWDigits R2 = r_m1;
+  bool R2_sign = SubtractSigned(R2, r_m1, r_m1_sign, R0, false);
+  // R3 <- (R2 - R3) / 2 + 2 * r_inf
+  R3_sign = SubtractSigned(R3, R2, R2_sign, R3, R3_sign);
+  DivideByTwo(R3);
+  // TODO(jkummerow): Would it be a measurable improvement to write an
+  // "AddTwice" helper?
+  R3_sign = AddSigned(R3, R3, R3_sign, r_inf, false);
+  R3_sign = AddSigned(R3, R3, R3_sign, r_inf, false);
+  // R2 <- R2 + R1 - R4
+  R2_sign = AddSigned(R2, R2, R2_sign, R1, R1_sign);
+  R2_sign = SubtractSigned(R2, R2, R2_sign, R4, false);
+  // R1 <- R1 - R3
+  R1_sign = SubtractSigned(R1, R1, R1_sign, R3, R3_sign);
+
+#if DEBUG
+  R1.Normalize();
+  R2.Normalize();
+  R3.Normalize();
+  DCHECK(R1_sign == false || R1.len() == 0);
+  DCHECK(R2_sign == false || R2.len() == 0);
+  DCHECK(R3_sign == false || R3.len() == 0);
+#endif
+
+  // Phase 5: Recomposition. R0 is already in place. Overflow can't happen.
+  for (int j = R0.len(); j < Z.len(); j++) Z[j] = 0;
+  AddAndReturnOverflow(Z + i, R1);
+  AddAndReturnOverflow(Z + 2 * i, R2);
+  AddAndReturnOverflow(Z + 3 * i, R3);
+  AddAndReturnOverflow(Z + 4 * i, R4);
+}
+
+void ProcessorImpl::MultiplyToomCook(RWDigits Z, Digits X, Digits Y) {
+  DCHECK(X.len() >= Y.len());
+  int k = Y.len();
+  // TODO(jkummerow): Would it be a measurable improvement to share the
+  // scratch memory for several invocations?
+  Digits X0(X, 0, k);
+  Toom3Main(Z, X0, Y);
+  if (X.len() > Y.len()) {
+    ScratchDigits T(2 * k);
+    for (int i = k; i < X.len(); i += k) {
+      Digits Xi(X, i, k);
+      // TODO(jkummerow): would it be a measurable improvement to craft a
+      // "ToomChunk" method in the style of {KaratsubaChunk}?
+      Toom3Main(T, Xi, Y);
+      AddAndReturnOverflow(Z + i, T);  // Can't overflow.
+    }
+  }
+}
+
+}  // namespace bigint
+}  // namespace v8
diff --git a/deps/v8/src/bigint/tostring.cc b/deps/v8/src/bigint/tostring.cc
new file mode 100644
index 0000000000..b426c864cd
--- /dev/null
+++ b/deps/v8/src/bigint/tostring.cc
@@ -0,0 +1,303 @@
+// Copyright 2021 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <cstring>
+#include <limits>
+
+#include "src/bigint/bigint-internal.h"
+#include "src/bigint/digit-arithmetic.h"
+#include "src/bigint/div-helpers.h"
+#include "src/bigint/util.h"
+#include "src/bigint/vector-arithmetic.h"
+
+namespace v8 {
+namespace bigint {
+
+namespace {
+
+// Lookup table for the maximum number of bits required per character of a
+// base-N string representation of a number. To increase accuracy, the array
+// value is the actual value multiplied by 32. To generate this table:
+// for (var i = 0; i <= 36; i++) { print(Math.ceil(Math.log2(i) * 32) + ","); }
+constexpr uint8_t kMaxBitsPerChar[] = {
+    0,   0,   32,  51,  64,  75,  83,  90,  96,  // 0..8
+    102, 107, 111, 115, 119, 122, 126, 128,      // 9..16
+    131, 134, 136, 139, 141, 143, 145, 147,      // 17..24
+    149, 151, 153, 154, 156, 158, 159, 160,      // 25..32
+    162, 163, 165, 166,                          // 33..36
+};
+
+static const int kBitsPerCharTableShift = 5;
+static const size_t kBitsPerCharTableMultiplier = 1u << kBitsPerCharTableShift;
+
+static const char kConversionChars[] = "0123456789abcdefghijklmnopqrstuvwxyz";
+
+// Raises {base} to the power of {exponent}. Does not check for overflow.
+digit_t digit_pow(digit_t base, digit_t exponent) {
+  digit_t result = 1ull;
+  while (exponent > 0) {
+    if (exponent & 1) {
+      result *= base;
+    }
+    exponent >>= 1;
+    base *= base;
+  }
+  return result;
+}
+
+// Compile-time version of the above.
+constexpr digit_t digit_pow_rec(digit_t base, digit_t exponent) {
+  return exponent == 1 ? base : base * digit_pow_rec(base, exponent - 1);
+}
+
+// A variant of ToStringFormatter::BasecaseLast, specialized for a radix
+// known at compile-time.
+template <int radix>
+char* BasecaseFixedLast(digit_t chunk, char* out) {
+  while (chunk != 0) {
+    DCHECK(*(out - 1) == kStringZapValue);  // NOLINT(readability/check)
+    if (radix <= 10) {
+      *(--out) = '0' + (chunk % radix);
+    } else {
+      *(--out) = kConversionChars[chunk % radix];
+    }
+    chunk /= radix;
+  }
+  return out;
+}
+
+// By making {radix} a compile-time constant and computing {chunk_divisor}
+// as another compile-time constant from it, we allow the compiler to emit
+// an optimized instruction sequence based on multiplications with "magic"
+// numbers (modular multiplicative inverses) instead of actual divisions.
+// The price we pay is having to work on half digits; the technique doesn't
+// work with twodigit_t-by-digit_t divisions.
+// Includes an equivalent of ToStringFormatter::BasecaseMiddle, accordingly
+// specialized for a radix known at compile time.
+template <digit_t radix>
+char* DivideByMagic(RWDigits rest, Digits input, char* output) {
+  constexpr uint8_t max_bits_per_char = kMaxBitsPerChar[radix];
+  constexpr int chunk_chars =
+      kHalfDigitBits * kBitsPerCharTableMultiplier / max_bits_per_char;
+  constexpr digit_t chunk_divisor = digit_pow_rec(radix, chunk_chars);
+  digit_t remainder = 0;
+  for (int i = input.len() - 1; i >= 0; i--) {
+    digit_t d = input[i];
+    digit_t upper = (remainder << kHalfDigitBits) | (d >> kHalfDigitBits);
+    digit_t u_result = upper / chunk_divisor;
+    remainder = upper % chunk_divisor;
+    digit_t lower = (remainder << kHalfDigitBits) | (d & kHalfDigitMask);
+    digit_t l_result = lower / chunk_divisor;
+    remainder = lower % chunk_divisor;
+    rest[i] = (u_result << kHalfDigitBits) | l_result;
+  }
+  // {remainder} is now the current chunk to be written out.
+  for (int i = 0; i < chunk_chars; i++) {
+    DCHECK(*(output - 1) == kStringZapValue);  // NOLINT(readability/check)
+    if (radix <= 10) {
+      *(--output) = '0' + (remainder % radix);
+    } else {
+      *(--output) = kConversionChars[remainder % radix];
+    }
+    remainder /= radix;
+  }
+  DCHECK(remainder == 0);  // NOLINT(readability/check)
+  return output;
+}
+
+class ToStringFormatter {
+ public:
+  ToStringFormatter(Digits X, int radix, bool sign, char* out,
+                    int chars_available, ProcessorImpl* processor)
+      : digits_(X),
+        radix_(radix),
+        sign_(sign),
+        out_start_(out),
+        out_end_(out + chars_available),
+        out_(out_end_),
+        processor_(processor) {
+    DCHECK(chars_available >= ToStringResultLength(digits_, radix_, sign_));
+  }
+
+  void Start();
+  int Finish();
+
+  void Classic() {
+    if (digits_.len() == 0) {
+      *(--out_) = '0';
+      return;
+    }
+    if (digits_.len() == 1) {
+      out_ = BasecaseLast(digits_[0], out_);
+      return;
+    }
+    // {rest} holds the part of the BigInt that we haven't looked at yet.
+    // Not to be confused with "remainder"!
+    ScratchDigits rest(digits_.len());
+    // In the first round, divide the input, allocating a new BigInt for
+    // the result == rest; from then on divide the rest in-place.
+    Digits dividend = digits_;
+    do {
+      if (radix_ == 10) {
+        // Faster but costs binary size, so we optimize the most common case.
+        out_ = DivideByMagic<10>(rest, dividend, out_);
+        processor_->AddWorkEstimate(rest.len() * 2);
+      } else {
+        digit_t chunk;
+        processor_->DivideSingle(rest, &chunk, dividend, chunk_divisor_);
+        out_ = BasecaseMiddle(chunk, out_);
+        // Assume that a division is about ten times as expensive as a
+        // multiplication.
+        processor_->AddWorkEstimate(rest.len() * 10);
+      }
+      if (processor_->should_terminate()) return;
+      rest.Normalize();
+      dividend = rest;
+    } while (rest.len() > 1);
+    out_ = BasecaseLast(rest[0], out_);
+  }
+
+  void BasePowerOfTwo();
+
+ private:
+  // When processing the last (most significant) digit, don't write leading
+  // zeros.
+  char* BasecaseLast(digit_t digit, char* out) {
+    if (radix_ == 10) return BasecaseFixedLast<10>(digit, out);
+    do {
+      DCHECK(*(out - 1) == kStringZapValue);  // NOLINT(readability/check)
+      *(--out) = kConversionChars[digit % radix_];
+      digit /= radix_;
+    } while (digit > 0);
+    return out;
+  }
+
+  // When processing a middle (non-most significant) digit, always write the
+  // same number of characters (as many '0' as necessary).
+  char* BasecaseMiddle(digit_t digit, char* out) {
+    for (int i = 0; i < chunk_chars_; i++) {
+      DCHECK(*(out - 1) == kStringZapValue);  // NOLINT(readability/check)
+      *(--out) = kConversionChars[digit % radix_];
+      digit /= radix_;
+    }
+    DCHECK(digit == 0);  // NOLINT(readability/check)
+    return out;
+  }
+
+  Digits digits_;
+  int radix_;
+  int max_bits_per_char_ = 0;
+  int chunk_chars_ = 0;
+  bool sign_;
+  char* out_start_;
+  char* out_end_;
+  char* out_;
+  digit_t chunk_divisor_ = 0;
+  ProcessorImpl* processor_;
+};
+
+// Prepares data for {Classic}. Not needed for {BasePowerOfTwo}.
+void ToStringFormatter::Start() {
+  max_bits_per_char_ = kMaxBitsPerChar[radix_];
+  chunk_chars_ = kDigitBits * kBitsPerCharTableMultiplier / max_bits_per_char_;
+  chunk_divisor_ = digit_pow(radix_, chunk_chars_);
+  // By construction of chunk_chars_, there can't have been overflow.
+  DCHECK(chunk_divisor_ != 0);  // NOLINT(readability/check)
+}
+
+int ToStringFormatter::Finish() {
+  DCHECK(out_ >= out_start_);
+  DCHECK(out_ < out_end_);  // At least one character was written.
+  while (out_ < out_end_ && *out_ == '0') out_++;
+  if (sign_) *(--out_) = '-';
+  int excess = 0;
+  if (out_ > out_start_) {
+    size_t actual_length = out_end_ - out_;
+    excess = static_cast<int>(out_ - out_start_);
+    std::memmove(out_start_, out_, actual_length);
+  }
+  return excess;
+}
+
+void ToStringFormatter::BasePowerOfTwo() {
+  const int bits_per_char = CountTrailingZeros(radix_);
+  const int char_mask = radix_ - 1;
+  digit_t digit = 0;
+  // Keeps track of how many unprocessed bits there are in {digit}.
+  int available_bits = 0;
+  for (int i = 0; i < digits_.len() - 1; i++) {
+    digit_t new_digit = digits_[i];
+    // Take any leftover bits from the last iteration into account.
+    int current = (digit | (new_digit << available_bits)) & char_mask;
+    *(--out_) = kConversionChars[current];
+    int consumed_bits = bits_per_char - available_bits;
+    digit = new_digit >> consumed_bits;
+    available_bits = kDigitBits - consumed_bits;
+    while (available_bits >= bits_per_char) {
+      *(--out_) = kConversionChars[digit & char_mask];
+      digit >>= bits_per_char;
+      available_bits -= bits_per_char;
+    }
+  }
+  // Take any leftover bits from the last iteration into account.
+  digit_t msd = digits_.msd();
+  int current = (digit | (msd << available_bits)) & char_mask;
+  *(--out_) = kConversionChars[current];
+  digit = msd >> (bits_per_char - available_bits);
+  while (digit != 0) {
+    *(--out_) = kConversionChars[digit & char_mask];
+    digit >>= bits_per_char;
+  }
+}
+
+}  // namespace
+
+void ProcessorImpl::ToString(char* out, int* out_length, Digits X, int radix,
+                             bool sign) {
+#if DEBUG
+  for (int i = 0; i < *out_length; i++) out[i] = kStringZapValue;
+#endif
+  ToStringFormatter formatter(X, radix, sign, out, *out_length, this);
+  if (IsPowerOfTwo(radix)) {
+    formatter.BasePowerOfTwo();
+  } else {
+    formatter.Start();
+    formatter.Classic();
+  }
+  int excess = formatter.Finish();
+  *out_length -= excess;
+}
+
+Status Processor::ToString(char* out, int* out_length, Digits X, int radix,
+                           bool sign) {
+  ProcessorImpl* impl = static_cast<ProcessorImpl*>(this);
+  impl->ToString(out, out_length, X, radix, sign);
+  return impl->get_and_clear_status();
+}
+
+int ToStringResultLength(Digits X, int radix, bool sign) {
+  const int bit_length = BitLength(X);
+  int result;
+  if (IsPowerOfTwo(radix)) {
+    const int bits_per_char = CountTrailingZeros(radix);
+    result = DIV_CEIL(bit_length, bits_per_char) + sign;
+  } else {
+    // Maximum number of bits we can represent with one character.
+    const uint8_t max_bits_per_char = kMaxBitsPerChar[radix];
+    // For estimating the result length, we have to be pessimistic and work with
+    // the minimum number of bits one character can represent.
+    const uint8_t min_bits_per_char = max_bits_per_char - 1;
+    // Perform the following computation with uint64_t to avoid overflows.
+    uint64_t chars_required = bit_length;
+    chars_required *= kBitsPerCharTableMultiplier;
+    chars_required = DIV_CEIL(chars_required, min_bits_per_char);
+    DCHECK(chars_required < std::numeric_limits<int>::max());
+    result = static_cast<int>(chars_required);
+  }
+  result += sign;
+  return result;
+}
+
+}  // namespace bigint
+}  // namespace v8
diff --git a/deps/v8/src/bigint/util.h b/deps/v8/src/bigint/util.h
new file mode 100644
index 0000000000..3d98e734dc
--- /dev/null
+++ b/deps/v8/src/bigint/util.h
@@ -0,0 +1,75 @@
+// Copyright 2021 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// "Generic" helper functions (not specific to BigInts).
+
+#include <stdint.h>
+
+#include <type_traits>
+
+#ifdef _MSC_VER
+#include <intrin.h>  // For _BitScanReverse.
+#endif
+
+#ifndef V8_BIGINT_UTIL_H_
+#define V8_BIGINT_UTIL_H_
+
+// Integer division, rounding up.
+#define DIV_CEIL(x, y) (((x)-1) / (y) + 1)
+
+namespace v8 {
+namespace bigint {
+
+// Rounds up x to a multiple of y.
+inline constexpr int RoundUp(int x, int y) { return (x + y - 1) & -y; }
+
+// Different environments disagree on how 64-bit uintptr_t and uint64_t are
+// defined, so we have to use templates to be generic.
+template <typename T, typename = typename std::enable_if<
+                          std::is_unsigned<T>::value && sizeof(T) == 8>::type>
+constexpr int CountLeadingZeros(T value) {
+#if __GNUC__ || __clang__
+  return value == 0 ? 64 : __builtin_clzll(value);
+#elif _MSC_VER
+  unsigned long index = 0;  // NOLINT(runtime/int). MSVC insists.
+  return _BitScanReverse64(&index, value) ? 63 - index : 64;
+#else
+#error Unsupported compiler.
+#endif
+}
+
+constexpr int CountLeadingZeros(uint32_t value) {
+#if __GNUC__ || __clang__
+  return value == 0 ? 32 : __builtin_clz(value);
+#elif _MSC_VER
+  unsigned long index = 0;  // NOLINT(runtime/int). MSVC insists.
+  return _BitScanReverse(&index, value) ? 31 - index : 32;
+#else
+#error Unsupported compiler.
+#endif
+}
+
+inline constexpr int CountTrailingZeros(uint32_t value) {
+#if __GNUC__ || __clang__
+  return value == 0 ? 32 : __builtin_ctz(value);
+#elif _MSC_VER
+  unsigned long index = 0;  // NOLINT(runtime/int).
+  return _BitScanForward(&index, value) ? index : 32;
+#else
+#error Unsupported compiler.
+#endif
+}
+
+inline constexpr int BitLength(int n) {
+  return 32 - CountLeadingZeros(static_cast<uint32_t>(n));
+}
+
+inline constexpr bool IsPowerOfTwo(int value) {
+  return value > 0 && (value & (value - 1)) == 0;
+}
+
+}  // namespace bigint
+}  // namespace v8
+
+#endif  // V8_BIGINT_UTIL_H_
diff --git a/deps/v8/src/bigint/vector-arithmetic.cc b/deps/v8/src/bigint/vector-arithmetic.cc
index 734b443911..4191755bc9 100644
--- a/deps/v8/src/bigint/vector-arithmetic.cc
+++ b/deps/v8/src/bigint/vector-arithmetic.cc
@@ -4,18 +4,118 @@
 
 #include "src/bigint/vector-arithmetic.h"
 
+#include "src/bigint/bigint-internal.h"
+#include "src/bigint/digit-arithmetic.h"
+
 namespace v8 {
 namespace bigint {
 
-int Compare(Digits A, Digits B) {
-  A.Normalize();
-  B.Normalize();
-  int diff = A.len() - B.len();
-  if (diff != 0) return diff;
-  int i = A.len() - 1;
-  while (i >= 0 && A[i] == B[i]) i--;
-  if (i < 0) return 0;
-  return A[i] > B[i] ? 1 : -1;
+digit_t AddAndReturnOverflow(RWDigits Z, Digits X) {
+  X.Normalize();
+  if (X.len() == 0) return 0;
+  digit_t carry = 0;
+  int i = 0;
+  for (; i < X.len(); i++) {
+    Z[i] = digit_add3(Z[i], X[i], carry, &carry);
+  }
+  for (; i < Z.len() && carry != 0; i++) {
+    Z[i] = digit_add2(Z[i], carry, &carry);
+  }
+  return carry;
+}
+
+digit_t SubAndReturnBorrow(RWDigits Z, Digits X) {
+  X.Normalize();
+  if (X.len() == 0) return 0;
+  digit_t borrow = 0;
+  int i = 0;
+  for (; i < X.len(); i++) {
+    Z[i] = digit_sub2(Z[i], X[i], borrow, &borrow);
+  }
+  for (; i < Z.len() && borrow != 0; i++) {
+    Z[i] = digit_sub(Z[i], borrow, &borrow);
+  }
+  return borrow;
+}
+
+void Add(RWDigits Z, Digits X, Digits Y) {
+  if (X.len() < Y.len()) {
+    return Add(Z, Y, X);
+  }
+  int i = 0;
+  digit_t carry = 0;
+  for (; i < Y.len(); i++) {
+    Z[i] = digit_add3(X[i], Y[i], carry, &carry);
+  }
+  for (; i < X.len(); i++) {
+    Z[i] = digit_add2(X[i], carry, &carry);
+  }
+  for (; i < Z.len(); i++) {
+    Z[i] = carry;
+    carry = 0;
+  }
+}
+
+void Subtract(RWDigits Z, Digits X, Digits Y) {
+  X.Normalize();
+  Y.Normalize();
+  DCHECK(X.len() >= Y.len());
+  int i = 0;
+  digit_t borrow = 0;
+  for (; i < Y.len(); i++) {
+    Z[i] = digit_sub2(X[i], Y[i], borrow, &borrow);
+  }
+  for (; i < X.len(); i++) {
+    Z[i] = digit_sub(X[i], borrow, &borrow);
+  }
+  DCHECK(borrow == 0);  // NOLINT(readability/check)
+  for (; i < Z.len(); i++) Z[i] = 0;
+}
+
+digit_t AddAndReturnCarry(RWDigits Z, Digits X, Digits Y) {
+  DCHECK(Z.len() >= Y.len() && X.len() >= Y.len());
+  digit_t carry = 0;
+  for (int i = 0; i < Y.len(); i++) {
+    Z[i] = digit_add3(X[i], Y[i], carry, &carry);
+  }
+  return carry;
+}
+
+digit_t SubtractAndReturnBorrow(RWDigits Z, Digits X, Digits Y) {
+  DCHECK(Z.len() >= Y.len() && X.len() >= Y.len());
+  digit_t borrow = 0;
+  for (int i = 0; i < Y.len(); i++) {
+    Z[i] = digit_sub2(X[i], Y[i], borrow, &borrow);
+  }
+  return borrow;
+}
+
+bool AddSigned(RWDigits Z, Digits X, bool x_negative, Digits Y,
+               bool y_negative) {
+  if (x_negative == y_negative) {
+    Add(Z, X, Y);
+    return x_negative;
+  }
+  if (GreaterThanOrEqual(X, Y)) {
+    Subtract(Z, X, Y);
+    return x_negative;
+  }
+  Subtract(Z, Y, X);
+  return !x_negative;
+}
+
+bool SubtractSigned(RWDigits Z, Digits X, bool x_negative, Digits Y,
+                    bool y_negative) {
+  if (x_negative != y_negative) {
+    Add(Z, X, Y);
+    return x_negative;
+  }
+  if (GreaterThanOrEqual(X, Y)) {
+    Subtract(Z, X, Y);
+    return x_negative;
+  }
+  Subtract(Z, Y, X);
+  return !x_negative;
 }
 
 }  // namespace bigint
diff --git a/deps/v8/src/bigint/vector-arithmetic.h b/deps/v8/src/bigint/vector-arithmetic.h
index 617cb20b55..3247660f95 100644
--- a/deps/v8/src/bigint/vector-arithmetic.h
+++ b/deps/v8/src/bigint/vector-arithmetic.h
@@ -8,12 +8,52 @@
 #define V8_BIGINT_VECTOR_ARITHMETIC_H_
 
 #include "src/bigint/bigint.h"
+#include "src/bigint/digit-arithmetic.h"
 
 namespace v8 {
 namespace bigint {
 
+// Z += X. Returns carry on overflow.
+digit_t AddAndReturnOverflow(RWDigits Z, Digits X);
+
+// Z -= X. Returns borrow on overflow.
+digit_t SubAndReturnBorrow(RWDigits Z, Digits X);
+
+// X += y.
+inline void Add(RWDigits X, digit_t y) {
+  digit_t carry = y;
+  int i = 0;
+  do {
+    X[i] = digit_add2(X[i], carry, &carry);
+    i++;
+  } while (carry != 0);
+}
+
+// X -= y.
+inline void Subtract(RWDigits X, digit_t y) {
+  digit_t borrow = y;
+  int i = 0;
+  do {
+    X[i] = digit_sub(X[i], borrow, &borrow);
+    i++;
+  } while (borrow != 0);
+}
+
+// These add exactly Y's digits to the matching digits in X, storing the
+// result in (part of) Z, and return the carry/borrow.
+digit_t AddAndReturnCarry(RWDigits Z, Digits X, Digits Y);
+digit_t SubtractAndReturnBorrow(RWDigits Z, Digits X, Digits Y);
+
 inline bool IsDigitNormalized(Digits X) { return X.len() == 0 || X.msd() != 0; }
 
+inline bool GreaterThanOrEqual(Digits A, Digits B) {
+  return Compare(A, B) >= 0;
+}
+
+inline int BitLength(Digits X) {
+  return X.len() * kDigitBits - CountLeadingZeros(X.msd());
+}
+
 }  // namespace bigint
 }  // namespace v8