1 files changed, 591 insertions, 34 deletions
diff --git a/chromium/v8/src/wasm/baseline/ia32/liftoff-assembler-ia32.h b/chromium/v8/src/wasm/baseline/ia32/liftoff-assembler-ia32.h
index 7a1d629bf2d..468450aef66 100644
--- a/chromium/v8/src/wasm/baseline/ia32/liftoff-assembler-ia32.h
+++ b/chromium/v8/src/wasm/baseline/ia32/liftoff-assembler-ia32.h
@@ -130,7 +130,7 @@ inline Register GetTmpByteRegister(LiftoffAssembler* assm, Register candidate) {
   if (candidate.is_byte_register()) return candidate;
   // {GetUnusedRegister()} may insert move instructions to spill registers to
   // the stack. This is OK because {mov} does not change the status flags.
-  return assm->GetUnusedRegister(liftoff::kByteRegs).gp();
+  return assm->GetUnusedRegister(liftoff::kByteRegs, {}).gp();
 }
 
 inline void MoveStackValue(LiftoffAssembler* assm, const Operand& src,
@@ -336,8 +336,6 @@ void LiftoffAssembler::Load(LiftoffRegister dst, Register src_addr,
     case LoadType::kS128Load:
       movdqu(dst.fp(), src_op);
       break;
-    default:
-      UNREACHABLE();
   }
 }
 
@@ -405,8 +403,6 @@ void LiftoffAssembler::Store(Register dst_addr, Register offset_reg,
     case StoreType::kS128Store:
       Movdqu(dst_op, src.fp());
       break;
-    default:
-      UNREACHABLE();
   }
 }
 
@@ -494,7 +490,56 @@ void LiftoffAssembler::AtomicStore(Register dst_addr, Register offset_reg,
 void LiftoffAssembler::AtomicAdd(Register dst_addr, Register offset_reg,
                                  uint32_t offset_imm, LiftoffRegister value,
                                  LiftoffRegister result, StoreType type) {
-  bailout(kAtomics, "AtomicAdd");
+  if (type.value() == StoreType::kI64Store) {
+    bailout(kAtomics, "AtomicAdd");
+    return;
+  }
+
+  DCHECK_EQ(value, result);
+  DCHECK(!cache_state()->is_used(result));
+  bool is_64_bit_op = type.value_type() == kWasmI64;
+
+  Register value_reg = is_64_bit_op ? value.low_gp() : value.gp();
+  Register result_reg = is_64_bit_op ? result.low_gp() : result.gp();
+
+  bool is_byte_store = type.size() == 1;
+  LiftoffRegList pinned =
+      LiftoffRegList::ForRegs(dst_addr, value_reg, offset_reg);
+
+  // Ensure that {value_reg} is a valid register.
+  if (is_byte_store && !liftoff::kByteRegs.has(value_reg)) {
+    Register safe_value_reg =
+        GetUnusedRegister(liftoff::kByteRegs, pinned).gp();
+    mov(safe_value_reg, value_reg);
+    value_reg = safe_value_reg;
+  }
+
+  Operand dst_op = Operand(dst_addr, offset_reg, times_1, offset_imm);
+  lock();
+  switch (type.value()) {
+    case StoreType::kI64Store8:
+    case StoreType::kI32Store8:
+      xadd_b(dst_op, value_reg);
+      movzx_b(result_reg, value_reg);
+      break;
+    case StoreType::kI64Store16:
+    case StoreType::kI32Store16:
+      xadd_w(dst_op, value_reg);
+      movzx_w(result_reg, value_reg);
+      break;
+    case StoreType::kI64Store32:
+    case StoreType::kI32Store:
+      xadd(dst_op, value_reg);
+      if (value_reg != result_reg) {
+        mov(result_reg, value_reg);
+      }
+      break;
+    default:
+      UNREACHABLE();
+  }
+  if (is_64_bit_op) {
+    xor_(result.high_gp(), result.high_gp());
+  }
 }
 
 void LiftoffAssembler::AtomicSub(Register dst_addr, Register offset_reg,
@@ -1349,7 +1394,7 @@ inline void EmitFloatMinOrMax(LiftoffAssembler* assm, DoubleRegister dst,
 
   // We need one tmp register to extract the sign bit. Get it right at the
   // beginning, such that the spilling code is not accidentially jumped over.
-  Register tmp = assm->GetUnusedRegister(kGpReg).gp();
+  Register tmp = assm->GetUnusedRegister(kGpReg, {}).gp();
 
 #define dop(name, ...)            \
   do {                            \
@@ -1412,9 +1457,9 @@ void LiftoffAssembler::emit_f32_max(DoubleRegister dst, DoubleRegister lhs,
 void LiftoffAssembler::emit_f32_copysign(DoubleRegister dst, DoubleRegister lhs,
                                          DoubleRegister rhs) {
   static constexpr int kF32SignBit = 1 << 31;
-  Register scratch = GetUnusedRegister(kGpReg).gp();
-  Register scratch2 =
-      GetUnusedRegister(kGpReg, LiftoffRegList::ForRegs(scratch)).gp();
+  LiftoffRegList pinned;
+  Register scratch = pinned.set(GetUnusedRegister(kGpReg, pinned)).gp();
+  Register scratch2 = GetUnusedRegister(kGpReg, pinned).gp();
   Movd(scratch, lhs);                      // move {lhs} into {scratch}.
   and_(scratch, Immediate(~kF32SignBit));  // clear sign bit in {scratch}.
   Movd(scratch2, rhs);                     // move {rhs} into {scratch2}.
@@ -1541,9 +1586,9 @@ void LiftoffAssembler::emit_f64_copysign(DoubleRegister dst, DoubleRegister lhs,
   static constexpr int kF32SignBit = 1 << 31;
   // On ia32, we cannot hold the whole f64 value in a gp register, so we just
   // operate on the upper half (UH).
-  Register scratch = GetUnusedRegister(kGpReg).gp();
-  Register scratch2 =
-      GetUnusedRegister(kGpReg, LiftoffRegList::ForRegs(scratch)).gp();
+  LiftoffRegList pinned;
+  Register scratch = pinned.set(GetUnusedRegister(kGpReg, pinned)).gp();
+  Register scratch2 = GetUnusedRegister(kGpReg, pinned).gp();
 
   Pextrd(scratch, lhs, 1);                 // move UH of {lhs} into {scratch}.
   and_(scratch, Immediate(~kF32SignBit));  // clear sign bit in {scratch}.
@@ -1612,6 +1657,7 @@ void LiftoffAssembler::emit_f64_sqrt(DoubleRegister dst, DoubleRegister src) {
 }
 
 namespace liftoff {
+#define __ assm->
 // Used for float to int conversions. If the value in {converted_back} equals
 // {src} afterwards, the conversion succeeded.
 template <typename dst_type, typename src_type>
@@ -1621,21 +1667,21 @@ inline void ConvertFloatToIntAndBack(LiftoffAssembler* assm, Register dst,
                                      LiftoffRegList pinned) {
   if (std::is_same<double, src_type>::value) {  // f64
     if (std::is_signed<dst_type>::value) {      // f64 -> i32
-      assm->cvttsd2si(dst, src);
-      assm->Cvtsi2sd(converted_back, dst);
+      __ cvttsd2si(dst, src);
+      __ Cvtsi2sd(converted_back, dst);
     } else {  // f64 -> u32
-      assm->Cvttsd2ui(dst, src, liftoff::kScratchDoubleReg);
-      assm->Cvtui2sd(converted_back, dst,
-                     assm->GetUnusedRegister(kGpReg, pinned).gp());
+      __ Cvttsd2ui(dst, src, liftoff::kScratchDoubleReg);
+      __ Cvtui2sd(converted_back, dst,
+                  __ GetUnusedRegister(kGpReg, pinned).gp());
     }
   } else {                                  // f32
     if (std::is_signed<dst_type>::value) {  // f32 -> i32
-      assm->cvttss2si(dst, src);
-      assm->Cvtsi2ss(converted_back, dst);
+      __ cvttss2si(dst, src);
+      __ Cvtsi2ss(converted_back, dst);
     } else {  // f32 -> u32
-      assm->Cvttss2ui(dst, src, liftoff::kScratchDoubleReg);
-      assm->Cvtui2ss(converted_back, dst,
-                     assm->GetUnusedRegister(kGpReg, pinned).gp());
+      __ Cvttss2ui(dst, src, liftoff::kScratchDoubleReg);
+      __ Cvtui2ss(converted_back, dst,
+                  __ GetUnusedRegister(kGpReg, pinned).gp());
     }
   }
 }
@@ -1644,36 +1690,101 @@ template <typename dst_type, typename src_type>
 inline bool EmitTruncateFloatToInt(LiftoffAssembler* assm, Register dst,
                                    DoubleRegister src, Label* trap) {
   if (!CpuFeatures::IsSupported(SSE4_1)) {
-    assm->bailout(kMissingCPUFeature, "no SSE4.1");
+    __ bailout(kMissingCPUFeature, "no SSE4.1");
     return true;
   }
   CpuFeatureScope feature(assm, SSE4_1);
 
   LiftoffRegList pinned = LiftoffRegList::ForRegs(src, dst);
   DoubleRegister rounded =
-      pinned.set(assm->GetUnusedRegister(kFpReg, pinned)).fp();
+      pinned.set(__ GetUnusedRegister(kFpReg, pinned)).fp();
   DoubleRegister converted_back =
-      pinned.set(assm->GetUnusedRegister(kFpReg, pinned)).fp();
+      pinned.set(__ GetUnusedRegister(kFpReg, pinned)).fp();
 
   if (std::is_same<double, src_type>::value) {  // f64
-    assm->roundsd(rounded, src, kRoundToZero);
+    __ roundsd(rounded, src, kRoundToZero);
   } else {  // f32
-    assm->roundss(rounded, src, kRoundToZero);
+    __ roundss(rounded, src, kRoundToZero);
   }
   ConvertFloatToIntAndBack<dst_type, src_type>(assm, dst, rounded,
                                                converted_back, pinned);
   if (std::is_same<double, src_type>::value) {  // f64
-    assm->ucomisd(converted_back, rounded);
+    __ ucomisd(converted_back, rounded);
   } else {  // f32
-    assm->ucomiss(converted_back, rounded);
+    __ ucomiss(converted_back, rounded);
   }
 
   // Jump to trap if PF is 0 (one of the operands was NaN) or they are not
   // equal.
-  assm->j(parity_even, trap);
-  assm->j(not_equal, trap);
+  __ j(parity_even, trap);
+  __ j(not_equal, trap);
   return true;
 }
+
+template <typename dst_type, typename src_type>
+inline bool EmitSatTruncateFloatToInt(LiftoffAssembler* assm, Register dst,
+                                      DoubleRegister src) {
+  if (!CpuFeatures::IsSupported(SSE4_1)) {
+    __ bailout(kMissingCPUFeature, "no SSE4.1");
+    return true;
+  }
+  CpuFeatureScope feature(assm, SSE4_1);
+
+  Label done;
+  Label not_nan;
+  Label src_positive;
+
+  LiftoffRegList pinned = LiftoffRegList::ForRegs(src, dst);
+  DoubleRegister rounded =
+      pinned.set(__ GetUnusedRegister(kFpReg, pinned)).fp();
+  DoubleRegister converted_back =
+      pinned.set(__ GetUnusedRegister(kFpReg, pinned)).fp();
+  DoubleRegister zero_reg =
+      pinned.set(__ GetUnusedRegister(kFpReg, pinned)).fp();
+
+  if (std::is_same<double, src_type>::value) {  // f64
+    __ roundsd(rounded, src, kRoundToZero);
+  } else {  // f32
+    __ roundss(rounded, src, kRoundToZero);
+  }
+
+  ConvertFloatToIntAndBack<dst_type, src_type>(assm, dst, rounded,
+                                               converted_back, pinned);
+  if (std::is_same<double, src_type>::value) {  // f64
+    __ ucomisd(converted_back, rounded);
+  } else {  // f32
+    __ ucomiss(converted_back, rounded);
+  }
+
+  // Return 0 if PF is 0 (one of the operands was NaN)
+  __ j(parity_odd, &not_nan);
+  __ xor_(dst, dst);
+  __ jmp(&done);
+
+  __ bind(&not_nan);
+  // If rounding is as expected, return result
+  __ j(equal, &done);
+
+  __ Xorpd(zero_reg, zero_reg);
+
+  // if out-of-bounds, check if src is positive
+  if (std::is_same<double, src_type>::value) {  // f64
+    __ ucomisd(src, zero_reg);
+  } else {  // f32
+    __ ucomiss(src, zero_reg);
+  }
+  __ j(above, &src_positive);
+  __ mov(dst, Immediate(std::numeric_limits<dst_type>::min()));
+  __ jmp(&done);
+
+  __ bind(&src_positive);
+
+  __ mov(dst, Immediate(std::numeric_limits<dst_type>::max()));
+
+  __ bind(&done);
+  return true;
+}
+#undef __
 }  // namespace liftoff
 
 bool LiftoffAssembler::emit_type_conversion(WasmOpcode opcode,
@@ -1695,6 +1806,18 @@ bool LiftoffAssembler::emit_type_conversion(WasmOpcode opcode,
     case kExprI32UConvertF64:
       return liftoff::EmitTruncateFloatToInt<uint32_t, double>(this, dst.gp(),
                                                                src.fp(), trap);
+    case kExprI32SConvertSatF32:
+      return liftoff::EmitSatTruncateFloatToInt<int32_t, float>(this, dst.gp(),
+                                                                src.fp());
+    case kExprI32UConvertSatF32:
+      return liftoff::EmitSatTruncateFloatToInt<uint32_t, float>(this, dst.gp(),
+                                                                 src.fp());
+    case kExprI32SConvertSatF64:
+      return liftoff::EmitSatTruncateFloatToInt<int32_t, double>(this, dst.gp(),
+                                                                 src.fp());
+    case kExprI32UConvertSatF64:
+      return liftoff::EmitSatTruncateFloatToInt<uint32_t, double>(
+          this, dst.gp(), src.fp());
     case kExprI32ReinterpretF32:
       Movd(dst.gp(), src.fp());
       return true;
@@ -2017,8 +2140,164 @@ void EmitSimdShiftOpImm(LiftoffAssembler* assm, LiftoffRegister dst,
     (assm->*sse_op)(dst.fp(), shift);
   }
 }
+
+enum class ShiftSignedness { kSigned, kUnsigned };
+
+template <bool is_signed>
+void EmitI8x16Shr(LiftoffAssembler* assm, LiftoffRegister dst,
+                  LiftoffRegister lhs, LiftoffRegister rhs) {
+  // Same algorithm is used for both signed and unsigned shifts, the only
+  // difference is the actual shift and pack in the end. This is the same
+  // algorithm as used in code-generator-ia32.cc
+  Register tmp =
+      assm->GetUnusedRegister(kGpReg, LiftoffRegList::ForRegs(rhs)).gp();
+  XMMRegister tmp_simd =
+      assm->GetUnusedRegister(kFpReg, LiftoffRegList::ForRegs(dst, lhs)).fp();
+
+  // Unpack the bytes into words, do logical shifts, and repack.
+  assm->Punpckhbw(liftoff::kScratchDoubleReg, lhs.fp());
+  assm->Punpcklbw(dst.fp(), lhs.fp());
+  assm->mov(tmp, rhs.gp());
+  // Take shift value modulo 8.
+  assm->and_(tmp, 7);
+  assm->add(tmp, Immediate(8));
+  assm->Movd(tmp_simd, tmp);
+  if (is_signed) {
+    assm->Psraw(liftoff::kScratchDoubleReg, liftoff::kScratchDoubleReg,
+                tmp_simd);
+    assm->Psraw(dst.fp(), dst.fp(), tmp_simd);
+    assm->Packsswb(dst.fp(), liftoff::kScratchDoubleReg);
+  } else {
+    assm->Psrlw(liftoff::kScratchDoubleReg, liftoff::kScratchDoubleReg,
+                tmp_simd);
+    assm->Psrlw(dst.fp(), dst.fp(), tmp_simd);
+    assm->Packuswb(dst.fp(), liftoff::kScratchDoubleReg);
+  }
+}
+
+inline void EmitAnyTrue(LiftoffAssembler* assm, LiftoffRegister dst,
+                        LiftoffRegister src) {
+  Register tmp =
+      assm->GetUnusedRegister(kGpReg, LiftoffRegList::ForRegs(dst)).gp();
+  assm->xor_(tmp, tmp);
+  assm->mov(dst.gp(), Immediate(1));
+  assm->Ptest(src.fp(), src.fp());
+  assm->cmov(zero, dst.gp(), tmp);
+}
+
+template <void (TurboAssembler::*pcmp)(XMMRegister, XMMRegister)>
+inline void EmitAllTrue(LiftoffAssembler* assm, LiftoffRegister dst,
+                        LiftoffRegister src) {
+  Register tmp =
+      assm->GetUnusedRegister(kGpReg, LiftoffRegList::ForRegs(dst)).gp();
+  XMMRegister tmp_simd = liftoff::kScratchDoubleReg;
+  assm->mov(tmp, Immediate(1));
+  assm->xor_(dst.gp(), dst.gp());
+  assm->Pxor(tmp_simd, tmp_simd);
+  (assm->*pcmp)(tmp_simd, src.fp());
+  assm->Ptest(tmp_simd, tmp_simd);
+  assm->cmov(zero, dst.gp(), tmp);
+}
+
 }  // namespace liftoff
 
+void LiftoffAssembler::LoadTransform(LiftoffRegister dst, Register src_addr,
+                                     Register offset_reg, uint32_t offset_imm,
+                                     LoadType type,
+                                     LoadTransformationKind transform,
+                                     uint32_t* protected_load_pc) {
+  DCHECK_LE(offset_imm, std::numeric_limits<int32_t>::max());
+  Operand src_op{src_addr, offset_reg, times_1,
+                 static_cast<int32_t>(offset_imm)};
+  *protected_load_pc = pc_offset();
+
+  MachineType memtype = type.mem_type();
+  if (transform == LoadTransformationKind::kExtend) {
+    if (memtype == MachineType::Int8()) {
+      Pmovsxbw(dst.fp(), src_op);
+    } else if (memtype == MachineType::Uint8()) {
+      Pmovzxbw(dst.fp(), src_op);
+    } else if (memtype == MachineType::Int16()) {
+      Pmovsxwd(dst.fp(), src_op);
+    } else if (memtype == MachineType::Uint16()) {
+      Pmovzxwd(dst.fp(), src_op);
+    } else if (memtype == MachineType::Int32()) {
+      Pmovsxdq(dst.fp(), src_op);
+    } else if (memtype == MachineType::Uint32()) {
+      Pmovzxdq(dst.fp(), src_op);
+    }
+  } else {
+    DCHECK_EQ(LoadTransformationKind::kSplat, transform);
+    if (memtype == MachineType::Int8()) {
+      Pinsrb(dst.fp(), src_op, 0);
+      Pxor(liftoff::kScratchDoubleReg, liftoff::kScratchDoubleReg);
+      Pshufb(dst.fp(), liftoff::kScratchDoubleReg);
+    } else if (memtype == MachineType::Int16()) {
+      Pinsrw(dst.fp(), src_op, 0);
+      Pshuflw(dst.fp(), dst.fp(), uint8_t{0});
+      Punpcklqdq(dst.fp(), dst.fp());
+    } else if (memtype == MachineType::Int32()) {
+      Vbroadcastss(dst.fp(), src_op);
+    } else if (memtype == MachineType::Int64()) {
+      Movddup(dst.fp(), src_op);
+    }
+  }
+}
+
+void LiftoffAssembler::emit_s8x16_shuffle(LiftoffRegister dst,
+                                          LiftoffRegister lhs,
+                                          LiftoffRegister rhs,
+                                          const uint8_t shuffle[16]) {
+  LiftoffRegister tmp = GetUnusedRegister(kGpReg, {});
+  // Prepare 16 byte aligned buffer for shuffle control mask.
+  mov(tmp.gp(), esp);
+  and_(esp, -16);
+  movups(liftoff::kScratchDoubleReg, lhs.fp());
+
+  for (int i = 3; i >= 0; i--) {
+    uint32_t mask = 0;
+    for (int j = 3; j >= 0; j--) {
+      uint8_t lane = shuffle[i * 4 + j];
+      mask <<= 8;
+      mask |= lane < kSimd128Size ? lane : 0x80;
+    }
+    push(Immediate(mask));
+  }
+  Pshufb(liftoff::kScratchDoubleReg, Operand(esp, 0));
+
+  for (int i = 3; i >= 0; i--) {
+    uint32_t mask = 0;
+    for (int j = 3; j >= 0; j--) {
+      uint8_t lane = shuffle[i * 4 + j];
+      mask <<= 8;
+      mask |= lane >= kSimd128Size ? (lane & 0x0F) : 0x80;
+    }
+    push(Immediate(mask));
+  }
+  if (dst.fp() != rhs.fp()) {
+    movups(dst.fp(), rhs.fp());
+  }
+  Pshufb(dst.fp(), Operand(esp, 0));
+  Por(dst.fp(), liftoff::kScratchDoubleReg);
+  mov(esp, tmp.gp());
+}
+
+void LiftoffAssembler::emit_s8x16_swizzle(LiftoffRegister dst,
+                                          LiftoffRegister lhs,
+                                          LiftoffRegister rhs) {
+  XMMRegister mask =
+      GetUnusedRegister(kFpReg, LiftoffRegList::ForRegs(lhs, rhs)).fp();
+  // Out-of-range indices should return 0, add 112 (0x70) so that any value > 15
+  // saturates to 128 (top bit set), so pshufb will zero that lane.
+  TurboAssembler::Move(mask, uint32_t{0x70707070});
+  Pshufd(mask, mask, uint8_t{0x0});
+  Paddusb(mask, rhs.fp());
+  if (lhs != dst) {
+    Movaps(dst.fp(), lhs.fp());
+  }
+  Pshufb(dst.fp(), mask);
+}
+
 void LiftoffAssembler::emit_i8x16_splat(LiftoffRegister dst,
                                         LiftoffRegister src) {
   Movd(dst.fp(), src.gp());
@@ -2350,6 +2629,21 @@ void LiftoffAssembler::emit_i8x16_neg(LiftoffRegister dst,
   }
 }
 
+void LiftoffAssembler::emit_v8x16_anytrue(LiftoffRegister dst,
+                                          LiftoffRegister src) {
+  liftoff::EmitAnyTrue(this, dst, src);
+}
+
+void LiftoffAssembler::emit_v8x16_alltrue(LiftoffRegister dst,
+                                          LiftoffRegister src) {
+  liftoff::EmitAllTrue<&TurboAssembler::Pcmpeqb>(this, dst, src);
+}
+
+void LiftoffAssembler::emit_i8x16_bitmask(LiftoffRegister dst,
+                                          LiftoffRegister src) {
+  Pmovmskb(dst.gp(), src.fp());
+}
+
 void LiftoffAssembler::emit_i8x16_shl(LiftoffRegister dst, LiftoffRegister lhs,
                                       LiftoffRegister rhs) {
   static constexpr RegClass tmp_rc = reg_class_for(ValueType::kI32);
@@ -2381,7 +2675,7 @@ void LiftoffAssembler::emit_i8x16_shl(LiftoffRegister dst, LiftoffRegister lhs,
 void LiftoffAssembler::emit_i8x16_shli(LiftoffRegister dst, LiftoffRegister lhs,
                                        int32_t rhs) {
   static constexpr RegClass tmp_rc = reg_class_for(ValueType::kI32);
-  LiftoffRegister tmp = GetUnusedRegister(tmp_rc);
+  LiftoffRegister tmp = GetUnusedRegister(tmp_rc, {});
   byte shift = static_cast<byte>(rhs & 0x7);
   if (CpuFeatures::IsSupported(AVX)) {
     CpuFeatureScope scope(this, AVX);
@@ -2399,6 +2693,43 @@ void LiftoffAssembler::emit_i8x16_shli(LiftoffRegister dst, LiftoffRegister lhs,
   Pand(dst.fp(), liftoff::kScratchDoubleReg);
 }
 
+void LiftoffAssembler::emit_i8x16_shr_s(LiftoffRegister dst,
+                                        LiftoffRegister lhs,
+                                        LiftoffRegister rhs) {
+  liftoff::EmitI8x16Shr</*is_signed=*/true>(this, dst, lhs, rhs);
+}
+
+void LiftoffAssembler::emit_i8x16_shri_s(LiftoffRegister dst,
+                                         LiftoffRegister lhs, int32_t rhs) {
+  Punpckhbw(liftoff::kScratchDoubleReg, lhs.fp());
+  Punpcklbw(dst.fp(), lhs.fp());
+  uint8_t shift = (rhs & 7) + 8;
+  Psraw(liftoff::kScratchDoubleReg, shift);
+  Psraw(dst.fp(), shift);
+  Packsswb(dst.fp(), liftoff::kScratchDoubleReg);
+}
+
+void LiftoffAssembler::emit_i8x16_shr_u(LiftoffRegister dst,
+                                        LiftoffRegister lhs,
+                                        LiftoffRegister rhs) {
+  liftoff::EmitI8x16Shr</*is_signed=*/false>(this, dst, lhs, rhs);
+}
+
+void LiftoffAssembler::emit_i8x16_shri_u(LiftoffRegister dst,
+                                         LiftoffRegister lhs, int32_t rhs) {
+  Register tmp = GetUnusedRegister(kGpReg, {}).gp();
+  // Perform 16-bit shift, then mask away high bits.
+  uint8_t shift = rhs & 7;
+  Psrlw(dst.fp(), lhs.fp(), byte{shift});
+
+  uint8_t bmask = 0xff >> shift;
+  uint32_t mask = bmask << 24 | bmask << 16 | bmask << 8 | bmask;
+  mov(tmp, mask);
+  Movd(liftoff::kScratchDoubleReg, tmp);
+  Pshufd(liftoff::kScratchDoubleReg, liftoff::kScratchDoubleReg, 0);
+  Pand(dst.fp(), liftoff::kScratchDoubleReg);
+}
+
 void LiftoffAssembler::emit_i8x16_add(LiftoffRegister dst, LiftoffRegister lhs,
                                       LiftoffRegister rhs) {
   liftoff::EmitSimdCommutativeBinOp<&Assembler::vpaddb, &Assembler::paddb>(
@@ -2541,6 +2872,24 @@ void LiftoffAssembler::emit_i16x8_neg(LiftoffRegister dst,
   }
 }
 
+void LiftoffAssembler::emit_v16x8_anytrue(LiftoffRegister dst,
+                                          LiftoffRegister src) {
+  liftoff::EmitAnyTrue(this, dst, src);
+}
+
+void LiftoffAssembler::emit_v16x8_alltrue(LiftoffRegister dst,
+                                          LiftoffRegister src) {
+  liftoff::EmitAllTrue<&TurboAssembler::Pcmpeqw>(this, dst, src);
+}
+
+void LiftoffAssembler::emit_i16x8_bitmask(LiftoffRegister dst,
+                                          LiftoffRegister src) {
+  XMMRegister tmp = liftoff::kScratchDoubleReg;
+  Packsswb(tmp, src.fp());
+  Pmovmskb(dst.gp(), tmp);
+  shr(dst.gp(), 8);
+}
+
 void LiftoffAssembler::emit_i16x8_shl(LiftoffRegister dst, LiftoffRegister lhs,
                                       LiftoffRegister rhs) {
   liftoff::EmitSimdShiftOp<&Assembler::vpsllw, &Assembler::psllw, 4>(this, dst,
@@ -2553,6 +2902,32 @@ void LiftoffAssembler::emit_i16x8_shli(LiftoffRegister dst, LiftoffRegister lhs,
       this, dst, lhs, rhs);
 }
 
+void LiftoffAssembler::emit_i16x8_shr_s(LiftoffRegister dst,
+                                        LiftoffRegister lhs,
+                                        LiftoffRegister rhs) {
+  liftoff::EmitSimdShiftOp<&Assembler::vpsraw, &Assembler::psraw, 4>(this, dst,
+                                                                     lhs, rhs);
+}
+
+void LiftoffAssembler::emit_i16x8_shri_s(LiftoffRegister dst,
+                                         LiftoffRegister lhs, int32_t rhs) {
+  liftoff::EmitSimdShiftOpImm<&Assembler::vpsraw, &Assembler::psraw, 4>(
+      this, dst, lhs, rhs);
+}
+
+void LiftoffAssembler::emit_i16x8_shr_u(LiftoffRegister dst,
+                                        LiftoffRegister lhs,
+                                        LiftoffRegister rhs) {
+  liftoff::EmitSimdShiftOp<&Assembler::vpsrlw, &Assembler::psrlw, 4>(this, dst,
+                                                                     lhs, rhs);
+}
+
+void LiftoffAssembler::emit_i16x8_shri_u(LiftoffRegister dst,
+                                         LiftoffRegister lhs, int32_t rhs) {
+  liftoff::EmitSimdShiftOpImm<&Assembler::vpsrlw, &Assembler::psrlw, 4>(
+      this, dst, lhs, rhs);
+}
+
 void LiftoffAssembler::emit_i16x8_add(LiftoffRegister dst, LiftoffRegister lhs,
                                       LiftoffRegister rhs) {
   liftoff::EmitSimdCommutativeBinOp<&Assembler::vpaddw, &Assembler::paddw>(
@@ -2639,6 +3014,21 @@ void LiftoffAssembler::emit_i32x4_neg(LiftoffRegister dst,
   }
 }
 
+void LiftoffAssembler::emit_v32x4_anytrue(LiftoffRegister dst,
+                                          LiftoffRegister src) {
+  liftoff::EmitAnyTrue(this, dst, src);
+}
+
+void LiftoffAssembler::emit_v32x4_alltrue(LiftoffRegister dst,
+                                          LiftoffRegister src) {
+  liftoff::EmitAllTrue<&TurboAssembler::Pcmpeqd>(this, dst, src);
+}
+
+void LiftoffAssembler::emit_i32x4_bitmask(LiftoffRegister dst,
+                                          LiftoffRegister src) {
+  Movmskps(dst.gp(), src.fp());
+}
+
 void LiftoffAssembler::emit_i32x4_shl(LiftoffRegister dst, LiftoffRegister lhs,
                                       LiftoffRegister rhs) {
   liftoff::EmitSimdShiftOp<&Assembler::vpslld, &Assembler::pslld, 5>(this, dst,
@@ -2651,6 +3041,32 @@ void LiftoffAssembler::emit_i32x4_shli(LiftoffRegister dst, LiftoffRegister lhs,
       this, dst, lhs, rhs);
 }
 
+void LiftoffAssembler::emit_i32x4_shr_s(LiftoffRegister dst,
+                                        LiftoffRegister lhs,
+                                        LiftoffRegister rhs) {
+  liftoff::EmitSimdShiftOp<&Assembler::vpsrad, &Assembler::psrad, 5>(this, dst,
+                                                                     lhs, rhs);
+}
+
+void LiftoffAssembler::emit_i32x4_shri_s(LiftoffRegister dst,
+                                         LiftoffRegister lhs, int32_t rhs) {
+  liftoff::EmitSimdShiftOpImm<&Assembler::vpsrad, &Assembler::psrad, 5>(
+      this, dst, lhs, rhs);
+}
+
+void LiftoffAssembler::emit_i32x4_shr_u(LiftoffRegister dst,
+                                        LiftoffRegister lhs,
+                                        LiftoffRegister rhs) {
+  liftoff::EmitSimdShiftOp<&Assembler::vpsrld, &Assembler::psrld, 5>(this, dst,
+                                                                     lhs, rhs);
+}
+
+void LiftoffAssembler::emit_i32x4_shri_u(LiftoffRegister dst,
+                                         LiftoffRegister lhs, int32_t rhs) {
+  liftoff::EmitSimdShiftOpImm<&Assembler::vpsrld, &Assembler::psrld, 5>(
+      this, dst, lhs, rhs);
+}
+
 void LiftoffAssembler::emit_i32x4_add(LiftoffRegister dst, LiftoffRegister lhs,
                                       LiftoffRegister rhs) {
   liftoff::EmitSimdCommutativeBinOp<&Assembler::vpaddd, &Assembler::paddd>(
@@ -2723,6 +3139,56 @@ void LiftoffAssembler::emit_i64x2_shli(LiftoffRegister dst, LiftoffRegister lhs,
       this, dst, lhs, rhs);
 }
 
+void LiftoffAssembler::emit_i64x2_shr_s(LiftoffRegister dst,
+                                        LiftoffRegister lhs,
+                                        LiftoffRegister rhs) {
+  XMMRegister shift = liftoff::kScratchDoubleReg;
+  XMMRegister tmp =
+      GetUnusedRegister(RegClass::kFpReg, LiftoffRegList::ForRegs(dst, lhs))
+          .fp();
+
+  // Take shift value modulo 64.
+  and_(rhs.gp(), Immediate(63));
+  Movd(shift, rhs.gp());
+
+  // Set up a mask [0x80000000,0,0x80000000,0].
+  Pcmpeqb(tmp, tmp);
+  Psllq(tmp, tmp, 63);
+
+  Psrlq(tmp, tmp, shift);
+  Psrlq(dst.fp(), lhs.fp(), shift);
+  Pxor(dst.fp(), tmp);
+  Psubq(dst.fp(), tmp);
+}
+
+void LiftoffAssembler::emit_i64x2_shri_s(LiftoffRegister dst,
+                                         LiftoffRegister lhs, int32_t rhs) {
+  XMMRegister tmp = liftoff::kScratchDoubleReg;
+  int32_t shift = rhs & 63;
+
+  // Set up a mask [0x80000000,0,0x80000000,0].
+  Pcmpeqb(tmp, tmp);
+  Psllq(tmp, tmp, 63);
+
+  Psrlq(tmp, tmp, shift);
+  Psrlq(dst.fp(), lhs.fp(), shift);
+  Pxor(dst.fp(), tmp);
+  Psubq(dst.fp(), tmp);
+}
+
+void LiftoffAssembler::emit_i64x2_shr_u(LiftoffRegister dst,
+                                        LiftoffRegister lhs,
+                                        LiftoffRegister rhs) {
+  liftoff::EmitSimdShiftOp<&Assembler::vpsrlq, &Assembler::psrlq, 6>(this, dst,
+                                                                     lhs, rhs);
+}
+
+void LiftoffAssembler::emit_i64x2_shri_u(LiftoffRegister dst,
+                                         LiftoffRegister lhs, int32_t rhs) {
+  liftoff::EmitSimdShiftOpImm<&Assembler::vpsrlq, &Assembler::psrlq, 6>(
+      this, dst, lhs, rhs);
+}
+
 void LiftoffAssembler::emit_i64x2_add(LiftoffRegister dst, LiftoffRegister lhs,
                                       LiftoffRegister rhs) {
   liftoff::EmitSimdCommutativeBinOp<&Assembler::vpaddq, &Assembler::paddq>(
@@ -2990,6 +3456,97 @@ void LiftoffAssembler::emit_f64x2_max(LiftoffRegister dst, LiftoffRegister lhs,
   Andnpd(dst.fp(), liftoff::kScratchDoubleReg);
 }
 
+void LiftoffAssembler::emit_i32x4_sconvert_f32x4(LiftoffRegister dst,
+                                                 LiftoffRegister src) {
+  // NAN->0
+  if (CpuFeatures::IsSupported(AVX)) {
+    CpuFeatureScope scope(this, AVX);
+    vcmpeqps(liftoff::kScratchDoubleReg, src.fp(), src.fp());
+    vpand(dst.fp(), src.fp(), liftoff::kScratchDoubleReg);
+  } else {
+    movaps(liftoff::kScratchDoubleReg, src.fp());
+    cmpeqps(liftoff::kScratchDoubleReg, liftoff::kScratchDoubleReg);
+    if (dst.fp() != src.fp()) movaps(dst.fp(), src.fp());
+    pand(dst.fp(), liftoff::kScratchDoubleReg);
+  }
+  // Set top bit if >= 0 (but not -0.0!).
+  Pxor(liftoff::kScratchDoubleReg, dst.fp());
+  // Convert to int.
+  Cvttps2dq(dst.fp(), dst.fp());
+  // Set top bit if >=0 is now < 0.
+  Pand(liftoff::kScratchDoubleReg, dst.fp());
+  Psrad(liftoff::kScratchDoubleReg, liftoff::kScratchDoubleReg, byte{31});
+  // Set positive overflow lanes to 0x7FFFFFFF.
+  Pxor(dst.fp(), liftoff::kScratchDoubleReg);
+}
+
+void LiftoffAssembler::emit_i32x4_uconvert_f32x4(LiftoffRegister dst,
+                                                 LiftoffRegister src) {
+  static constexpr RegClass tmp_rc = reg_class_for(ValueType::kS128);
+  DoubleRegister tmp =
+      GetUnusedRegister(tmp_rc, LiftoffRegList::ForRegs(dst, src)).fp();
+  // NAN->0, negative->0.
+  Pxor(liftoff::kScratchDoubleReg, liftoff::kScratchDoubleReg);
+  if (CpuFeatures::IsSupported(AVX)) {
+    CpuFeatureScope scope(this, AVX);
+    vmaxps(dst.fp(), src.fp(), liftoff::kScratchDoubleReg);
+  } else {
+    if (dst.fp() != src.fp()) movaps(dst.fp(), src.fp());
+    maxps(dst.fp(), liftoff::kScratchDoubleReg);
+  }
+  // scratch: float representation of max_signed.
+  Pcmpeqd(liftoff::kScratchDoubleReg, liftoff::kScratchDoubleReg);
+  Psrld(liftoff::kScratchDoubleReg, liftoff::kScratchDoubleReg,
+        uint8_t{1});  // 0x7fffffff
+  Cvtdq2ps(liftoff::kScratchDoubleReg,
+           liftoff::kScratchDoubleReg);  // 0x4f000000
+  // tmp: convert (src-max_signed).
+  // Set positive overflow lanes to 0x7FFFFFFF.
+  // Set negative lanes to 0.
+  if (CpuFeatures::IsSupported(AVX)) {
+    CpuFeatureScope scope(this, AVX);
+    vsubps(tmp, dst.fp(), liftoff::kScratchDoubleReg);
+  } else {
+    movaps(tmp, dst.fp());
+    subps(tmp, liftoff::kScratchDoubleReg);
+  }
+  Cmpleps(liftoff::kScratchDoubleReg, liftoff::kScratchDoubleReg, tmp);
+  Cvttps2dq(tmp, tmp);
+  Pxor(tmp, liftoff::kScratchDoubleReg);
+  Pxor(liftoff::kScratchDoubleReg, liftoff::kScratchDoubleReg);
+  Pmaxsd(tmp, liftoff::kScratchDoubleReg);
+  // Convert to int. Overflow lanes above max_signed will be 0x80000000.
+  Cvttps2dq(dst.fp(), dst.fp());
+  // Add (src-max_signed) for overflow lanes.
+  Paddd(dst.fp(), dst.fp(), tmp);
+}
+
+void LiftoffAssembler::emit_f32x4_sconvert_i32x4(LiftoffRegister dst,
+                                                 LiftoffRegister src) {
+  Cvtdq2ps(dst.fp(), src.fp());
+}
+
+void LiftoffAssembler::emit_f32x4_uconvert_i32x4(LiftoffRegister dst,
+                                                 LiftoffRegister src) {
+  Pxor(liftoff::kScratchDoubleReg, liftoff::kScratchDoubleReg);  // Zeros.
+  Pblendw(liftoff::kScratchDoubleReg, src.fp(),
+          uint8_t{0x55});  // Get lo 16 bits.
+  if (CpuFeatures::IsSupported(AVX)) {
+    CpuFeatureScope scope(this, AVX);
+    vpsubd(dst.fp(), src.fp(), liftoff::kScratchDoubleReg);  // Get hi 16 bits.
+  } else {
+    if (dst.fp() != src.fp()) movaps(dst.fp(), src.fp());
+    psubd(dst.fp(), liftoff::kScratchDoubleReg);
+  }
+  Cvtdq2ps(liftoff::kScratchDoubleReg,
+           liftoff::kScratchDoubleReg);  // Convert lo exactly.
+  Psrld(dst.fp(), dst.fp(), byte{1});   // Divide by 2 to get in unsigned range.
+  Cvtdq2ps(dst.fp(), dst.fp());         // Convert hi, exactly.
+  Addps(dst.fp(), dst.fp(), dst.fp());  // Double hi, exactly.
+  Addps(dst.fp(), dst.fp(),
+        liftoff::kScratchDoubleReg);  // Add hi and lo, may round.
+}
+
 void LiftoffAssembler::emit_i8x16_sconvert_i16x8(LiftoffRegister dst,
                                                  LiftoffRegister lhs,
                                                  LiftoffRegister rhs) {
@@ -3270,7 +3827,7 @@ void LiftoffAssembler::StackCheck(Label* ool_code, Register limit_address) {
 }
 
 void LiftoffAssembler::CallTrapCallbackForTesting() {
-  PrepareCallCFunction(0, GetUnusedRegister(kGpReg).gp());
+  PrepareCallCFunction(0, GetUnusedRegister(kGpReg, {}).gp());
   CallCFunction(ExternalReference::wasm_call_trap_callback_for_testing(), 0);
 }