; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v -riscv-v-vector-bits-min=256 | FileCheck %s --check-prefixes=CHECK,V ; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+f,+zve32f -riscv-v-vector-bits-min=256 | FileCheck %s --check-prefixes=CHECK,ZVE32F ; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v,+no-optimized-zero-stride-load -riscv-v-vector-bits-min=256 | FileCheck %s --check-prefixes=CHECK,NOT-OPTIMIZED ; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+f,+zve32f,+no-optimized-zero-stride-load -riscv-v-vector-bits-min=256 | FileCheck %s --check-prefixes=CHECK,NOT-OPTIMIZED %struct.foo = type { i32, i32, i32, i32 } ; void gather(signed char * __restrict A, signed char * __restrict B) { ; for (int i = 0; i != 1024; ++i) ; A[i] += B[i * 5]; ; } define void @gather(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) { ; CHECK-LABEL: gather: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: li a2, 1024 ; CHECK-NEXT: li a3, 32 ; CHECK-NEXT: li a4, 5 ; CHECK-NEXT: .LBB0_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vsetvli zero, a3, e8, m1, ta, ma ; CHECK-NEXT: vlse8.v v8, (a1), a4 ; CHECK-NEXT: vle8.v v9, (a0) ; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: vse8.v v8, (a0) ; CHECK-NEXT: addi a2, a2, -32 ; CHECK-NEXT: addi a0, a0, 32 ; CHECK-NEXT: addi a1, a1, 160 ; CHECK-NEXT: bnez a2, .LBB0_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup ; CHECK-NEXT: ret entry: br label %vector.body vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] %vec.ind = phi <32 x i64> [ , %entry ], [ %vec.ind.next, %vector.body ] %i = mul nuw nsw <32 x i64> %vec.ind, %i1 = getelementptr inbounds i8, ptr %B, <32 x i64> %i %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i1, i32 1, <32 x i1> , <32 x i8> undef) %i2 = getelementptr inbounds i8, ptr %A, i64 %index %wide.load = load <32 x i8>, ptr %i2, align 1 %i4 = add <32 x i8> %wide.load, %wide.masked.gather store <32 x i8> %i4, ptr %i2, align 1 %index.next = add nuw i64 %index, 32 %vec.ind.next = add <32 x i64> %vec.ind, %i6 = icmp eq i64 %index.next, 1024 br i1 %i6, label %for.cond.cleanup, label %vector.body for.cond.cleanup: ; preds = %vector.body ret void } define void @gather_masked(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, <32 x i8> %maskedoff) { ; V-LABEL: gather_masked: ; V: # %bb.0: # %entry ; V-NEXT: li a2, 1024 ; V-NEXT: lui a3, 983765 ; V-NEXT: addiw a3, a3, 873 ; V-NEXT: vsetivli zero, 1, e32, mf2, ta, ma ; V-NEXT: vmv.s.x v0, a3 ; V-NEXT: li a3, 32 ; V-NEXT: li a4, 5 ; V-NEXT: .LBB1_1: # %vector.body ; V-NEXT: # =>This Inner Loop Header: Depth=1 ; V-NEXT: vsetvli zero, a3, e8, m1, ta, mu ; V-NEXT: vmv1r.v v9, v8 ; V-NEXT: vlse8.v v9, (a1), a4, v0.t ; V-NEXT: vle8.v v10, (a0) ; V-NEXT: vadd.vv v9, v10, v9 ; V-NEXT: vse8.v v9, (a0) ; V-NEXT: addi a2, a2, -32 ; V-NEXT: addi a0, a0, 32 ; V-NEXT: addi a1, a1, 160 ; V-NEXT: bnez a2, .LBB1_1 ; V-NEXT: # %bb.2: # %for.cond.cleanup ; V-NEXT: ret ; ; ZVE32F-LABEL: gather_masked: ; ZVE32F: # %bb.0: # %entry ; ZVE32F-NEXT: li a2, 1024 ; ZVE32F-NEXT: lui a3, 983765 ; ZVE32F-NEXT: addiw a3, a3, 873 ; ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; ZVE32F-NEXT: vmv.s.x v0, a3 ; ZVE32F-NEXT: li a3, 32 ; ZVE32F-NEXT: li a4, 5 ; ZVE32F-NEXT: .LBB1_1: # %vector.body ; ZVE32F-NEXT: # =>This Inner Loop Header: Depth=1 ; ZVE32F-NEXT: vsetvli zero, a3, e8, m1, ta, mu ; ZVE32F-NEXT: vmv1r.v v9, v8 ; ZVE32F-NEXT: vlse8.v v9, (a1), a4, v0.t ; ZVE32F-NEXT: vle8.v v10, (a0) ; ZVE32F-NEXT: vadd.vv v9, v10, v9 ; ZVE32F-NEXT: vse8.v v9, (a0) ; ZVE32F-NEXT: addi a2, a2, -32 ; ZVE32F-NEXT: addi a0, a0, 32 ; ZVE32F-NEXT: addi a1, a1, 160 ; ZVE32F-NEXT: bnez a2, .LBB1_1 ; ZVE32F-NEXT: # %bb.2: # %for.cond.cleanup ; ZVE32F-NEXT: ret entry: br label %vector.body vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] %vec.ind = phi <32 x i64> [ , %entry ], [ %vec.ind.next, %vector.body ] %i = mul nuw nsw <32 x i64> %vec.ind, %i1 = getelementptr inbounds i8, ptr %B, <32 x i64> %i %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i1, i32 1, <32 x i1> , <32 x i8> %maskedoff) %i2 = getelementptr inbounds i8, ptr %A, i64 %index %wide.load = load <32 x i8>, ptr %i2, align 1 %i4 = add <32 x i8> %wide.load, %wide.masked.gather store <32 x i8> %i4, ptr %i2, align 1 %index.next = add nuw i64 %index, 32 %vec.ind.next = add <32 x i64> %vec.ind, %i6 = icmp eq i64 %index.next, 1024 br i1 %i6, label %for.cond.cleanup, label %vector.body for.cond.cleanup: ; preds = %vector.body ret void } define void @gather_negative_stride(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) { ; CHECK-LABEL: gather_negative_stride: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: addi a1, a1, 155 ; CHECK-NEXT: li a2, 1024 ; CHECK-NEXT: li a3, 32 ; CHECK-NEXT: li a4, -5 ; CHECK-NEXT: .LBB2_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vsetvli zero, a3, e8, m1, ta, ma ; CHECK-NEXT: vlse8.v v8, (a1), a4 ; CHECK-NEXT: vle8.v v9, (a0) ; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: vse8.v v8, (a0) ; CHECK-NEXT: addi a2, a2, -32 ; CHECK-NEXT: addi a0, a0, 32 ; CHECK-NEXT: addi a1, a1, 160 ; CHECK-NEXT: bnez a2, .LBB2_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup ; CHECK-NEXT: ret entry: br label %vector.body vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] %vec.ind = phi <32 x i64> [ , %entry ], [ %vec.ind.next, %vector.body ] %i = mul nuw nsw <32 x i64> %vec.ind, %i1 = getelementptr inbounds i8, ptr %B, <32 x i64> %i %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i1, i32 1, <32 x i1> , <32 x i8> undef) %i2 = getelementptr inbounds i8, ptr %A, i64 %index %wide.load = load <32 x i8>, ptr %i2, align 1 %i4 = add <32 x i8> %wide.load, %wide.masked.gather store <32 x i8> %i4, ptr %i2, align 1 %index.next = add nuw i64 %index, 32 %vec.ind.next = add <32 x i64> %vec.ind, %i6 = icmp eq i64 %index.next, 1024 br i1 %i6, label %for.cond.cleanup, label %vector.body for.cond.cleanup: ; preds = %vector.body ret void } define void @gather_zero_stride(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) { ; CHECK-LABEL: gather_zero_stride: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: li a2, 1024 ; CHECK-NEXT: li a3, 32 ; CHECK-NEXT: .LBB3_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: lbu a4, 0(a1) ; CHECK-NEXT: vsetvli zero, a3, e8, m1, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: vadd.vx v8, v8, a4 ; CHECK-NEXT: vse8.v v8, (a0) ; CHECK-NEXT: addi a2, a2, -32 ; CHECK-NEXT: addi a0, a0, 32 ; CHECK-NEXT: addi a1, a1, 160 ; CHECK-NEXT: bnez a2, .LBB3_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup ; CHECK-NEXT: ret entry: br label %vector.body vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] %vec.ind = phi <32 x i64> [ zeroinitializer, %entry ], [ %vec.ind.next, %vector.body ] %i = mul nuw nsw <32 x i64> %vec.ind, %i1 = getelementptr inbounds i8, ptr %B, <32 x i64> %i %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i1, i32 1, <32 x i1> , <32 x i8> undef) %i2 = getelementptr inbounds i8, ptr %A, i64 %index %wide.load = load <32 x i8>, ptr %i2, align 1 %i4 = add <32 x i8> %wide.load, %wide.masked.gather store <32 x i8> %i4, ptr %i2, align 1 %index.next = add nuw i64 %index, 32 %vec.ind.next = add <32 x i64> %vec.ind, %i6 = icmp eq i64 %index.next, 1024 br i1 %i6, label %for.cond.cleanup, label %vector.body for.cond.cleanup: ; preds = %vector.body ret void } define void @gather_zero_stride_unfold(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) { ; V-LABEL: gather_zero_stride_unfold: ; V: # %bb.0: # %entry ; V-NEXT: li a2, 1024 ; V-NEXT: li a3, 32 ; V-NEXT: .LBB4_1: # %vector.body ; V-NEXT: # =>This Inner Loop Header: Depth=1 ; V-NEXT: vsetvli zero, a3, e8, m1, ta, ma ; V-NEXT: vlse8.v v8, (a1), zero ; V-NEXT: vle8.v v9, (a0) ; V-NEXT: vdivu.vv v8, v8, v9 ; V-NEXT: vse8.v v8, (a0) ; V-NEXT: addi a2, a2, -32 ; V-NEXT: addi a0, a0, 32 ; V-NEXT: addi a1, a1, 160 ; V-NEXT: bnez a2, .LBB4_1 ; V-NEXT: # %bb.2: # %for.cond.cleanup ; V-NEXT: ret ; ; ZVE32F-LABEL: gather_zero_stride_unfold: ; ZVE32F: # %bb.0: # %entry ; ZVE32F-NEXT: li a2, 1024 ; ZVE32F-NEXT: li a3, 32 ; ZVE32F-NEXT: .LBB4_1: # %vector.body ; ZVE32F-NEXT: # =>This Inner Loop Header: Depth=1 ; ZVE32F-NEXT: vsetvli zero, a3, e8, m1, ta, ma ; ZVE32F-NEXT: vlse8.v v8, (a1), zero ; ZVE32F-NEXT: vle8.v v9, (a0) ; ZVE32F-NEXT: vdivu.vv v8, v8, v9 ; ZVE32F-NEXT: vse8.v v8, (a0) ; ZVE32F-NEXT: addi a2, a2, -32 ; ZVE32F-NEXT: addi a0, a0, 32 ; ZVE32F-NEXT: addi a1, a1, 160 ; ZVE32F-NEXT: bnez a2, .LBB4_1 ; ZVE32F-NEXT: # %bb.2: # %for.cond.cleanup ; ZVE32F-NEXT: ret ; ; NOT-OPTIMIZED-LABEL: gather_zero_stride_unfold: ; NOT-OPTIMIZED: # %bb.0: # %entry ; NOT-OPTIMIZED-NEXT: li a2, 1024 ; NOT-OPTIMIZED-NEXT: li a3, 32 ; NOT-OPTIMIZED-NEXT: .LBB4_1: # %vector.body ; NOT-OPTIMIZED-NEXT: # =>This Inner Loop Header: Depth=1 ; NOT-OPTIMIZED-NEXT: lbu a4, 0(a1) ; NOT-OPTIMIZED-NEXT: vsetvli zero, a3, e8, m1, ta, ma ; NOT-OPTIMIZED-NEXT: vle8.v v8, (a0) ; NOT-OPTIMIZED-NEXT: vmv.v.x v9, a4 ; NOT-OPTIMIZED-NEXT: vdivu.vv v8, v9, v8 ; NOT-OPTIMIZED-NEXT: vse8.v v8, (a0) ; NOT-OPTIMIZED-NEXT: addi a2, a2, -32 ; NOT-OPTIMIZED-NEXT: addi a0, a0, 32 ; NOT-OPTIMIZED-NEXT: addi a1, a1, 160 ; NOT-OPTIMIZED-NEXT: bnez a2, .LBB4_1 ; NOT-OPTIMIZED-NEXT: # %bb.2: # %for.cond.cleanup ; NOT-OPTIMIZED-NEXT: ret entry: br label %vector.body vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] %vec.ind = phi <32 x i64> [ zeroinitializer, %entry ], [ %vec.ind.next, %vector.body ] %i = mul nuw nsw <32 x i64> %vec.ind, %i1 = getelementptr inbounds i8, ptr %B, <32 x i64> %i %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i1, i32 1, <32 x i1> , <32 x i8> undef) %i2 = getelementptr inbounds i8, ptr %A, i64 %index %wide.load = load <32 x i8>, ptr %i2, align 1 %i4 = udiv <32 x i8> %wide.masked.gather, %wide.load store <32 x i8> %i4, ptr %i2, align 1 %index.next = add nuw i64 %index, 32 %vec.ind.next = add <32 x i64> %vec.ind, %i6 = icmp eq i64 %index.next, 1024 br i1 %i6, label %for.cond.cleanup, label %vector.body for.cond.cleanup: ; preds = %vector.body ret void } ;void scatter(signed char * __restrict A, signed char * __restrict B) { ; for (int i = 0; i < 1024; ++i) ; A[i * 5] += B[i]; ;} define void @scatter(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) { ; CHECK-LABEL: scatter: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: li a2, 1024 ; CHECK-NEXT: li a3, 32 ; CHECK-NEXT: li a4, 5 ; CHECK-NEXT: .LBB5_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vsetvli zero, a3, e8, m1, ta, ma ; CHECK-NEXT: vle8.v v8, (a1) ; CHECK-NEXT: vlse8.v v9, (a0), a4 ; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: vsse8.v v8, (a0), a4 ; CHECK-NEXT: addi a2, a2, -32 ; CHECK-NEXT: addi a1, a1, 32 ; CHECK-NEXT: addi a0, a0, 160 ; CHECK-NEXT: bnez a2, .LBB5_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup ; CHECK-NEXT: ret entry: br label %vector.body vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] %vec.ind = phi <32 x i64> [ , %entry ], [ %vec.ind.next, %vector.body ] %i = getelementptr inbounds i8, ptr %B, i64 %index %wide.load = load <32 x i8>, ptr %i, align 1 %i2 = mul nuw nsw <32 x i64> %vec.ind, %i3 = getelementptr inbounds i8, ptr %A, <32 x i64> %i2 %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i3, i32 1, <32 x i1> , <32 x i8> undef) %i4 = add <32 x i8> %wide.masked.gather, %wide.load call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> %i4, <32 x ptr> %i3, i32 1, <32 x i1> ) %index.next = add nuw i64 %index, 32 %vec.ind.next = add <32 x i64> %vec.ind, %i5 = icmp eq i64 %index.next, 1024 br i1 %i5, label %for.cond.cleanup, label %vector.body for.cond.cleanup: ; preds = %vector.body ret void } define void @scatter_masked(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, <32 x i8> %maskedoff) { ; V-LABEL: scatter_masked: ; V: # %bb.0: # %entry ; V-NEXT: li a2, 1024 ; V-NEXT: li a3, 32 ; V-NEXT: lui a4, 983765 ; V-NEXT: addiw a4, a4, 873 ; V-NEXT: vsetivli zero, 1, e32, mf2, ta, ma ; V-NEXT: vmv.s.x v0, a4 ; V-NEXT: li a4, 5 ; V-NEXT: .LBB6_1: # %vector.body ; V-NEXT: # =>This Inner Loop Header: Depth=1 ; V-NEXT: vsetvli zero, a3, e8, m1, ta, mu ; V-NEXT: vle8.v v9, (a1) ; V-NEXT: vmv1r.v v10, v8 ; V-NEXT: vlse8.v v10, (a0), a4, v0.t ; V-NEXT: vadd.vv v9, v10, v9 ; V-NEXT: vsse8.v v9, (a0), a4, v0.t ; V-NEXT: addi a2, a2, -32 ; V-NEXT: addi a1, a1, 32 ; V-NEXT: addi a0, a0, 160 ; V-NEXT: bnez a2, .LBB6_1 ; V-NEXT: # %bb.2: # %for.cond.cleanup ; V-NEXT: ret ; ; ZVE32F-LABEL: scatter_masked: ; ZVE32F: # %bb.0: # %entry ; ZVE32F-NEXT: li a2, 1024 ; ZVE32F-NEXT: li a3, 32 ; ZVE32F-NEXT: lui a4, 983765 ; ZVE32F-NEXT: addiw a4, a4, 873 ; ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; ZVE32F-NEXT: vmv.s.x v0, a4 ; ZVE32F-NEXT: li a4, 5 ; ZVE32F-NEXT: .LBB6_1: # %vector.body ; ZVE32F-NEXT: # =>This Inner Loop Header: Depth=1 ; ZVE32F-NEXT: vsetvli zero, a3, e8, m1, ta, mu ; ZVE32F-NEXT: vle8.v v9, (a1) ; ZVE32F-NEXT: vmv1r.v v10, v8 ; ZVE32F-NEXT: vlse8.v v10, (a0), a4, v0.t ; ZVE32F-NEXT: vadd.vv v9, v10, v9 ; ZVE32F-NEXT: vsse8.v v9, (a0), a4, v0.t ; ZVE32F-NEXT: addi a2, a2, -32 ; ZVE32F-NEXT: addi a1, a1, 32 ; ZVE32F-NEXT: addi a0, a0, 160 ; ZVE32F-NEXT: bnez a2, .LBB6_1 ; ZVE32F-NEXT: # %bb.2: # %for.cond.cleanup ; ZVE32F-NEXT: ret entry: br label %vector.body vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] %vec.ind = phi <32 x i64> [ , %entry ], [ %vec.ind.next, %vector.body ] %i = getelementptr inbounds i8, ptr %B, i64 %index %wide.load = load <32 x i8>, ptr %i, align 1 %i2 = mul nuw nsw <32 x i64> %vec.ind, %i3 = getelementptr inbounds i8, ptr %A, <32 x i64> %i2 %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i3, i32 1, <32 x i1> , <32 x i8> %maskedoff) %i4 = add <32 x i8> %wide.masked.gather, %wide.load call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> %i4, <32 x ptr> %i3, i32 1, <32 x i1> ) %index.next = add nuw i64 %index, 32 %vec.ind.next = add <32 x i64> %vec.ind, %i5 = icmp eq i64 %index.next, 1024 br i1 %i5, label %for.cond.cleanup, label %vector.body for.cond.cleanup: ; preds = %vector.body ret void } ; void gather_pow2(signed char * __restrict A, signed char * __restrict B) { ; for (int i = 0; i != 1024; ++i) ; A[i] += B[i * 4]; ; } define void @gather_pow2(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) { ; CHECK-LABEL: gather_pow2: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: li a2, 1024 ; CHECK-NEXT: li a3, 16 ; CHECK-NEXT: li a4, 32 ; CHECK-NEXT: .LBB7_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, ma ; CHECK-NEXT: vlse32.v v8, (a1), a3 ; CHECK-NEXT: vsetvli zero, a4, e8, m1, ta, ma ; CHECK-NEXT: vle8.v v9, (a0) ; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, ma ; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: vsetvli zero, a4, e8, m1, ta, ma ; CHECK-NEXT: vse8.v v8, (a0) ; CHECK-NEXT: addi a2, a2, -8 ; CHECK-NEXT: addi a0, a0, 32 ; CHECK-NEXT: addi a1, a1, 128 ; CHECK-NEXT: bnez a2, .LBB7_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup ; CHECK-NEXT: ret entry: br label %vector.body vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] %vec.ind = phi <8 x i64> [ , %entry ], [ %vec.ind.next, %vector.body ] %i = shl nsw <8 x i64> %vec.ind, %i1 = getelementptr inbounds i32, ptr %B, <8 x i64> %i %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i1, i32 4, <8 x i1> , <8 x i32> undef) %i2 = getelementptr inbounds i32, ptr %A, i64 %index %wide.load = load <8 x i32>, ptr %i2, align 1 %i4 = add <8 x i32> %wide.load, %wide.masked.gather store <8 x i32> %i4, ptr %i2, align 1 %index.next = add nuw i64 %index, 8 %vec.ind.next = add <8 x i64> %vec.ind, %i6 = icmp eq i64 %index.next, 1024 br i1 %i6, label %for.cond.cleanup, label %vector.body for.cond.cleanup: ; preds = %vector.body ret void } ;void scatter_pow2(signed char * __restrict A, signed char * __restrict B) { ; for (int i = 0; i < 1024; ++i) ; A[i * 4] += B[i]; ;} define void @scatter_pow2(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) { ; CHECK-LABEL: scatter_pow2: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: li a2, 1024 ; CHECK-NEXT: li a3, 32 ; CHECK-NEXT: li a4, 16 ; CHECK-NEXT: .LBB8_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vsetvli zero, a3, e8, m1, ta, ma ; CHECK-NEXT: vle8.v v8, (a1) ; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, ma ; CHECK-NEXT: vlse32.v v9, (a0), a4 ; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: vsse32.v v8, (a0), a4 ; CHECK-NEXT: addi a2, a2, -8 ; CHECK-NEXT: addi a1, a1, 32 ; CHECK-NEXT: addi a0, a0, 128 ; CHECK-NEXT: bnez a2, .LBB8_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup ; CHECK-NEXT: ret entry: br label %vector.body vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] %vec.ind = phi <8 x i64> [ , %entry ], [ %vec.ind.next, %vector.body ] %i = getelementptr inbounds i32, ptr %B, i64 %index %wide.load = load <8 x i32>, ptr %i, align 1 %i2 = shl nuw nsw <8 x i64> %vec.ind, %i3 = getelementptr inbounds i32, ptr %A, <8 x i64> %i2 %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i3, i32 4, <8 x i1> , <8 x i32> undef) %i4 = add <8 x i32> %wide.masked.gather, %wide.load call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %i4, <8 x ptr> %i3, i32 4, <8 x i1> ) %index.next = add nuw i64 %index, 8 %vec.ind.next = add <8 x i64> %vec.ind, %i5 = icmp eq i64 %index.next, 1024 br i1 %i5, label %for.cond.cleanup, label %vector.body for.cond.cleanup: ; preds = %vector.body ret void } ;struct foo { ; int a, b, c, d; ;}; ; ;void struct_gather(int * __restrict A, struct foo * __restrict B) { ; for (int i = 0; i < 1024; ++i) ; A[i] += B[i].b; ;} define void @struct_gather(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) { ; CHECK-LABEL: struct_gather: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: addi a1, a1, 132 ; CHECK-NEXT: li a2, 1024 ; CHECK-NEXT: li a3, 16 ; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, ma ; CHECK-NEXT: .LBB9_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: addi a4, a0, 32 ; CHECK-NEXT: addi a5, a1, -128 ; CHECK-NEXT: vlse32.v v8, (a5), a3 ; CHECK-NEXT: vlse32.v v9, (a1), a3 ; CHECK-NEXT: vle32.v v10, (a0) ; CHECK-NEXT: vle32.v v11, (a4) ; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: vadd.vv v9, v11, v9 ; CHECK-NEXT: vse32.v v8, (a0) ; CHECK-NEXT: vse32.v v9, (a4) ; CHECK-NEXT: addi a2, a2, -16 ; CHECK-NEXT: addi a0, a0, 64 ; CHECK-NEXT: addi a1, a1, 256 ; CHECK-NEXT: bnez a2, .LBB9_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup ; CHECK-NEXT: ret entry: br label %vector.body vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] %vec.ind = phi <8 x i64> [ , %entry ], [ %vec.ind.next, %vector.body ] %step.add = add <8 x i64> %vec.ind, %i = getelementptr inbounds %struct.foo, ptr %B, <8 x i64> %vec.ind, i32 1 %i1 = getelementptr inbounds %struct.foo, ptr %B, <8 x i64> %step.add, i32 1 %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i, i32 4, <8 x i1> , <8 x i32> undef) %wide.masked.gather9 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i1, i32 4, <8 x i1> , <8 x i32> undef) %i2 = getelementptr inbounds i32, ptr %A, i64 %index %wide.load = load <8 x i32>, ptr %i2, align 4 %i4 = getelementptr inbounds i32, ptr %i2, i64 8 %wide.load10 = load <8 x i32>, ptr %i4, align 4 %i6 = add nsw <8 x i32> %wide.load, %wide.masked.gather %i7 = add nsw <8 x i32> %wide.load10, %wide.masked.gather9 store <8 x i32> %i6, ptr %i2, align 4 store <8 x i32> %i7, ptr %i4, align 4 %index.next = add nuw i64 %index, 16 %vec.ind.next = add <8 x i64> %vec.ind, %i10 = icmp eq i64 %index.next, 1024 br i1 %i10, label %for.cond.cleanup, label %vector.body for.cond.cleanup: ; preds = %vector.body ret void } ;void gather_unroll(int * __restrict A, int * __restrict B) { ; for (int i = 0; i < 1024; i+= 4 ) { ; A[i] += B[i * 4]; ; A[i+1] += B[(i+1) * 4]; ; A[i+2] += B[(i+2) * 4]; ; A[i+3] += B[(i+3) * 4]; ; } ;} define void @gather_unroll(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) { ; CHECK-LABEL: gather_unroll: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: li a2, 256 ; CHECK-NEXT: li a3, 64 ; CHECK-NEXT: li a4, 16 ; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, ma ; CHECK-NEXT: .LBB10_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vlse32.v v8, (a1), a3 ; CHECK-NEXT: vlse32.v v9, (a0), a4 ; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: vsse32.v v8, (a0), a4 ; CHECK-NEXT: addi a5, a1, 16 ; CHECK-NEXT: vlse32.v v8, (a5), a3 ; CHECK-NEXT: addi a5, a0, 4 ; CHECK-NEXT: vlse32.v v9, (a5), a4 ; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: vsse32.v v8, (a5), a4 ; CHECK-NEXT: addi a5, a1, 32 ; CHECK-NEXT: vlse32.v v8, (a5), a3 ; CHECK-NEXT: addi a5, a0, 8 ; CHECK-NEXT: vlse32.v v9, (a5), a4 ; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: vsse32.v v8, (a5), a4 ; CHECK-NEXT: addi a5, a1, 48 ; CHECK-NEXT: vlse32.v v8, (a5), a3 ; CHECK-NEXT: addi a5, a0, 12 ; CHECK-NEXT: vlse32.v v9, (a5), a4 ; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: vsse32.v v8, (a5), a4 ; CHECK-NEXT: addi a2, a2, -8 ; CHECK-NEXT: addi a1, a1, 512 ; CHECK-NEXT: addi a0, a0, 128 ; CHECK-NEXT: bnez a2, .LBB10_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup ; CHECK-NEXT: ret entry: br label %vector.body vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] %vec.ind = phi <8 x i64> [ , %entry ], [ %vec.ind.next, %vector.body ] %i = shl nuw nsw <8 x i64> %vec.ind, %i1 = getelementptr inbounds i32, ptr %B, <8 x i64> %i %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i1, i32 4, <8 x i1> , <8 x i32> undef) %i2 = getelementptr inbounds i32, ptr %A, <8 x i64> %vec.ind %wide.masked.gather52 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i2, i32 4, <8 x i1> , <8 x i32> undef) %i3 = add nsw <8 x i32> %wide.masked.gather52, %wide.masked.gather call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %i3, <8 x ptr> %i2, i32 4, <8 x i1> ) %i4 = or <8 x i64> %vec.ind, %i5 = shl nsw <8 x i64> %i4, %i6 = getelementptr inbounds i32, ptr %B, <8 x i64> %i5 %wide.masked.gather53 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i6, i32 4, <8 x i1> , <8 x i32> undef) %i7 = getelementptr inbounds i32, ptr %A, <8 x i64> %i4 %wide.masked.gather54 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i7, i32 4, <8 x i1> , <8 x i32> undef) %i8 = add nsw <8 x i32> %wide.masked.gather54, %wide.masked.gather53 call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %i8, <8 x ptr> %i7, i32 4, <8 x i1> ) %i9 = or <8 x i64> %vec.ind, %i10 = shl nsw <8 x i64> %i9, %i11 = getelementptr inbounds i32, ptr %B, <8 x i64> %i10 %wide.masked.gather55 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i11, i32 4, <8 x i1> , <8 x i32> undef) %i12 = getelementptr inbounds i32, ptr %A, <8 x i64> %i9 %wide.masked.gather56 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i12, i32 4, <8 x i1> , <8 x i32> undef) %i13 = add nsw <8 x i32> %wide.masked.gather56, %wide.masked.gather55 call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %i13, <8 x ptr> %i12, i32 4, <8 x i1> ) %i14 = or <8 x i64> %vec.ind, %i15 = shl nsw <8 x i64> %i14, %i16 = getelementptr inbounds i32, ptr %B, <8 x i64> %i15 %wide.masked.gather57 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i16, i32 4, <8 x i1> , <8 x i32> undef) %i17 = getelementptr inbounds i32, ptr %A, <8 x i64> %i14 %wide.masked.gather58 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i17, i32 4, <8 x i1> , <8 x i32> undef) %i18 = add nsw <8 x i32> %wide.masked.gather58, %wide.masked.gather57 call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %i18, <8 x ptr> %i17, i32 4, <8 x i1> ) %index.next = add nuw i64 %index, 8 %vec.ind.next = add <8 x i64> %vec.ind, %i19 = icmp eq i64 %index.next, 256 br i1 %i19, label %for.cond.cleanup, label %vector.body for.cond.cleanup: ; preds = %vector.body ret void } declare <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr>, i32 immarg, <32 x i1>, <32 x i8>) declare <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr>, i32 immarg, <8 x i1>, <8 x i32>) declare void @llvm.masked.scatter.v32i8.v32p0(<32 x i8>, <32 x ptr>, i32 immarg, <32 x i1>) declare void @llvm.masked.scatter.v8i32.v8p0(<8 x i32>, <8 x ptr>, i32 immarg, <8 x i1>) ; Make sure we don't crash in getTgtMemIntrinsic for a vector of pointers. define void @gather_of_pointers(ptr noalias nocapture %arg, ptr noalias nocapture readonly %arg1) { ; V-LABEL: gather_of_pointers: ; V: # %bb.0: # %bb ; V-NEXT: li a2, 1024 ; V-NEXT: li a3, 40 ; V-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; V-NEXT: .LBB11_1: # %bb2 ; V-NEXT: # =>This Inner Loop Header: Depth=1 ; V-NEXT: addi a4, a1, 80 ; V-NEXT: vlse64.v v8, (a1), a3 ; V-NEXT: vlse64.v v9, (a4), a3 ; V-NEXT: addi a4, a0, 16 ; V-NEXT: vse64.v v8, (a0) ; V-NEXT: vse64.v v9, (a4) ; V-NEXT: addi a2, a2, -4 ; V-NEXT: addi a0, a0, 32 ; V-NEXT: addi a1, a1, 160 ; V-NEXT: bnez a2, .LBB11_1 ; V-NEXT: # %bb.2: # %bb18 ; V-NEXT: ret ; ; ZVE32F-LABEL: gather_of_pointers: ; ZVE32F: # %bb.0: # %bb ; ZVE32F-NEXT: li a2, 0 ; ZVE32F-NEXT: li a3, 1 ; ZVE32F-NEXT: li a4, 1024 ; ZVE32F-NEXT: li a5, 40 ; ZVE32F-NEXT: .LBB11_1: # %bb2 ; ZVE32F-NEXT: # =>This Inner Loop Header: Depth=1 ; ZVE32F-NEXT: mul a6, a3, a5 ; ZVE32F-NEXT: add a6, a1, a6 ; ZVE32F-NEXT: mul a7, a2, a5 ; ZVE32F-NEXT: add a7, a1, a7 ; ZVE32F-NEXT: ld t0, 0(a6) ; ZVE32F-NEXT: ld t1, 0(a7) ; ZVE32F-NEXT: ld a6, 80(a6) ; ZVE32F-NEXT: ld a7, 80(a7) ; ZVE32F-NEXT: sd t0, 8(a0) ; ZVE32F-NEXT: sd t1, 0(a0) ; ZVE32F-NEXT: sd a6, 24(a0) ; ZVE32F-NEXT: sd a7, 16(a0) ; ZVE32F-NEXT: addi a2, a2, 4 ; ZVE32F-NEXT: addi a3, a3, 4 ; ZVE32F-NEXT: addi a4, a4, -4 ; ZVE32F-NEXT: addi a0, a0, 32 ; ZVE32F-NEXT: bnez a4, .LBB11_1 ; ZVE32F-NEXT: # %bb.2: # %bb18 ; ZVE32F-NEXT: ret bb: br label %bb2 bb2: ; preds = %bb2, %bb %i = phi i64 [ 0, %bb ], [ %i15, %bb2 ] %i3 = phi <2 x i64> [ , %bb ], [ %i16, %bb2 ] %i4 = mul nuw nsw <2 x i64> %i3, %i5 = mul <2 x i64> %i3, %i6 = add <2 x i64> %i5, %i7 = getelementptr inbounds ptr, ptr %arg1, <2 x i64> %i4 %i8 = getelementptr inbounds ptr, ptr %arg1, <2 x i64> %i6 %i9 = call <2 x ptr> @llvm.masked.gather.v2p0.v2p0(<2 x ptr> %i7, i32 8, <2 x i1> , <2 x ptr> undef) %i10 = call <2 x ptr> @llvm.masked.gather.v2p0.v2p0(<2 x ptr> %i8, i32 8, <2 x i1> , <2 x ptr> undef) %i11 = getelementptr inbounds ptr, ptr %arg, i64 %i store <2 x ptr> %i9, ptr %i11, align 8 %i13 = getelementptr inbounds ptr, ptr %i11, i64 2 store <2 x ptr> %i10, ptr %i13, align 8 %i15 = add nuw i64 %i, 4 %i16 = add <2 x i64> %i3, %i17 = icmp eq i64 %i15, 1024 br i1 %i17, label %bb18, label %bb2 bb18: ; preds = %bb2 ret void } declare <2 x ptr> @llvm.masked.gather.v2p0.v2p0(<2 x ptr>, i32 immarg, <2 x i1>, <2 x ptr>) ; Make sure we don't crash in getTgtMemIntrinsic for a vector of pointers. define void @scatter_of_pointers(ptr noalias nocapture %arg, ptr noalias nocapture readonly %arg1) { ; V-LABEL: scatter_of_pointers: ; V: # %bb.0: # %bb ; V-NEXT: li a2, 1024 ; V-NEXT: li a3, 40 ; V-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; V-NEXT: .LBB12_1: # %bb2 ; V-NEXT: # =>This Inner Loop Header: Depth=1 ; V-NEXT: addi a4, a1, 16 ; V-NEXT: vle64.v v8, (a1) ; V-NEXT: vle64.v v9, (a4) ; V-NEXT: addi a4, a0, 80 ; V-NEXT: vsse64.v v8, (a0), a3 ; V-NEXT: vsse64.v v9, (a4), a3 ; V-NEXT: addi a2, a2, -4 ; V-NEXT: addi a1, a1, 32 ; V-NEXT: addi a0, a0, 160 ; V-NEXT: bnez a2, .LBB12_1 ; V-NEXT: # %bb.2: # %bb18 ; V-NEXT: ret ; ; ZVE32F-LABEL: scatter_of_pointers: ; ZVE32F: # %bb.0: # %bb ; ZVE32F-NEXT: li a2, 0 ; ZVE32F-NEXT: li a3, 1 ; ZVE32F-NEXT: li a4, 1024 ; ZVE32F-NEXT: li a5, 40 ; ZVE32F-NEXT: .LBB12_1: # %bb2 ; ZVE32F-NEXT: # =>This Inner Loop Header: Depth=1 ; ZVE32F-NEXT: ld a6, 8(a1) ; ZVE32F-NEXT: ld a7, 0(a1) ; ZVE32F-NEXT: ld t0, 24(a1) ; ZVE32F-NEXT: ld t1, 16(a1) ; ZVE32F-NEXT: mul t2, a3, a5 ; ZVE32F-NEXT: add t2, a0, t2 ; ZVE32F-NEXT: mul t3, a2, a5 ; ZVE32F-NEXT: add t3, a0, t3 ; ZVE32F-NEXT: sd a7, 0(t3) ; ZVE32F-NEXT: sd a6, 0(t2) ; ZVE32F-NEXT: sd t1, 80(t3) ; ZVE32F-NEXT: sd t0, 80(t2) ; ZVE32F-NEXT: addi a2, a2, 4 ; ZVE32F-NEXT: addi a3, a3, 4 ; ZVE32F-NEXT: addi a4, a4, -4 ; ZVE32F-NEXT: addi a1, a1, 32 ; ZVE32F-NEXT: bnez a4, .LBB12_1 ; ZVE32F-NEXT: # %bb.2: # %bb18 ; ZVE32F-NEXT: ret bb: br label %bb2 bb2: ; preds = %bb2, %bb %i = phi i64 [ 0, %bb ], [ %i15, %bb2 ] %i3 = phi <2 x i64> [ , %bb ], [ %i16, %bb2 ] %i4 = getelementptr inbounds ptr, ptr %arg1, i64 %i %i6 = load <2 x ptr>, ptr %i4, align 8 %i7 = getelementptr inbounds ptr, ptr %i4, i64 2 %i9 = load <2 x ptr>, ptr %i7, align 8 %i10 = mul nuw nsw <2 x i64> %i3, %i11 = mul <2 x i64> %i3, %i12 = add <2 x i64> %i11, %i13 = getelementptr inbounds ptr, ptr %arg, <2 x i64> %i10 %i14 = getelementptr inbounds ptr, ptr %arg, <2 x i64> %i12 call void @llvm.masked.scatter.v2p0.v2p0(<2 x ptr> %i6, <2 x ptr> %i13, i32 8, <2 x i1> ) call void @llvm.masked.scatter.v2p0.v2p0(<2 x ptr> %i9, <2 x ptr> %i14, i32 8, <2 x i1> ) %i15 = add nuw i64 %i, 4 %i16 = add <2 x i64> %i3, %i17 = icmp eq i64 %i15, 1024 br i1 %i17, label %bb18, label %bb2 bb18: ; preds = %bb2 ret void } declare void @llvm.masked.scatter.v2p0.v2p0(<2 x ptr>, <2 x ptr>, i32 immarg, <2 x i1>) define void @strided_load_startval_add_with_splat(ptr noalias nocapture %arg, ptr noalias nocapture readonly %arg1, i32 signext %arg2) { ; CHECK-LABEL: strided_load_startval_add_with_splat: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: li a3, 1024 ; CHECK-NEXT: beq a2, a3, .LBB13_7 ; CHECK-NEXT: # %bb.1: # %bb3 ; CHECK-NEXT: li a4, 1023 ; CHECK-NEXT: subw a4, a4, a2 ; CHECK-NEXT: li a5, 31 ; CHECK-NEXT: mv a3, a2 ; CHECK-NEXT: bltu a4, a5, .LBB13_5 ; CHECK-NEXT: # %bb.2: # %bb9 ; CHECK-NEXT: slli a4, a4, 32 ; CHECK-NEXT: srli a4, a4, 32 ; CHECK-NEXT: addi a4, a4, 1 ; CHECK-NEXT: andi a5, a4, -32 ; CHECK-NEXT: add a3, a5, a2 ; CHECK-NEXT: slli a7, a2, 2 ; CHECK-NEXT: add a6, a0, a2 ; CHECK-NEXT: add a2, a1, a2 ; CHECK-NEXT: add a2, a2, a7 ; CHECK-NEXT: li a7, 32 ; CHECK-NEXT: li t0, 5 ; CHECK-NEXT: mv t1, a5 ; CHECK-NEXT: .LBB13_3: # %bb15 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vsetvli zero, a7, e8, m1, ta, ma ; CHECK-NEXT: vlse8.v v8, (a2), t0 ; CHECK-NEXT: vle8.v v9, (a6) ; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: vse8.v v8, (a6) ; CHECK-NEXT: addi t1, t1, -32 ; CHECK-NEXT: addi a6, a6, 32 ; CHECK-NEXT: addi a2, a2, 160 ; CHECK-NEXT: bnez t1, .LBB13_3 ; CHECK-NEXT: # %bb.4: # %bb30 ; CHECK-NEXT: beq a4, a5, .LBB13_7 ; CHECK-NEXT: .LBB13_5: # %bb32 ; CHECK-NEXT: addiw a2, a3, -1024 ; CHECK-NEXT: add a0, a0, a3 ; CHECK-NEXT: slli a4, a3, 2 ; CHECK-NEXT: add a1, a1, a3 ; CHECK-NEXT: add a1, a1, a4 ; CHECK-NEXT: .LBB13_6: # %bb35 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: lbu a3, 0(a1) ; CHECK-NEXT: lbu a4, 0(a0) ; CHECK-NEXT: add a3, a4, a3 ; CHECK-NEXT: sb a3, 0(a0) ; CHECK-NEXT: addiw a2, a2, 1 ; CHECK-NEXT: addi a0, a0, 1 ; CHECK-NEXT: addi a1, a1, 5 ; CHECK-NEXT: bnez a2, .LBB13_6 ; CHECK-NEXT: .LBB13_7: # %bb34 ; CHECK-NEXT: ret bb: %i = icmp eq i32 %arg2, 1024 br i1 %i, label %bb34, label %bb3 bb3: ; preds = %bb %i4 = sext i32 %arg2 to i64 %i5 = sub i32 1023, %arg2 %i6 = zext i32 %i5 to i64 %i7 = add nuw nsw i64 %i6, 1 %i8 = icmp ult i32 %i5, 31 br i1 %i8, label %bb32, label %bb9 bb9: ; preds = %bb3 %i10 = and i64 %i7, 8589934560 %i11 = add nsw i64 %i10, %i4 %i12 = insertelement <32 x i64> poison, i64 %i4, i64 0 %i13 = shufflevector <32 x i64> %i12, <32 x i64> poison, <32 x i32> zeroinitializer %i14 = add <32 x i64> %i13, br label %bb15 bb15: ; preds = %bb15, %bb9 %i16 = phi i64 [ 0, %bb9 ], [ %i27, %bb15 ] %i17 = phi <32 x i64> [ %i14, %bb9 ], [ %i28, %bb15 ] %i18 = add i64 %i16, %i4 %i19 = mul nsw <32 x i64> %i17, %i20 = getelementptr inbounds i8, ptr %arg1, <32 x i64> %i19 %i21 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i20, i32 1, <32 x i1> , <32 x i8> undef) %i22 = getelementptr inbounds i8, ptr %arg, i64 %i18 %i24 = load <32 x i8>, ptr %i22, align 1 %i25 = add <32 x i8> %i24, %i21 store <32 x i8> %i25, ptr %i22, align 1 %i27 = add nuw i64 %i16, 32 %i28 = add <32 x i64> %i17, %i29 = icmp eq i64 %i27, %i10 br i1 %i29, label %bb30, label %bb15 bb30: ; preds = %bb15 %i31 = icmp eq i64 %i7, %i10 br i1 %i31, label %bb34, label %bb32 bb32: ; preds = %bb30, %bb3 %i33 = phi i64 [ %i4, %bb3 ], [ %i11, %bb30 ] br label %bb35 bb34: ; preds = %bb35, %bb30, %bb ret void bb35: ; preds = %bb35, %bb32 %i36 = phi i64 [ %i43, %bb35 ], [ %i33, %bb32 ] %i37 = mul nsw i64 %i36, 5 %i38 = getelementptr inbounds i8, ptr %arg1, i64 %i37 %i39 = load i8, ptr %i38, align 1 %i40 = getelementptr inbounds i8, ptr %arg, i64 %i36 %i41 = load i8, ptr %i40, align 1 %i42 = add i8 %i41, %i39 store i8 %i42, ptr %i40, align 1 %i43 = add nsw i64 %i36, 1 %i44 = trunc i64 %i43 to i32 %i45 = icmp eq i32 %i44, 1024 br i1 %i45, label %bb34, label %bb35 } declare <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr>, i32 immarg, <16 x i1>, <16 x i8>) declare void @llvm.masked.scatter.v16i8.v16p0(<16 x i8>, <16 x ptr>, i32 immarg, <16 x i1>) define void @gather_no_scalar_remainder(ptr noalias nocapture noundef %arg, ptr noalias nocapture noundef readonly %arg1, i64 noundef %arg2) { ; CHECK-LABEL: gather_no_scalar_remainder: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: slli a2, a2, 4 ; CHECK-NEXT: beqz a2, .LBB14_3 ; CHECK-NEXT: # %bb.1: # %bb2 ; CHECK-NEXT: li a3, 5 ; CHECK-NEXT: vsetivli zero, 16, e8, mf2, ta, ma ; CHECK-NEXT: .LBB14_2: # %bb4 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vlse8.v v8, (a1), a3 ; CHECK-NEXT: vle8.v v9, (a0) ; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: vse8.v v8, (a0) ; CHECK-NEXT: addi a2, a2, -16 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: addi a1, a1, 80 ; CHECK-NEXT: bnez a2, .LBB14_2 ; CHECK-NEXT: .LBB14_3: # %bb16 ; CHECK-NEXT: ret bb: %i = shl i64 %arg2, 4 %i3 = icmp eq i64 %i, 0 br i1 %i3, label %bb16, label %bb2 bb2: ; preds = %bb br label %bb4 bb4: ; preds = %bb4, %bb2 %i5 = phi i64 [ %i13, %bb4 ], [ 0, %bb2 ] %i6 = phi <16 x i64> [ %i14, %bb4 ], [ , %bb2 ] %i7 = mul <16 x i64> %i6, %i8 = getelementptr inbounds i8, ptr %arg1, <16 x i64> %i7 %i9 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> %i8, i32 1, <16 x i1> , <16 x i8> undef) %i10 = getelementptr inbounds i8, ptr %arg, i64 %i5 %i11 = load <16 x i8>, ptr %i10, align 1 %i12 = add <16 x i8> %i11, %i9 store <16 x i8> %i12, ptr %i10, align 1 %i13 = add nuw i64 %i5, 16 %i14 = add <16 x i64> %i6, %i15 = icmp eq i64 %i13, %i br i1 %i15, label %bb16, label %bb4 bb16: ; preds = %bb4, %bb ret void }