; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7 -passes=slp-vectorizer -S | FileCheck %s ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s ; ; dot4(ptr x, ptr y) - ((xptr y[0])+(xptr y[1])+(xptr y[2])+(xptr y[3])) ; define double @dot4f64(ptr dereferenceable(32) %ptrx, ptr dereferenceable(32) %ptry) { ; CHECK-LABEL: @dot4f64( ; CHECK-NEXT: [[PTRX2:%.*]] = getelementptr inbounds double, ptr [[PTRX:%.*]], i64 2 ; CHECK-NEXT: [[PTRY2:%.*]] = getelementptr inbounds double, ptr [[PTRY:%.*]], i64 2 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[PTRX]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x double>, ptr [[PTRY]], align 4 ; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP2]], [[TMP4]] ; CHECK-NEXT: [[TMP7:%.*]] = load <2 x double>, ptr [[PTRX2]], align 4 ; CHECK-NEXT: [[TMP9:%.*]] = load <2 x double>, ptr [[PTRY2]], align 4 ; CHECK-NEXT: [[TMP10:%.*]] = fmul <2 x double> [[TMP7]], [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x double> [[TMP5]], i32 0 ; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x double> [[TMP5]], i32 1 ; CHECK-NEXT: [[DOT01:%.*]] = fadd double [[TMP11]], [[TMP12]] ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x double> [[TMP10]], i32 0 ; CHECK-NEXT: [[DOT012:%.*]] = fadd double [[DOT01]], [[TMP13]] ; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x double> [[TMP10]], i32 1 ; CHECK-NEXT: [[DOT0123:%.*]] = fadd double [[DOT012]], [[TMP14]] ; CHECK-NEXT: ret double [[DOT0123]] ; %ptrx1 = getelementptr inbounds double, ptr %ptrx, i64 1 %ptry1 = getelementptr inbounds double, ptr %ptry, i64 1 %ptrx2 = getelementptr inbounds double, ptr %ptrx, i64 2 %ptry2 = getelementptr inbounds double, ptr %ptry, i64 2 %ptrx3 = getelementptr inbounds double, ptr %ptrx, i64 3 %ptry3 = getelementptr inbounds double, ptr %ptry, i64 3 %x0 = load double, ptr %ptrx, align 4 %y0 = load double, ptr %ptry, align 4 %x1 = load double, ptr %ptrx1, align 4 %y1 = load double, ptr %ptry1, align 4 %x2 = load double, ptr %ptrx2, align 4 %y2 = load double, ptr %ptry2, align 4 %x3 = load double, ptr %ptrx3, align 4 %y3 = load double, ptr %ptry3, align 4 %mul0 = fmul double %x0, %y0 %mul1 = fmul double %x1, %y1 %mul2 = fmul double %x2, %y2 %mul3 = fmul double %x3, %y3 %dot01 = fadd double %mul0, %mul1 %dot012 = fadd double %dot01, %mul2 %dot0123 = fadd double %dot012, %mul3 ret double %dot0123 } define float @dot4f32(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %ptry) { ; CHECK-LABEL: @dot4f32( ; CHECK-NEXT: [[PTRX2:%.*]] = getelementptr inbounds float, ptr [[PTRX:%.*]], i64 2 ; CHECK-NEXT: [[PTRY2:%.*]] = getelementptr inbounds float, ptr [[PTRY:%.*]], i64 2 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[PTRX]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, ptr [[PTRY]], align 4 ; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]] ; CHECK-NEXT: [[TMP7:%.*]] = load <2 x float>, ptr [[PTRX2]], align 4 ; CHECK-NEXT: [[TMP9:%.*]] = load <2 x float>, ptr [[PTRY2]], align 4 ; CHECK-NEXT: [[TMP10:%.*]] = fmul <2 x float> [[TMP7]], [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[TMP5]], i32 0 ; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[TMP5]], i32 1 ; CHECK-NEXT: [[DOT01:%.*]] = fadd float [[TMP11]], [[TMP12]] ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[TMP10]], i32 0 ; CHECK-NEXT: [[DOT012:%.*]] = fadd float [[DOT01]], [[TMP13]] ; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x float> [[TMP10]], i32 1 ; CHECK-NEXT: [[DOT0123:%.*]] = fadd float [[DOT012]], [[TMP14]] ; CHECK-NEXT: ret float [[DOT0123]] ; %ptrx1 = getelementptr inbounds float, ptr %ptrx, i64 1 %ptry1 = getelementptr inbounds float, ptr %ptry, i64 1 %ptrx2 = getelementptr inbounds float, ptr %ptrx, i64 2 %ptry2 = getelementptr inbounds float, ptr %ptry, i64 2 %ptrx3 = getelementptr inbounds float, ptr %ptrx, i64 3 %ptry3 = getelementptr inbounds float, ptr %ptry, i64 3 %x0 = load float, ptr %ptrx, align 4 %y0 = load float, ptr %ptry, align 4 %x1 = load float, ptr %ptrx1, align 4 %y1 = load float, ptr %ptry1, align 4 %x2 = load float, ptr %ptrx2, align 4 %y2 = load float, ptr %ptry2, align 4 %x3 = load float, ptr %ptrx3, align 4 %y3 = load float, ptr %ptry3, align 4 %mul0 = fmul float %x0, %y0 %mul1 = fmul float %x1, %y1 %mul2 = fmul float %x2, %y2 %mul3 = fmul float %x3, %y3 %dot01 = fadd float %mul0, %mul1 %dot012 = fadd float %dot01, %mul2 %dot0123 = fadd float %dot012, %mul3 ret float %dot0123 } define double @dot4f64_fast(ptr dereferenceable(32) %ptrx, ptr dereferenceable(32) %ptry) { ; CHECK-LABEL: @dot4f64_fast( ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x double>, ptr [[PTRX:%.*]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = load <4 x double>, ptr [[PTRY:%.*]], align 4 ; CHECK-NEXT: [[TMP5:%.*]] = fmul <4 x double> [[TMP2]], [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[TMP5]]) ; CHECK-NEXT: ret double [[TMP6]] ; %ptrx1 = getelementptr inbounds double, ptr %ptrx, i64 1 %ptry1 = getelementptr inbounds double, ptr %ptry, i64 1 %ptrx2 = getelementptr inbounds double, ptr %ptrx, i64 2 %ptry2 = getelementptr inbounds double, ptr %ptry, i64 2 %ptrx3 = getelementptr inbounds double, ptr %ptrx, i64 3 %ptry3 = getelementptr inbounds double, ptr %ptry, i64 3 %x0 = load double, ptr %ptrx, align 4 %y0 = load double, ptr %ptry, align 4 %x1 = load double, ptr %ptrx1, align 4 %y1 = load double, ptr %ptry1, align 4 %x2 = load double, ptr %ptrx2, align 4 %y2 = load double, ptr %ptry2, align 4 %x3 = load double, ptr %ptrx3, align 4 %y3 = load double, ptr %ptry3, align 4 %mul0 = fmul double %x0, %y0 %mul1 = fmul double %x1, %y1 %mul2 = fmul double %x2, %y2 %mul3 = fmul double %x3, %y3 %dot01 = fadd fast double %mul0, %mul1 %dot012 = fadd fast double %dot01, %mul2 %dot0123 = fadd fast double %dot012, %mul3 ret double %dot0123 } define float @dot4f32_fast(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %ptry) { ; CHECK-LABEL: @dot4f32_fast( ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[PTRX:%.*]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = load <4 x float>, ptr [[PTRY:%.*]], align 4 ; CHECK-NEXT: [[TMP5:%.*]] = fmul <4 x float> [[TMP2]], [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP5]]) ; CHECK-NEXT: ret float [[TMP6]] ; %ptrx1 = getelementptr inbounds float, ptr %ptrx, i64 1 %ptry1 = getelementptr inbounds float, ptr %ptry, i64 1 %ptrx2 = getelementptr inbounds float, ptr %ptrx, i64 2 %ptry2 = getelementptr inbounds float, ptr %ptry, i64 2 %ptrx3 = getelementptr inbounds float, ptr %ptrx, i64 3 %ptry3 = getelementptr inbounds float, ptr %ptry, i64 3 %x0 = load float, ptr %ptrx, align 4 %y0 = load float, ptr %ptry, align 4 %x1 = load float, ptr %ptrx1, align 4 %y1 = load float, ptr %ptry1, align 4 %x2 = load float, ptr %ptrx2, align 4 %y2 = load float, ptr %ptry2, align 4 %x3 = load float, ptr %ptrx3, align 4 %y3 = load float, ptr %ptry3, align 4 %mul0 = fmul float %x0, %y0 %mul1 = fmul float %x1, %y1 %mul2 = fmul float %x2, %y2 %mul3 = fmul float %x3, %y3 %dot01 = fadd fast float %mul0, %mul1 %dot012 = fadd fast float %dot01, %mul2 %dot0123 = fadd fast float %dot012, %mul3 ret float %dot0123 } ; ; dot3(ptr x, ptr y) - ((xptr y[0])+(xptr y[1])+(xptr y[2])) ; define double @dot3f64(ptr dereferenceable(32) %ptrx, ptr dereferenceable(32) %ptry) { ; CHECK-LABEL: @dot3f64( ; CHECK-NEXT: [[PTRX1:%.*]] = getelementptr inbounds double, ptr [[PTRX:%.*]], i64 1 ; CHECK-NEXT: [[PTRY1:%.*]] = getelementptr inbounds double, ptr [[PTRY:%.*]], i64 1 ; CHECK-NEXT: [[X0:%.*]] = load double, ptr [[PTRX]], align 4 ; CHECK-NEXT: [[Y0:%.*]] = load double, ptr [[PTRY]], align 4 ; CHECK-NEXT: [[MUL0:%.*]] = fmul double [[X0]], [[Y0]] ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[PTRX1]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x double>, ptr [[PTRY1]], align 4 ; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP2]], [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i32 0 ; CHECK-NEXT: [[DOT01:%.*]] = fadd double [[MUL0]], [[TMP6]] ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i32 1 ; CHECK-NEXT: [[DOT012:%.*]] = fadd double [[DOT01]], [[TMP7]] ; CHECK-NEXT: ret double [[DOT012]] ; %ptrx1 = getelementptr inbounds double, ptr %ptrx, i64 1 %ptry1 = getelementptr inbounds double, ptr %ptry, i64 1 %ptrx2 = getelementptr inbounds double, ptr %ptrx, i64 2 %ptry2 = getelementptr inbounds double, ptr %ptry, i64 2 %x0 = load double, ptr %ptrx, align 4 %y0 = load double, ptr %ptry, align 4 %x1 = load double, ptr %ptrx1, align 4 %y1 = load double, ptr %ptry1, align 4 %x2 = load double, ptr %ptrx2, align 4 %y2 = load double, ptr %ptry2, align 4 %mul0 = fmul double %x0, %y0 %mul1 = fmul double %x1, %y1 %mul2 = fmul double %x2, %y2 %dot01 = fadd double %mul0, %mul1 %dot012 = fadd double %dot01, %mul2 ret double %dot012 } define float @dot3f32(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %ptry) { ; CHECK-LABEL: @dot3f32( ; CHECK-NEXT: [[PTRX1:%.*]] = getelementptr inbounds float, ptr [[PTRX:%.*]], i64 1 ; CHECK-NEXT: [[PTRY1:%.*]] = getelementptr inbounds float, ptr [[PTRY:%.*]], i64 1 ; CHECK-NEXT: [[X0:%.*]] = load float, ptr [[PTRX]], align 4 ; CHECK-NEXT: [[Y0:%.*]] = load float, ptr [[PTRY]], align 4 ; CHECK-NEXT: [[MUL0:%.*]] = fmul float [[X0]], [[Y0]] ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[PTRX1]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, ptr [[PTRY1]], align 4 ; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP5]], i32 0 ; CHECK-NEXT: [[DOT01:%.*]] = fadd float [[MUL0]], [[TMP6]] ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[TMP5]], i32 1 ; CHECK-NEXT: [[DOT012:%.*]] = fadd float [[DOT01]], [[TMP7]] ; CHECK-NEXT: ret float [[DOT012]] ; %ptrx1 = getelementptr inbounds float, ptr %ptrx, i64 1 %ptry1 = getelementptr inbounds float, ptr %ptry, i64 1 %ptrx2 = getelementptr inbounds float, ptr %ptrx, i64 2 %ptry2 = getelementptr inbounds float, ptr %ptry, i64 2 %x0 = load float, ptr %ptrx, align 4 %y0 = load float, ptr %ptry, align 4 %x1 = load float, ptr %ptrx1, align 4 %y1 = load float, ptr %ptry1, align 4 %x2 = load float, ptr %ptrx2, align 4 %y2 = load float, ptr %ptry2, align 4 %mul0 = fmul float %x0, %y0 %mul1 = fmul float %x1, %y1 %mul2 = fmul float %x2, %y2 %dot01 = fadd float %mul0, %mul1 %dot012 = fadd float %dot01, %mul2 ret float %dot012 } define double @dot3f64_fast(ptr dereferenceable(32) %ptrx, ptr dereferenceable(32) %ptry) { ; CHECK-LABEL: @dot3f64_fast( ; CHECK-NEXT: [[PTRX1:%.*]] = getelementptr inbounds double, ptr [[PTRX:%.*]], i64 1 ; CHECK-NEXT: [[PTRY1:%.*]] = getelementptr inbounds double, ptr [[PTRY:%.*]], i64 1 ; CHECK-NEXT: [[X0:%.*]] = load double, ptr [[PTRX]], align 4 ; CHECK-NEXT: [[Y0:%.*]] = load double, ptr [[PTRY]], align 4 ; CHECK-NEXT: [[MUL0:%.*]] = fmul double [[X0]], [[Y0]] ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[PTRX1]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x double>, ptr [[PTRY1]], align 4 ; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP2]], [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i32 0 ; CHECK-NEXT: [[DOT01:%.*]] = fadd fast double [[MUL0]], [[TMP6]] ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i32 1 ; CHECK-NEXT: [[DOT012:%.*]] = fadd fast double [[DOT01]], [[TMP7]] ; CHECK-NEXT: ret double [[DOT012]] ; %ptrx1 = getelementptr inbounds double, ptr %ptrx, i64 1 %ptry1 = getelementptr inbounds double, ptr %ptry, i64 1 %ptrx2 = getelementptr inbounds double, ptr %ptrx, i64 2 %ptry2 = getelementptr inbounds double, ptr %ptry, i64 2 %x0 = load double, ptr %ptrx, align 4 %y0 = load double, ptr %ptry, align 4 %x1 = load double, ptr %ptrx1, align 4 %y1 = load double, ptr %ptry1, align 4 %x2 = load double, ptr %ptrx2, align 4 %y2 = load double, ptr %ptry2, align 4 %mul0 = fmul double %x0, %y0 %mul1 = fmul double %x1, %y1 %mul2 = fmul double %x2, %y2 %dot01 = fadd fast double %mul0, %mul1 %dot012 = fadd fast double %dot01, %mul2 ret double %dot012 } define float @dot3f32_fast(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %ptry) { ; CHECK-LABEL: @dot3f32_fast( ; CHECK-NEXT: [[PTRX1:%.*]] = getelementptr inbounds float, ptr [[PTRX:%.*]], i64 1 ; CHECK-NEXT: [[PTRY1:%.*]] = getelementptr inbounds float, ptr [[PTRY:%.*]], i64 1 ; CHECK-NEXT: [[X0:%.*]] = load float, ptr [[PTRX]], align 4 ; CHECK-NEXT: [[Y0:%.*]] = load float, ptr [[PTRY]], align 4 ; CHECK-NEXT: [[MUL0:%.*]] = fmul float [[X0]], [[Y0]] ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[PTRX1]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, ptr [[PTRY1]], align 4 ; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP5]], i32 0 ; CHECK-NEXT: [[DOT01:%.*]] = fadd fast float [[MUL0]], [[TMP6]] ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[TMP5]], i32 1 ; CHECK-NEXT: [[DOT012:%.*]] = fadd fast float [[DOT01]], [[TMP7]] ; CHECK-NEXT: ret float [[DOT012]] ; %ptrx1 = getelementptr inbounds float, ptr %ptrx, i64 1 %ptry1 = getelementptr inbounds float, ptr %ptry, i64 1 %ptrx2 = getelementptr inbounds float, ptr %ptrx, i64 2 %ptry2 = getelementptr inbounds float, ptr %ptry, i64 2 %x0 = load float, ptr %ptrx, align 4 %y0 = load float, ptr %ptry, align 4 %x1 = load float, ptr %ptrx1, align 4 %y1 = load float, ptr %ptry1, align 4 %x2 = load float, ptr %ptrx2, align 4 %y2 = load float, ptr %ptry2, align 4 %mul0 = fmul float %x0, %y0 %mul1 = fmul float %x1, %y1 %mul2 = fmul float %x2, %y2 %dot01 = fadd fast float %mul0, %mul1 %dot012 = fadd fast float %dot01, %mul2 ret float %dot012 } ; ; dot2(ptr x, ptr y) - ((xptr y[0])+(xptr y[1])) ; define double @dot2f64(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %ptry) { ; CHECK-LABEL: @dot2f64( ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[PTRX:%.*]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x double>, ptr [[PTRY:%.*]], align 4 ; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP2]], [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i32 0 ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i32 1 ; CHECK-NEXT: [[DOT01:%.*]] = fadd double [[TMP6]], [[TMP7]] ; CHECK-NEXT: ret double [[DOT01]] ; %ptrx1 = getelementptr inbounds double, ptr %ptrx, i64 1 %ptry1 = getelementptr inbounds double, ptr %ptry, i64 1 %x0 = load double, ptr %ptrx, align 4 %y0 = load double, ptr %ptry, align 4 %x1 = load double, ptr %ptrx1, align 4 %y1 = load double, ptr %ptry1, align 4 %mul0 = fmul double %x0, %y0 %mul1 = fmul double %x1, %y1 %dot01 = fadd double %mul0, %mul1 ret double %dot01 } define float @dot2f32(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %ptry) { ; CHECK-LABEL: @dot2f32( ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[PTRX:%.*]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, ptr [[PTRY:%.*]], align 4 ; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP5]], i32 0 ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[TMP5]], i32 1 ; CHECK-NEXT: [[DOT01:%.*]] = fadd float [[TMP6]], [[TMP7]] ; CHECK-NEXT: ret float [[DOT01]] ; %ptrx1 = getelementptr inbounds float, ptr %ptrx, i64 1 %ptry1 = getelementptr inbounds float, ptr %ptry, i64 1 %x0 = load float, ptr %ptrx, align 4 %y0 = load float, ptr %ptry, align 4 %x1 = load float, ptr %ptrx1, align 4 %y1 = load float, ptr %ptry1, align 4 %mul0 = fmul float %x0, %y0 %mul1 = fmul float %x1, %y1 %dot01 = fadd float %mul0, %mul1 ret float %dot01 } define double @dot2f64_fast(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %ptry) { ; CHECK-LABEL: @dot2f64_fast( ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[PTRX:%.*]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x double>, ptr [[PTRY:%.*]], align 4 ; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP2]], [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i32 0 ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i32 1 ; CHECK-NEXT: [[DOT01:%.*]] = fadd fast double [[TMP6]], [[TMP7]] ; CHECK-NEXT: ret double [[DOT01]] ; %ptrx1 = getelementptr inbounds double, ptr %ptrx, i64 1 %ptry1 = getelementptr inbounds double, ptr %ptry, i64 1 %x0 = load double, ptr %ptrx, align 4 %y0 = load double, ptr %ptry, align 4 %x1 = load double, ptr %ptrx1, align 4 %y1 = load double, ptr %ptry1, align 4 %mul0 = fmul double %x0, %y0 %mul1 = fmul double %x1, %y1 %dot01 = fadd fast double %mul0, %mul1 ret double %dot01 } define float @dot2f32_fast(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %ptry) { ; CHECK-LABEL: @dot2f32_fast( ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[PTRX:%.*]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, ptr [[PTRY:%.*]], align 4 ; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP5]], i32 0 ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[TMP5]], i32 1 ; CHECK-NEXT: [[DOT01:%.*]] = fadd fast float [[TMP6]], [[TMP7]] ; CHECK-NEXT: ret float [[DOT01]] ; %ptrx1 = getelementptr inbounds float, ptr %ptrx, i64 1 %ptry1 = getelementptr inbounds float, ptr %ptry, i64 1 %x0 = load float, ptr %ptrx, align 4 %y0 = load float, ptr %ptry, align 4 %x1 = load float, ptr %ptrx1, align 4 %y1 = load float, ptr %ptry1, align 4 %mul0 = fmul float %x0, %y0 %mul1 = fmul float %x1, %y1 %dot01 = fadd fast float %mul0, %mul1 ret float %dot01 }