1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=aarch64-none-eabif | FileCheck %s 3 4define void @vld2(float* nocapture readonly %pSrc, float* noalias nocapture %pDst, i32 %numSamples) { 5; CHECK-LABEL: vld2: 6; CHECK: // %bb.0: // %entry 7; CHECK-NEXT: mov x8, xzr 8; CHECK-NEXT: .LBB0_1: // %vector.body 9; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 10; CHECK-NEXT: ld2 { v0.4s, v1.4s }, [x0], #32 11; CHECK-NEXT: fmul v2.4s, v0.4s, v0.4s 12; CHECK-NEXT: fmla v2.4s, v1.4s, v1.4s 13; CHECK-NEXT: str q2, [x1, x8] 14; CHECK-NEXT: add x8, x8, #16 // =16 15; CHECK-NEXT: cmp x8, #1, lsl #12 // =4096 16; CHECK-NEXT: b.ne .LBB0_1 17; CHECK-NEXT: // %bb.2: // %while.end 18; CHECK-NEXT: ret 19entry: 20 br label %vector.body 21 22vector.body: ; preds = %vector.body, %entry 23 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] 24 %0 = shl i64 %index, 1 25 %next.gep = getelementptr float, float* %pSrc, i64 %0 26 %next.gep19 = getelementptr float, float* %pDst, i64 %index 27 %1 = bitcast float* %next.gep to <8 x float>* 28 %wide.vec = load <8 x float>, <8 x float>* %1, align 4 29 %2 = fmul fast <8 x float> %wide.vec, %wide.vec 30 %3 = shufflevector <8 x float> %2, <8 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 31 %4 = fmul fast <8 x float> %wide.vec, %wide.vec 32 %5 = shufflevector <8 x float> %4, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 33 %6 = fadd fast <4 x float> %5, %3 34 %7 = bitcast float* %next.gep19 to <4 x float>* 35 store <4 x float> %6, <4 x float>* %7, align 4 36 %index.next = add i64 %index, 4 37 %8 = icmp eq i64 %index.next, 1024 38 br i1 %8, label %while.end, label %vector.body 39 40while.end: ; preds = %vector.body 41 ret void 42} 43 44define void @vld3(float* nocapture readonly %pSrc, float* noalias nocapture %pDst, i32 %numSamples) { 45; CHECK-LABEL: vld3: 46; CHECK: // %bb.0: // %entry 47; CHECK-NEXT: mov x8, xzr 48; CHECK-NEXT: .LBB1_1: // %vector.body 49; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 50; CHECK-NEXT: ld3 { v0.4s, v1.4s, v2.4s }, [x0], #48 51; CHECK-NEXT: fmul v3.4s, v0.4s, v0.4s 52; CHECK-NEXT: fmla v3.4s, v1.4s, v1.4s 53; CHECK-NEXT: fmla v3.4s, v2.4s, v2.4s 54; CHECK-NEXT: str q3, [x1, x8] 55; CHECK-NEXT: add x8, x8, #16 // =16 56; CHECK-NEXT: cmp x8, #1, lsl #12 // =4096 57; CHECK-NEXT: b.ne .LBB1_1 58; CHECK-NEXT: // %bb.2: // %while.end 59; CHECK-NEXT: ret 60entry: 61 br label %vector.body 62 63vector.body: ; preds = %vector.body, %entry 64 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] 65 %0 = mul i64 %index, 3 66 %next.gep = getelementptr float, float* %pSrc, i64 %0 67 %next.gep23 = getelementptr float, float* %pDst, i64 %index 68 %1 = bitcast float* %next.gep to <12 x float>* 69 %wide.vec = load <12 x float>, <12 x float>* %1, align 4 70 %2 = fmul fast <12 x float> %wide.vec, %wide.vec 71 %3 = shufflevector <12 x float> %2, <12 x float> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9> 72 %4 = fmul fast <12 x float> %wide.vec, %wide.vec 73 %5 = shufflevector <12 x float> %4, <12 x float> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10> 74 %6 = fadd fast <4 x float> %5, %3 75 %7 = fmul fast <12 x float> %wide.vec, %wide.vec 76 %8 = shufflevector <12 x float> %7, <12 x float> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11> 77 %9 = fadd fast <4 x float> %6, %8 78 %10 = bitcast float* %next.gep23 to <4 x float>* 79 store <4 x float> %9, <4 x float>* %10, align 4 80 %index.next = add i64 %index, 4 81 %11 = icmp eq i64 %index.next, 1024 82 br i1 %11, label %while.end, label %vector.body 83 84while.end: ; preds = %vector.body 85 ret void 86} 87 88define void @vld4(float* nocapture readonly %pSrc, float* noalias nocapture %pDst, i32 %numSamples) { 89; CHECK-LABEL: vld4: 90; CHECK: // %bb.0: // %entry 91; CHECK-NEXT: mov x8, xzr 92; CHECK-NEXT: .LBB2_1: // %vector.body 93; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 94; CHECK-NEXT: ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0], #64 95; CHECK-NEXT: add x9, x1, x8 96; CHECK-NEXT: add x8, x8, #32 // =32 97; CHECK-NEXT: cmp x8, #2, lsl #12 // =8192 98; CHECK-NEXT: fmul v4.4s, v0.4s, v0.4s 99; CHECK-NEXT: fmla v4.4s, v1.4s, v1.4s 100; CHECK-NEXT: fmul v5.4s, v2.4s, v2.4s 101; CHECK-NEXT: fmla v5.4s, v3.4s, v3.4s 102; CHECK-NEXT: st2 { v4.4s, v5.4s }, [x9] 103; CHECK-NEXT: b.ne .LBB2_1 104; CHECK-NEXT: // %bb.2: // %while.end 105; CHECK-NEXT: ret 106entry: 107 br label %vector.body 108 109vector.body: ; preds = %vector.body, %entry 110 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] 111 %0 = shl i64 %index, 2 112 %next.gep = getelementptr float, float* %pSrc, i64 %0 113 %1 = shl i64 %index, 1 114 %2 = bitcast float* %next.gep to <16 x float>* 115 %wide.vec = load <16 x float>, <16 x float>* %2, align 4 116 %3 = fmul fast <16 x float> %wide.vec, %wide.vec 117 %4 = shufflevector <16 x float> %3, <16 x float> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12> 118 %5 = fmul fast <16 x float> %wide.vec, %wide.vec 119 %6 = shufflevector <16 x float> %5, <16 x float> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13> 120 %7 = fadd fast <4 x float> %6, %4 121 %8 = fmul fast <16 x float> %wide.vec, %wide.vec 122 %9 = shufflevector <16 x float> %8, <16 x float> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14> 123 %10 = fmul fast <16 x float> %wide.vec, %wide.vec 124 %11 = shufflevector <16 x float> %10, <16 x float> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15> 125 %12 = fadd fast <4 x float> %11, %9 126 %13 = getelementptr inbounds float, float* %pDst, i64 %1 127 %14 = bitcast float* %13 to <8 x float>* 128 %interleaved.vec = shufflevector <4 x float> %7, <4 x float> %12, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 129 store <8 x float> %interleaved.vec, <8 x float>* %14, align 4 130 %index.next = add i64 %index, 4 131 %15 = icmp eq i64 %index.next, 1024 132 br i1 %15, label %while.end, label %vector.body 133 134while.end: ; preds = %vector.body 135 ret void 136} 137 138define void @twosrc(float* nocapture readonly %pSrc, float* nocapture readonly %pSrc2, float* noalias nocapture %pDst, i32 %numSamples) { 139; CHECK-LABEL: twosrc: 140; CHECK: // %bb.0: // %entry 141; CHECK-NEXT: mov x8, xzr 142; CHECK-NEXT: .LBB3_1: // %vector.body 143; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 144; CHECK-NEXT: add x9, x0, x8 145; CHECK-NEXT: add x10, x1, x8 146; CHECK-NEXT: ld2 { v0.4s, v1.4s }, [x9] 147; CHECK-NEXT: ld2 { v2.4s, v3.4s }, [x10] 148; CHECK-NEXT: add x8, x8, #32 // =32 149; CHECK-NEXT: cmp x8, #2, lsl #12 // =8192 150; CHECK-NEXT: fmul v4.4s, v2.4s, v0.4s 151; CHECK-NEXT: fmla v4.4s, v1.4s, v3.4s 152; CHECK-NEXT: str q4, [x2], #16 153; CHECK-NEXT: b.ne .LBB3_1 154; CHECK-NEXT: // %bb.2: // %while.end 155; CHECK-NEXT: ret 156entry: 157 br label %vector.body 158 159vector.body: ; preds = %vector.body, %entry 160 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] 161 %0 = shl i64 %index, 1 162 %next.gep = getelementptr float, float* %pSrc, i64 %0 163 %1 = shl i64 %index, 1 164 %next.gep23 = getelementptr float, float* %pSrc2, i64 %1 165 %next.gep24 = getelementptr float, float* %pDst, i64 %index 166 %2 = bitcast float* %next.gep to <8 x float>* 167 %wide.vec = load <8 x float>, <8 x float>* %2, align 4 168 %3 = bitcast float* %next.gep23 to <8 x float>* 169 %wide.vec26 = load <8 x float>, <8 x float>* %3, align 4 170 %4 = fmul fast <8 x float> %wide.vec26, %wide.vec 171 %5 = shufflevector <8 x float> %4, <8 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 172 %6 = fmul fast <8 x float> %wide.vec26, %wide.vec 173 %7 = shufflevector <8 x float> %6, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 174 %8 = fadd fast <4 x float> %7, %5 175 %9 = bitcast float* %next.gep24 to <4 x float>* 176 store <4 x float> %8, <4 x float>* %9, align 4 177 %index.next = add i64 %index, 4 178 %10 = icmp eq i64 %index.next, 1024 179 br i1 %10, label %while.end, label %vector.body 180 181while.end: ; preds = %vector.body 182 ret void 183} 184