1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=aarch64-none-eabif | FileCheck %s
3
4define void @vld2(float* nocapture readonly %pSrc, float* noalias nocapture %pDst, i32 %numSamples) {
5; CHECK-LABEL: vld2:
6; CHECK:       // %bb.0: // %entry
7; CHECK-NEXT:    mov x8, xzr
8; CHECK-NEXT:  .LBB0_1: // %vector.body
9; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
10; CHECK-NEXT:    ld2 { v0.4s, v1.4s }, [x0], #32
11; CHECK-NEXT:    fmul v2.4s, v0.4s, v0.4s
12; CHECK-NEXT:    fmla v2.4s, v1.4s, v1.4s
13; CHECK-NEXT:    str q2, [x1, x8]
14; CHECK-NEXT:    add x8, x8, #16 // =16
15; CHECK-NEXT:    cmp x8, #1, lsl #12 // =4096
16; CHECK-NEXT:    b.ne .LBB0_1
17; CHECK-NEXT:  // %bb.2: // %while.end
18; CHECK-NEXT:    ret
19entry:
20  br label %vector.body
21
22vector.body:                                      ; preds = %vector.body, %entry
23  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
24  %0 = shl i64 %index, 1
25  %next.gep = getelementptr float, float* %pSrc, i64 %0
26  %next.gep19 = getelementptr float, float* %pDst, i64 %index
27  %1 = bitcast float* %next.gep to <8 x float>*
28  %wide.vec = load <8 x float>, <8 x float>* %1, align 4
29  %2 = fmul fast <8 x float> %wide.vec, %wide.vec
30  %3 = shufflevector <8 x float> %2, <8 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
31  %4 = fmul fast <8 x float> %wide.vec, %wide.vec
32  %5 = shufflevector <8 x float> %4, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
33  %6 = fadd fast <4 x float> %5, %3
34  %7 = bitcast float* %next.gep19 to <4 x float>*
35  store <4 x float> %6, <4 x float>* %7, align 4
36  %index.next = add i64 %index, 4
37  %8 = icmp eq i64 %index.next, 1024
38  br i1 %8, label %while.end, label %vector.body
39
40while.end:                                        ; preds = %vector.body
41  ret void
42}
43
44define void @vld3(float* nocapture readonly %pSrc, float* noalias nocapture %pDst, i32 %numSamples) {
45; CHECK-LABEL: vld3:
46; CHECK:       // %bb.0: // %entry
47; CHECK-NEXT:    mov x8, xzr
48; CHECK-NEXT:  .LBB1_1: // %vector.body
49; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
50; CHECK-NEXT:    ld3 { v0.4s, v1.4s, v2.4s }, [x0], #48
51; CHECK-NEXT:    fmul v3.4s, v0.4s, v0.4s
52; CHECK-NEXT:    fmla v3.4s, v1.4s, v1.4s
53; CHECK-NEXT:    fmla v3.4s, v2.4s, v2.4s
54; CHECK-NEXT:    str q3, [x1, x8]
55; CHECK-NEXT:    add x8, x8, #16 // =16
56; CHECK-NEXT:    cmp x8, #1, lsl #12 // =4096
57; CHECK-NEXT:    b.ne .LBB1_1
58; CHECK-NEXT:  // %bb.2: // %while.end
59; CHECK-NEXT:    ret
60entry:
61  br label %vector.body
62
63vector.body:                                      ; preds = %vector.body, %entry
64  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
65  %0 = mul i64 %index, 3
66  %next.gep = getelementptr float, float* %pSrc, i64 %0
67  %next.gep23 = getelementptr float, float* %pDst, i64 %index
68  %1 = bitcast float* %next.gep to <12 x float>*
69  %wide.vec = load <12 x float>, <12 x float>* %1, align 4
70  %2 = fmul fast <12 x float> %wide.vec, %wide.vec
71  %3 = shufflevector <12 x float> %2, <12 x float> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
72  %4 = fmul fast <12 x float> %wide.vec, %wide.vec
73  %5 = shufflevector <12 x float> %4, <12 x float> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
74  %6 = fadd fast <4 x float> %5, %3
75  %7 = fmul fast <12 x float> %wide.vec, %wide.vec
76  %8 = shufflevector <12 x float> %7, <12 x float> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
77  %9 = fadd fast <4 x float> %6, %8
78  %10 = bitcast float* %next.gep23 to <4 x float>*
79  store <4 x float> %9, <4 x float>* %10, align 4
80  %index.next = add i64 %index, 4
81  %11 = icmp eq i64 %index.next, 1024
82  br i1 %11, label %while.end, label %vector.body
83
84while.end:                                        ; preds = %vector.body
85  ret void
86}
87
88define void @vld4(float* nocapture readonly %pSrc, float* noalias nocapture %pDst, i32 %numSamples) {
89; CHECK-LABEL: vld4:
90; CHECK:       // %bb.0: // %entry
91; CHECK-NEXT:    mov x8, xzr
92; CHECK-NEXT:  .LBB2_1: // %vector.body
93; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
94; CHECK-NEXT:    ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0], #64
95; CHECK-NEXT:    add x9, x1, x8
96; CHECK-NEXT:    add x8, x8, #32 // =32
97; CHECK-NEXT:    cmp x8, #2, lsl #12 // =8192
98; CHECK-NEXT:    fmul v4.4s, v0.4s, v0.4s
99; CHECK-NEXT:    fmla v4.4s, v1.4s, v1.4s
100; CHECK-NEXT:    fmul v5.4s, v2.4s, v2.4s
101; CHECK-NEXT:    fmla v5.4s, v3.4s, v3.4s
102; CHECK-NEXT:    st2 { v4.4s, v5.4s }, [x9]
103; CHECK-NEXT:    b.ne .LBB2_1
104; CHECK-NEXT:  // %bb.2: // %while.end
105; CHECK-NEXT:    ret
106entry:
107  br label %vector.body
108
109vector.body:                                      ; preds = %vector.body, %entry
110  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
111  %0 = shl i64 %index, 2
112  %next.gep = getelementptr float, float* %pSrc, i64 %0
113  %1 = shl i64 %index, 1
114  %2 = bitcast float* %next.gep to <16 x float>*
115  %wide.vec = load <16 x float>, <16 x float>* %2, align 4
116  %3 = fmul fast <16 x float> %wide.vec, %wide.vec
117  %4 = shufflevector <16 x float> %3, <16 x float> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
118  %5 = fmul fast <16 x float> %wide.vec, %wide.vec
119  %6 = shufflevector <16 x float> %5, <16 x float> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
120  %7 = fadd fast <4 x float> %6, %4
121  %8 = fmul fast <16 x float> %wide.vec, %wide.vec
122  %9 = shufflevector <16 x float> %8, <16 x float> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
123  %10 = fmul fast <16 x float> %wide.vec, %wide.vec
124  %11 = shufflevector <16 x float> %10, <16 x float> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
125  %12 = fadd fast <4 x float> %11, %9
126  %13 = getelementptr inbounds float, float* %pDst, i64 %1
127  %14 = bitcast float* %13 to <8 x float>*
128  %interleaved.vec = shufflevector <4 x float> %7, <4 x float> %12, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
129  store <8 x float> %interleaved.vec, <8 x float>* %14, align 4
130  %index.next = add i64 %index, 4
131  %15 = icmp eq i64 %index.next, 1024
132  br i1 %15, label %while.end, label %vector.body
133
134while.end:                                        ; preds = %vector.body
135  ret void
136}
137
138define void @twosrc(float* nocapture readonly %pSrc, float* nocapture readonly %pSrc2, float* noalias nocapture %pDst, i32 %numSamples) {
139; CHECK-LABEL: twosrc:
140; CHECK:       // %bb.0: // %entry
141; CHECK-NEXT:    mov x8, xzr
142; CHECK-NEXT:  .LBB3_1: // %vector.body
143; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
144; CHECK-NEXT:    add x9, x0, x8
145; CHECK-NEXT:    add x10, x1, x8
146; CHECK-NEXT:    ld2 { v0.4s, v1.4s }, [x9]
147; CHECK-NEXT:    ld2 { v2.4s, v3.4s }, [x10]
148; CHECK-NEXT:    add x8, x8, #32 // =32
149; CHECK-NEXT:    cmp x8, #2, lsl #12 // =8192
150; CHECK-NEXT:    fmul v4.4s, v2.4s, v0.4s
151; CHECK-NEXT:    fmla v4.4s, v1.4s, v3.4s
152; CHECK-NEXT:    str q4, [x2], #16
153; CHECK-NEXT:    b.ne .LBB3_1
154; CHECK-NEXT:  // %bb.2: // %while.end
155; CHECK-NEXT:    ret
156entry:
157  br label %vector.body
158
159vector.body:                                      ; preds = %vector.body, %entry
160  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
161  %0 = shl i64 %index, 1
162  %next.gep = getelementptr float, float* %pSrc, i64 %0
163  %1 = shl i64 %index, 1
164  %next.gep23 = getelementptr float, float* %pSrc2, i64 %1
165  %next.gep24 = getelementptr float, float* %pDst, i64 %index
166  %2 = bitcast float* %next.gep to <8 x float>*
167  %wide.vec = load <8 x float>, <8 x float>* %2, align 4
168  %3 = bitcast float* %next.gep23 to <8 x float>*
169  %wide.vec26 = load <8 x float>, <8 x float>* %3, align 4
170  %4 = fmul fast <8 x float> %wide.vec26, %wide.vec
171  %5 = shufflevector <8 x float> %4, <8 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
172  %6 = fmul fast <8 x float> %wide.vec26, %wide.vec
173  %7 = shufflevector <8 x float> %6, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
174  %8 = fadd fast <4 x float> %7, %5
175  %9 = bitcast float* %next.gep24 to <4 x float>*
176  store <4 x float> %8, <4 x float>* %9, align 4
177  %index.next = add i64 %index, 4
178  %10 = icmp eq i64 %index.next, 1024
179  br i1 %10, label %while.end, label %vector.body
180
181while.end:                                        ; preds = %vector.body
182  ret void
183}
184