1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs -tail-predication=enabled %s -o - | FileCheck %s
3
4define arm_aapcs_vfpcc void @fmas1(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) {
5; CHECK-LABEL: fmas1:
6; CHECK:       @ %bb.0: @ %entry
7; CHECK-NEXT:    .save {r4, lr}
8; CHECK-NEXT:    push {r4, lr}
9; CHECK-NEXT:    cmp r3, #1
10; CHECK-NEXT:    it lt
11; CHECK-NEXT:    poplt {r4, pc}
12; CHECK-NEXT:  .LBB0_1: @ %vector.ph
13; CHECK-NEXT:    vmov r12, s0
14; CHECK-NEXT:    movs r4, #0
15; CHECK-NEXT:    dlstp.32 lr, r3
16; CHECK-NEXT:  .LBB0_2: @ %vector.body
17; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
18; CHECK-NEXT:    adds r4, #4
19; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
20; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
21; CHECK-NEXT:    vfmas.f32 q1, q0, r12
22; CHECK-NEXT:    vstrw.32 q1, [r2], #16
23; CHECK-NEXT:    letp lr, .LBB0_2
24; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
25; CHECK-NEXT:    pop {r4, pc}
26entry:
27  %cmp8 = icmp sgt i32 %n, 0
28  br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
29
30vector.ph:                                        ; preds = %entry
31  %n.rnd.up = add i32 %n, 3
32  %n.vec = and i32 %n.rnd.up, -4
33  %trip.count.minus.1 = add i32 %n, -1
34  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
35  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
36  %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
37  %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
38  br label %vector.body
39
40vector.body:                                      ; preds = %vector.body, %vector.ph
41  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
42  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
43  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
44  %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
45  %0 = getelementptr inbounds float, float* %x, i32 %index
46
47  ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
48  %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
49
50  %2 = bitcast float* %0 to <4 x float>*
51  %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
52  %3 = getelementptr inbounds float, float* %y, i32 %index
53  %4 = bitcast float* %3 to <4 x float>*
54  %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %1, <4 x float> undef)
55  %5 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %wide.masked.load, <4 x float> %wide.masked.load12, <4 x float> %broadcast.splat14)
56  %6 = getelementptr inbounds float, float* %z, i32 %index
57  %7 = bitcast float* %6 to <4 x float>*
58  call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %5, <4 x float>* %7, i32 4, <4 x i1> %1)
59  %index.next = add i32 %index, 4
60  %8 = icmp eq i32 %index.next, %n.vec
61  br i1 %8, label %for.cond.cleanup, label %vector.body
62
63for.cond.cleanup:                                 ; preds = %vector.body, %entry
64  ret void
65}
66
67define arm_aapcs_vfpcc void @fmas2(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) {
68; CHECK-LABEL: fmas2:
69; CHECK:       @ %bb.0: @ %entry
70; CHECK-NEXT:    .save {r4, lr}
71; CHECK-NEXT:    push {r4, lr}
72; CHECK-NEXT:    cmp r3, #1
73; CHECK-NEXT:    it lt
74; CHECK-NEXT:    poplt {r4, pc}
75; CHECK-NEXT:  .LBB1_1: @ %vector.ph
76; CHECK-NEXT:    vmov r12, s0
77; CHECK-NEXT:    movs r4, #0
78; CHECK-NEXT:    dlstp.32 lr, r3
79; CHECK-NEXT:  .LBB1_2: @ %vector.body
80; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
81; CHECK-NEXT:    adds r4, #4
82; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
83; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
84; CHECK-NEXT:    vfmas.f32 q1, q0, r12
85; CHECK-NEXT:    vstrw.32 q1, [r2], #16
86; CHECK-NEXT:    letp lr, .LBB1_2
87; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
88; CHECK-NEXT:    pop {r4, pc}
89entry:
90  %cmp8 = icmp sgt i32 %n, 0
91  br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
92
93vector.ph:                                        ; preds = %entry
94  %n.rnd.up = add i32 %n, 3
95  %n.vec = and i32 %n.rnd.up, -4
96  %trip.count.minus.1 = add i32 %n, -1
97  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
98  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
99  %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
100  %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
101  br label %vector.body
102
103vector.body:                                      ; preds = %vector.body, %vector.ph
104  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
105  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
106  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
107  %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
108  %0 = getelementptr inbounds float, float* %x, i32 %index
109
110  ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
111  %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
112
113  %2 = bitcast float* %0 to <4 x float>*
114  %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
115  %3 = getelementptr inbounds float, float* %y, i32 %index
116  %4 = bitcast float* %3 to <4 x float>*
117  %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %1, <4 x float> undef)
118  %5 = fmul fast <4 x float> %wide.masked.load12, %wide.masked.load
119  %6 = fadd fast <4 x float> %5, %broadcast.splat14
120  %7 = getelementptr inbounds float, float* %z, i32 %index
121  %8 = bitcast float* %7 to <4 x float>*
122  call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %6, <4 x float>* %8, i32 4, <4 x i1> %1)
123  %index.next = add i32 %index, 4
124  %9 = icmp eq i32 %index.next, %n.vec
125  br i1 %9, label %for.cond.cleanup, label %vector.body
126
127for.cond.cleanup:                                 ; preds = %vector.body, %entry
128  ret void
129}
130
131define arm_aapcs_vfpcc void @fma1(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) {
132; CHECK-LABEL: fma1:
133; CHECK:       @ %bb.0: @ %entry
134; CHECK-NEXT:    .save {r4, lr}
135; CHECK-NEXT:    push {r4, lr}
136; CHECK-NEXT:    cmp r3, #1
137; CHECK-NEXT:    it lt
138; CHECK-NEXT:    poplt {r4, pc}
139; CHECK-NEXT:  .LBB2_1: @ %vector.ph
140; CHECK-NEXT:    vmov r12, s0
141; CHECK-NEXT:    movs r4, #0
142; CHECK-NEXT:    dlstp.32 lr, r3
143; CHECK-NEXT:  .LBB2_2: @ %vector.body
144; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
145; CHECK-NEXT:    adds r4, #4
146; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
147; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
148; CHECK-NEXT:    vfma.f32 q1, q0, r12
149; CHECK-NEXT:    vstrw.32 q1, [r2], #16
150; CHECK-NEXT:    letp lr, .LBB2_2
151; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
152; CHECK-NEXT:    pop {r4, pc}
153entry:
154  %cmp8 = icmp sgt i32 %n, 0
155  br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
156
157vector.ph:                                        ; preds = %entry
158  %n.rnd.up = add i32 %n, 3
159  %n.vec = and i32 %n.rnd.up, -4
160  %trip.count.minus.1 = add i32 %n, -1
161  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
162  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
163  %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
164  %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
165  br label %vector.body
166
167vector.body:                                      ; preds = %vector.body, %vector.ph
168  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
169  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
170  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
171  %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
172  %0 = getelementptr inbounds float, float* %x, i32 %index
173
174  ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
175  %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
176
177  %2 = bitcast float* %0 to <4 x float>*
178  %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
179  %3 = getelementptr inbounds float, float* %y, i32 %index
180  %4 = bitcast float* %3 to <4 x float>*
181  %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %1, <4 x float> undef)
182  %5 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %wide.masked.load, <4 x float> %broadcast.splat14, <4 x float> %wide.masked.load12)
183  %6 = getelementptr inbounds float, float* %z, i32 %index
184  %7 = bitcast float* %6 to <4 x float>*
185  call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %5, <4 x float>* %7, i32 4, <4 x i1> %1)
186  %index.next = add i32 %index, 4
187  %8 = icmp eq i32 %index.next, %n.vec
188  br i1 %8, label %for.cond.cleanup, label %vector.body
189
190for.cond.cleanup:                                 ; preds = %vector.body, %entry
191  ret void
192}
193
194define arm_aapcs_vfpcc void @fma2(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) {
195; CHECK-LABEL: fma2:
196; CHECK:       @ %bb.0: @ %entry
197; CHECK-NEXT:    .save {r4, lr}
198; CHECK-NEXT:    push {r4, lr}
199; CHECK-NEXT:    cmp r3, #1
200; CHECK-NEXT:    it lt
201; CHECK-NEXT:    poplt {r4, pc}
202; CHECK-NEXT:  .LBB3_1: @ %vector.ph
203; CHECK-NEXT:    vmov r12, s0
204; CHECK-NEXT:    movs r4, #0
205; CHECK-NEXT:    dlstp.32 lr, r3
206; CHECK-NEXT:  .LBB3_2: @ %vector.body
207; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
208; CHECK-NEXT:    adds r4, #4
209; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
210; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
211; CHECK-NEXT:    vfma.f32 q1, q0, r12
212; CHECK-NEXT:    vstrw.32 q1, [r2], #16
213; CHECK-NEXT:    letp lr, .LBB3_2
214; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
215; CHECK-NEXT:    pop {r4, pc}
216entry:
217  %cmp8 = icmp sgt i32 %n, 0
218  br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
219
220vector.ph:                                        ; preds = %entry
221  %n.rnd.up = add i32 %n, 3
222  %n.vec = and i32 %n.rnd.up, -4
223  %trip.count.minus.1 = add i32 %n, -1
224  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
225  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
226  %broadcast.splatinsert12 = insertelement <4 x float> undef, float %a, i32 0
227  %broadcast.splat13 = shufflevector <4 x float> %broadcast.splatinsert12, <4 x float> undef, <4 x i32> zeroinitializer
228  br label %vector.body
229
230vector.body:                                      ; preds = %vector.body, %vector.ph
231  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
232  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
233  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
234  %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
235  %0 = getelementptr inbounds float, float* %x, i32 %index
236
237  ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
238  %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
239
240  %2 = bitcast float* %0 to <4 x float>*
241  %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
242  %3 = fmul fast <4 x float> %wide.masked.load, %broadcast.splat13
243  %4 = getelementptr inbounds float, float* %y, i32 %index
244  %5 = bitcast float* %4 to <4 x float>*
245  %wide.masked.load14 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %5, i32 4, <4 x i1> %1, <4 x float> undef)
246  %6 = fadd fast <4 x float> %3, %wide.masked.load14
247  %7 = getelementptr inbounds float, float* %z, i32 %index
248  %8 = bitcast float* %7 to <4 x float>*
249  call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %6, <4 x float>* %8, i32 4, <4 x i1> %1)
250  %index.next = add i32 %index, 4
251  %9 = icmp eq i32 %index.next, %n.vec
252  br i1 %9, label %for.cond.cleanup, label %vector.body
253
254for.cond.cleanup:                                 ; preds = %vector.body, %entry
255  ret void
256}
257
258define arm_aapcs_vfpcc void @fmss1(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) {
259; CHECK-LABEL: fmss1:
260; CHECK:       @ %bb.0: @ %entry
261; CHECK-NEXT:    .save {r4, lr}
262; CHECK-NEXT:    push {r4, lr}
263; CHECK-NEXT:    cmp r3, #1
264; CHECK-NEXT:    it lt
265; CHECK-NEXT:    poplt {r4, pc}
266; CHECK-NEXT:  .LBB4_1: @ %vector.ph
267; CHECK-NEXT:    vmov r12, s0
268; CHECK-NEXT:    movs r4, #0
269; CHECK-NEXT:    dlstp.32 lr, r3
270; CHECK-NEXT:    eor r12, r12, #-2147483648
271; CHECK-NEXT:  .LBB4_2: @ %vector.body
272; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
273; CHECK-NEXT:    adds r4, #4
274; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
275; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
276; CHECK-NEXT:    vfmas.f32 q1, q0, r12
277; CHECK-NEXT:    vstrw.32 q1, [r2], #16
278; CHECK-NEXT:    letp lr, .LBB4_2
279; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
280; CHECK-NEXT:    pop {r4, pc}
281entry:
282  %cmp8 = icmp sgt i32 %n, 0
283  br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
284
285vector.ph:                                        ; preds = %entry
286  %fneg = fneg fast float %a
287  %n.rnd.up = add i32 %n, 3
288  %n.vec = and i32 %n.rnd.up, -4
289  %trip.count.minus.1 = add i32 %n, -1
290  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
291  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
292  %broadcast.splatinsert13 = insertelement <4 x float> undef, float %fneg, i32 0
293  %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
294  br label %vector.body
295
296vector.body:                                      ; preds = %vector.body, %vector.ph
297  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
298  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
299  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
300  %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
301  %0 = getelementptr inbounds float, float* %x, i32 %index
302
303  ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
304  %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
305
306  %2 = bitcast float* %0 to <4 x float>*
307  %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
308  %3 = getelementptr inbounds float, float* %y, i32 %index
309  %4 = bitcast float* %3 to <4 x float>*
310  %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %1, <4 x float> undef)
311  %5 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %wide.masked.load, <4 x float> %wide.masked.load12, <4 x float> %broadcast.splat14)
312  %6 = getelementptr inbounds float, float* %z, i32 %index
313  %7 = bitcast float* %6 to <4 x float>*
314  call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %5, <4 x float>* %7, i32 4, <4 x i1> %1)
315  %index.next = add i32 %index, 4
316  %8 = icmp eq i32 %index.next, %n.vec
317  br i1 %8, label %for.cond.cleanup, label %vector.body
318
319for.cond.cleanup:                                 ; preds = %vector.body, %entry
320  ret void
321}
322
323define arm_aapcs_vfpcc void @fmss2(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) {
324; CHECK-LABEL: fmss2:
325; CHECK:       @ %bb.0: @ %entry
326; CHECK-NEXT:    .save {r4, r5, r6, lr}
327; CHECK-NEXT:    push {r4, r5, r6, lr}
328; CHECK-NEXT:    cmp r3, #1
329; CHECK-NEXT:    blt .LBB5_3
330; CHECK-NEXT:  @ %bb.1: @ %vector.ph
331; CHECK-NEXT:    vmov r6, s0
332; CHECK-NEXT:    vdup.32 q0, r6
333; CHECK-NEXT:    mov.w r12, #0
334; CHECK-NEXT:    vneg.f32 q0, q0
335; CHECK-NEXT:    dlstp.32 lr, r3
336; CHECK-NEXT:  .LBB5_2: @ %vector.body
337; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
338; CHECK-NEXT:    add.w r12, r12, #4
339; CHECK-NEXT:    vmov q3, q0
340; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
341; CHECK-NEXT:    vldrw.u32 q2, [r1], #16
342; CHECK-NEXT:    vfma.f32 q3, q2, q1
343; CHECK-NEXT:    vstrw.32 q3, [r2], #16
344; CHECK-NEXT:    letp lr, .LBB5_2
345; CHECK-NEXT:  .LBB5_3: @ %for.cond.cleanup
346; CHECK-NEXT:    pop {r4, r5, r6, pc}
347entry:
348  %cmp8 = icmp sgt i32 %n, 0
349  br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
350
351vector.ph:                                        ; preds = %entry
352  %n.rnd.up = add i32 %n, 3
353  %n.vec = and i32 %n.rnd.up, -4
354  %trip.count.minus.1 = add i32 %n, -1
355  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
356  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
357  %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
358  %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
359  br label %vector.body
360
361vector.body:                                      ; preds = %vector.body, %vector.ph
362  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
363  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
364  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
365  %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
366  %0 = getelementptr inbounds float, float* %x, i32 %index
367
368  ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
369  %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
370
371  %2 = bitcast float* %0 to <4 x float>*
372  %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
373  %3 = getelementptr inbounds float, float* %y, i32 %index
374  %4 = bitcast float* %3 to <4 x float>*
375  %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %1, <4 x float> undef)
376  %5 = fmul fast <4 x float> %wide.masked.load12, %wide.masked.load
377  %6 = fsub fast <4 x float> %5, %broadcast.splat14
378  %7 = getelementptr inbounds float, float* %z, i32 %index
379  %8 = bitcast float* %7 to <4 x float>*
380  call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %6, <4 x float>* %8, i32 4, <4 x i1> %1)
381  %index.next = add i32 %index, 4
382  %9 = icmp eq i32 %index.next, %n.vec
383  br i1 %9, label %for.cond.cleanup, label %vector.body
384
385for.cond.cleanup:                                 ; preds = %vector.body, %entry
386  ret void
387}
388
389define arm_aapcs_vfpcc void @fmss3(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) {
390; CHECK-LABEL: fmss3:
391; CHECK:       @ %bb.0: @ %entry
392; CHECK-NEXT:    .save {r4, lr}
393; CHECK-NEXT:    push {r4, lr}
394; CHECK-NEXT:    cmp r3, #1
395; CHECK-NEXT:    it lt
396; CHECK-NEXT:    poplt {r4, pc}
397; CHECK-NEXT:  .LBB6_1: @ %vector.ph
398; CHECK-NEXT:    vmov r4, s0
399; CHECK-NEXT:    vdup.32 q0, r4
400; CHECK-NEXT:    mov.w r12, #0
401; CHECK-NEXT:    dlstp.32 lr, r3
402; CHECK-NEXT:  .LBB6_2: @ %vector.body
403; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
404; CHECK-NEXT:    add.w r12, r12, #4
405; CHECK-NEXT:    vmov q3, q0
406; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
407; CHECK-NEXT:    vldrw.u32 q2, [r1], #16
408; CHECK-NEXT:    vfms.f32 q3, q2, q1
409; CHECK-NEXT:    vstrw.32 q3, [r2], #16
410; CHECK-NEXT:    letp lr, .LBB6_2
411; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
412; CHECK-NEXT:    pop {r4, pc}
413entry:
414  %cmp8 = icmp sgt i32 %n, 0
415  br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
416
417vector.ph:                                        ; preds = %entry
418  %n.rnd.up = add i32 %n, 3
419  %n.vec = and i32 %n.rnd.up, -4
420  %trip.count.minus.1 = add i32 %n, -1
421  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
422  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
423  %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
424  %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
425  br label %vector.body
426
427vector.body:                                      ; preds = %vector.body, %vector.ph
428  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
429  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
430  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
431  %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
432  %0 = getelementptr inbounds float, float* %x, i32 %index
433
434  ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
435  %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
436
437  %2 = bitcast float* %0 to <4 x float>*
438  %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
439  %3 = getelementptr inbounds float, float* %y, i32 %index
440  %4 = bitcast float* %3 to <4 x float>*
441  %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %1, <4 x float> undef)
442  %5 = fneg fast <4 x float> %wide.masked.load12
443  %6 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %wide.masked.load, <4 x float> %5, <4 x float> %broadcast.splat14)
444  %7 = getelementptr inbounds float, float* %z, i32 %index
445  %8 = bitcast float* %7 to <4 x float>*
446  call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %6, <4 x float>* %8, i32 4, <4 x i1> %1)
447  %index.next = add i32 %index, 4
448  %9 = icmp eq i32 %index.next, %n.vec
449  br i1 %9, label %for.cond.cleanup, label %vector.body
450
451for.cond.cleanup:                                 ; preds = %vector.body, %entry
452  ret void
453}
454
455define arm_aapcs_vfpcc void @fmss4(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) {
456; CHECK-LABEL: fmss4:
457; CHECK:       @ %bb.0: @ %entry
458; CHECK-NEXT:    .save {r4, lr}
459; CHECK-NEXT:    push {r4, lr}
460; CHECK-NEXT:    cmp r3, #1
461; CHECK-NEXT:    it lt
462; CHECK-NEXT:    poplt {r4, pc}
463; CHECK-NEXT:  .LBB7_1: @ %vector.ph
464; CHECK-NEXT:    vmov r4, s0
465; CHECK-NEXT:    vdup.32 q0, r4
466; CHECK-NEXT:    mov.w r12, #0
467; CHECK-NEXT:    dlstp.32 lr, r3
468; CHECK-NEXT:  .LBB7_2: @ %vector.body
469; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
470; CHECK-NEXT:    add.w r12, r12, #4
471; CHECK-NEXT:    vmov q3, q0
472; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
473; CHECK-NEXT:    vldrw.u32 q2, [r1], #16
474; CHECK-NEXT:    vfms.f32 q3, q2, q1
475; CHECK-NEXT:    vstrw.32 q3, [r2], #16
476; CHECK-NEXT:    letp lr, .LBB7_2
477; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
478; CHECK-NEXT:    pop {r4, pc}
479entry:
480  %cmp8 = icmp sgt i32 %n, 0
481  br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
482
483vector.ph:                                        ; preds = %entry
484  %n.rnd.up = add i32 %n, 3
485  %n.vec = and i32 %n.rnd.up, -4
486  %trip.count.minus.1 = add i32 %n, -1
487  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
488  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
489  %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
490  %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
491  br label %vector.body
492
493vector.body:                                      ; preds = %vector.body, %vector.ph
494  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
495  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
496  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
497  %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
498  %0 = getelementptr inbounds float, float* %x, i32 %index
499
500  ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
501  %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
502
503  %2 = bitcast float* %0 to <4 x float>*
504  %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
505  %3 = getelementptr inbounds float, float* %y, i32 %index
506  %4 = bitcast float* %3 to <4 x float>*
507  %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %1, <4 x float> undef)
508  %5 = fmul fast <4 x float> %wide.masked.load12, %wide.masked.load
509  %6 = fsub fast <4 x float> %broadcast.splat14, %5
510  %7 = getelementptr inbounds float, float* %z, i32 %index
511  %8 = bitcast float* %7 to <4 x float>*
512  call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %6, <4 x float>* %8, i32 4, <4 x i1> %1)
513  %index.next = add i32 %index, 4
514  %9 = icmp eq i32 %index.next, %n.vec
515  br i1 %9, label %for.cond.cleanup, label %vector.body
516
517for.cond.cleanup:                                 ; preds = %vector.body, %entry
518  ret void
519}
520
521define arm_aapcs_vfpcc void @fms1(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) {
522; CHECK-LABEL: fms1:
523; CHECK:       @ %bb.0: @ %entry
524; CHECK-NEXT:    .save {r4, lr}
525; CHECK-NEXT:    push {r4, lr}
526; CHECK-NEXT:    cmp r3, #1
527; CHECK-NEXT:    it lt
528; CHECK-NEXT:    poplt {r4, pc}
529; CHECK-NEXT:  .LBB8_1: @ %vector.ph
530; CHECK-NEXT:    vmov r12, s0
531; CHECK-NEXT:    movs r4, #0
532; CHECK-NEXT:    dlstp.32 lr, r3
533; CHECK-NEXT:    eor r12, r12, #-2147483648
534; CHECK-NEXT:  .LBB8_2: @ %vector.body
535; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
536; CHECK-NEXT:    adds r4, #4
537; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
538; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
539; CHECK-NEXT:    vfma.f32 q1, q0, r12
540; CHECK-NEXT:    vstrw.32 q1, [r2], #16
541; CHECK-NEXT:    letp lr, .LBB8_2
542; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
543; CHECK-NEXT:    pop {r4, pc}
544entry:
545  %cmp8 = icmp sgt i32 %n, 0
546  br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
547
548vector.ph:                                        ; preds = %entry
549  %fneg = fneg fast float %a
550  %n.rnd.up = add i32 %n, 3
551  %n.vec = and i32 %n.rnd.up, -4
552  %trip.count.minus.1 = add i32 %n, -1
553  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
554  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
555  %broadcast.splatinsert13 = insertelement <4 x float> undef, float %fneg, i32 0
556  %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
557  br label %vector.body
558
559vector.body:                                      ; preds = %vector.body, %vector.ph
560  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
561  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
562  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
563  %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
564  %0 = getelementptr inbounds float, float* %x, i32 %index
565
566  ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
567  %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
568
569  %2 = bitcast float* %0 to <4 x float>*
570  %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
571  %3 = getelementptr inbounds float, float* %y, i32 %index
572  %4 = bitcast float* %3 to <4 x float>*
573  %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %1, <4 x float> undef)
574  %5 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %wide.masked.load, <4 x float> %broadcast.splat14, <4 x float> %wide.masked.load12)
575  %6 = getelementptr inbounds float, float* %z, i32 %index
576  %7 = bitcast float* %6 to <4 x float>*
577  call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %5, <4 x float>* %7, i32 4, <4 x i1> %1)
578  %index.next = add i32 %index, 4
579  %8 = icmp eq i32 %index.next, %n.vec
580  br i1 %8, label %for.cond.cleanup, label %vector.body
581
582for.cond.cleanup:                                 ; preds = %vector.body, %entry
583  ret void
584}
585
586define arm_aapcs_vfpcc void @fms2(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) {
587; CHECK-LABEL: fms2:
588; CHECK:       @ %bb.0: @ %entry
589; CHECK-NEXT:    .save {r4, lr}
590; CHECK-NEXT:    push {r4, lr}
591; CHECK-NEXT:    cmp r3, #1
592; CHECK-NEXT:    it lt
593; CHECK-NEXT:    poplt {r4, pc}
594; CHECK-NEXT:  .LBB9_1: @ %vector.ph
595; CHECK-NEXT:    vmov r4, s0
596; CHECK-NEXT:    vdup.32 q0, r4
597; CHECK-NEXT:    mov.w r12, #0
598; CHECK-NEXT:    dlstp.32 lr, r3
599; CHECK-NEXT:  .LBB9_2: @ %vector.body
600; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
601; CHECK-NEXT:    add.w r12, r12, #4
602; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
603; CHECK-NEXT:    vldrw.u32 q2, [r1], #16
604; CHECK-NEXT:    vfms.f32 q2, q1, q0
605; CHECK-NEXT:    vstrw.32 q2, [r2], #16
606; CHECK-NEXT:    letp lr, .LBB9_2
607; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
608; CHECK-NEXT:    pop {r4, pc}
609entry:
610  %cmp8 = icmp sgt i32 %n, 0
611  br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
612
613vector.ph:                                        ; preds = %entry
614  %n.rnd.up = add i32 %n, 3
615  %n.vec = and i32 %n.rnd.up, -4
616  %trip.count.minus.1 = add i32 %n, -1
617  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
618  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
619  %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
620  %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
621  br label %vector.body
622
623vector.body:                                      ; preds = %vector.body, %vector.ph
624  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
625  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
626  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
627  %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
628  %0 = getelementptr inbounds float, float* %x, i32 %index
629
630  ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
631  %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
632
633  %2 = bitcast float* %0 to <4 x float>*
634  %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
635  %3 = getelementptr inbounds float, float* %y, i32 %index
636  %4 = bitcast float* %3 to <4 x float>*
637  %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %1, <4 x float> undef)
638  %5 = fmul fast <4 x float> %wide.masked.load, %broadcast.splat14
639  %6 = fsub fast <4 x float> %wide.masked.load12, %5
640  %7 = getelementptr inbounds float, float* %z, i32 %index
641  %8 = bitcast float* %7 to <4 x float>*
642  call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %6, <4 x float>* %8, i32 4, <4 x i1> %1)
643  %index.next = add i32 %index, 4
644  %9 = icmp eq i32 %index.next, %n.vec
645  br i1 %9, label %for.cond.cleanup, label %vector.body
646
647for.cond.cleanup:                                 ; preds = %vector.body, %entry
648  ret void
649}
650
651define arm_aapcs_vfpcc void @fms3(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) {
652; CHECK-LABEL: fms3:
653; CHECK:       @ %bb.0: @ %entry
654; CHECK-NEXT:    .save {r4, lr}
655; CHECK-NEXT:    push {r4, lr}
656; CHECK-NEXT:    cmp r3, #1
657; CHECK-NEXT:    it lt
658; CHECK-NEXT:    poplt {r4, pc}
659; CHECK-NEXT:  .LBB10_1: @ %vector.ph
660; CHECK-NEXT:    vmov r12, s0
661; CHECK-NEXT:    movs r4, #0
662; CHECK-NEXT:    dlstp.32 lr, r3
663; CHECK-NEXT:  .LBB10_2: @ %vector.body
664; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
665; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
666; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
667; CHECK-NEXT:    adds r4, #4
668; CHECK-NEXT:    vneg.f32 q1, q1
669; CHECK-NEXT:    vfma.f32 q1, q0, r12
670; CHECK-NEXT:    vstrw.32 q1, [r2], #16
671; CHECK-NEXT:    letp lr, .LBB10_2
672; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
673; CHECK-NEXT:    pop {r4, pc}
674entry:
675  %cmp8 = icmp sgt i32 %n, 0
676  br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
677
678vector.ph:                                        ; preds = %entry
679  %n.rnd.up = add i32 %n, 3
680  %n.vec = and i32 %n.rnd.up, -4
681  %trip.count.minus.1 = add i32 %n, -1
682  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
683  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
684  %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
685  %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
686  br label %vector.body
687
688vector.body:                                      ; preds = %vector.body, %vector.ph
689  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
690  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
691  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
692  %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
693  %0 = getelementptr inbounds float, float* %x, i32 %index
694
695  ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
696  %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
697
698  %2 = bitcast float* %0 to <4 x float>*
699  %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
700  %3 = getelementptr inbounds float, float* %y, i32 %index
701  %4 = bitcast float* %3 to <4 x float>*
702  %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %1, <4 x float> undef)
703  %5 = fneg fast <4 x float> %wide.masked.load12
704  %6 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %wide.masked.load, <4 x float> %broadcast.splat14, <4 x float> %5)
705  %7 = getelementptr inbounds float, float* %z, i32 %index
706  %8 = bitcast float* %7 to <4 x float>*
707  call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %6, <4 x float>* %8, i32 4, <4 x i1> %1)
708  %index.next = add i32 %index, 4
709  %9 = icmp eq i32 %index.next, %n.vec
710  br i1 %9, label %for.cond.cleanup, label %vector.body
711
712for.cond.cleanup:                                 ; preds = %vector.body, %entry
713  ret void
714}
715
716define arm_aapcs_vfpcc void @fms4(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) {
717; CHECK-LABEL: fms4:
718; CHECK:       @ %bb.0: @ %entry
719; CHECK-NEXT:    .save {r4, lr}
720; CHECK-NEXT:    push {r4, lr}
721; CHECK-NEXT:    cmp r3, #1
722; CHECK-NEXT:    it lt
723; CHECK-NEXT:    poplt {r4, pc}
724; CHECK-NEXT:  .LBB11_1: @ %vector.ph
725; CHECK-NEXT:    vmov r12, s0
726; CHECK-NEXT:    movs r4, #0
727; CHECK-NEXT:    dlstp.32 lr, r3
728; CHECK-NEXT:  .LBB11_2: @ %vector.body
729; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
730; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
731; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
732; CHECK-NEXT:    adds r4, #4
733; CHECK-NEXT:    vneg.f32 q1, q1
734; CHECK-NEXT:    vfma.f32 q1, q0, r12
735; CHECK-NEXT:    vstrw.32 q1, [r2], #16
736; CHECK-NEXT:    letp lr, .LBB11_2
737; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
738; CHECK-NEXT:    pop {r4, pc}
739entry:
740  %cmp8 = icmp sgt i32 %n, 0
741  br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
742
743vector.ph:                                        ; preds = %entry
744  %n.rnd.up = add i32 %n, 3
745  %n.vec = and i32 %n.rnd.up, -4
746  %trip.count.minus.1 = add i32 %n, -1
747  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
748  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
749  %broadcast.splatinsert12 = insertelement <4 x float> undef, float %a, i32 0
750  %broadcast.splat13 = shufflevector <4 x float> %broadcast.splatinsert12, <4 x float> undef, <4 x i32> zeroinitializer
751  br label %vector.body
752
753vector.body:                                      ; preds = %vector.body, %vector.ph
754  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
755  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
756  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
757  %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
758  %0 = getelementptr inbounds float, float* %x, i32 %index
759
760  ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
761  %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
762
763  %2 = bitcast float* %0 to <4 x float>*
764  %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
765  %3 = fmul fast <4 x float> %wide.masked.load, %broadcast.splat13
766  %4 = getelementptr inbounds float, float* %y, i32 %index
767  %5 = bitcast float* %4 to <4 x float>*
768  %wide.masked.load14 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %5, i32 4, <4 x i1> %1, <4 x float> undef)
769  %6 = fsub fast <4 x float> %3, %wide.masked.load14
770  %7 = getelementptr inbounds float, float* %z, i32 %index
771  %8 = bitcast float* %7 to <4 x float>*
772  call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %6, <4 x float>* %8, i32 4, <4 x i1> %1)
773  %index.next = add i32 %index, 4
774  %9 = icmp eq i32 %index.next, %n.vec
775  br i1 %9, label %for.cond.cleanup, label %vector.body
776
777for.cond.cleanup:                                 ; preds = %vector.body, %entry
778  ret void
779}
780
781declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>)
782declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
783declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32 immarg, <4 x i1>)
784declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
785