1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp,+fp-armv8d16sp,+fp16,+fullfp16 -tail-predication=enabled %s -o - | FileCheck %s
3
4define arm_aapcs_vfpcc void @fast_float_mul(float* nocapture %a, float* nocapture readonly %b, float* nocapture readonly %c, i32 %N) {
5; CHECK-LABEL: fast_float_mul:
6; CHECK:       @ %bb.0: @ %entry
7; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
8; CHECK-NEXT:    cmp r3, #0
9; CHECK-NEXT:    beq.w .LBB0_11
10; CHECK-NEXT:  @ %bb.1: @ %vector.memcheck
11; CHECK-NEXT:    add.w r5, r0, r3, lsl #2
12; CHECK-NEXT:    add.w r4, r2, r3, lsl #2
13; CHECK-NEXT:    cmp r5, r2
14; CHECK-NEXT:    cset r12, hi
15; CHECK-NEXT:    cmp r4, r0
16; CHECK-NEXT:    cset lr, hi
17; CHECK-NEXT:    cmp r5, r1
18; CHECK-NEXT:    add.w r5, r1, r3, lsl #2
19; CHECK-NEXT:    cset r4, hi
20; CHECK-NEXT:    cmp r5, r0
21; CHECK-NEXT:    cset r5, hi
22; CHECK-NEXT:    ands r4, r5
23; CHECK-NEXT:    lsls r4, r4, #31
24; CHECK-NEXT:    itt eq
25; CHECK-NEXT:    andeq.w r5, lr, r12
26; CHECK-NEXT:    lslseq.w r5, r5, #31
27; CHECK-NEXT:    beq .LBB0_4
28; CHECK-NEXT:  @ %bb.2: @ %for.body.preheader
29; CHECK-NEXT:    subs r5, r3, #1
30; CHECK-NEXT:    and r7, r3, #3
31; CHECK-NEXT:    cmp r5, #3
32; CHECK-NEXT:    bhs .LBB0_6
33; CHECK-NEXT:  @ %bb.3:
34; CHECK-NEXT:    mov.w r12, #0
35; CHECK-NEXT:    b .LBB0_8
36; CHECK-NEXT:  .LBB0_4: @ %vector.ph
37; CHECK-NEXT:    mov.w r12, #0
38; CHECK-NEXT:    dlstp.32 lr, r3
39; CHECK-NEXT:  .LBB0_5: @ %vector.body
40; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
41; CHECK-NEXT:    add.w r12, r12, #4
42; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
43; CHECK-NEXT:    vldrw.u32 q1, [r2], #16
44; CHECK-NEXT:    vmul.f32 q0, q1, q0
45; CHECK-NEXT:    vstrw.32 q0, [r0], #16
46; CHECK-NEXT:    letp lr, .LBB0_5
47; CHECK-NEXT:    b .LBB0_11
48; CHECK-NEXT:  .LBB0_6: @ %for.body.preheader.new
49; CHECK-NEXT:    bic r3, r3, #3
50; CHECK-NEXT:    movs r5, #1
51; CHECK-NEXT:    subs r3, #4
52; CHECK-NEXT:    mov.w r12, #0
53; CHECK-NEXT:    add.w lr, r5, r3, lsr #2
54; CHECK-NEXT:    movs r3, #0
55; CHECK-NEXT:    dls lr, lr
56; CHECK-NEXT:  .LBB0_7: @ %for.body
57; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
58; CHECK-NEXT:    adds r4, r1, r3
59; CHECK-NEXT:    adds r5, r2, r3
60; CHECK-NEXT:    adds r6, r0, r3
61; CHECK-NEXT:    adds r3, #16
62; CHECK-NEXT:    vldr s0, [r4]
63; CHECK-NEXT:    add.w r12, r12, #4
64; CHECK-NEXT:    vldr s2, [r5]
65; CHECK-NEXT:    vmul.f32 s0, s2, s0
66; CHECK-NEXT:    vstr s0, [r6]
67; CHECK-NEXT:    vldr s0, [r4, #4]
68; CHECK-NEXT:    vldr s2, [r5, #4]
69; CHECK-NEXT:    vmul.f32 s0, s2, s0
70; CHECK-NEXT:    vstr s0, [r6, #4]
71; CHECK-NEXT:    vldr s0, [r4, #8]
72; CHECK-NEXT:    vldr s2, [r5, #8]
73; CHECK-NEXT:    vmul.f32 s0, s2, s0
74; CHECK-NEXT:    vstr s0, [r6, #8]
75; CHECK-NEXT:    vldr s0, [r4, #12]
76; CHECK-NEXT:    vldr s2, [r5, #12]
77; CHECK-NEXT:    vmul.f32 s0, s2, s0
78; CHECK-NEXT:    vstr s0, [r6, #12]
79; CHECK-NEXT:    le lr, .LBB0_7
80; CHECK-NEXT:  .LBB0_8: @ %for.cond.cleanup.loopexit.unr-lcssa
81; CHECK-NEXT:    wls lr, r7, .LBB0_11
82; CHECK-NEXT:  @ %bb.9: @ %for.body.epil.preheader
83; CHECK-NEXT:    add.w r1, r1, r12, lsl #2
84; CHECK-NEXT:    add.w r2, r2, r12, lsl #2
85; CHECK-NEXT:    add.w r0, r0, r12, lsl #2
86; CHECK-NEXT:    mov lr, r7
87; CHECK-NEXT:  .LBB0_10: @ %for.body.epil
88; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
89; CHECK-NEXT:    vldr s0, [r1]
90; CHECK-NEXT:    adds r1, #4
91; CHECK-NEXT:    vldr s2, [r2]
92; CHECK-NEXT:    adds r2, #4
93; CHECK-NEXT:    vmul.f32 s0, s2, s0
94; CHECK-NEXT:    vstr s0, [r0]
95; CHECK-NEXT:    adds r0, #4
96; CHECK-NEXT:    le lr, .LBB0_10
97; CHECK-NEXT:  .LBB0_11: @ %for.cond.cleanup
98; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
99entry:
100  %cmp8 = icmp eq i32 %N, 0
101  br i1 %cmp8, label %for.cond.cleanup, label %vector.memcheck
102
103vector.memcheck:                                  ; preds = %entry
104  %scevgep = getelementptr float, float* %a, i32 %N
105  %scevgep13 = getelementptr float, float* %b, i32 %N
106  %scevgep16 = getelementptr float, float* %c, i32 %N
107  %bound0 = icmp ugt float* %scevgep13, %a
108  %bound1 = icmp ugt float* %scevgep, %b
109  %found.conflict = and i1 %bound0, %bound1
110  %bound018 = icmp ugt float* %scevgep16, %a
111  %bound119 = icmp ugt float* %scevgep, %c
112  %found.conflict20 = and i1 %bound018, %bound119
113  %conflict.rdx = or i1 %found.conflict, %found.conflict20
114  br i1 %conflict.rdx, label %for.body.preheader, label %vector.ph
115
116for.body.preheader:                               ; preds = %vector.memcheck
117  %0 = add i32 %N, -1
118  %xtraiter = and i32 %N, 3
119  %1 = icmp ult i32 %0, 3
120  br i1 %1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
121
122for.body.preheader.new:                           ; preds = %for.body.preheader
123  %unroll_iter = sub i32 %N, %xtraiter
124  br label %for.body
125
126vector.ph:                                        ; preds = %vector.memcheck
127  %n.rnd.up = add i32 %N, 3
128  %n.vec = and i32 %n.rnd.up, -4
129  %trip.count.minus.1 = add i32 %N, -1
130  %broadcast.splatinsert21 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
131  %broadcast.splat22 = shufflevector <4 x i32> %broadcast.splatinsert21, <4 x i32> undef, <4 x i32> zeroinitializer
132  br label %vector.body
133
134vector.body:                                      ; preds = %vector.body, %vector.ph
135  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
136  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
137  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
138  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
139  %2 = getelementptr inbounds float, float* %b, i32 %index
140
141  ; %3 = icmp ule <4 x i32> %induction, %broadcast.splat22
142  %3 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
143
144  %4 = bitcast float* %2 to <4 x float>*
145  %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %3, <4 x float> undef)
146  %5 = getelementptr inbounds float, float* %c, i32 %index
147  %6 = bitcast float* %5 to <4 x float>*
148  %wide.masked.load23 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %6, i32 4, <4 x i1> %3, <4 x float> undef)
149  %7 = fmul fast <4 x float> %wide.masked.load23, %wide.masked.load
150  %8 = getelementptr inbounds float, float* %a, i32 %index
151  %9 = bitcast float* %8 to <4 x float>*
152  call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %7, <4 x float>* %9, i32 4, <4 x i1> %3)
153  %index.next = add i32 %index, 4
154  %10 = icmp eq i32 %index.next, %n.vec
155  br i1 %10, label %for.cond.cleanup, label %vector.body
156
157for.cond.cleanup.loopexit.unr-lcssa:              ; preds = %for.body, %for.body.preheader
158  %i.09.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ]
159  %lcmp.mod = icmp eq i32 %xtraiter, 0
160  br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil
161
162for.body.epil:                                    ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil
163  %i.09.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.09.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
164  %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ]
165  %arrayidx.epil = getelementptr inbounds float, float* %b, i32 %i.09.epil
166  %11 = load float, float* %arrayidx.epil, align 4
167  %arrayidx1.epil = getelementptr inbounds float, float* %c, i32 %i.09.epil
168  %12 = load float, float* %arrayidx1.epil, align 4
169  %mul.epil = fmul fast float %12, %11
170  %arrayidx2.epil = getelementptr inbounds float, float* %a, i32 %i.09.epil
171  store float %mul.epil, float* %arrayidx2.epil, align 4
172  %inc.epil = add nuw i32 %i.09.epil, 1
173  %epil.iter.sub = add i32 %epil.iter, -1
174  %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
175  br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil
176
177for.cond.cleanup:                                 ; preds = %vector.body, %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil, %entry
178  ret void
179
180for.body:                                         ; preds = %for.body, %for.body.preheader.new
181  %i.09 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ]
182  %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ]
183  %arrayidx = getelementptr inbounds float, float* %b, i32 %i.09
184  %13 = load float, float* %arrayidx, align 4
185  %arrayidx1 = getelementptr inbounds float, float* %c, i32 %i.09
186  %14 = load float, float* %arrayidx1, align 4
187  %mul = fmul fast float %14, %13
188  %arrayidx2 = getelementptr inbounds float, float* %a, i32 %i.09
189  store float %mul, float* %arrayidx2, align 4
190  %inc = or i32 %i.09, 1
191  %arrayidx.1 = getelementptr inbounds float, float* %b, i32 %inc
192  %15 = load float, float* %arrayidx.1, align 4
193  %arrayidx1.1 = getelementptr inbounds float, float* %c, i32 %inc
194  %16 = load float, float* %arrayidx1.1, align 4
195  %mul.1 = fmul fast float %16, %15
196  %arrayidx2.1 = getelementptr inbounds float, float* %a, i32 %inc
197  store float %mul.1, float* %arrayidx2.1, align 4
198  %inc.1 = or i32 %i.09, 2
199  %arrayidx.2 = getelementptr inbounds float, float* %b, i32 %inc.1
200  %17 = load float, float* %arrayidx.2, align 4
201  %arrayidx1.2 = getelementptr inbounds float, float* %c, i32 %inc.1
202  %18 = load float, float* %arrayidx1.2, align 4
203  %mul.2 = fmul fast float %18, %17
204  %arrayidx2.2 = getelementptr inbounds float, float* %a, i32 %inc.1
205  store float %mul.2, float* %arrayidx2.2, align 4
206  %inc.2 = or i32 %i.09, 3
207  %arrayidx.3 = getelementptr inbounds float, float* %b, i32 %inc.2
208  %19 = load float, float* %arrayidx.3, align 4
209  %arrayidx1.3 = getelementptr inbounds float, float* %c, i32 %inc.2
210  %20 = load float, float* %arrayidx1.3, align 4
211  %mul.3 = fmul fast float %20, %19
212  %arrayidx2.3 = getelementptr inbounds float, float* %a, i32 %inc.2
213  store float %mul.3, float* %arrayidx2.3, align 4
214  %inc.3 = add nuw i32 %i.09, 4
215  %niter.nsub.3 = add i32 %niter, -4
216  %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
217  br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body
218}
219
220define arm_aapcs_vfpcc float @fast_float_mac(float* nocapture readonly %b, float* nocapture readonly %c, i32 %N) {
221; CHECK-LABEL: fast_float_mac:
222; CHECK:       @ %bb.0: @ %entry
223; CHECK-NEXT:    push {r7, lr}
224; CHECK-NEXT:    cbz r2, .LBB1_4
225; CHECK-NEXT:  @ %bb.1: @ %vector.ph
226; CHECK-NEXT:    adds r3, r2, #3
227; CHECK-NEXT:    mov.w r12, #1
228; CHECK-NEXT:    bic r3, r3, #3
229; CHECK-NEXT:    vmov.i32 q0, #0x0
230; CHECK-NEXT:    subs r3, #4
231; CHECK-NEXT:    add.w lr, r12, r3, lsr #2
232; CHECK-NEXT:    movs r3, #0
233; CHECK-NEXT:    dls lr, lr
234; CHECK-NEXT:  .LBB1_2: @ %vector.body
235; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
236; CHECK-NEXT:    vctp.32 r2
237; CHECK-NEXT:    adds r3, #4
238; CHECK-NEXT:    subs r2, #4
239; CHECK-NEXT:    vmov q1, q0
240; CHECK-NEXT:    vpstt
241; CHECK-NEXT:    vldrwt.u32 q2, [r0], #16
242; CHECK-NEXT:    vldrwt.u32 q3, [r1], #16
243; CHECK-NEXT:    vfma.f32 q0, q3, q2
244; CHECK-NEXT:    le lr, .LBB1_2
245; CHECK-NEXT:  @ %bb.3: @ %middle.block
246; CHECK-NEXT:    vpsel q0, q0, q1
247; CHECK-NEXT:    vmov.f32 s4, s2
248; CHECK-NEXT:    vmov.f32 s5, s3
249; CHECK-NEXT:    vadd.f32 q0, q0, q1
250; CHECK-NEXT:    vmov r0, s1
251; CHECK-NEXT:    vadd.f32 q0, q0, r0
252; CHECK-NEXT:    @ kill: def $s0 killed $s0 killed $q0
253; CHECK-NEXT:    pop {r7, pc}
254; CHECK-NEXT:  .LBB1_4:
255; CHECK-NEXT:    vldr s0, .LCPI1_0
256; CHECK-NEXT:    @ kill: def $s0 killed $s0 killed $q0
257; CHECK-NEXT:    pop {r7, pc}
258; CHECK-NEXT:    .p2align 2
259; CHECK-NEXT:  @ %bb.5:
260; CHECK-NEXT:  .LCPI1_0:
261; CHECK-NEXT:    .long 0x00000000 @ float 0
262entry:
263  %cmp8 = icmp eq i32 %N, 0
264  br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
265
266vector.ph:                                        ; preds = %entry
267  %n.rnd.up = add i32 %N, 3
268  %n.vec = and i32 %n.rnd.up, -4
269  %trip.count.minus.1 = add i32 %N, -1
270  %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
271  %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
272  br label %vector.body
273
274vector.body:                                      ; preds = %vector.body, %vector.ph
275  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
276  %vec.phi = phi <4 x float> [ zeroinitializer, %vector.ph ], [ %6, %vector.body ]
277  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
278  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
279  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
280  %0 = getelementptr inbounds float, float* %b, i32 %index
281
282;  %1 = icmp ule <4 x i32> %induction, %broadcast.splat12
283  %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
284
285  %2 = bitcast float* %0 to <4 x float>*
286  %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
287  %3 = getelementptr inbounds float, float* %c, i32 %index
288  %4 = bitcast float* %3 to <4 x float>*
289  %wide.masked.load13 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %1, <4 x float> undef)
290  %5 = fmul fast <4 x float> %wide.masked.load13, %wide.masked.load
291  %6 = fadd fast <4 x float> %5, %vec.phi
292  %index.next = add i32 %index, 4
293  %7 = icmp eq i32 %index.next, %n.vec
294  br i1 %7, label %middle.block, label %vector.body
295
296middle.block:                                     ; preds = %vector.body
297  %8 = select <4 x i1> %1, <4 x float> %6, <4 x float> %vec.phi
298  %rdx.shuf = shufflevector <4 x float> %8, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
299  %bin.rdx = fadd fast <4 x float> %8, %rdx.shuf
300  %rdx.shuf14 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
301  %bin.rdx15 = fadd fast <4 x float> %bin.rdx, %rdx.shuf14
302  %9 = extractelement <4 x float> %bin.rdx15, i32 0
303  br label %for.cond.cleanup
304
305for.cond.cleanup:                                 ; preds = %middle.block, %entry
306  %a.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %9, %middle.block ]
307  ret float %a.0.lcssa
308}
309
310define arm_aapcs_vfpcc float @fast_float_half_mac(half* nocapture readonly %b, half* nocapture readonly %c, i32 %N) {
311; CHECK-LABEL: fast_float_half_mac:
312; CHECK:       @ %bb.0: @ %entry
313; CHECK-NEXT:    push {r4, r5, r7, lr}
314; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
315; CHECK-NEXT:    sub sp, #32
316; CHECK-NEXT:    cmp r2, #0
317; CHECK-NEXT:    beq.w .LBB2_22
318; CHECK-NEXT:  @ %bb.1: @ %vector.ph
319; CHECK-NEXT:    adds r3, r2, #3
320; CHECK-NEXT:    vmov.i32 q5, #0x0
321; CHECK-NEXT:    bic r3, r3, #3
322; CHECK-NEXT:    sub.w r12, r3, #4
323; CHECK-NEXT:    movs r3, #1
324; CHECK-NEXT:    add.w lr, r3, r12, lsr #2
325; CHECK-NEXT:    sub.w r12, r2, #1
326; CHECK-NEXT:    adr r2, .LCPI2_1
327; CHECK-NEXT:    mov lr, lr
328; CHECK-NEXT:    vldrw.u32 q0, [r2]
329; CHECK-NEXT:    movs r3, #0
330; CHECK-NEXT:    vdup.32 q1, r12
331; CHECK-NEXT:    vdup.32 q2, r12
332; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
333; CHECK-NEXT:    b .LBB2_4
334; CHECK-NEXT:  .LBB2_2: @ %cond.load25
335; CHECK-NEXT:    @ in Loop: Header=BB2_4 Depth=1
336; CHECK-NEXT:    vmovx.f16 s0, s28
337; CHECK-NEXT:    vmov r4, s28
338; CHECK-NEXT:    vmov r2, s0
339; CHECK-NEXT:    vmov.16 q6[0], r4
340; CHECK-NEXT:    vldr.16 s0, [r1, #6]
341; CHECK-NEXT:    vmov.16 q6[1], r2
342; CHECK-NEXT:    vmov r2, s29
343; CHECK-NEXT:    vmov.16 q6[2], r2
344; CHECK-NEXT:    vmov r2, s0
345; CHECK-NEXT:    vmov.16 q6[3], r2
346; CHECK-NEXT:  .LBB2_3: @ %else26
347; CHECK-NEXT:    @ in Loop: Header=BB2_4 Depth=1
348; CHECK-NEXT:    vmul.f16 q0, q6, q5
349; CHECK-NEXT:    adds r0, #8
350; CHECK-NEXT:    vcvtt.f32.f16 s23, s1
351; CHECK-NEXT:    adds r1, #8
352; CHECK-NEXT:    vcvtb.f32.f16 s22, s1
353; CHECK-NEXT:    adds r3, #4
354; CHECK-NEXT:    vcvtt.f32.f16 s21, s0
355; CHECK-NEXT:    subs.w lr, lr, #1
356; CHECK-NEXT:    vcvtb.f32.f16 s20, s0
357; CHECK-NEXT:    vadd.f32 q5, q3, q5
358; CHECK-NEXT:    bne .LBB2_4
359; CHECK-NEXT:    b .LBB2_21
360; CHECK-NEXT:  .LBB2_4: @ %vector.body
361; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
362; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
363; CHECK-NEXT:    vmov q3, q5
364; CHECK-NEXT:    @ implicit-def: $q6
365; CHECK-NEXT:    vadd.i32 q4, q0, r3
366; CHECK-NEXT:    vcmp.u32 cs, q1, q4
367; CHECK-NEXT:    vmrs r4, p0
368; CHECK-NEXT:    and r2, r4, #1
369; CHECK-NEXT:    rsbs r5, r2, #0
370; CHECK-NEXT:    movs r2, #0
371; CHECK-NEXT:    bfi r2, r5, #0, #1
372; CHECK-NEXT:    ubfx r5, r4, #4, #1
373; CHECK-NEXT:    rsbs r5, r5, #0
374; CHECK-NEXT:    bfi r2, r5, #1, #1
375; CHECK-NEXT:    ubfx r5, r4, #8, #1
376; CHECK-NEXT:    ubfx r4, r4, #12, #1
377; CHECK-NEXT:    rsbs r5, r5, #0
378; CHECK-NEXT:    bfi r2, r5, #2, #1
379; CHECK-NEXT:    rsbs r4, r4, #0
380; CHECK-NEXT:    bfi r2, r4, #3, #1
381; CHECK-NEXT:    lsls r4, r2, #31
382; CHECK-NEXT:    bne .LBB2_9
383; CHECK-NEXT:  @ %bb.5: @ %else
384; CHECK-NEXT:    @ in Loop: Header=BB2_4 Depth=1
385; CHECK-NEXT:    lsls r4, r2, #30
386; CHECK-NEXT:    bpl .LBB2_10
387; CHECK-NEXT:  .LBB2_6: @ %cond.load6
388; CHECK-NEXT:    @ in Loop: Header=BB2_4 Depth=1
389; CHECK-NEXT:    vldr.16 s20, [r0, #2]
390; CHECK-NEXT:    vmov r5, s24
391; CHECK-NEXT:    vmovx.f16 s24, s25
392; CHECK-NEXT:    vmov r4, s20
393; CHECK-NEXT:    vmov.16 q5[0], r5
394; CHECK-NEXT:    vmov.16 q5[1], r4
395; CHECK-NEXT:    vmov r4, s25
396; CHECK-NEXT:    vmov.16 q5[2], r4
397; CHECK-NEXT:    vmov r4, s24
398; CHECK-NEXT:    vmov.16 q5[3], r4
399; CHECK-NEXT:    lsls r4, r2, #29
400; CHECK-NEXT:    bmi .LBB2_11
401; CHECK-NEXT:  .LBB2_7: @ in Loop: Header=BB2_4 Depth=1
402; CHECK-NEXT:    vmov q6, q5
403; CHECK-NEXT:    lsls r2, r2, #28
404; CHECK-NEXT:    bmi .LBB2_12
405; CHECK-NEXT:  .LBB2_8: @ in Loop: Header=BB2_4 Depth=1
406; CHECK-NEXT:    vmov q5, q6
407; CHECK-NEXT:    b .LBB2_13
408; CHECK-NEXT:  .LBB2_9: @ %cond.load
409; CHECK-NEXT:    @ in Loop: Header=BB2_4 Depth=1
410; CHECK-NEXT:    vldr.16 s24, [r0]
411; CHECK-NEXT:    lsls r4, r2, #30
412; CHECK-NEXT:    bmi .LBB2_6
413; CHECK-NEXT:  .LBB2_10: @ in Loop: Header=BB2_4 Depth=1
414; CHECK-NEXT:    vmov q5, q6
415; CHECK-NEXT:    lsls r4, r2, #29
416; CHECK-NEXT:    bpl .LBB2_7
417; CHECK-NEXT:  .LBB2_11: @ %cond.load9
418; CHECK-NEXT:    @ in Loop: Header=BB2_4 Depth=1
419; CHECK-NEXT:    vmovx.f16 s24, s20
420; CHECK-NEXT:    vmov r4, s20
421; CHECK-NEXT:    vldr.16 s28, [r0, #4]
422; CHECK-NEXT:    vmov r5, s24
423; CHECK-NEXT:    vmov.16 q6[0], r4
424; CHECK-NEXT:    vmovx.f16 s20, s21
425; CHECK-NEXT:    vmov.16 q6[1], r5
426; CHECK-NEXT:    vmov r4, s28
427; CHECK-NEXT:    vmov.16 q6[2], r4
428; CHECK-NEXT:    vmov r4, s20
429; CHECK-NEXT:    vmov.16 q6[3], r4
430; CHECK-NEXT:    lsls r2, r2, #28
431; CHECK-NEXT:    bpl .LBB2_8
432; CHECK-NEXT:  .LBB2_12: @ %cond.load12
433; CHECK-NEXT:    @ in Loop: Header=BB2_4 Depth=1
434; CHECK-NEXT:    vmovx.f16 s20, s24
435; CHECK-NEXT:    vmov r4, s24
436; CHECK-NEXT:    vmov r2, s20
437; CHECK-NEXT:    vmov.16 q5[0], r4
438; CHECK-NEXT:    vmov.16 q5[1], r2
439; CHECK-NEXT:    vmov r2, s25
440; CHECK-NEXT:    vldr.16 s24, [r0, #6]
441; CHECK-NEXT:    vmov.16 q5[2], r2
442; CHECK-NEXT:    vmov r2, s24
443; CHECK-NEXT:    vmov.16 q5[3], r2
444; CHECK-NEXT:  .LBB2_13: @ %else13
445; CHECK-NEXT:    @ in Loop: Header=BB2_4 Depth=1
446; CHECK-NEXT:    vcmp.u32 cs, q2, q4
447; CHECK-NEXT:    @ implicit-def: $q7
448; CHECK-NEXT:    vmrs r4, p0
449; CHECK-NEXT:    and r2, r4, #1
450; CHECK-NEXT:    rsbs r5, r2, #0
451; CHECK-NEXT:    movs r2, #0
452; CHECK-NEXT:    bfi r2, r5, #0, #1
453; CHECK-NEXT:    ubfx r5, r4, #4, #1
454; CHECK-NEXT:    rsbs r5, r5, #0
455; CHECK-NEXT:    bfi r2, r5, #1, #1
456; CHECK-NEXT:    ubfx r5, r4, #8, #1
457; CHECK-NEXT:    ubfx r4, r4, #12, #1
458; CHECK-NEXT:    rsbs r5, r5, #0
459; CHECK-NEXT:    bfi r2, r5, #2, #1
460; CHECK-NEXT:    rsbs r4, r4, #0
461; CHECK-NEXT:    bfi r2, r4, #3, #1
462; CHECK-NEXT:    lsls r4, r2, #31
463; CHECK-NEXT:    bne .LBB2_17
464; CHECK-NEXT:  @ %bb.14: @ %else17
465; CHECK-NEXT:    @ in Loop: Header=BB2_4 Depth=1
466; CHECK-NEXT:    lsls r4, r2, #30
467; CHECK-NEXT:    bpl .LBB2_18
468; CHECK-NEXT:  .LBB2_15: @ %cond.load19
469; CHECK-NEXT:    @ in Loop: Header=BB2_4 Depth=1
470; CHECK-NEXT:    vldr.16 s24, [r1, #2]
471; CHECK-NEXT:    vmov r5, s28
472; CHECK-NEXT:    vmovx.f16 s28, s29
473; CHECK-NEXT:    vmov r4, s24
474; CHECK-NEXT:    vmov.16 q6[0], r5
475; CHECK-NEXT:    vmov.16 q6[1], r4
476; CHECK-NEXT:    vmov r4, s29
477; CHECK-NEXT:    vmov.16 q6[2], r4
478; CHECK-NEXT:    vmov r4, s28
479; CHECK-NEXT:    vmov.16 q6[3], r4
480; CHECK-NEXT:    lsls r4, r2, #29
481; CHECK-NEXT:    bmi .LBB2_19
482; CHECK-NEXT:  .LBB2_16: @ in Loop: Header=BB2_4 Depth=1
483; CHECK-NEXT:    vmov q7, q6
484; CHECK-NEXT:    lsls r2, r2, #28
485; CHECK-NEXT:    bmi.w .LBB2_2
486; CHECK-NEXT:    b .LBB2_20
487; CHECK-NEXT:  .LBB2_17: @ %cond.load16
488; CHECK-NEXT:    @ in Loop: Header=BB2_4 Depth=1
489; CHECK-NEXT:    vldr.16 s28, [r1]
490; CHECK-NEXT:    lsls r4, r2, #30
491; CHECK-NEXT:    bmi .LBB2_15
492; CHECK-NEXT:  .LBB2_18: @ in Loop: Header=BB2_4 Depth=1
493; CHECK-NEXT:    vmov q6, q7
494; CHECK-NEXT:    lsls r4, r2, #29
495; CHECK-NEXT:    bpl .LBB2_16
496; CHECK-NEXT:  .LBB2_19: @ %cond.load22
497; CHECK-NEXT:    @ in Loop: Header=BB2_4 Depth=1
498; CHECK-NEXT:    vmovx.f16 s28, s24
499; CHECK-NEXT:    vmov r4, s24
500; CHECK-NEXT:    vldr.16 s0, [r1, #4]
501; CHECK-NEXT:    vmov r5, s28
502; CHECK-NEXT:    vmov.16 q7[0], r4
503; CHECK-NEXT:    vmov r4, s0
504; CHECK-NEXT:    vmov.16 q7[1], r5
505; CHECK-NEXT:    vmovx.f16 s0, s25
506; CHECK-NEXT:    vmov.16 q7[2], r4
507; CHECK-NEXT:    vmov r4, s0
508; CHECK-NEXT:    vmov.16 q7[3], r4
509; CHECK-NEXT:    lsls r2, r2, #28
510; CHECK-NEXT:    bmi.w .LBB2_2
511; CHECK-NEXT:  .LBB2_20: @ in Loop: Header=BB2_4 Depth=1
512; CHECK-NEXT:    vmov q6, q7
513; CHECK-NEXT:    b .LBB2_3
514; CHECK-NEXT:  .LBB2_21: @ %middle.block
515; CHECK-NEXT:    vdup.32 q0, r12
516; CHECK-NEXT:    vcmp.u32 cs, q0, q4
517; CHECK-NEXT:    vpsel q0, q5, q3
518; CHECK-NEXT:    vmov.f32 s4, s2
519; CHECK-NEXT:    vmov.f32 s5, s3
520; CHECK-NEXT:    vadd.f32 q0, q0, q1
521; CHECK-NEXT:    vmov r0, s1
522; CHECK-NEXT:    vadd.f32 q0, q0, r0
523; CHECK-NEXT:    b .LBB2_23
524; CHECK-NEXT:  .LBB2_22:
525; CHECK-NEXT:    vldr s0, .LCPI2_0
526; CHECK-NEXT:  .LBB2_23: @ %for.cond.cleanup
527; CHECK-NEXT:    @ kill: def $s0 killed $s0 killed $q0
528; CHECK-NEXT:    add sp, #32
529; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
530; CHECK-NEXT:    pop {r4, r5, r7, pc}
531; CHECK-NEXT:    .p2align 4
532; CHECK-NEXT:  @ %bb.24:
533; CHECK-NEXT:  .LCPI2_1:
534; CHECK-NEXT:    .long 0 @ 0x0
535; CHECK-NEXT:    .long 1 @ 0x1
536; CHECK-NEXT:    .long 2 @ 0x2
537; CHECK-NEXT:    .long 3 @ 0x3
538; CHECK-NEXT:  .LCPI2_0:
539; CHECK-NEXT:    .long 0x00000000 @ float 0
540entry:
541  %cmp8 = icmp eq i32 %N, 0
542  br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
543
544vector.ph:                                        ; preds = %entry
545  %n.rnd.up = add i32 %N, 3
546  %n.vec = and i32 %n.rnd.up, -4
547  %trip.count.minus.1 = add i32 %N, -1
548  %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
549  %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
550  br label %vector.body
551
552vector.body:                                      ; preds = %vector.body, %vector.ph
553  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
554  %vec.phi = phi <4 x float> [ zeroinitializer, %vector.ph ], [ %7, %vector.body ]
555  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
556  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
557  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
558  %0 = getelementptr inbounds half, half* %b, i32 %index
559  %1 = icmp ule <4 x i32> %induction, %broadcast.splat12
560  %2 = bitcast half* %0 to <4 x half>*
561  %wide.masked.load = call <4 x half> @llvm.masked.load.v4f16.p0v4f16(<4 x half>* %2, i32 2, <4 x i1> %1, <4 x half> undef)
562  %3 = getelementptr inbounds half, half* %c, i32 %index
563  %4 = bitcast half* %3 to <4 x half>*
564  %wide.masked.load13 = call <4 x half> @llvm.masked.load.v4f16.p0v4f16(<4 x half>* %4, i32 2, <4 x i1> %1, <4 x half> undef)
565  %5 = fmul fast <4 x half> %wide.masked.load13, %wide.masked.load
566  %6 = fpext <4 x half> %5 to <4 x float>
567  %7 = fadd fast <4 x float> %vec.phi, %6
568  %index.next = add i32 %index, 4
569  %8 = icmp eq i32 %index.next, %n.vec
570  br i1 %8, label %middle.block, label %vector.body
571
572middle.block:                                     ; preds = %vector.body
573  %9 = select <4 x i1> %1, <4 x float> %7, <4 x float> %vec.phi
574  %rdx.shuf = shufflevector <4 x float> %9, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
575  %bin.rdx = fadd fast <4 x float> %9, %rdx.shuf
576  %rdx.shuf14 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
577  %bin.rdx15 = fadd fast <4 x float> %bin.rdx, %rdx.shuf14
578  %10 = extractelement <4 x float> %bin.rdx15, i32 0
579  br label %for.cond.cleanup
580
581for.cond.cleanup:                                 ; preds = %middle.block, %entry
582  %a.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %10, %middle.block ]
583  ret float %a.0.lcssa
584}
585
586; Function Attrs: argmemonly nounwind readonly willreturn
587declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>)
588
589; Function Attrs: argmemonly nounwind willreturn
590declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32 immarg, <4 x i1>)
591
592; Function Attrs: argmemonly nounwind readonly willreturn
593declare <4 x half> @llvm.masked.load.v4f16.p0v4f16(<4 x half>*, i32 immarg, <4 x i1>, <4 x half>)
594
595declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
596