1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -tail-predication=enabled --verify-machineinstrs %s -o - | FileCheck %s
3
4define dso_local i32 @vpsel_mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture readonly %c, i32 %N) {
5; CHECK-LABEL: vpsel_mul_reduce_add:
6; CHECK:       @ %bb.0: @ %entry
7; CHECK-NEXT:    cmp r3, #0
8; CHECK-NEXT:    itt eq
9; CHECK-NEXT:    moveq r0, #0
10; CHECK-NEXT:    bxeq lr
11; CHECK-NEXT:  .LBB0_1: @ %vector.ph
12; CHECK-NEXT:    push {r4, lr}
13; CHECK-NEXT:    add.w r12, r3, #3
14; CHECK-NEXT:    mov.w lr, #1
15; CHECK-NEXT:    bic r12, r12, #3
16; CHECK-NEXT:    vmov.i32 q1, #0x0
17; CHECK-NEXT:    sub.w r12, r12, #4
18; CHECK-NEXT:    add.w lr, lr, r12, lsr #2
19; CHECK-NEXT:    mov.w r12, #0
20; CHECK-NEXT:    dls lr, lr
21; CHECK-NEXT:  .LBB0_2: @ %vector.body
22; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
23; CHECK-NEXT:    and r4, r12, #15
24; CHECK-NEXT:    vmov q0, q1
25; CHECK-NEXT:    vctp.32 r3
26; CHECK-NEXT:    vpstt
27; CHECK-NEXT:    vldrwt.u32 q1, [r2], #16
28; CHECK-NEXT:    vldrwt.u32 q2, [r1], #16
29; CHECK-NEXT:    vdup.32 q3, r4
30; CHECK-NEXT:    vpt.i32 eq, q3, zr
31; CHECK-NEXT:    vmovt q1, q2
32; CHECK-NEXT:    vctp.32 r3
33; CHECK-NEXT:    vpst
34; CHECK-NEXT:    vldrwt.u32 q2, [r0], #16
35; CHECK-NEXT:    vmul.i32 q1, q1, q2
36; CHECK-NEXT:    add.w r12, r12, #4
37; CHECK-NEXT:    subs r3, #4
38; CHECK-NEXT:    vadd.i32 q1, q1, q0
39; CHECK-NEXT:    le lr, .LBB0_2
40; CHECK-NEXT:  @ %bb.3: @ %middle.block
41; CHECK-NEXT:    vpsel q0, q1, q0
42; CHECK-NEXT:    vaddv.u32 r0, q0
43; CHECK-NEXT:    pop {r4, pc}
44entry:
45  %cmp8 = icmp eq i32 %N, 0
46  br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
47
48vector.ph:                                        ; preds = %entry
49  %n.rnd.up = add i32 %N, 3
50  %n.vec = and i32 %n.rnd.up, -4
51  %trip.count.minus.1 = add i32 %N, -1
52  %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
53  %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
54  br label %vector.body
55
56vector.body:                                      ; preds = %vector.body, %vector.ph
57  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
58  %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %add, %vector.body ]
59  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
60  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
61  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
62  %tmp = getelementptr inbounds i32, i32* %a, i32 %index
63
64;  %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12
65  %tmp1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
66
67  %tmp2 = bitcast i32* %tmp to <4 x i32>*
68  %wide.masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
69  %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
70  %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
71  %wide.masked.load.b = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
72  %tmp5 = getelementptr inbounds i32, i32* %c, i32 %index
73  %tmp6 = bitcast i32* %tmp5 to <4 x i32>*
74  %wide.masked.load.c = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp6, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
75  %rem = urem i32 %index, 16
76  %rem.broadcast.splatinsert = insertelement <4 x i32> undef, i32 %rem, i32 0
77  %rem.broadcast.splat = shufflevector <4 x i32> %rem.broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
78  %cmp = icmp eq <4 x i32> %rem.broadcast.splat, <i32 0, i32 0, i32 0, i32 0>
79  %wide.masked.load = select <4 x i1> %cmp, <4 x i32> %wide.masked.load.b, <4 x i32> %wide.masked.load.c
80  %mul = mul nsw <4 x i32> %wide.masked.load, %wide.masked.load.a
81  %add = add nsw <4 x i32> %mul, %vec.phi
82  %index.next = add i32 %index, 4
83  %tmp7 = icmp eq i32 %index.next, %n.vec
84  br i1 %tmp7, label %middle.block, label %vector.body
85
86middle.block:                                     ; preds = %vector.body
87  %tmp8 = select <4 x i1> %tmp1, <4 x i32> %add, <4 x i32> %vec.phi
88  %tmp9 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %tmp8)
89  br label %for.cond.cleanup
90
91for.cond.cleanup:                                 ; preds = %middle.block, %entry
92  %res.0.lcssa = phi i32 [ 0, %entry ], [ %tmp9, %middle.block ]
93  ret i32 %res.0.lcssa
94}
95
96define dso_local i32 @vpsel_mul_reduce_add_2(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b,
97; CHECK-LABEL: vpsel_mul_reduce_add_2:
98; CHECK:       @ %bb.0: @ %entry
99; CHECK-NEXT:    push {r4, r5, r7, lr}
100; CHECK-NEXT:    vpush {d8, d9}
101; CHECK-NEXT:    ldr.w r12, [sp, #32]
102; CHECK-NEXT:    cmp.w r12, #0
103; CHECK-NEXT:    beq .LBB1_4
104; CHECK-NEXT:  @ %bb.1: @ %vector.ph
105; CHECK-NEXT:    add.w r4, r12, #3
106; CHECK-NEXT:    vmov.i32 q1, #0x0
107; CHECK-NEXT:    bic r4, r4, #3
108; CHECK-NEXT:    sub.w lr, r4, #4
109; CHECK-NEXT:    movs r4, #1
110; CHECK-NEXT:    add.w lr, r4, lr, lsr #2
111; CHECK-NEXT:    movs r4, #0
112; CHECK-NEXT:    dls lr, lr
113; CHECK-NEXT:  .LBB1_2: @ %vector.body
114; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
115; CHECK-NEXT:    and r5, r4, #15
116; CHECK-NEXT:    vmov q0, q1
117; CHECK-NEXT:    vctp.32 r12
118; CHECK-NEXT:    vpsttt
119; CHECK-NEXT:    vldrwt.u32 q1, [r1], #16
120; CHECK-NEXT:    vldrwt.u32 q2, [r3], #16
121; CHECK-NEXT:    vldrwt.u32 q3, [r2], #16
122; CHECK-NEXT:    vdup.32 q4, r5
123; CHECK-NEXT:    vpt.i32 eq, q4, zr
124; CHECK-NEXT:    vsubt.i32 q1, q3, q2
125; CHECK-NEXT:    vctp.32 r12
126; CHECK-NEXT:    vpst
127; CHECK-NEXT:    vldrwt.u32 q2, [r0], #16
128; CHECK-NEXT:    vmul.i32 q1, q1, q2
129; CHECK-NEXT:    adds r4, #4
130; CHECK-NEXT:    sub.w r12, r12, #4
131; CHECK-NEXT:    vadd.i32 q1, q1, q0
132; CHECK-NEXT:    le lr, .LBB1_2
133; CHECK-NEXT:  @ %bb.3: @ %middle.block
134; CHECK-NEXT:    vpsel q0, q1, q0
135; CHECK-NEXT:    vaddv.u32 r0, q0
136; CHECK-NEXT:    vpop {d8, d9}
137; CHECK-NEXT:    pop {r4, r5, r7, pc}
138; CHECK-NEXT:  .LBB1_4:
139; CHECK-NEXT:    movs r0, #0
140; CHECK-NEXT:    vpop {d8, d9}
141; CHECK-NEXT:    pop {r4, r5, r7, pc}
142                                         i32* noalias nocapture readonly %c, i32* noalias nocapture readonly %d, i32 %N) {
143entry:
144  %cmp8 = icmp eq i32 %N, 0
145  br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
146
147vector.ph:                                        ; preds = %entry
148  %n.rnd.up = add i32 %N, 3
149  %n.vec = and i32 %n.rnd.up, -4
150  %trip.count.minus.1 = add i32 %N, -1
151  %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
152  %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
153  br label %vector.body
154
155vector.body:                                      ; preds = %vector.body, %vector.ph
156  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
157  %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %add, %vector.body ]
158  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
159  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
160  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
161  %tmp = getelementptr inbounds i32, i32* %a, i32 %index
162
163;  %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12
164  %tmp1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
165
166  %tmp2 = bitcast i32* %tmp to <4 x i32>*
167  %wide.masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
168  %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
169  %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
170  %wide.masked.load.b = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
171  %tmp5 = getelementptr inbounds i32, i32* %c, i32 %index
172  %tmp6 = bitcast i32* %tmp5 to <4 x i32>*
173  %wide.masked.load.c = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp6, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
174  %tmp7 = getelementptr inbounds i32, i32* %d, i32 %index
175  %tmp8 = bitcast i32* %tmp7 to <4 x i32>*
176  %wide.masked.load.d = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp8, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
177  %sub = sub <4 x i32> %wide.masked.load.c, %wide.masked.load.d
178  %rem = urem i32 %index, 16
179  %rem.broadcast.splatinsert = insertelement <4 x i32> undef, i32 %rem, i32 0
180  %rem.broadcast.splat = shufflevector <4 x i32> %rem.broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
181  %cmp = icmp eq <4 x i32> %rem.broadcast.splat, <i32 0, i32 0, i32 0, i32 0>
182  %sel = select <4 x i1> %cmp, <4 x i32> %sub, <4 x i32> %wide.masked.load.b
183  %mul = mul  <4 x i32> %sel, %wide.masked.load.a
184  %add = add  <4 x i32> %mul, %vec.phi
185  %index.next = add i32 %index, 4
186  %cmp.exit = icmp eq i32 %index.next, %n.vec
187  br i1 %cmp.exit, label %middle.block, label %vector.body
188
189middle.block:                                     ; preds = %vector.body
190  %acc = select <4 x i1> %tmp1, <4 x i32> %add, <4 x i32> %vec.phi
191  %reduce = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %acc)
192  br label %for.cond.cleanup
193
194for.cond.cleanup:                                 ; preds = %middle.block, %entry
195  %res.0.lcssa = phi i32 [ 0, %entry ], [ %reduce, %middle.block ]
196  ret i32 %res.0.lcssa
197}
198
199define dso_local i32 @and_mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b,
200; CHECK-LABEL: and_mul_reduce_add:
201; CHECK:       @ %bb.0: @ %entry
202; CHECK-NEXT:    push {r4, lr}
203; CHECK-NEXT:    sub sp, #4
204; CHECK-NEXT:    ldr.w r12, [sp, #12]
205; CHECK-NEXT:    cmp.w r12, #0
206; CHECK-NEXT:    beq .LBB2_4
207; CHECK-NEXT:  @ %bb.1: @ %vector.ph
208; CHECK-NEXT:    add.w r4, r12, #3
209; CHECK-NEXT:    vmov.i32 q1, #0x0
210; CHECK-NEXT:    bic r4, r4, #3
211; CHECK-NEXT:    sub.w lr, r4, #4
212; CHECK-NEXT:    movs r4, #1
213; CHECK-NEXT:    add.w lr, r4, lr, lsr #2
214; CHECK-NEXT:    movs r4, #0
215; CHECK-NEXT:    dls lr, lr
216; CHECK-NEXT:  .LBB2_2: @ %vector.body
217; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
218; CHECK-NEXT:    vctp.32 r12
219; CHECK-NEXT:    vmov q0, q1
220; CHECK-NEXT:    vpstt
221; CHECK-NEXT:    vldrwt.u32 q1, [r1], #16
222; CHECK-NEXT:    vldrwt.u32 q2, [r0], #16
223; CHECK-NEXT:    vstr p0, [sp] @ 4-byte Spill
224; CHECK-NEXT:    vsub.i32 q1, q2, q1
225; CHECK-NEXT:    adds r4, #4
226; CHECK-NEXT:    vpsttt
227; CHECK-NEXT:    vcmpt.i32 eq, q1, zr
228; CHECK-NEXT:    vldrwt.u32 q1, [r3], #16
229; CHECK-NEXT:    vldrwt.u32 q2, [r2], #16
230; CHECK-NEXT:    sub.w r12, r12, #4
231; CHECK-NEXT:    vmul.i32 q1, q2, q1
232; CHECK-NEXT:    vadd.i32 q1, q1, q0
233; CHECK-NEXT:    le lr, .LBB2_2
234; CHECK-NEXT:  @ %bb.3: @ %middle.block
235; CHECK-NEXT:    vldr p0, [sp] @ 4-byte Reload
236; CHECK-NEXT:    vpsel q0, q1, q0
237; CHECK-NEXT:    vaddv.u32 r0, q0
238; CHECK-NEXT:    add sp, #4
239; CHECK-NEXT:    pop {r4, pc}
240; CHECK-NEXT:  .LBB2_4:
241; CHECK-NEXT:    movs r0, #0
242; CHECK-NEXT:    add sp, #4
243; CHECK-NEXT:    pop {r4, pc}
244                                         i32* noalias nocapture readonly %c, i32* noalias nocapture readonly %d, i32 %N) {
245entry:
246  %cmp8 = icmp eq i32 %N, 0
247  br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
248
249vector.ph:                                        ; preds = %entry
250  %n.rnd.up = add i32 %N, 3
251  %n.vec = and i32 %n.rnd.up, -4
252  %trip.count.minus.1 = add i32 %N, -1
253  %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
254  %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
255  br label %vector.body
256
257vector.body:                                      ; preds = %vector.body, %vector.ph
258  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
259  %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %add, %vector.body ]
260  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
261  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
262  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
263  %tmp = getelementptr inbounds i32, i32* %a, i32 %index
264
265;  %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12
266  %tmp1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
267
268  %tmp2 = bitcast i32* %tmp to <4 x i32>*
269  %wide.masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
270  %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
271  %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
272  %wide.masked.load.b = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
273  %sub = sub <4 x i32> %wide.masked.load.a, %wide.masked.load.b
274  %cmp = icmp eq <4 x i32> %sub, <i32 0, i32 0, i32 0, i32 0>
275  %mask = and <4 x i1> %cmp, %tmp1
276  %tmp5 = getelementptr inbounds i32, i32* %c, i32 %index
277  %tmp6 = bitcast i32* %tmp5 to <4 x i32>*
278  %wide.masked.load.c = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp6, i32 4, <4 x i1> %mask, <4 x i32> undef)
279  %tmp7 = getelementptr inbounds i32, i32* %d, i32 %index
280  %tmp8 = bitcast i32* %tmp7 to <4 x i32>*
281  %wide.masked.load.d = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp8, i32 4, <4 x i1> %mask, <4 x i32> undef)
282  %mul = mul  <4 x i32> %wide.masked.load.c, %wide.masked.load.d
283  %add = add  <4 x i32> %mul, %vec.phi
284  %index.next = add i32 %index, 4
285  %cmp.exit = icmp eq i32 %index.next, %n.vec
286  br i1 %cmp.exit, label %middle.block, label %vector.body
287
288middle.block:                                     ; preds = %vector.body
289  %acc = select <4 x i1> %tmp1, <4 x i32> %add, <4 x i32> %vec.phi
290  %reduce = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %acc)
291  br label %for.cond.cleanup
292
293for.cond.cleanup:                                 ; preds = %middle.block, %entry
294  %res.0.lcssa = phi i32 [ 0, %entry ], [ %reduce, %middle.block ]
295  ret i32 %res.0.lcssa
296}
297
298define dso_local i32 @or_mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture readonly %c, i32* noalias nocapture readonly %d, i32 %N) {
299; CHECK-LABEL: or_mul_reduce_add:
300; CHECK:       @ %bb.0: @ %entry
301; CHECK-NEXT:    push {r4, lr}
302; CHECK-NEXT:    sub sp, #4
303; CHECK-NEXT:    ldr.w r12, [sp, #12]
304; CHECK-NEXT:    cmp.w r12, #0
305; CHECK-NEXT:    beq .LBB3_4
306; CHECK-NEXT:  @ %bb.1: @ %vector.ph
307; CHECK-NEXT:    add.w r4, r12, #3
308; CHECK-NEXT:    vmov.i32 q1, #0x0
309; CHECK-NEXT:    bic r4, r4, #3
310; CHECK-NEXT:    sub.w lr, r4, #4
311; CHECK-NEXT:    movs r4, #1
312; CHECK-NEXT:    add.w lr, r4, lr, lsr #2
313; CHECK-NEXT:    movs r4, #0
314; CHECK-NEXT:    dls lr, lr
315; CHECK-NEXT:  .LBB3_2: @ %vector.body
316; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
317; CHECK-NEXT:    vctp.32 r12
318; CHECK-NEXT:    vmov q0, q1
319; CHECK-NEXT:    vpstt
320; CHECK-NEXT:    vldrwt.u32 q1, [r1], #16
321; CHECK-NEXT:    vldrwt.u32 q2, [r0], #16
322; CHECK-NEXT:    vstr p0, [sp] @ 4-byte Spill
323; CHECK-NEXT:    vsub.i32 q1, q2, q1
324; CHECK-NEXT:    vpnot
325; CHECK-NEXT:    vpstee
326; CHECK-NEXT:    vcmpt.i32 ne, q1, zr
327; CHECK-NEXT:    vldrwe.u32 q1, [r3], #16
328; CHECK-NEXT:    vldrwe.u32 q2, [r2], #16
329; CHECK-NEXT:    adds r4, #4
330; CHECK-NEXT:    vmul.i32 q1, q2, q1
331; CHECK-NEXT:    sub.w r12, r12, #4
332; CHECK-NEXT:    vadd.i32 q1, q1, q0
333; CHECK-NEXT:    le lr, .LBB3_2
334; CHECK-NEXT:  @ %bb.3: @ %middle.block
335; CHECK-NEXT:    vldr p0, [sp] @ 4-byte Reload
336; CHECK-NEXT:    vpsel q0, q1, q0
337; CHECK-NEXT:    vaddv.u32 r0, q0
338; CHECK-NEXT:    add sp, #4
339; CHECK-NEXT:    pop {r4, pc}
340; CHECK-NEXT:  .LBB3_4:
341; CHECK-NEXT:    movs r0, #0
342; CHECK-NEXT:    add sp, #4
343; CHECK-NEXT:    pop {r4, pc}
344entry:
345  %cmp8 = icmp eq i32 %N, 0
346  br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
347
348vector.ph:                                        ; preds = %entry
349  %n.rnd.up = add i32 %N, 3
350  %n.vec = and i32 %n.rnd.up, -4
351  %trip.count.minus.1 = add i32 %N, -1
352  %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
353  %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
354  br label %vector.body
355
356vector.body:                                      ; preds = %vector.body, %vector.ph
357  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
358  %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %add, %vector.body ]
359  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
360  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
361  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
362  %tmp = getelementptr inbounds i32, i32* %a, i32 %index
363
364;  %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12
365  %tmp1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
366
367  %tmp2 = bitcast i32* %tmp to <4 x i32>*
368  %wide.masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
369  %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
370  %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
371  %wide.masked.load.b = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
372  %sub = sub <4 x i32> %wide.masked.load.a, %wide.masked.load.b
373  %cmp = icmp eq <4 x i32> %sub, <i32 0, i32 0, i32 0, i32 0>
374  %mask = or <4 x i1> %cmp, %tmp1
375  %tmp5 = getelementptr inbounds i32, i32* %c, i32 %index
376  %tmp6 = bitcast i32* %tmp5 to <4 x i32>*
377  %wide.masked.load.c = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp6, i32 4, <4 x i1> %mask, <4 x i32> undef)
378  %tmp7 = getelementptr inbounds i32, i32* %d, i32 %index
379  %tmp8 = bitcast i32* %tmp7 to <4 x i32>*
380  %wide.masked.load.d = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp8, i32 4, <4 x i1> %mask, <4 x i32> undef)
381  %mul = mul  <4 x i32> %wide.masked.load.c, %wide.masked.load.d
382  %add = add  <4 x i32> %mul, %vec.phi
383  %index.next = add i32 %index, 4
384  %cmp.exit = icmp eq i32 %index.next, %n.vec
385  br i1 %cmp.exit, label %middle.block, label %vector.body
386
387middle.block:                                     ; preds = %vector.body
388  %acc = select <4 x i1> %tmp1, <4 x i32> %add, <4 x i32> %vec.phi
389  %reduce = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %acc)
390  br label %for.cond.cleanup
391
392for.cond.cleanup:                                 ; preds = %middle.block, %entry
393  %res.0.lcssa = phi i32 [ 0, %entry ], [ %reduce, %middle.block ]
394  ret i32 %res.0.lcssa
395}
396
397define dso_local void @continue_on_zero(i32* noalias nocapture %arg, i32* noalias nocapture readonly %arg1, i32 %arg2) {
398; CHECK-LABEL: continue_on_zero:
399; CHECK:       @ %bb.0: @ %bb
400; CHECK-NEXT:    push {r7, lr}
401; CHECK-NEXT:    cmp r2, #0
402; CHECK-NEXT:    it eq
403; CHECK-NEXT:    popeq {r7, pc}
404; CHECK-NEXT:  .LBB4_1: @ %bb3
405; CHECK-NEXT:    movs r3, #0
406; CHECK-NEXT:    dlstp.32 lr, r2
407; CHECK-NEXT:  .LBB4_2: @ %bb9
408; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
409; CHECK-NEXT:    adds r3, #4
410; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
411; CHECK-NEXT:    vpt.i32 ne, q0, zr
412; CHECK-NEXT:    vldrwt.u32 q1, [r0]
413; CHECK-NEXT:    vmul.i32 q0, q1, q0
414; CHECK-NEXT:    vpst
415; CHECK-NEXT:    vstrwt.32 q0, [r0], #16
416; CHECK-NEXT:    letp lr, .LBB4_2
417; CHECK-NEXT:  @ %bb.3: @ %bb27
418; CHECK-NEXT:    pop {r7, pc}
419bb:
420  %tmp = icmp eq i32 %arg2, 0
421  br i1 %tmp, label %bb27, label %bb3
422
423bb3:                                              ; preds = %bb
424  %tmp4 = add i32 %arg2, 3
425  %tmp5 = and i32 %tmp4, -4
426  %tmp6 = add i32 %arg2, -1
427  %tmp7 = insertelement <4 x i32> undef, i32 %tmp6, i32 0
428  %tmp8 = shufflevector <4 x i32> %tmp7, <4 x i32> undef, <4 x i32> zeroinitializer
429  br label %bb9
430
431bb9:                                              ; preds = %bb9, %bb3
432  %tmp10 = phi i32 [ 0, %bb3 ], [ %tmp25, %bb9 ]
433  %tmp11 = insertelement <4 x i32> undef, i32 %tmp10, i32 0
434  %tmp12 = shufflevector <4 x i32> %tmp11, <4 x i32> undef, <4 x i32> zeroinitializer
435  %tmp13 = add <4 x i32> %tmp12, <i32 0, i32 1, i32 2, i32 3>
436  %tmp14 = getelementptr inbounds i32, i32* %arg1, i32 %tmp10
437
438  ; %tmp15 = icmp ule <4 x i32> %tmp13, %tmp8
439  %tmp15 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %tmp10, i32 %arg2)
440
441  %tmp16 = bitcast i32* %tmp14 to <4 x i32>*
442  %tmp17 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp16, i32 4, <4 x i1> %tmp15, <4 x i32> undef)
443  %tmp18 = icmp ne <4 x i32> %tmp17, zeroinitializer
444  %tmp19 = getelementptr inbounds i32, i32* %arg, i32 %tmp10
445  %tmp20 = and <4 x i1> %tmp18, %tmp15
446  %tmp21 = bitcast i32* %tmp19 to <4 x i32>*
447  %tmp22 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp21, i32 4, <4 x i1> %tmp20, <4 x i32> undef)
448  %tmp23 = mul nsw <4 x i32> %tmp22, %tmp17
449  %tmp24 = bitcast i32* %tmp19 to <4 x i32>*
450  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp23, <4 x i32>* %tmp24, i32 4, <4 x i1> %tmp20)
451  %tmp25 = add i32 %tmp10, 4
452  %tmp26 = icmp eq i32 %tmp25, %tmp5
453  br i1 %tmp26, label %bb27, label %bb9
454
455bb27:                                             ; preds = %bb9, %bb
456  ret void
457}
458
459define dso_local arm_aapcs_vfpcc void @range_test(i32* noalias nocapture %arg, i32* noalias nocapture readonly %arg1, i32 %arg2, i32 %arg3) {
460; CHECK-LABEL: range_test:
461; CHECK:       @ %bb.0: @ %bb
462; CHECK-NEXT:    push {r7, lr}
463; CHECK-NEXT:    cmp r3, #0
464; CHECK-NEXT:    it eq
465; CHECK-NEXT:    popeq {r7, pc}
466; CHECK-NEXT:  .LBB5_1: @ %bb4
467; CHECK-NEXT:    mov.w r12, #0
468; CHECK-NEXT:    dlstp.32 lr, r3
469; CHECK-NEXT:  .LBB5_2: @ %bb12
470; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
471; CHECK-NEXT:    vldrw.u32 q0, [r0]
472; CHECK-NEXT:    vptt.i32 ne, q0, zr
473; CHECK-NEXT:    vcmpt.s32 le, q0, r2
474; CHECK-NEXT:    vldrwt.u32 q1, [r1], #16
475; CHECK-NEXT:    add.w r12, r12, #4
476; CHECK-NEXT:    vmul.i32 q0, q1, q0
477; CHECK-NEXT:    vpst
478; CHECK-NEXT:    vstrwt.32 q0, [r0], #16
479; CHECK-NEXT:    letp lr, .LBB5_2
480; CHECK-NEXT:  @ %bb.3: @ %bb32
481; CHECK-NEXT:    pop {r7, pc}
482bb:
483  %tmp = icmp eq i32 %arg3, 0
484  br i1 %tmp, label %bb32, label %bb4
485
486bb4:                                              ; preds = %bb
487  %tmp5 = add i32 %arg3, 3
488  %tmp6 = and i32 %tmp5, -4
489  %tmp7 = add i32 %arg3, -1
490  %tmp8 = insertelement <4 x i32> undef, i32 %tmp7, i32 0
491  %tmp9 = shufflevector <4 x i32> %tmp8, <4 x i32> undef, <4 x i32> zeroinitializer
492  %tmp10 = insertelement <4 x i32> undef, i32 %arg2, i32 0
493  %tmp11 = shufflevector <4 x i32> %tmp10, <4 x i32> undef, <4 x i32> zeroinitializer
494  br label %bb12
495
496bb12:                                             ; preds = %bb12, %bb4
497  %tmp13 = phi i32 [ 0, %bb4 ], [ %tmp30, %bb12 ]
498  %tmp14 = insertelement <4 x i32> undef, i32 %tmp13, i32 0
499  %tmp15 = shufflevector <4 x i32> %tmp14, <4 x i32> undef, <4 x i32> zeroinitializer
500  %tmp16 = add <4 x i32> %tmp15, <i32 0, i32 1, i32 2, i32 3>
501  %tmp17 = getelementptr inbounds i32, i32* %arg, i32 %tmp13
502
503  ; %tmp18 = icmp ule <4 x i32> %tmp16, %tmp9
504  %tmp18= call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %tmp13, i32 %arg3)
505
506  %tmp19 = bitcast i32* %tmp17 to <4 x i32>*
507  %tmp20 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp19, i32 4, <4 x i1> %tmp18, <4 x i32> undef)
508  %tmp21 = icmp ne <4 x i32> %tmp20, zeroinitializer
509  %tmp22 = icmp sle <4 x i32> %tmp20, %tmp11
510  %tmp23 = getelementptr inbounds i32, i32* %arg1, i32 %tmp13
511  %tmp24 = and <4 x i1> %tmp22, %tmp21
512  %tmp25 = and <4 x i1> %tmp24, %tmp18
513  %tmp26 = bitcast i32* %tmp23 to <4 x i32>*
514  %tmp27 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp26, i32 4, <4 x i1> %tmp25, <4 x i32> undef)
515  %tmp28 = mul nsw <4 x i32> %tmp27, %tmp20
516  %tmp29 = bitcast i32* %tmp17 to <4 x i32>*
517  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp28, <4 x i32>* %tmp29, i32 4, <4 x i1> %tmp25)
518  %tmp30 = add i32 %tmp13, 4
519  %tmp31 = icmp eq i32 %tmp30, %tmp6
520  br i1 %tmp31, label %bb32, label %bb12
521
522bb32:                                             ; preds = %bb12, %bb
523  ret void
524}
525
526; Function Attrs: argmemonly nounwind readonly willreturn
527declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
528declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>)
529
530; Function Attrs: nounwind readnone willreturn
531declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
532
533declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
534