1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -tail-predication=enabled %s -o - | FileCheck %s
3
4define dso_local arm_aapcs_vfpcc zeroext i8 @one_loop_add_add_v16i8(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %N) local_unnamed_addr {
5; CHECK-LABEL: one_loop_add_add_v16i8:
6; CHECK:       @ %bb.0: @ %entry
7; CHECK-NEXT:    push {r7, lr}
8; CHECK-NEXT:    cbz r2, .LBB0_4
9; CHECK-NEXT:  @ %bb.1: @ %vector.ph
10; CHECK-NEXT:    vmov.i32 q0, #0x0
11; CHECK-NEXT:    dlstp.8 lr, r2
12; CHECK-NEXT:  .LBB0_2: @ %vector.body
13; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
14; CHECK-NEXT:    vldrb.u8 q1, [r1], #16
15; CHECK-NEXT:    vldrb.u8 q2, [r0], #16
16; CHECK-NEXT:    vadd.i8 q0, q2, q1
17; CHECK-NEXT:    vaddv.u8 r12, q0
18; CHECK-NEXT:    letp lr, .LBB0_2
19; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
20; CHECK-NEXT:    uxtb.w r0, r12
21; CHECK-NEXT:    pop {r7, pc}
22; CHECK-NEXT:  .LBB0_4:
23; CHECK-NEXT:    mov.w r12, #0
24; CHECK-NEXT:    uxtb.w r0, r12
25; CHECK-NEXT:    pop {r7, pc}
26entry:
27  %cmp11 = icmp eq i32 %N, 0
28  br i1 %cmp11, label %for.cond.cleanup, label %vector.ph
29
30vector.ph:                                        ; preds = %entry
31  %n.rnd.up = add i32 %N, 15
32  %n.vec = and i32 %n.rnd.up, -16
33  %trip.count.minus.1 = add i32 %N, -1
34  br label %vector.body
35
36vector.body:                                      ; preds = %vector.body, %vector.ph
37  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
38  %vec.phi = phi <16 x i8> [ zeroinitializer, %vector.ph ], [ %i5, %vector.body ]
39  %i = getelementptr inbounds i8, i8* %a, i32 %index
40  %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %N)
41  %i1 = bitcast i8* %i to <16 x i8>*
42  %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %i1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
43  %i2 = getelementptr inbounds i8, i8* %b, i32 %index
44  %i3 = bitcast i8* %i2 to <16 x i8>*
45  %wide.masked.load16 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %i3, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
46  %i4 = add <16 x i8> %wide.masked.load, %wide.masked.load16
47  %i5 = select <16 x i1> %active.lane.mask, <16 x i8> %i4, <16 x i8> %vec.phi
48  %i6 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %i5)
49  %index.next = add i32 %index, 16
50  %i7 = icmp eq i32 %index.next, %n.vec
51  br i1 %i7, label %middle.block, label %vector.body
52
53middle.block:                                     ; preds = %vector.body
54  br label %for.cond.cleanup
55
56for.cond.cleanup:                                 ; preds = %middle.block, %entry
57  %res.0.lcssa = phi i8 [ 0, %entry ], [ %i6, %middle.block ]
58  ret i8 %res.0.lcssa
59}
60
61define dso_local arm_aapcs_vfpcc signext i16 @one_loop_add_add_v8i16(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %N) local_unnamed_addr {
62; CHECK-LABEL: one_loop_add_add_v8i16:
63; CHECK:       @ %bb.0: @ %entry
64; CHECK-NEXT:    cmp r2, #0
65; CHECK-NEXT:    ittt eq
66; CHECK-NEXT:    moveq r0, #0
67; CHECK-NEXT:    sxtheq r0, r0
68; CHECK-NEXT:    bxeq lr
69; CHECK-NEXT:  .LBB1_1: @ %vector.ph
70; CHECK-NEXT:    push {r7, lr}
71; CHECK-NEXT:    adds r3, r2, #7
72; CHECK-NEXT:    vmov.i32 q0, #0x0
73; CHECK-NEXT:    bic r3, r3, #7
74; CHECK-NEXT:    sub.w r12, r3, #8
75; CHECK-NEXT:    movs r3, #1
76; CHECK-NEXT:    add.w lr, r3, r12, lsr #3
77; CHECK-NEXT:    dls lr, lr
78; CHECK-NEXT:  .LBB1_2: @ %vector.body
79; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
80; CHECK-NEXT:    vctp.16 r2
81; CHECK-NEXT:    vmov q1, q0
82; CHECK-NEXT:    vpst
83; CHECK-NEXT:    vldrbt.u16 q0, [r0], #8
84; CHECK-NEXT:    subs r2, #8
85; CHECK-NEXT:    vadd.i16 q0, q1, q0
86; CHECK-NEXT:    vpst
87; CHECK-NEXT:    vldrbt.u16 q2, [r1], #8
88; CHECK-NEXT:    vadd.i16 q0, q0, q2
89; CHECK-NEXT:    le lr, .LBB1_2
90; CHECK-NEXT:  @ %bb.3: @ %middle.block
91; CHECK-NEXT:    vpsel q0, q0, q1
92; CHECK-NEXT:    vaddv.u16 r0, q0
93; CHECK-NEXT:    pop.w {r7, lr}
94; CHECK-NEXT:    sxth r0, r0
95; CHECK-NEXT:    bx lr
96entry:
97  %cmp12 = icmp eq i32 %N, 0
98  br i1 %cmp12, label %for.cond.cleanup, label %vector.ph
99
100vector.ph:                                        ; preds = %entry
101  %n.rnd.up = add i32 %N, 7
102  %n.vec = and i32 %n.rnd.up, -8
103  %trip.count.minus.1 = add i32 %N, -1
104  br label %vector.body
105
106vector.body:                                      ; preds = %vector.body, %vector.ph
107  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
108  %vec.phi = phi <8 x i16> [ zeroinitializer, %vector.ph ], [ %i7, %vector.body ]
109  %i = getelementptr inbounds i8, i8* %a, i32 %index
110  %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
111  %i1 = bitcast i8* %i to <8 x i8>*
112  %wide.masked.load = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %i1, i32 1, <8 x i1> %active.lane.mask, <8 x i8> undef)
113  %i2 = zext <8 x i8> %wide.masked.load to <8 x i16>
114  %i3 = getelementptr inbounds i8, i8* %b, i32 %index
115  %i4 = bitcast i8* %i3 to <8 x i8>*
116  %wide.masked.load17 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %i4, i32 1, <8 x i1> %active.lane.mask, <8 x i8> undef)
117  %i5 = zext <8 x i8> %wide.masked.load17 to <8 x i16>
118  %i6 = add <8 x i16> %vec.phi, %i2
119  %i7 = add <8 x i16> %i6, %i5
120  %index.next = add i32 %index, 8
121  %i8 = icmp eq i32 %index.next, %n.vec
122  br i1 %i8, label %middle.block, label %vector.body
123
124middle.block:                                     ; preds = %vector.body
125  %i9 = select <8 x i1> %active.lane.mask, <8 x i16> %i7, <8 x i16> %vec.phi
126  %i10 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %i9)
127  br label %for.cond.cleanup
128
129for.cond.cleanup:                                 ; preds = %middle.block, %entry
130  %res.0.lcssa = phi i16 [ 0, %entry ], [ %i10, %middle.block ]
131  ret i16 %res.0.lcssa
132}
133
134define dso_local arm_aapcs_vfpcc zeroext i8 @one_loop_sub_add_v16i8(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %N) local_unnamed_addr {
135; CHECK-LABEL: one_loop_sub_add_v16i8:
136; CHECK:       @ %bb.0: @ %entry
137; CHECK-NEXT:    cmp r2, #0
138; CHECK-NEXT:    ittt eq
139; CHECK-NEXT:    moveq r0, #0
140; CHECK-NEXT:    uxtbeq r0, r0
141; CHECK-NEXT:    bxeq lr
142; CHECK-NEXT:  .LBB2_1: @ %vector.ph
143; CHECK-NEXT:    push {r7, lr}
144; CHECK-NEXT:    add.w r3, r2, #15
145; CHECK-NEXT:    vmov.i32 q0, #0x0
146; CHECK-NEXT:    bic r3, r3, #15
147; CHECK-NEXT:    sub.w r12, r3, #16
148; CHECK-NEXT:    movs r3, #1
149; CHECK-NEXT:    add.w lr, r3, r12, lsr #4
150; CHECK-NEXT:    dls lr, lr
151; CHECK-NEXT:  .LBB2_2: @ %vector.body
152; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
153; CHECK-NEXT:    vctp.8 r2
154; CHECK-NEXT:    vmov q1, q0
155; CHECK-NEXT:    vpstt
156; CHECK-NEXT:    vldrbt.u8 q0, [r1], #16
157; CHECK-NEXT:    vldrbt.u8 q2, [r0], #16
158; CHECK-NEXT:    subs r2, #16
159; CHECK-NEXT:    vsub.i8 q0, q2, q0
160; CHECK-NEXT:    vadd.i8 q0, q0, q1
161; CHECK-NEXT:    le lr, .LBB2_2
162; CHECK-NEXT:  @ %bb.3: @ %middle.block
163; CHECK-NEXT:    vpsel q0, q0, q1
164; CHECK-NEXT:    vaddv.u8 r0, q0
165; CHECK-NEXT:    pop.w {r7, lr}
166; CHECK-NEXT:    uxtb r0, r0
167; CHECK-NEXT:    bx lr
168entry:
169  %cmp11 = icmp eq i32 %N, 0
170  br i1 %cmp11, label %for.cond.cleanup, label %vector.ph
171
172vector.ph:                                        ; preds = %entry
173  %n.rnd.up = add i32 %N, 15
174  %n.vec = and i32 %n.rnd.up, -16
175  %trip.count.minus.1 = add i32 %N, -1
176  br label %vector.body
177
178vector.body:                                      ; preds = %vector.body, %vector.ph
179  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
180  %vec.phi = phi <16 x i8> [ zeroinitializer, %vector.ph ], [ %i5, %vector.body ]
181  %i = getelementptr inbounds i8, i8* %a, i32 %index
182  %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %N)
183  %i1 = bitcast i8* %i to <16 x i8>*
184  %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %i1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
185  %i2 = getelementptr inbounds i8, i8* %b, i32 %index
186  %i3 = bitcast i8* %i2 to <16 x i8>*
187  %wide.masked.load16 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %i3, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
188  %i4 = sub <16 x i8> %wide.masked.load, %wide.masked.load16
189  %i5 = add <16 x i8> %i4, %vec.phi
190  %index.next = add i32 %index, 16
191  %i6 = icmp eq i32 %index.next, %n.vec
192  br i1 %i6, label %middle.block, label %vector.body
193
194middle.block:                                     ; preds = %vector.body
195  %i7 = select <16 x i1> %active.lane.mask, <16 x i8> %i5, <16 x i8> %vec.phi
196  %i8 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %i7)
197  br label %for.cond.cleanup
198
199for.cond.cleanup:                                 ; preds = %middle.block, %entry
200  %res.0.lcssa = phi i8 [ 0, %entry ], [ %i8, %middle.block ]
201  ret i8 %res.0.lcssa
202}
203
204define dso_local arm_aapcs_vfpcc signext i16 @one_loop_sub_add_v8i16(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %N) local_unnamed_addr {
205; CHECK-LABEL: one_loop_sub_add_v8i16:
206; CHECK:       @ %bb.0: @ %entry
207; CHECK-NEXT:    cmp r2, #0
208; CHECK-NEXT:    ittt eq
209; CHECK-NEXT:    moveq r0, #0
210; CHECK-NEXT:    sxtheq r0, r0
211; CHECK-NEXT:    bxeq lr
212; CHECK-NEXT:  .LBB3_1: @ %vector.ph
213; CHECK-NEXT:    push {r7, lr}
214; CHECK-NEXT:    adds r3, r2, #7
215; CHECK-NEXT:    vmov.i32 q0, #0x0
216; CHECK-NEXT:    bic r3, r3, #7
217; CHECK-NEXT:    sub.w r12, r3, #8
218; CHECK-NEXT:    movs r3, #1
219; CHECK-NEXT:    add.w lr, r3, r12, lsr #3
220; CHECK-NEXT:    dls lr, lr
221; CHECK-NEXT:  .LBB3_2: @ %vector.body
222; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
223; CHECK-NEXT:    vctp.16 r2
224; CHECK-NEXT:    vmov q1, q0
225; CHECK-NEXT:    vpstt
226; CHECK-NEXT:    vldrbt.u16 q0, [r0], #8
227; CHECK-NEXT:    vldrbt.u16 q2, [r1], #8
228; CHECK-NEXT:    subs r2, #8
229; CHECK-NEXT:    vsub.i16 q0, q2, q0
230; CHECK-NEXT:    vadd.i16 q0, q0, q1
231; CHECK-NEXT:    le lr, .LBB3_2
232; CHECK-NEXT:  @ %bb.3: @ %middle.block
233; CHECK-NEXT:    vpsel q0, q0, q1
234; CHECK-NEXT:    vaddv.u16 r0, q0
235; CHECK-NEXT:    pop.w {r7, lr}
236; CHECK-NEXT:    sxth r0, r0
237; CHECK-NEXT:    bx lr
238entry:
239  %cmp12 = icmp eq i32 %N, 0
240  br i1 %cmp12, label %for.cond.cleanup, label %vector.ph
241
242vector.ph:                                        ; preds = %entry
243  %n.rnd.up = add i32 %N, 7
244  %n.vec = and i32 %n.rnd.up, -8
245  %trip.count.minus.1 = add i32 %N, -1
246  br label %vector.body
247
248vector.body:                                      ; preds = %vector.body, %vector.ph
249  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
250  %vec.phi = phi <8 x i16> [ zeroinitializer, %vector.ph ], [ %i7, %vector.body ]
251  %i = getelementptr inbounds i8, i8* %a, i32 %index
252  %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
253  %i1 = bitcast i8* %i to <8 x i8>*
254  %wide.masked.load = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %i1, i32 1, <8 x i1> %active.lane.mask, <8 x i8> undef)
255  %i2 = zext <8 x i8> %wide.masked.load to <8 x i16>
256  %i3 = getelementptr inbounds i8, i8* %b, i32 %index
257  %i4 = bitcast i8* %i3 to <8 x i8>*
258  %wide.masked.load17 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %i4, i32 1, <8 x i1> %active.lane.mask, <8 x i8> undef)
259  %i5 = zext <8 x i8> %wide.masked.load17 to <8 x i16>
260  %i6 = sub <8 x i16> %i5, %i2
261  %i7 = add <8 x i16> %i6, %vec.phi
262  %index.next = add i32 %index, 8
263  %i8 = icmp eq i32 %index.next, %n.vec
264  br i1 %i8, label %middle.block, label %vector.body
265
266middle.block:                                     ; preds = %vector.body
267  %i9 = select <8 x i1> %active.lane.mask, <8 x i16> %i7, <8 x i16> %vec.phi
268  %i10 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %i9)
269  br label %for.cond.cleanup
270
271for.cond.cleanup:                                 ; preds = %middle.block, %entry
272  %res.0.lcssa = phi i16 [ 0, %entry ], [ %i10, %middle.block ]
273  ret i16 %res.0.lcssa
274}
275
276define dso_local arm_aapcs_vfpcc zeroext i8 @one_loop_mul_add_v16i8(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %N) local_unnamed_addr {
277; CHECK-LABEL: one_loop_mul_add_v16i8:
278; CHECK:       @ %bb.0: @ %entry
279; CHECK-NEXT:    cmp r2, #0
280; CHECK-NEXT:    ittt eq
281; CHECK-NEXT:    moveq r0, #0
282; CHECK-NEXT:    uxtbeq r0, r0
283; CHECK-NEXT:    bxeq lr
284; CHECK-NEXT:  .LBB4_1: @ %vector.ph
285; CHECK-NEXT:    push {r7, lr}
286; CHECK-NEXT:    add.w r3, r2, #15
287; CHECK-NEXT:    vmov.i32 q0, #0x0
288; CHECK-NEXT:    bic r3, r3, #15
289; CHECK-NEXT:    sub.w r12, r3, #16
290; CHECK-NEXT:    movs r3, #1
291; CHECK-NEXT:    add.w lr, r3, r12, lsr #4
292; CHECK-NEXT:    dls lr, lr
293; CHECK-NEXT:  .LBB4_2: @ %vector.body
294; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
295; CHECK-NEXT:    vctp.8 r2
296; CHECK-NEXT:    vmov q1, q0
297; CHECK-NEXT:    vpstt
298; CHECK-NEXT:    vldrbt.u8 q0, [r0], #16
299; CHECK-NEXT:    vldrbt.u8 q2, [r1], #16
300; CHECK-NEXT:    subs r2, #16
301; CHECK-NEXT:    vmul.i8 q0, q2, q0
302; CHECK-NEXT:    vadd.i8 q0, q0, q1
303; CHECK-NEXT:    le lr, .LBB4_2
304; CHECK-NEXT:  @ %bb.3: @ %middle.block
305; CHECK-NEXT:    vpsel q0, q0, q1
306; CHECK-NEXT:    vaddv.u8 r0, q0
307; CHECK-NEXT:    pop.w {r7, lr}
308; CHECK-NEXT:    uxtb r0, r0
309; CHECK-NEXT:    bx lr
310entry:
311  %cmp10 = icmp eq i32 %N, 0
312  br i1 %cmp10, label %for.cond.cleanup, label %vector.ph
313
314vector.ph:                                        ; preds = %entry
315  %n.rnd.up = add i32 %N, 15
316  %n.vec = and i32 %n.rnd.up, -16
317  %trip.count.minus.1 = add i32 %N, -1
318  br label %vector.body
319
320vector.body:                                      ; preds = %vector.body, %vector.ph
321  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
322  %vec.phi = phi <16 x i8> [ zeroinitializer, %vector.ph ], [ %i5, %vector.body ]
323  %i = getelementptr inbounds i8, i8* %a, i32 %index
324  %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %N)
325  %i1 = bitcast i8* %i to <16 x i8>*
326  %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %i1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
327  %i2 = getelementptr inbounds i8, i8* %b, i32 %index
328  %i3 = bitcast i8* %i2 to <16 x i8>*
329  %wide.masked.load15 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %i3, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
330  %i4 = mul <16 x i8> %wide.masked.load15, %wide.masked.load
331  %i5 = add <16 x i8> %i4, %vec.phi
332  %index.next = add i32 %index, 16
333  %i6 = icmp eq i32 %index.next, %n.vec
334  br i1 %i6, label %middle.block, label %vector.body
335
336middle.block:                                     ; preds = %vector.body
337  %i7 = select <16 x i1> %active.lane.mask, <16 x i8> %i5, <16 x i8> %vec.phi
338  %i8 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %i7)
339  br label %for.cond.cleanup
340
341for.cond.cleanup:                                 ; preds = %middle.block, %entry
342  %res.0.lcssa = phi i8 [ 0, %entry ], [ %i8, %middle.block ]
343  ret i8 %res.0.lcssa
344}
345
346define dso_local arm_aapcs_vfpcc signext i16 @one_loop_mul_add_v8i16(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %N) local_unnamed_addr {
347; CHECK-LABEL: one_loop_mul_add_v8i16:
348; CHECK:       @ %bb.0: @ %entry
349; CHECK-NEXT:    cmp r2, #0
350; CHECK-NEXT:    ittt eq
351; CHECK-NEXT:    moveq r0, #0
352; CHECK-NEXT:    sxtheq r0, r0
353; CHECK-NEXT:    bxeq lr
354; CHECK-NEXT:  .LBB5_1: @ %vector.ph
355; CHECK-NEXT:    push {r7, lr}
356; CHECK-NEXT:    adds r3, r2, #7
357; CHECK-NEXT:    vmov.i32 q0, #0x0
358; CHECK-NEXT:    bic r3, r3, #7
359; CHECK-NEXT:    sub.w r12, r3, #8
360; CHECK-NEXT:    movs r3, #1
361; CHECK-NEXT:    add.w lr, r3, r12, lsr #3
362; CHECK-NEXT:    dls lr, lr
363; CHECK-NEXT:  .LBB5_2: @ %vector.body
364; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
365; CHECK-NEXT:    vctp.16 r2
366; CHECK-NEXT:    vmov q1, q0
367; CHECK-NEXT:    vpstt
368; CHECK-NEXT:    vldrbt.u16 q0, [r0], #8
369; CHECK-NEXT:    vldrbt.u16 q2, [r1], #8
370; CHECK-NEXT:    subs r2, #8
371; CHECK-NEXT:    vmul.i16 q0, q2, q0
372; CHECK-NEXT:    vadd.i16 q0, q0, q1
373; CHECK-NEXT:    le lr, .LBB5_2
374; CHECK-NEXT:  @ %bb.3: @ %middle.block
375; CHECK-NEXT:    vpsel q0, q0, q1
376; CHECK-NEXT:    vaddv.u16 r0, q0
377; CHECK-NEXT:    pop.w {r7, lr}
378; CHECK-NEXT:    sxth r0, r0
379; CHECK-NEXT:    bx lr
380entry:
381  %cmp12 = icmp eq i32 %N, 0
382  br i1 %cmp12, label %for.cond.cleanup, label %vector.ph
383
384vector.ph:                                        ; preds = %entry
385  %n.rnd.up = add i32 %N, 7
386  %n.vec = and i32 %n.rnd.up, -8
387  %trip.count.minus.1 = add i32 %N, -1
388  br label %vector.body
389
390vector.body:                                      ; preds = %vector.body, %vector.ph
391  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
392  %vec.phi = phi <8 x i16> [ zeroinitializer, %vector.ph ], [ %i7, %vector.body ]
393  %i = getelementptr inbounds i8, i8* %a, i32 %index
394  %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
395  %i1 = bitcast i8* %i to <8 x i8>*
396  %wide.masked.load = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %i1, i32 1, <8 x i1> %active.lane.mask, <8 x i8> undef)
397  %i2 = zext <8 x i8> %wide.masked.load to <8 x i16>
398  %i3 = getelementptr inbounds i8, i8* %b, i32 %index
399  %i4 = bitcast i8* %i3 to <8 x i8>*
400  %wide.masked.load17 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %i4, i32 1, <8 x i1> %active.lane.mask, <8 x i8> undef)
401  %i5 = zext <8 x i8> %wide.masked.load17 to <8 x i16>
402  %i6 = mul <8 x i16> %i5, %i2
403  %i7 = add <8 x i16> %i6, %vec.phi
404  %index.next = add i32 %index, 8
405  %i8 = icmp eq i32 %index.next, %n.vec
406  br i1 %i8, label %middle.block, label %vector.body
407
408middle.block:                                     ; preds = %vector.body
409  %i9 = select <8 x i1> %active.lane.mask, <8 x i16> %i7, <8 x i16> %vec.phi
410  %i10 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %i9)
411  br label %for.cond.cleanup
412
413for.cond.cleanup:                                 ; preds = %middle.block, %entry
414  %res.0.lcssa = phi i16 [ 0, %entry ], [ %i10, %middle.block ]
415  ret i16 %res.0.lcssa
416}
417
418define dso_local arm_aapcs_vfpcc i32 @two_loops_mul_add_v4i32(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %N) local_unnamed_addr {
419; CHECK-LABEL: two_loops_mul_add_v4i32:
420; CHECK:       @ %bb.0: @ %entry
421; CHECK-NEXT:    push {r4, r5, r6, lr}
422; CHECK-NEXT:    cmp r2, #0
423; CHECK-NEXT:    beq .LBB6_8
424; CHECK-NEXT:  @ %bb.1: @ %vector.ph
425; CHECK-NEXT:    adds r3, r2, #3
426; CHECK-NEXT:    vmov.i32 q0, #0x0
427; CHECK-NEXT:    bic r3, r3, #3
428; CHECK-NEXT:    mov r4, r0
429; CHECK-NEXT:    subs r6, r3, #4
430; CHECK-NEXT:    movs r3, #1
431; CHECK-NEXT:    mov r5, r1
432; CHECK-NEXT:    add.w lr, r3, r6, lsr #2
433; CHECK-NEXT:    mov r3, r2
434; CHECK-NEXT:    dls lr, lr
435; CHECK-NEXT:  .LBB6_2: @ %vector.body
436; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
437; CHECK-NEXT:    vctp.32 r3
438; CHECK-NEXT:    vmov q1, q0
439; CHECK-NEXT:    vpstt
440; CHECK-NEXT:    vldrbt.u32 q0, [r4], #4
441; CHECK-NEXT:    vldrbt.u32 q2, [r5], #4
442; CHECK-NEXT:    subs r3, #4
443; CHECK-NEXT:    vmul.i32 q0, q2, q0
444; CHECK-NEXT:    vadd.i32 q0, q0, q1
445; CHECK-NEXT:    le lr, .LBB6_2
446; CHECK-NEXT:  @ %bb.3: @ %middle.block
447; CHECK-NEXT:    vpsel q0, q0, q1
448; CHECK-NEXT:    vaddv.u32 r12, q0
449; CHECK-NEXT:    cbz r2, .LBB6_7
450; CHECK-NEXT:  @ %bb.4: @ %vector.ph47
451; CHECK-NEXT:    movs r3, #0
452; CHECK-NEXT:    vdup.32 q0, r3
453; CHECK-NEXT:    movs r3, #1
454; CHECK-NEXT:    add.w lr, r3, r6, lsr #2
455; CHECK-NEXT:    vmov.32 q0[0], r12
456; CHECK-NEXT:    dls lr, lr
457; CHECK-NEXT:  .LBB6_5: @ %vector.body46
458; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
459; CHECK-NEXT:    vctp.32 r2
460; CHECK-NEXT:    vmov q1, q0
461; CHECK-NEXT:    vpstt
462; CHECK-NEXT:    vldrbt.u32 q0, [r0], #4
463; CHECK-NEXT:    vldrbt.u32 q2, [r1], #4
464; CHECK-NEXT:    subs r2, #4
465; CHECK-NEXT:    vmul.i32 q0, q2, q0
466; CHECK-NEXT:    vadd.i32 q0, q0, q1
467; CHECK-NEXT:    le lr, .LBB6_5
468; CHECK-NEXT:  @ %bb.6: @ %middle.block44
469; CHECK-NEXT:    vpsel q0, q0, q1
470; CHECK-NEXT:    vaddv.u32 r12, q0
471; CHECK-NEXT:  .LBB6_7: @ %for.cond.cleanup7
472; CHECK-NEXT:    mov r0, r12
473; CHECK-NEXT:    pop {r4, r5, r6, pc}
474; CHECK-NEXT:  .LBB6_8:
475; CHECK-NEXT:    movs r0, #0
476; CHECK-NEXT:    pop {r4, r5, r6, pc}
477entry:
478  %cmp35 = icmp eq i32 %N, 0
479  br i1 %cmp35, label %for.cond.cleanup7, label %vector.ph
480
481vector.ph:                                        ; preds = %entry
482  %n.rnd.up = add i32 %N, 3
483  %n.vec = and i32 %n.rnd.up, -4
484  %trip.count.minus.1 = add i32 %N, -1
485  br label %vector.body
486
487vector.body:                                      ; preds = %vector.body, %vector.ph
488  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
489  %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %i7, %vector.body ]
490  %i = getelementptr inbounds i8, i8* %a, i32 %index
491  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
492  %i1 = bitcast i8* %i to <4 x i8>*
493  %wide.masked.load = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %i1, i32 1, <4 x i1> %active.lane.mask, <4 x i8> undef)
494  %i2 = zext <4 x i8> %wide.masked.load to <4 x i32>
495  %i3 = getelementptr inbounds i8, i8* %b, i32 %index
496  %i4 = bitcast i8* %i3 to <4 x i8>*
497  %wide.masked.load43 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %i4, i32 1, <4 x i1> %active.lane.mask, <4 x i8> undef)
498  %i5 = zext <4 x i8> %wide.masked.load43 to <4 x i32>
499  %i6 = mul nuw nsw <4 x i32> %i5, %i2
500  %i7 = add <4 x i32> %i6, %vec.phi
501  %index.next = add i32 %index, 4
502  %i8 = icmp eq i32 %index.next, %n.vec
503  br i1 %i8, label %middle.block, label %vector.body
504
505middle.block:                                     ; preds = %vector.body
506  %i9 = select <4 x i1> %active.lane.mask, <4 x i32> %i7, <4 x i32> %vec.phi
507  %i10 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %i9)
508  br i1 %cmp35, label %for.cond.cleanup7, label %vector.ph47
509
510vector.ph47:                                      ; preds = %middle.block
511  %n.rnd.up48 = add i32 %N, 3
512  %n.vec50 = and i32 %n.rnd.up48, -4
513  %trip.count.minus.154 = add i32 %N, -1
514  %i11 = insertelement <4 x i32> <i32 undef, i32 0, i32 0, i32 0>, i32 %i10, i32 0
515  br label %vector.body46
516
517vector.body46:                                    ; preds = %vector.body46, %vector.ph47
518  %index51 = phi i32 [ 0, %vector.ph47 ], [ %index.next52, %vector.body46 ]
519  %vec.phi60 = phi <4 x i32> [ %i11, %vector.ph47 ], [ %i19, %vector.body46 ]
520  %i12 = getelementptr inbounds i8, i8* %a, i32 %index51
521  %active.lane.mask61 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index51, i32 %N)
522  %i13 = bitcast i8* %i12 to <4 x i8>*
523  %wide.masked.load62 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %i13, i32 1, <4 x i1> %active.lane.mask61, <4 x i8> undef)
524  %i14 = zext <4 x i8> %wide.masked.load62 to <4 x i32>
525  %i15 = getelementptr inbounds i8, i8* %b, i32 %index51
526  %i16 = bitcast i8* %i15 to <4 x i8>*
527  %wide.masked.load63 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %i16, i32 1, <4 x i1> %active.lane.mask61, <4 x i8> undef)
528  %i17 = zext <4 x i8> %wide.masked.load63 to <4 x i32>
529  %i18 = mul nuw nsw <4 x i32> %i17, %i14
530  %i19 = add <4 x i32> %i18, %vec.phi60
531  %index.next52 = add i32 %index51, 4
532  %i20 = icmp eq i32 %index.next52, %n.vec50
533  br i1 %i20, label %middle.block44, label %vector.body46
534
535middle.block44:                                   ; preds = %vector.body46
536  %i21 = select <4 x i1> %active.lane.mask61, <4 x i32> %i19, <4 x i32> %vec.phi60
537  %i22 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %i21)
538  br label %for.cond.cleanup7
539
540for.cond.cleanup7:                                ; preds = %middle.block44, %middle.block, %entry
541  %res.1.lcssa = phi i32 [ %i10, %middle.block ], [ 0, %entry ], [ %i22, %middle.block44 ]
542  ret i32 %res.1.lcssa
543}
544
545define dso_local arm_aapcs_vfpcc void @two_reductions_mul_add_v8i16(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %N) local_unnamed_addr {
546; CHECK-LABEL: two_reductions_mul_add_v8i16:
547; CHECK:       @ %bb.0: @ %entry
548; CHECK-NEXT:    push {r4, lr}
549; CHECK-NEXT:    vpush {d8, d9}
550; CHECK-NEXT:    cbz r2, .LBB7_4
551; CHECK-NEXT:  @ %bb.1: @ %vector.ph
552; CHECK-NEXT:    adds r3, r2, #7
553; CHECK-NEXT:    vmov.i32 q0, #0x0
554; CHECK-NEXT:    bic r3, r3, #7
555; CHECK-NEXT:    movs r4, #1
556; CHECK-NEXT:    subs r3, #8
557; CHECK-NEXT:    vmov q3, q0
558; CHECK-NEXT:    add.w lr, r4, r3, lsr #3
559; CHECK-NEXT:    mov r3, r0
560; CHECK-NEXT:    dls lr, lr
561; CHECK-NEXT:    mov r4, r1
562; CHECK-NEXT:  .LBB7_2: @ %vector.body
563; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
564; CHECK-NEXT:    vctp.16 r2
565; CHECK-NEXT:    vmov q1, q0
566; CHECK-NEXT:    vpstt
567; CHECK-NEXT:    vldrbt.u16 q0, [r3], #8
568; CHECK-NEXT:    vldrbt.u16 q4, [r4], #8
569; CHECK-NEXT:    vmov q2, q3
570; CHECK-NEXT:    vsub.i16 q3, q4, q0
571; CHECK-NEXT:    vmul.i16 q0, q4, q0
572; CHECK-NEXT:    subs r2, #8
573; CHECK-NEXT:    vadd.i16 q3, q3, q2
574; CHECK-NEXT:    vadd.i16 q0, q0, q1
575; CHECK-NEXT:    le lr, .LBB7_2
576; CHECK-NEXT:  @ %bb.3: @ %middle.block
577; CHECK-NEXT:    vpsel q2, q3, q2
578; CHECK-NEXT:    vpsel q0, q0, q1
579; CHECK-NEXT:    vaddv.u16 r4, q2
580; CHECK-NEXT:    vaddv.u16 r2, q0
581; CHECK-NEXT:    b .LBB7_5
582; CHECK-NEXT:  .LBB7_4:
583; CHECK-NEXT:    movs r2, #0
584; CHECK-NEXT:    movs r4, #0
585; CHECK-NEXT:  .LBB7_5: @ %for.cond.cleanup
586; CHECK-NEXT:    strb r2, [r0]
587; CHECK-NEXT:    strb r4, [r1]
588; CHECK-NEXT:    vpop {d8, d9}
589; CHECK-NEXT:    pop {r4, pc}
590entry:
591  %cmp12 = icmp eq i32 %N, 0
592  br i1 %cmp12, label %for.cond.cleanup, label %vector.ph
593
594vector.ph:                                        ; preds = %entry
595  %n.rnd.up = add i32 %N, 7
596  %n.vec = and i32 %n.rnd.up, -8
597  %trip.count.minus.1 = add i32 %N, -1
598  br label %vector.body
599
600vector.body:                                      ; preds = %vector.body, %vector.ph
601  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
602  %vec.phi = phi <8 x i16> [ zeroinitializer, %vector.ph ], [ %i8, %vector.body ]
603  %vec.phi.1 = phi <8 x i16> [ zeroinitializer, %vector.ph ], [ %i9, %vector.body ]
604  %i = getelementptr inbounds i8, i8* %a, i32 %index
605  %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
606  %i1 = bitcast i8* %i to <8 x i8>*
607  %wide.masked.load = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %i1, i32 1, <8 x i1> %active.lane.mask, <8 x i8> undef)
608  %i2 = zext <8 x i8> %wide.masked.load to <8 x i16>
609  %i3 = getelementptr inbounds i8, i8* %b, i32 %index
610  %i4 = bitcast i8* %i3 to <8 x i8>*
611  %wide.masked.load17 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %i4, i32 1, <8 x i1> %active.lane.mask, <8 x i8> undef)
612  %i5 = zext <8 x i8> %wide.masked.load17 to <8 x i16>
613  %i6 = mul <8 x i16> %i5, %i2
614  %i7 = sub <8 x i16> %i5, %i2
615  %i8 = add <8 x i16> %i6, %vec.phi
616  %i9 = add <8 x i16> %i7, %vec.phi.1
617  %index.next = add i32 %index, 8
618  %i10 = icmp eq i32 %index.next, %n.vec
619  br i1 %i10, label %middle.block, label %vector.body
620
621middle.block:                                     ; preds = %vector.body
622  %i11 = select <8 x i1> %active.lane.mask, <8 x i16> %i8, <8 x i16> %vec.phi
623  %i12 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %i11)
624  %i13 = select <8 x i1> %active.lane.mask, <8 x i16> %i9, <8 x i16> %vec.phi.1
625  %i14 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %i13)
626  br label %for.cond.cleanup
627
628for.cond.cleanup:                                 ; preds = %middle.block, %entry
629  %res.0.lcssa = phi i16 [ 0, %entry ], [ %i12, %middle.block ]
630  %res.1.lcssa = phi i16 [ 0, %entry ], [ %i14, %middle.block ]
631  %trunc.res.0 = trunc i16 %res.0.lcssa to i8
632  store i8 %trunc.res.0, i8* %a
633  %trunc.res.1 = trunc i16 %res.1.lcssa to i8
634  store i8 %trunc.res.1, i8* %b
635  ret void
636}
637
638%struct.date = type { i32, i32, i32, i32 }
639@days = internal unnamed_addr constant [2 x [13 x i32]] [[13 x i32] [i32 0, i32 31, i32 28, i32 31, i32 30, i32 31, i32 30, i32 31, i32 31, i32 30, i32 31, i32 30, i32 31], [13 x i32] [i32 0, i32 31, i32 29, i32 31, i32 30, i32 31, i32 30, i32 31, i32 31, i32 30, i32 31, i32 30, i32 31]], align 4
640define i32 @wrongop(%struct.date* nocapture readonly %pd) {
641; CHECK-LABEL: wrongop:
642; CHECK:       @ %bb.0: @ %entry
643; CHECK-NEXT:    push {r4, lr}
644; CHECK-NEXT:    mov r1, r0
645; CHECK-NEXT:    movw r12, #47184
646; CHECK-NEXT:    movw r3, #23593
647; CHECK-NEXT:    ldrd r2, lr, [r1, #4]
648; CHECK-NEXT:    movt r12, #1310
649; CHECK-NEXT:    movt r3, #49807
650; CHECK-NEXT:    mla r3, lr, r3, r12
651; CHECK-NEXT:    movw r1, #55051
652; CHECK-NEXT:    movw r4, #23593
653; CHECK-NEXT:    movt r1, #163
654; CHECK-NEXT:    ldr r0, [r0]
655; CHECK-NEXT:    movt r4, #655
656; CHECK-NEXT:    ror.w r12, r3, #4
657; CHECK-NEXT:    cmp r12, r1
658; CHECK-NEXT:    cset r1, lo
659; CHECK-NEXT:    ror.w r3, r3, #2
660; CHECK-NEXT:    mov.w r12, #1
661; CHECK-NEXT:    cmp r3, r4
662; CHECK-NEXT:    csel r3, r1, r12, lo
663; CHECK-NEXT:    lsls.w r4, lr, #30
664; CHECK-NEXT:    csel r1, r1, r3, ne
665; CHECK-NEXT:    cmp r2, #1
666; CHECK-NEXT:    it lt
667; CHECK-NEXT:    poplt {r4, pc}
668; CHECK-NEXT:  .LBB8_1: @ %vector.ph
669; CHECK-NEXT:    movw r3, :lower16:days
670; CHECK-NEXT:    movs r4, #52
671; CHECK-NEXT:    movt r3, :upper16:days
672; CHECK-NEXT:    mla r1, r1, r4, r3
673; CHECK-NEXT:    movs r3, #0
674; CHECK-NEXT:    vdup.32 q0, r3
675; CHECK-NEXT:    vmov.32 q0[0], r0
676; CHECK-NEXT:    adds r0, r2, #3
677; CHECK-NEXT:    bic r0, r0, #3
678; CHECK-NEXT:    subs r0, #4
679; CHECK-NEXT:    add.w lr, r12, r0, lsr #2
680; CHECK-NEXT:    dls lr, lr
681; CHECK-NEXT:  .LBB8_2: @ %vector.body
682; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
683; CHECK-NEXT:    vctp.32 r2
684; CHECK-NEXT:    vmov q1, q0
685; CHECK-NEXT:    vpst
686; CHECK-NEXT:    vldrwt.u32 q0, [r1], #16
687; CHECK-NEXT:    subs r2, #4
688; CHECK-NEXT:    vadd.i32 q0, q0, q1
689; CHECK-NEXT:    le lr, .LBB8_2
690; CHECK-NEXT:  @ %bb.3: @ %middle.block
691; CHECK-NEXT:    vpsel q0, q0, q1
692; CHECK-NEXT:    vaddv.u32 r0, q0
693; CHECK-NEXT:    pop {r4, pc}
694entry:
695  %day1 = getelementptr inbounds %struct.date, %struct.date* %pd, i32 0, i32 0
696  %0 = load i32, i32* %day1, align 4
697  %year = getelementptr inbounds %struct.date, %struct.date* %pd, i32 0, i32 2
698  %1 = load i32, i32* %year, align 4
699  %2 = and i32 %1, 3
700  %cmp = icmp ne i32 %2, 0
701  %rem3 = srem i32 %1, 100
702  %cmp4.not = icmp eq i32 %rem3, 0
703  %or.cond = or i1 %cmp, %cmp4.not
704  br i1 %or.cond, label %lor.rhs, label %lor.end
705
706lor.rhs:                                          ; preds = %entry
707  %rem6 = srem i32 %1, 400
708  %cmp7 = icmp eq i32 %rem6, 0
709  %phi.cast = zext i1 %cmp7 to i32
710  br label %lor.end
711
712lor.end:                                          ; preds = %entry, %lor.rhs
713  %3 = phi i32 [ %phi.cast, %lor.rhs ], [ 1, %entry ]
714  %month = getelementptr inbounds %struct.date, %struct.date* %pd, i32 0, i32 1
715  %4 = load i32, i32* %month, align 4
716  %cmp820 = icmp sgt i32 %4, 0
717  br i1 %cmp820, label %vector.ph, label %for.end
718
719vector.ph:                                        ; preds = %lor.end
720  %n.rnd.up = add i32 %4, 3
721  %n.vec = and i32 %n.rnd.up, -4
722  %trip.count.minus.1 = add i32 %4, -1
723  %5 = insertelement <4 x i32> <i32 undef, i32 0, i32 0, i32 0>, i32 %0, i32 0
724  br label %vector.body
725
726vector.body:                                      ; preds = %vector.body, %vector.ph
727  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
728  %vec.phi = phi <4 x i32> [ %5, %vector.ph ], [ %8, %vector.body ]
729  %6 = getelementptr inbounds [2 x [13 x i32]], [2 x [13 x i32]]* @days, i32 0, i32 %3, i32 %index
730  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %4)
731  %7 = bitcast i32* %6 to <4 x i32>*
732  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* nonnull %7, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
733  %8 = add <4 x i32> %wide.masked.load, %vec.phi
734  %index.next = add i32 %index, 4
735  %9 = icmp eq i32 %index.next, %n.vec
736  br i1 %9, label %middle.block, label %vector.body
737
738middle.block:                                     ; preds = %vector.body
739  %10 = select <4 x i1> %active.lane.mask, <4 x i32> %8, <4 x i32> %vec.phi
740  %11 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %10)
741  br label %for.end
742
743for.end:                                          ; preds = %middle.block, %lor.end
744  %day.0.lcssa = phi i32 [ %0, %lor.end ], [ %11, %middle.block ]
745  ret i32 %day.0.lcssa
746}
747
748declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
749declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32)
750declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>)
751declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)
752declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32)
753declare <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>*, i32 immarg, <8 x i1>, <8 x i8>)
754declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
755declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
756declare <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>*, i32 immarg, <4 x i1>, <4 x i8>)
757declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
758