1; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -tail-predication=enabled -mattr=+mve,+lob %s -S -o - | FileCheck %s
2
3; CHECK-LABEL: mul_v16i8
4; CHECK-NOT: %num.elements = add i32 %trip.count.minus.1, 1
5; CHECK: vector.body:
6; CHECK: %index = phi i32
7; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
8; CHECK: [[VCTP:%[^ ]+]] = call <16 x i1> @llvm.arm.mve.vctp8(i32 [[ELEMS]])
9; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 16
10; CHECK: [[LD0:%[^ ]+]] = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* {{.*}}, i32 4, <16 x i1> [[VCTP]], <16 x i8> undef)
11; CHECK: [[LD1:%[^ ]+]] = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* {{.*}}, i32 4, <16 x i1> [[VCTP]], <16 x i8> undef)
12; CHECK: tail call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> {{.*}}, <16 x i8>* {{.*}}, i32 4, <16 x i1> [[VCTP]])
13define dso_local arm_aapcs_vfpcc void @mul_v16i8(i8* noalias nocapture readonly %a, i8* noalias nocapture readonly %b, i8* noalias nocapture %c, i32 %N) {
14entry:
15  %cmp8 = icmp eq i32 %N, 0
16  %tmp8 = add i32 %N, 15
17  %tmp9 = lshr i32 %tmp8, 4
18  %tmp10 = shl nuw i32 %tmp9, 4
19  %tmp11 = add i32 %tmp10, -16
20  %tmp12 = lshr i32 %tmp11, 4
21  %tmp13 = add nuw nsw i32 %tmp12, 1
22  br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
23
24vector.ph:                                        ; preds = %entry
25  %trip.count.minus.1 = add i32 %N, -1
26  %broadcast.splatinsert10 = insertelement <16 x i32> undef, i32 %trip.count.minus.1, i32 0
27  %broadcast.splat11 = shufflevector <16 x i32> %broadcast.splatinsert10, <16 x i32> undef, <16 x i32> zeroinitializer
28  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
29  br label %vector.body
30
31vector.body:                                      ; preds = %vector.body, %vector.ph
32  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
33  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
34  %broadcast.splatinsert = insertelement <16 x i32> undef, i32 %index, i32 0
35  %broadcast.splat = shufflevector <16 x i32> %broadcast.splatinsert, <16 x i32> undef, <16 x i32> zeroinitializer
36  %induction = or <16 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
37  %tmp = getelementptr inbounds i8, i8* %a, i32 %index
38
39;  %tmp1 = icmp ule <16 x i32> %induction, %broadcast.splat11
40  %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %N)
41
42  %tmp2 = bitcast i8* %tmp to <16 x i8>*
43  %wide.masked.load = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %tmp2, i32 4, <16 x i1> %active.lane.mask, <16 x i8> undef)
44  %tmp3 = getelementptr inbounds i8, i8* %b, i32 %index
45  %tmp4 = bitcast i8* %tmp3 to <16 x i8>*
46  %wide.masked.load2 = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %tmp4, i32 4, <16 x i1> %active.lane.mask, <16 x i8> undef)
47  %mul = mul nsw <16 x i8> %wide.masked.load2, %wide.masked.load
48  %tmp6 = getelementptr inbounds i8, i8* %c, i32 %index
49  %tmp7 = bitcast i8* %tmp6 to <16 x i8>*
50  tail call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %mul, <16 x i8>* %tmp7, i32 4, <16 x i1> %active.lane.mask)
51  %index.next = add i32 %index, 16
52  %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1)
53  %tmp16 = icmp ne i32 %tmp15, 0
54  br i1 %tmp16, label %vector.body, label %for.cond.cleanup
55
56for.cond.cleanup:                                 ; preds = %vector.body, %entry
57  ret void
58}
59
60; CHECK-LABEL: mul_v8i16
61; CHECK-NOT: %num.elements = add i32 %trip.count.minus.1, 1
62; CHECK: vector.body:
63; CHECK: %index = phi i32
64; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
65; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[ELEMS]])
66; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 8
67; CHECK: [[LD0:%[^ ]+]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* {{.*}}, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef)
68; CHECK: [[LD1:%[^ ]+]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* {{.*}}, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef)
69; CHECK: tail call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> {{.*}}, <8 x i16>* {{.*}}, i32 4, <8 x i1> [[VCTP]])
70define dso_local arm_aapcs_vfpcc void @mul_v8i16(i16* noalias nocapture readonly %a, i16* noalias nocapture readonly %b, i16* noalias nocapture %c, i32 %N) {
71entry:
72  %cmp8 = icmp eq i32 %N, 0
73  %tmp8 = add i32 %N, 7
74  %tmp9 = lshr i32 %tmp8, 3
75  %tmp10 = shl nuw i32 %tmp9, 3
76  %tmp11 = add i32 %tmp10, -8
77  %tmp12 = lshr i32 %tmp11, 3
78  %tmp13 = add nuw nsw i32 %tmp12, 1
79  br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
80
81vector.ph:                                        ; preds = %entry
82  %trip.count.minus.1 = add i32 %N, -1
83  %broadcast.splatinsert10 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0
84  %broadcast.splat11 = shufflevector <8 x i32> %broadcast.splatinsert10, <8 x i32> undef, <8 x i32> zeroinitializer
85  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
86  br label %vector.body
87
88vector.body:                                      ; preds = %vector.body, %vector.ph
89  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
90  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
91  %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
92  %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
93  %induction = add <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
94  %tmp = getelementptr inbounds i16, i16* %a, i32 %index
95
96;  %tmp1 = icmp ule <8 x i32> %induction, %broadcast.splat11
97  %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
98
99  %tmp2 = bitcast i16* %tmp to <8 x i16>*
100  %wide.masked.load = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp2, i32 4, <8 x i1> %active.lane.mask, <8 x i16> undef)
101  %tmp3 = getelementptr inbounds i16, i16* %b, i32 %index
102  %tmp4 = bitcast i16* %tmp3 to <8 x i16>*
103  %wide.masked.load2 = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> %active.lane.mask, <8 x i16> undef)
104  %mul = mul nsw <8 x i16> %wide.masked.load2, %wide.masked.load
105  %tmp6 = getelementptr inbounds i16, i16* %c, i32 %index
106  %tmp7 = bitcast i16* %tmp6 to <8 x i16>*
107  tail call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %mul, <8 x i16>* %tmp7, i32 4, <8 x i1> %active.lane.mask)
108  %index.next = add i32 %index, 8
109  %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1)
110  %tmp16 = icmp ne i32 %tmp15, 0
111  br i1 %tmp16, label %vector.body, label %for.cond.cleanup
112
113for.cond.cleanup:                                 ; preds = %vector.body, %entry
114  ret void
115}
116
117; CHECK-LABEL: mul_v4i32
118; CHECK-NOT: %num.elements = add i32 %trip.count.minus.1, 1
119; CHECK: vector.body:
120; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
121; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELEMS]])
122; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4
123; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
124; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
125; CHECK: tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> {{.*}}, <4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]])
126define dso_local arm_aapcs_vfpcc void @mul_v4i32(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
127entry:
128  %cmp8 = icmp eq i32 %N, 0
129  %tmp8 = add i32 %N, 3
130  %tmp9 = lshr i32 %tmp8, 2
131  %tmp10 = shl nuw i32 %tmp9, 2
132  %tmp11 = add i32 %tmp10, -4
133  %tmp12 = lshr i32 %tmp11, 2
134  %tmp13 = add nuw nsw i32 %tmp12, 1
135  br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
136
137vector.ph:                                        ; preds = %entry
138  %trip.count.minus.1 = add i32 %N, -1
139  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
140  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
141  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
142  br label %vector.body
143
144vector.body:                                      ; preds = %vector.body, %vector.ph
145  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
146  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
147  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
148  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
149  %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
150  %tmp = getelementptr inbounds i32, i32* %a, i32 %index
151 ; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
152  %tmp2 = bitcast i32* %tmp to <4 x i32>*
153  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
154  %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
155  %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
156  %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
157  %wide.masked.load2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
158  %mul = mul nsw <4 x i32> %wide.masked.load2, %wide.masked.load
159  %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
160  %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
161  tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %mul, <4 x i32>* %tmp7, i32 4, <4 x i1> %active.lane.mask)
162  %index.next = add i32 %index, 4
163  %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1)
164  %tmp16 = icmp ne i32 %tmp15, 0
165  br i1 %tmp16, label %vector.body, label %for.cond.cleanup
166
167for.cond.cleanup:                                 ; preds = %vector.body, %entry
168  ret void
169}
170
171; CHECK-LABEL: split_vector
172; CHECK-NOT: %num.elements = add i32 %trip.count.minus.1, 1
173; CHECK: vector.body:
174; CHECK: %index = phi i32
175; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
176; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELEMS]])
177; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4
178; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
179; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
180; CHECK: tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> {{.*}}, <4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]])
181define dso_local arm_aapcs_vfpcc void @split_vector(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
182entry:
183  %cmp8 = icmp eq i32 %N, 0
184  %tmp8 = add i32 %N, 3
185  %tmp9 = lshr i32 %tmp8, 2
186  %tmp10 = shl nuw i32 %tmp9, 2
187  %tmp11 = add i32 %tmp10, -4
188  %tmp12 = lshr i32 %tmp11, 2
189  %tmp13 = add nuw nsw i32 %tmp12, 1
190  br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
191
192vector.ph:                                        ; preds = %entry
193  %trip.count.minus.1 = add i32 %N, -1
194  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
195  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
196  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
197  br label %vector.body
198
199vector.body:                                      ; preds = %vector.body, %vector.ph
200  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
201  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
202  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
203  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
204  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
205  %tmp = getelementptr inbounds i32, i32* %a, i32 %index
206;  %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
207  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
208  %tmp2 = bitcast i32* %tmp to <4 x i32>*
209  %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
210  %extract.1.low = shufflevector <4 x i32> %wide.masked.load, <4 x i32> undef, < 2 x i32> < i32 0, i32 2>
211  %extract.1.high = shufflevector <4 x i32> %wide.masked.load, <4 x i32> undef, < 2 x i32> < i32 1, i32 3>
212  %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
213  %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
214  %wide.masked.load2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
215  %extract.2.low = shufflevector <4 x i32> %wide.masked.load2, <4 x i32> undef, < 2 x i32> < i32 0, i32 2>
216  %extract.2.high = shufflevector <4 x i32> %wide.masked.load2, <4 x i32> undef, < 2 x i32> < i32 1, i32 3>
217  %mul = mul nsw <2 x i32> %extract.1.low, %extract.2.low
218  %sub = sub nsw <2 x i32> %extract.1.high, %extract.2.high
219  %combine = shufflevector <2 x i32> %mul, <2 x i32> %sub, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
220  %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
221  %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
222  tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %combine, <4 x i32>* %tmp7, i32 4, <4 x i1> %active.lane.mask)
223  %index.next = add i32 %index, 4
224  %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1)
225  %tmp16 = icmp ne i32 %tmp15, 0
226  br i1 %tmp16, label %vector.body, label %for.cond.cleanup
227
228for.cond.cleanup:                                 ; preds = %vector.body, %entry
229  ret void
230}
231
232; One of the loads now uses ult predicate.
233; CHECK-LABEL: mismatch_load_pred
234; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
235; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELEMS]])
236; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4
237; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
238; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> %wrong, <4 x i32> undef)
239; CHECK: tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> {{.*}}, <4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]])
240define dso_local arm_aapcs_vfpcc void @mismatch_load_pred(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
241entry:
242  %cmp8 = icmp eq i32 %N, 0
243  %tmp8 = add i32 %N, 3
244  %tmp9 = lshr i32 %tmp8, 2
245  %tmp10 = shl nuw i32 %tmp9, 2
246  %tmp11 = add i32 %tmp10, -4
247  %tmp12 = lshr i32 %tmp11, 2
248  %tmp13 = add nuw nsw i32 %tmp12, 1
249  br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
250
251vector.ph:                                        ; preds = %entry
252  %trip.count.minus.1 = add i32 %N, -1
253  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
254  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
255  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
256  br label %vector.body
257
258vector.body:                                      ; preds = %vector.body, %vector.ph
259  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
260  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
261  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
262  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
263  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
264  %tmp = getelementptr inbounds i32, i32* %a, i32 %index
265
266;  %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
267  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
268
269  %wrong = icmp ult <4 x i32> %induction, %broadcast.splat11
270  %tmp2 = bitcast i32* %tmp to <4 x i32>*
271  %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
272  %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
273  %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
274  %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %wrong, <4 x i32> undef)
275  %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
276  %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
277  %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
278  tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %active.lane.mask)
279  %index.next = add i32 %index, 4
280  %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1)
281  %tmp16 = icmp ne i32 %tmp15, 0
282  br i1 %tmp16, label %vector.body, label %for.cond.cleanup
283
284for.cond.cleanup:                                 ; preds = %vector.body, %entry
285  ret void
286}
287
288; The store now uses ult predicate.
289; CHECK-LABEL: mismatch_store_pred
290; CHECK-NOT: %num.elements = add i32 %trip.count.minus.1, 1
291; CHECK: vector.body:
292; CHECK: %index = phi i32
293; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
294; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELEMS]])
295; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4
296; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
297; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
298; CHECK: tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> {{.*}}, <4 x i32>* {{.*}}, i32 4, <4 x i1> %wrong)
299define dso_local arm_aapcs_vfpcc void @mismatch_store_pred(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
300entry:
301  %cmp8 = icmp eq i32 %N, 0
302  %tmp8 = add i32 %N, 3
303  %tmp9 = lshr i32 %tmp8, 2
304  %tmp10 = shl nuw i32 %tmp9, 2
305  %tmp11 = add i32 %tmp10, -4
306  %tmp12 = lshr i32 %tmp11, 2
307  %tmp13 = add nuw nsw i32 %tmp12, 1
308  br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
309
310vector.ph:                                        ; preds = %entry
311  %trip.count.minus.1 = add i32 %N, -1
312  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
313  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
314  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
315  br label %vector.body
316
317vector.body:                                      ; preds = %vector.body, %vector.ph
318  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
319  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
320  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
321  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
322  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
323  %tmp = getelementptr inbounds i32, i32* %a, i32 %index
324
325;  %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
326  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
327
328  %wrong = icmp ult <4 x i32> %induction, %broadcast.splat11
329  %tmp2 = bitcast i32* %tmp to <4 x i32>*
330  %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
331  %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
332  %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
333  %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
334  %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
335  %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
336  %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
337  tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %wrong)
338  %index.next = add i32 %index, 4
339  %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1)
340  %tmp16 = icmp ne i32 %tmp15, 0
341  br i1 %tmp16, label %vector.body, label %for.cond.cleanup
342
343for.cond.cleanup:                                 ; preds = %vector.body, %entry
344  ret void
345}
346
347; TODO: Multiple intrinsics not yet supported.
348; This is currently rejected, because if the vector body is unrolled, the step
349; is not what we expect:
350;
351;   Step value 16 doesn't match vector width 4
352;
353; CHECK-LABEL: interleave4
354; CHECK: vector.body:
355; CHECK:  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
356; CHECK:  %active.lane.mask{{.*}} = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v7, i32 %N)
357; CHECK:  %active.lane.mask{{.*}} = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v8, i32 %N)
358; CHECK:  %active.lane.mask{{.*}} = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v9, i32 %N)
359;
360define dso_local void @interleave4(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
361entry:
362  %cmp8 = icmp sgt i32 %N, 0
363  %v0 = add i32 %N, 15
364  %v1 = lshr i32 %v0, 4
365  %v2 = shl nuw i32 %v1, 4
366  %v3 = add i32 %v2, -16
367  %v4 = lshr i32 %v3, 4
368  %v5 = add nuw nsw i32 %v4, 1
369  br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
370
371
372vector.ph:
373  %trip.count.minus.1 = add i32 %N, -1
374  %scevgep = getelementptr i32, i32* %A, i32 8
375  %scevgep30 = getelementptr i32, i32* %C, i32 8
376  %scevgep37 = getelementptr i32, i32* %B, i32 8
377  %start = call i32 @llvm.start.loop.iterations.i32(i32 %v5)
378  br label %vector.body
379
380vector.body:
381  %lsr.iv38 = phi i32* [ %scevgep39, %vector.body ], [ %scevgep37, %vector.ph ]
382  %lsr.iv31 = phi i32* [ %scevgep32, %vector.body ], [ %scevgep30, %vector.ph ]
383  %lsr.iv = phi i32* [ %scevgep25, %vector.body ], [ %scevgep, %vector.ph ]
384  %index = phi i32 [ 0, %vector.ph ], [ %v14, %vector.body ]
385  %v6 = phi i32 [ %start, %vector.ph ], [ %v15, %vector.body ]
386  %lsr.iv3840 = bitcast i32* %lsr.iv38 to <4 x i32>*
387  %lsr.iv3133 = bitcast i32* %lsr.iv31 to <4 x i32>*
388  %lsr.iv26 = bitcast i32* %lsr.iv to <4 x i32>*
389  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
390  %v7 = add i32 %index, 4
391  %active.lane.mask15 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v7, i32 %N)
392  %v8 = add i32 %v7, 4
393  %active.lane.mask16 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v8, i32 %N)
394  %v9 = add i32 %v8, 4
395  %active.lane.mask17 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v9, i32 %N)
396  %scevgep42 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3840, i32 -2
397  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %scevgep42, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
398  %scevgep43 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3840, i32 -1
399  %wide.masked.load18 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* nonnull %scevgep43, i32 4, <4 x i1> %active.lane.mask15, <4 x i32> undef)
400  %wide.masked.load19 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* nonnull %lsr.iv3840, i32 4, <4 x i1> %active.lane.mask16, <4 x i32> undef)
401  %scevgep41 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3840, i32 1
402  %wide.masked.load20 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* nonnull %scevgep41, i32 4, <4 x i1> %active.lane.mask17, <4 x i32> undef)
403  %scevgep34 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3133, i32 -2
404  %wide.masked.load21 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %scevgep34, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
405  %scevgep35 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3133, i32 -1
406  %wide.masked.load22 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* nonnull %scevgep35, i32 4, <4 x i1> %active.lane.mask15, <4 x i32> undef)
407  %wide.masked.load23 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* nonnull %lsr.iv3133, i32 4, <4 x i1> %active.lane.mask16, <4 x i32> undef)
408  %scevgep36 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3133, i32 1
409  %wide.masked.load24 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* nonnull %scevgep36, i32 4, <4 x i1> %active.lane.mask17, <4 x i32> undef)
410  %v10 = add nsw <4 x i32> %wide.masked.load21, %wide.masked.load
411  %v11 = add nsw <4 x i32> %wide.masked.load22, %wide.masked.load18
412  %v12 = add nsw <4 x i32> %wide.masked.load23, %wide.masked.load19
413  %v13 = add nsw <4 x i32> %wide.masked.load24, %wide.masked.load20
414  %scevgep27 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv26, i32 -2
415  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %v10, <4 x i32>* %scevgep27, i32 4, <4 x i1> %active.lane.mask)
416  %scevgep28 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv26, i32 -1
417  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %v11, <4 x i32>* %scevgep28, i32 4, <4 x i1> %active.lane.mask15)
418  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %v12, <4 x i32>* %lsr.iv26, i32 4, <4 x i1> %active.lane.mask16)
419  %scevgep29 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv26, i32 1
420  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %v13, <4 x i32>* %scevgep29, i32 4, <4 x i1> %active.lane.mask17)
421  %scevgep25 = getelementptr i32, i32* %lsr.iv, i32 16
422  %scevgep32 = getelementptr i32, i32* %lsr.iv31, i32 16
423  %scevgep39 = getelementptr i32, i32* %lsr.iv38, i32 16
424  %v14 = add i32 %v9, 4
425  %v15 = call i32 @llvm.loop.decrement.reg.i32(i32 %v6, i32 1)
426  %v16 = icmp ne i32 %v15, 0
427  br i1 %v16, label %vector.body, label %for.cond.cleanup
428
429for.cond.cleanup:
430  ret void
431}
432
433; CHECK-LABEL: const_expected_in_set_loop
434; CHECK:       call <4 x i1> @llvm.get.active.lane.mask
435; CHECK-NOT:   vctp
436; CHECK:       ret void
437;
438define dso_local void @const_expected_in_set_loop(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
439entry:
440  %cmp8 = icmp sgt i32 %N, 0
441  %0 = add i32 %N, 3
442  %1 = lshr i32 %0, 2
443  %2 = shl nuw i32 %1, 2
444  %3 = add i32 %2, -4
445  %4 = lshr i32 %3, 2
446  %5 = add nuw nsw i32 %4, 1
447  br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
448
449vector.ph:
450  %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
451  br label %vector.body
452
453vector.body:                                      ; preds = %vector.body, %vector.ph
454  %lsr.iv17 = phi i32* [ %scevgep18, %vector.body ], [ %A, %vector.ph ]
455  %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %C, %vector.ph ]
456  %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %vector.ph ]
457  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
458  %6 = phi i32 [ %start, %vector.ph ], [ %8, %vector.body ]
459  %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>*
460  %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
461  %lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>*
462
463  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 42)
464
465  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
466  %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
467  %7 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load
468  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %7, <4 x i32>* %lsr.iv1719, i32 4, <4 x i1> %active.lane.mask)
469  %index.next = add i32 %index, 4
470  %scevgep = getelementptr i32, i32* %lsr.iv, i32 4
471  %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4
472  %scevgep18 = getelementptr i32, i32* %lsr.iv17, i32 4
473  %8 = call i32 @llvm.loop.decrement.reg.i32(i32 %6, i32 1)
474  %9 = icmp ne i32 %8, 0
475  br i1 %9, label %vector.body, label %for.cond.cleanup
476
477for.cond.cleanup:                                 ; preds = %vector.body, %entry
478  ret void
479}
480
481; CHECK-LABEL: tripcount_arg_not_invariant
482; CHECK:       call <4 x i1> @llvm.get.active.lane.mask
483; CHECK-NOT:   vctp
484; CHECK:       ret void
485;
486define dso_local void @tripcount_arg_not_invariant(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
487entry:
488  %cmp8 = icmp sgt i32 %N, 0
489  %0 = add i32 %N, 3
490  %1 = lshr i32 %0, 2
491  %2 = shl nuw i32 %1, 2
492  %3 = add i32 %2, -4
493  %4 = lshr i32 %3, 2
494  %5 = add nuw nsw i32 %4, 1
495  br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
496
497vector.ph:                                        ; preds = %entry
498  %trip.count.minus.1 = add i32 %N, -1
499  %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
500  br label %vector.body
501
502vector.body:                                      ; preds = %vector.body, %vector.ph
503  %lsr.iv17 = phi i32* [ %scevgep18, %vector.body ], [ %A, %vector.ph ]
504  %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %C, %vector.ph ]
505  %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %vector.ph ]
506  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
507  %6 = phi i32 [ %start, %vector.ph ], [ %8, %vector.body ]
508
509  %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>*
510  %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
511  %lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>*
512
513  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %index)
514
515  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
516  %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
517  %7 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load
518  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %7, <4 x i32>* %lsr.iv1719, i32 4, <4 x i1> %active.lane.mask)
519  %index.next = add i32 %index, 4
520  %scevgep = getelementptr i32, i32* %lsr.iv, i32 4
521  %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4
522  %scevgep18 = getelementptr i32, i32* %lsr.iv17, i32 4
523  %8 = call i32 @llvm.loop.decrement.reg.i32(i32 %6, i32 1)
524  %9 = icmp ne i32 %8, 0
525  ;br i1 %9, label %vector.body, label %for.cond.cleanup
526  br i1 %9, label %vector.body, label %vector.ph
527
528for.cond.cleanup:                                 ; preds = %vector.body, %entry
529  ret void
530}
531
532; CHECK-LABEL: addrec_base_not_zero
533; CHECK:       call <4 x i1> @llvm.get.active.lane.mask
534; CHECK-NOT:   vctp
535; CHECK:       ret void
536;
537define dso_local void @addrec_base_not_zero(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
538entry:
539  %cmp8 = icmp sgt i32 %N, 0
540  %0 = add i32 %N, 3
541  %1 = lshr i32 %0, 2
542  %2 = shl nuw i32 %1, 2
543  %3 = add i32 %2, -4
544  %4 = lshr i32 %3, 2
545  %5 = add nuw nsw i32 %4, 1
546  br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
547
548vector.ph:                                        ; preds = %entry
549  %trip.count.minus.1 = add i32 %N, -1
550  %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
551  br label %vector.body
552
553vector.body:                                      ; preds = %vector.body, %vector.ph
554  %lsr.iv17 = phi i32* [ %scevgep18, %vector.body ], [ %A, %vector.ph ]
555  %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %C, %vector.ph ]
556  %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %vector.ph ]
557
558; AddRec base is not 0:
559  %index = phi i32 [ 1, %vector.ph ], [ %index.next, %vector.body ]
560
561  %6 = phi i32 [ %start, %vector.ph ], [ %8, %vector.body ]
562  %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>*
563  %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
564  %lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>*
565  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
566  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
567  %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
568  %7 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load
569  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %7, <4 x i32>* %lsr.iv1719, i32 4, <4 x i1> %active.lane.mask)
570  %index.next = add i32 %index, 4
571  %scevgep = getelementptr i32, i32* %lsr.iv, i32 4
572  %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4
573  %scevgep18 = getelementptr i32, i32* %lsr.iv17, i32 4
574  %8 = call i32 @llvm.loop.decrement.reg.i32(i32 %6, i32 1)
575  %9 = icmp ne i32 %8, 0
576  ;br i1 %9, label %vector.body, label %for.cond.cleanup
577  br i1 %9, label %vector.body, label %vector.ph
578
579for.cond.cleanup:                                 ; preds = %vector.body, %entry
580  ret void
581}
582
583
584declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>)
585declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32 immarg, <16 x i1>)
586declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>)
587declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32 immarg, <8 x i1>)
588declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
589declare void @llvm.masked.store.v2i64.p0v2i64(<2 x i64>, <2 x i64>*, i32 immarg, <2 x i1>)
590declare <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>*, i32 immarg, <2 x i1>, <2 x i64>)
591declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
592declare i32 @llvm.start.loop.iterations.i32(i32)
593declare i32 @llvm.loop.decrement.reg.i32(i32, i32)
594declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
595declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32)
596declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32)
597