1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -tail-predication=enabled -mattr=+mve %s -S -o - | FileCheck %s
3
4define dso_local void @foo(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
5; CHECK-LABEL: @foo(
6; CHECK-NEXT:  entry:
7; CHECK-NEXT:    [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 8001)
8; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
9; CHECK:       vector.body:
10; CHECK-NEXT:    [[LSR_IV14:%.*]] = phi i32* [ [[SCEVGEP15:%.*]], [[VECTOR_BODY]] ], [ [[A:%.*]], [[ENTRY:%.*]] ]
11; CHECK-NEXT:    [[LSR_IV11:%.*]] = phi i32* [ [[SCEVGEP12:%.*]], [[VECTOR_BODY]] ], [ [[C:%.*]], [[ENTRY]] ]
12; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i32* [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[B:%.*]], [[ENTRY]] ]
13; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
14; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
15; CHECK-NEXT:    [[TMP1:%.*]] = phi i32 [ 32003, [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
16; CHECK-NEXT:    [[LSR_IV1416:%.*]] = bitcast i32* [[LSR_IV14]] to <4 x i32>*
17; CHECK-NEXT:    [[LSR_IV1113:%.*]] = bitcast i32* [[LSR_IV11]] to <4 x i32>*
18; CHECK-NEXT:    [[LSR_IV10:%.*]] = bitcast i32* [[LSR_IV]] to <4 x i32>*
19; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0
20; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
21; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
22; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP1]])
23; CHECK-NEXT:    [[TMP3]] = sub i32 [[TMP1]], 4
24; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[LSR_IV10]], i32 4, <4 x i1> [[TMP2]], <4 x i32> undef)
25; CHECK-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[LSR_IV1113]], i32 4, <4 x i1> [[TMP2]], <4 x i32> undef)
26; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD]]
27; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP4]], <4 x i32>* [[LSR_IV1416]], i32 4, <4 x i1> [[TMP2]])
28; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
29; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i32, i32* [[LSR_IV]], i32 4
30; CHECK-NEXT:    [[SCEVGEP12]] = getelementptr i32, i32* [[LSR_IV11]], i32 4
31; CHECK-NEXT:    [[SCEVGEP15]] = getelementptr i32, i32* [[LSR_IV14]], i32 4
32; CHECK-NEXT:    [[TMP5]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP0]], i32 1)
33; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne i32 [[TMP5]], 0
34; CHECK-NEXT:    br i1 [[TMP6]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
35; CHECK:       for.cond.cleanup:
36; CHECK-NEXT:    ret void
37;
38entry:
39  %start = call i32 @llvm.start.loop.iterations.i32(i32 8001)
40  br label %vector.body
41
42vector.body:
43  %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %A, %entry ]
44  %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ]
45  %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ]
46  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
47  %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ]
48  %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
49  %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>*
50  %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>*
51  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
52  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
53  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
54
55  ; %1 = icmp ult <4 x i32> %induction, <i32 32002, i32 32002, i32 32002, i32 32002>
56  %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32003)
57
58  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef)
59  %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef)
60  %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load
61  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %2, <4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %1)
62  %index.next = add i32 %index, 4
63  %scevgep = getelementptr i32, i32* %lsr.iv, i32 4
64  %scevgep12 = getelementptr i32, i32* %lsr.iv11, i32 4
65  %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4
66  %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1)
67  %4 = icmp ne i32 %3, 0
68  br i1 %4, label %vector.body, label %for.cond.cleanup
69
70for.cond.cleanup:
71  ret void
72}
73
74; Silly test case: the loop count is constant and a multiple of the vectorisation
75; factor. So, the vectoriser should not produce masked loads/stores and there's
76; nothing to tail-predicate here, just checking.
77define dso_local void @foo2(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
78; CHECK-LABEL: @foo2(
79; CHECK-NEXT:  entry:
80; CHECK-NEXT:    [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 2000)
81; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
82; CHECK:       vector.body:
83; CHECK-NEXT:    [[LSR_IV14:%.*]] = phi i32* [ [[SCEVGEP15:%.*]], [[VECTOR_BODY]] ], [ [[A:%.*]], [[ENTRY:%.*]] ]
84; CHECK-NEXT:    [[LSR_IV11:%.*]] = phi i32* [ [[SCEVGEP12:%.*]], [[VECTOR_BODY]] ], [ [[C:%.*]], [[ENTRY]] ]
85; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i32* [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[B:%.*]], [[ENTRY]] ]
86; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
87; CHECK-NEXT:    [[LSR_IV1416:%.*]] = bitcast i32* [[LSR_IV14]] to <4 x i32>*
88; CHECK-NEXT:    [[LSR_IV1113:%.*]] = bitcast i32* [[LSR_IV11]] to <4 x i32>*
89; CHECK-NEXT:    [[LSR_IV10:%.*]] = bitcast i32* [[LSR_IV]] to <4 x i32>*
90; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[LSR_IV10]], align 4
91; CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <4 x i32>, <4 x i32>* [[LSR_IV1113]], align 4
92; CHECK-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD9]], [[WIDE_LOAD]]
93; CHECK-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[LSR_IV1416]], align 4
94; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i32, i32* [[LSR_IV]], i32 4
95; CHECK-NEXT:    [[SCEVGEP12]] = getelementptr i32, i32* [[LSR_IV11]], i32 4
96; CHECK-NEXT:    [[SCEVGEP15]] = getelementptr i32, i32* [[LSR_IV14]], i32 4
97; CHECK-NEXT:    [[TMP2]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP0]], i32 1)
98; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0
99; CHECK-NEXT:    br i1 [[TMP3]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
100; CHECK:       for.cond.cleanup:
101; CHECK-NEXT:    ret void
102;
103entry:
104  %start = call i32 @llvm.start.loop.iterations.i32(i32 2000)
105  br label %vector.body
106
107vector.body:
108  %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %A, %entry ]
109  %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ]
110  %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ]
111  %0 = phi i32 [ %start, %entry ], [ %2, %vector.body ]
112  %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
113  %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>*
114  %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>*
115  %wide.load = load <4 x i32>, <4 x i32>* %lsr.iv10, align 4
116  %wide.load9 = load <4 x i32>, <4 x i32>* %lsr.iv1113, align 4
117  %1 = add nsw <4 x i32> %wide.load9, %wide.load
118  store <4 x i32> %1, <4 x i32>* %lsr.iv1416, align 4
119  %scevgep = getelementptr i32, i32* %lsr.iv, i32 4
120  %scevgep12 = getelementptr i32, i32* %lsr.iv11, i32 4
121  %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4
122  %2 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1)
123  %3 = icmp ne i32 %2, 0
124  br i1 %3, label %vector.body, label %for.cond.cleanup
125
126for.cond.cleanup:
127  ret void
128}
129
130; Check that the icmp is a ult
131define dso_local void @foo3(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
132; CHECK-LABEL: @foo3(
133; CHECK-NEXT:  entry:
134; CHECK-NEXT:    [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 8001)
135; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
136; CHECK:       vector.body:
137; CHECK-NEXT:    [[LSR_IV14:%.*]] = phi i32* [ [[SCEVGEP15:%.*]], [[VECTOR_BODY]] ], [ [[A:%.*]], [[ENTRY:%.*]] ]
138; CHECK-NEXT:    [[LSR_IV11:%.*]] = phi i32* [ [[SCEVGEP12:%.*]], [[VECTOR_BODY]] ], [ [[C:%.*]], [[ENTRY]] ]
139; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i32* [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[B:%.*]], [[ENTRY]] ]
140; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
141; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
142; CHECK-NEXT:    [[LSR_IV1416:%.*]] = bitcast i32* [[LSR_IV14]] to <4 x i32>*
143; CHECK-NEXT:    [[LSR_IV1113:%.*]] = bitcast i32* [[LSR_IV11]] to <4 x i32>*
144; CHECK-NEXT:    [[LSR_IV10:%.*]] = bitcast i32* [[LSR_IV]] to <4 x i32>*
145; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0
146; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
147; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
148; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt <4 x i32> [[INDUCTION]], <i32 32002, i32 32002, i32 32002, i32 32002>
149; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[LSR_IV10]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
150; CHECK-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[LSR_IV1113]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
151; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD]]
152; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP2]], <4 x i32>* [[LSR_IV1416]], i32 4, <4 x i1> [[TMP1]])
153; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
154; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i32, i32* [[LSR_IV]], i32 4
155; CHECK-NEXT:    [[SCEVGEP12]] = getelementptr i32, i32* [[LSR_IV11]], i32 4
156; CHECK-NEXT:    [[SCEVGEP15]] = getelementptr i32, i32* [[LSR_IV14]], i32 4
157; CHECK-NEXT:    [[TMP3]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP0]], i32 1)
158; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
159; CHECK-NEXT:    br i1 [[TMP4]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
160; CHECK:       for.cond.cleanup:
161; CHECK-NEXT:    ret void
162;
163entry:
164  %start = call i32 @llvm.start.loop.iterations.i32(i32 8001)
165  br label %vector.body
166
167vector.body:
168  %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %A, %entry ]
169  %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ]
170  %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ]
171  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
172  %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ]
173  %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
174  %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>*
175  %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>*
176  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
177  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
178  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
179
180; UGT here:
181  %1 = icmp ugt <4 x i32> %induction, <i32 32002, i32 32002, i32 32002, i32 32002>
182
183  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef)
184  %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef)
185  %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load
186  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %2, <4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %1)
187  %index.next = add i32 %index, 4
188  %scevgep = getelementptr i32, i32* %lsr.iv, i32 4
189  %scevgep12 = getelementptr i32, i32* %lsr.iv11, i32 4
190  %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4
191  %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1)
192  %4 = icmp ne i32 %3, 0
193  br i1 %4, label %vector.body, label %for.cond.cleanup
194
195for.cond.cleanup:
196  ret void
197}
198
199define dso_local void @foo5(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
200; CHECK-LABEL: @foo5(
201; CHECK-NEXT:  entry:
202; CHECK-NEXT:    [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 8001)
203; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
204; CHECK:       vector.body:
205; CHECK-NEXT:    [[LSR_IV14:%.*]] = phi i32* [ [[SCEVGEP15:%.*]], [[VECTOR_BODY]] ], [ [[A:%.*]], [[ENTRY:%.*]] ]
206; CHECK-NEXT:    [[LSR_IV11:%.*]] = phi i32* [ [[SCEVGEP12:%.*]], [[VECTOR_BODY]] ], [ [[C:%.*]], [[ENTRY]] ]
207; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i32* [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[B:%.*]], [[ENTRY]] ]
208; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
209; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
210; CHECK-NEXT:    [[LSR_IV1416:%.*]] = bitcast i32* [[LSR_IV14]] to <4 x i32>*
211; CHECK-NEXT:    [[LSR_IV1113:%.*]] = bitcast i32* [[LSR_IV11]] to <4 x i32>*
212; CHECK-NEXT:    [[LSR_IV10:%.*]] = bitcast i32* [[LSR_IV]] to <4 x i32>*
213; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0
214; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
215; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
216; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult <4 x i32> [[INDUCTION]], <i32 0, i32 3200, i32 32002, i32 32002>
217; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[LSR_IV10]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
218; CHECK-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[LSR_IV1113]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
219; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD]]
220; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP2]], <4 x i32>* [[LSR_IV1416]], i32 4, <4 x i1> [[TMP1]])
221; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
222; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i32, i32* [[LSR_IV]], i32 4
223; CHECK-NEXT:    [[SCEVGEP12]] = getelementptr i32, i32* [[LSR_IV11]], i32 4
224; CHECK-NEXT:    [[SCEVGEP15]] = getelementptr i32, i32* [[LSR_IV14]], i32 4
225; CHECK-NEXT:    [[TMP3]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP0]], i32 1)
226; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
227; CHECK-NEXT:    br i1 [[TMP4]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
228; CHECK:       for.cond.cleanup:
229; CHECK-NEXT:    ret void
230;
231entry:
232  %start = call i32 @llvm.start.loop.iterations.i32(i32 8001)
233  br label %vector.body
234
235vector.body:
236  %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %A, %entry ]
237  %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ]
238  %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ]
239  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
240  %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ]
241  %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
242  %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>*
243  %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>*
244  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
245  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
246  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
247
248; Non-uniform constant vector here. This can't be represented with
249; @llvm.get.active.lane.mask, but let's keep this test as a sanity check:
250  %1 = icmp ult <4 x i32> %induction, <i32 0, i32 3200, i32 32002, i32 32002>
251
252  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef)
253  %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef)
254  %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load
255  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %2, <4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %1)
256  %index.next = add i32 %index, 4
257  %scevgep = getelementptr i32, i32* %lsr.iv, i32 4
258  %scevgep12 = getelementptr i32, i32* %lsr.iv11, i32 4
259  %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4
260  %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1)
261  %4 = icmp ne i32 %3, 0
262  br i1 %4, label %vector.body, label %for.cond.cleanup
263
264for.cond.cleanup:
265  ret void
266}
267
268; CHECK-LABEL: @inconsistent_tripcounts(
269; CHECK:       vector.body:
270; CHECK-NOT:   @llvm.arm.mve.vctp32
271; CHECK:       @llvm.get.active.lane.mask
272; CHECK:       ret void
273;
274define dso_local void @inconsistent_tripcounts(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
275entry:
276  %start = call i32 @llvm.start.loop.iterations.i32(i32 8001)
277  br label %vector.body
278
279vector.body:
280  %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %A, %entry ]
281  %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ]
282  %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ]
283  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
284  %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ]
285  %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
286  %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>*
287  %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>*
288  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
289  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
290  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
291
292; BTC = UINT_MAX, and scalar trip count BTC + 1 would overflow:
293  %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 4294967295)
294
295  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef)
296  %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef)
297  %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load
298  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %2, <4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %1)
299  %index.next = add i32 %index, 4
300  %scevgep = getelementptr i32, i32* %lsr.iv, i32 4
301  %scevgep12 = getelementptr i32, i32* %lsr.iv11, i32 4
302  %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4
303  %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1)
304  %4 = icmp ne i32 %3, 0
305  br i1 %4, label %vector.body, label %for.cond.cleanup
306
307for.cond.cleanup:
308  ret void
309}
310
311; CHECK-LABEL: @overflow_in_sub(
312; CHECK:       vector.body:
313; CHECK-NOT:   @llvm.arm.mve.vctp32
314; CHECK:       @llvm.get.active.lane.mask
315; CHECK:       ret void
316;
317define dso_local void @overflow_in_sub(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
318entry:
319  %start = call i32 @llvm.start.loop.iterations.i32(i32 1073741824)
320  br label %vector.body
321
322vector.body:
323  %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %A, %entry ]
324  %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ]
325  %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ]
326  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
327  %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ]
328  %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
329  %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>*
330  %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>*
331  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
332  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
333  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
334
335  %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32003)
336
337  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef)
338  %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef)
339  %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load
340  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %2, <4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %1)
341  %index.next = add i32 %index, 4
342  %scevgep = getelementptr i32, i32* %lsr.iv, i32 4
343  %scevgep12 = getelementptr i32, i32* %lsr.iv11, i32 4
344  %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4
345  %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1)
346  %4 = icmp ne i32 %3, 0
347  br i1 %4, label %vector.body, label %for.cond.cleanup
348
349for.cond.cleanup:
350  ret void
351}
352
353
354; CHECK-LABEL: @IV_not_an_induction(
355; CHECK:       vector.body:
356; CHECK-NOT:   @llvm.arm.mve.vctp32
357; CHECK:       @llvm.get.active.lane.mask
358; CHECK:       ret void
359;
360define dso_local void @IV_not_an_induction(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
361entry:
362  %start = call i32 @llvm.start.loop.iterations.i32(i32 8001)
363  br label %vector.body
364
365vector.body:
366  %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %A, %entry ]
367  %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ]
368  %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ]
369  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
370  %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ]
371  %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
372  %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>*
373  %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>*
374  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
375  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
376  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
377
378; The induction variable %N is not an IV:
379  %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %N, i32 32003)
380
381  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef)
382  %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef)
383  %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load
384  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %2, <4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %1)
385  %index.next = add i32 %index, 4
386  %scevgep = getelementptr i32, i32* %lsr.iv, i32 4
387  %scevgep12 = getelementptr i32, i32* %lsr.iv11, i32 4
388  %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4
389  %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1)
390  %4 = icmp ne i32 %3, 0
391  br i1 %4, label %vector.body, label %for.cond.cleanup
392
393for.cond.cleanup:
394  ret void
395}
396
397; CHECK-LABEL: @IV_wrong_step(
398; CHECK:       vector.body:
399; CHECK-NOT:   @llvm.arm.mve.vctp32
400; CHECK:       @llvm.get.active.lane.mask
401; CHECK:       ret void
402;
403define dso_local void @IV_wrong_step(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
404entry:
405  %start = call i32 @llvm.start.loop.iterations.i32(i32 8001)
406  br label %vector.body
407
408vector.body:
409  %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %A, %entry ]
410  %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ]
411  %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ]
412  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
413  %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ]
414  %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
415  %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>*
416  %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>*
417  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
418  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
419  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
420
421  %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32003)
422
423  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef)
424  %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef)
425  %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load
426  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %2, <4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %1)
427
428; %index is incremented with 3 and not 4, which is the vectorisation factor
429; that we expect here:
430  %index.next = add i32 %index, 3
431
432  %scevgep = getelementptr i32, i32* %lsr.iv, i32 4
433  %scevgep12 = getelementptr i32, i32* %lsr.iv11, i32 4
434  %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4
435  %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1)
436  %4 = icmp ne i32 %3, 0
437  br i1 %4, label %vector.body, label %for.cond.cleanup
438
439for.cond.cleanup:
440  ret void
441}
442
443; CHECK-LABEL: @IV_step_not_constant(
444; CHECK:       vector.body:
445; CHECK-NOT:   @llvm.arm.mve.vctp32
446; CHECK:       @llvm.get.active.lane.mask
447; CHECK:       ret void
448;
449define dso_local void @IV_step_not_constant(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
450entry:
451  %start = call i32 @llvm.start.loop.iterations.i32(i32 8001)
452  br label %vector.body
453
454vector.body:
455  %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %A, %entry ]
456  %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ]
457  %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ]
458  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
459  %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ]
460  %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
461  %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>*
462  %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>*
463  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
464  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
465  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
466  %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32003)
467  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef)
468  %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef)
469  %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load
470  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %2, <4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %1)
471
472; %index is incremented with some runtime value, i.e. not a constant:
473  %index.next = add i32 %index, %N
474
475  %scevgep = getelementptr i32, i32* %lsr.iv, i32 4
476  %scevgep12 = getelementptr i32, i32* %lsr.iv11, i32 4
477  %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4
478  %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1)
479  %4 = icmp ne i32 %3, 0
480  br i1 %4, label %vector.body, label %for.cond.cleanup
481
482for.cond.cleanup:
483  ret void
484}
485
486; CHECK-LABEL: @outerloop_phi(
487; CHECK:       vector.body:
488; CHECK-NOT:   @llvm.arm.mve.vctp32
489; CHECK:       @llvm.get.active.lane.mask
490; CHECK:       ret void
491;
492define dso_local void @outerloop_phi(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
493entry:
494  %cmp24 = icmp eq i32 %N, 0
495  br i1 %cmp24, label %for.cond.cleanup, label %vector.ph.preheader
496
497vector.ph.preheader:                              ; preds = %entry
498  br label %vector.ph
499
500vector.ph:                                        ; preds = %vector.ph.preheader, %for.cond.cleanup3
501  %lsr.iv36 = phi i32* [ %B, %vector.ph.preheader ], [ %scevgep37, %for.cond.cleanup3 ]
502  %lsr.iv31 = phi i32* [ %C, %vector.ph.preheader ], [ %scevgep32, %for.cond.cleanup3 ]
503  %lsr.iv = phi i32* [ %A, %vector.ph.preheader ], [ %scevgep, %for.cond.cleanup3 ]
504  %j.025 = phi i32 [ %inc11, %for.cond.cleanup3 ], [ 0, %vector.ph.preheader ]
505  %start = call i32 @llvm.start.loop.iterations.i32(i32 1025)
506  br label %vector.body
507
508vector.body:                                      ; preds = %vector.body, %vector.ph
509  %lsr.iv38 = phi i32* [ %scevgep39, %vector.body ], [ %lsr.iv36, %vector.ph ]
510  %lsr.iv33 = phi i32* [ %scevgep34, %vector.body ], [ %lsr.iv31, %vector.ph ]
511  %lsr.iv28 = phi i32* [ %scevgep29, %vector.body ], [ %lsr.iv, %vector.ph ]
512  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
513  %0 = phi i32 [ %start, %vector.ph ], [ %2, %vector.body ]
514  %lsr.iv3840 = bitcast i32* %lsr.iv38 to <4 x i32>*
515  %lsr.iv3335 = bitcast i32* %lsr.iv33 to <4 x i32>*
516  %lsr.iv2830 = bitcast i32* %lsr.iv28 to <4 x i32>*
517
518; It's using %j.025, the induction variable from its outer loop:
519  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %j.025, i32 4096)
520
521  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv3840, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
522  %wide.masked.load27 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv3335, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
523  %1 = add nsw <4 x i32> %wide.masked.load27, %wide.masked.load
524  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %lsr.iv2830, i32 4, <4 x i1> %active.lane.mask)
525  %index.next = add i32 %index, 4
526  %scevgep29 = getelementptr i32, i32* %lsr.iv28, i32 4
527  %scevgep34 = getelementptr i32, i32* %lsr.iv33, i32 4
528  %scevgep39 = getelementptr i32, i32* %lsr.iv38, i32 4
529  %2 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1)
530  %3 = icmp ne i32 %2, 0
531  br i1 %3, label %vector.body, label %for.cond.cleanup3
532
533for.cond.cleanup:                                 ; preds = %for.cond.cleanup3, %entry
534  ret void
535
536for.cond.cleanup3:                                ; preds = %vector.body
537  %inc11 = add nuw i32 %j.025, 1
538  %scevgep = getelementptr i32, i32* %lsr.iv, i32 1
539  %scevgep32 = getelementptr i32, i32* %lsr.iv31, i32 1
540  %scevgep37 = getelementptr i32, i32* %lsr.iv36, i32 1
541  %exitcond26 = icmp eq i32 %inc11, %N
542  br i1 %exitcond26, label %for.cond.cleanup, label %vector.ph
543}
544
545
546declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #1
547declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #2
548declare i32 @llvm.loop.decrement.reg.i32(i32 , i32 )
549declare i32 @llvm.start.loop.iterations.i32(i32)
550declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
551