1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt -loop-vectorize -force-vector-width=4 -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -tail-predication=force-enabled -S %s -o - | FileCheck %s
3
4define void @test_stride1_4i32(i32* readonly %data, i32* noalias nocapture %dst, i32 %n) {
5; CHECK-LABEL: @test_stride1_4i32(
6; CHECK-NEXT:  entry:
7; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
8; CHECK:       vector.ph:
9; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N:%.*]], 3
10; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 4
11; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
12; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
13; CHECK:       vector.body:
14; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
15; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0
16; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
17; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
18; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
19; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP0]], i32 [[N]])
20; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw nsw i32 [[TMP0]], 1
21; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i32 [[TMP1]], 2
22; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], i32 [[TMP2]]
23; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 0
24; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>*
25; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP5]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
26; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <4 x i32> <i32 5, i32 5, i32 5, i32 5>, [[WIDE_MASKED_LOAD]]
27; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i32 [[TMP0]]
28; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 0
29; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
30; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP6]], <4 x i32>* [[TMP9]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
31; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
32; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
33; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
34; CHECK:       middle.block:
35; CHECK-NEXT:    br i1 true, label [[END:%.*]], label [[SCALAR_PH]]
36; CHECK:       scalar.ph:
37; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
38; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
39; CHECK:       for.body:
40; CHECK-NEXT:    [[I_023:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
41; CHECK-NEXT:    [[MUL:%.*]] = mul nuw nsw i32 [[I_023]], 1
42; CHECK-NEXT:    [[ADD5:%.*]] = add nuw nsw i32 [[MUL]], 2
43; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i32 [[ADD5]]
44; CHECK-NEXT:    [[TMP11:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4
45; CHECK-NEXT:    [[ADD7:%.*]] = add nsw i32 5, [[TMP11]]
46; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[DST]], i32 [[I_023]]
47; CHECK-NEXT:    store i32 [[ADD7]], i32* [[ARRAYIDX9]], align 4
48; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_023]], 1
49; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
50; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[END]], label [[FOR_BODY]], [[LOOP2:!llvm.loop !.*]]
51; CHECK:       end:
52; CHECK-NEXT:    ret void
53;
54entry:
55  br label %for.body
56for.body:                                         ; preds = %for.body.preheader, %for.body
57  %i.023 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
58  %mul = mul nuw nsw i32 %i.023, 1
59  %add5 = add nuw nsw i32 %mul, 2
60  %arrayidx6 = getelementptr inbounds i32, i32* %data, i32 %add5
61  %0 = load i32, i32* %arrayidx6, align 4
62  %add7 = add nsw i32 5, %0
63  %arrayidx9 = getelementptr inbounds i32, i32* %dst, i32 %i.023
64  store i32 %add7, i32* %arrayidx9, align 4
65  %inc = add nuw nsw i32 %i.023, 1
66  %exitcond.not = icmp eq i32 %inc, %n
67  br i1 %exitcond.not, label %end, label %for.body
68end:                                 ; preds = %end, %entry
69  ret void
70}
71
72define void @test_stride-1_4i32(i32* readonly %data, i32* noalias nocapture %dst, i32 %n) {
73; CHECK-LABEL: @test_stride-1_4i32(
74; CHECK-NEXT:  entry:
75; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N:%.*]], 4
76; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
77; CHECK:       vector.scevcheck:
78; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
79; CHECK-NEXT:    [[MUL1:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP0]])
80; CHECK-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL1]], 0
81; CHECK-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL1]], 1
82; CHECK-NEXT:    [[TMP1:%.*]] = add i32 2, [[MUL_RESULT]]
83; CHECK-NEXT:    [[TMP2:%.*]] = sub i32 2, [[MUL_RESULT]]
84; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt i32 [[TMP2]], 2
85; CHECK-NEXT:    [[TMP4:%.*]] = icmp slt i32 [[TMP1]], 2
86; CHECK-NEXT:    [[TMP5:%.*]] = select i1 true, i1 [[TMP3]], i1 [[TMP4]]
87; CHECK-NEXT:    [[TMP6:%.*]] = or i1 [[TMP5]], [[MUL_OVERFLOW]]
88; CHECK-NEXT:    [[TMP7:%.*]] = or i1 false, [[TMP6]]
89; CHECK-NEXT:    br i1 [[TMP7]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
90; CHECK:       vector.ph:
91; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N]], 4
92; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
93; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
94; CHECK:       vector.body:
95; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
96; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[INDEX]], 0
97; CHECK-NEXT:    [[TMP9:%.*]] = mul nuw nsw i32 [[TMP8]], -1
98; CHECK-NEXT:    [[TMP10:%.*]] = add nuw nsw i32 [[TMP9]], 2
99; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], i32 [[TMP10]]
100; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 0
101; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i32 -3
102; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
103; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
104; CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
105; CHECK-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> <i32 5, i32 5, i32 5, i32 5>, [[REVERSE]]
106; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i32 [[TMP8]]
107; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP16]], i32 0
108; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i32* [[TMP17]] to <4 x i32>*
109; CHECK-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP18]], align 4
110; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
111; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
112; CHECK-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP4:!llvm.loop !.*]]
113; CHECK:       middle.block:
114; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
115; CHECK-NEXT:    br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]]
116; CHECK:       scalar.ph:
117; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
118; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
119; CHECK:       for.body:
120; CHECK-NEXT:    [[I_023:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
121; CHECK-NEXT:    [[MUL:%.*]] = mul nuw nsw i32 [[I_023]], -1
122; CHECK-NEXT:    [[ADD5:%.*]] = add nuw nsw i32 [[MUL]], 2
123; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i32 [[ADD5]]
124; CHECK-NEXT:    [[TMP20:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4
125; CHECK-NEXT:    [[ADD7:%.*]] = add nsw i32 5, [[TMP20]]
126; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[DST]], i32 [[I_023]]
127; CHECK-NEXT:    store i32 [[ADD7]], i32* [[ARRAYIDX9]], align 4
128; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_023]], 1
129; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
130; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[END]], label [[FOR_BODY]], [[LOOP5:!llvm.loop !.*]]
131; CHECK:       end:
132; CHECK-NEXT:    ret void
133;
134entry:
135  br label %for.body
136for.body:                                         ; preds = %for.body.preheader, %for.body
137  %i.023 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
138  %mul = mul nuw nsw i32 %i.023, -1
139  %add5 = add nuw nsw i32 %mul, 2
140  %arrayidx6 = getelementptr inbounds i32, i32* %data, i32 %add5
141  %0 = load i32, i32* %arrayidx6, align 4
142  %add7 = add nsw i32 5, %0
143  %arrayidx9 = getelementptr inbounds i32, i32* %dst, i32 %i.023
144  store i32 %add7, i32* %arrayidx9, align 4
145  %inc = add nuw nsw i32 %i.023, 1
146  %exitcond.not = icmp eq i32 %inc, %n
147  br i1 %exitcond.not, label %end, label %for.body
148end:                                 ; preds = %end, %entry
149  ret void
150}
151
152define void @test_stride2_4i32(i32* readonly %data, i32* noalias nocapture %dst, i32 %n) {
153;
154; CHECK-LABEL: @test_stride2_4i32(
155; CHECK-NEXT:  entry:
156; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i32 [[N:%.*]], 4
157; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
158; CHECK:       vector.ph:
159; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N]], 4
160; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[N_MOD_VF]], 0
161; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[TMP0]], i32 4, i32 [[N_MOD_VF]]
162; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N]], [[TMP1]]
163; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
164; CHECK:       vector.body:
165; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
166; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[INDEX]], 0
167; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw nsw i32 [[TMP2]], 2
168; CHECK-NEXT:    [[TMP4:%.*]] = add nuw nsw i32 [[TMP3]], 2
169; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], i32 [[TMP4]]
170; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0
171; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>*
172; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP7]], align 4
173; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
174; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> <i32 5, i32 5, i32 5, i32 5>, [[STRIDED_VEC]]
175; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i32 [[TMP2]]
176; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP9]], i32 0
177; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>*
178; CHECK-NEXT:    store <4 x i32> [[TMP8]], <4 x i32>* [[TMP11]], align 4
179; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
180; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
181; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP6:!llvm.loop !.*]]
182; CHECK:       middle.block:
183; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
184; CHECK-NEXT:    br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]]
185; CHECK:       scalar.ph:
186; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
187; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
188; CHECK:       for.body:
189; CHECK-NEXT:    [[I_023:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
190; CHECK-NEXT:    [[MUL:%.*]] = mul nuw nsw i32 [[I_023]], 2
191; CHECK-NEXT:    [[ADD5:%.*]] = add nuw nsw i32 [[MUL]], 2
192; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i32 [[ADD5]]
193; CHECK-NEXT:    [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4
194; CHECK-NEXT:    [[ADD7:%.*]] = add nsw i32 5, [[TMP13]]
195; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[DST]], i32 [[I_023]]
196; CHECK-NEXT:    store i32 [[ADD7]], i32* [[ARRAYIDX9]], align 4
197; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_023]], 1
198; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
199; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[END]], label [[FOR_BODY]], [[LOOP7:!llvm.loop !.*]]
200; CHECK:       end:
201; CHECK-NEXT:    ret void
202;
203entry:
204  br label %for.body
205for.body:                                         ; preds = %for.body.preheader, %for.body
206  %i.023 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
207  %mul = mul nuw nsw i32 %i.023, 2
208  %add5 = add nuw nsw i32 %mul, 2
209  %arrayidx6 = getelementptr inbounds i32, i32* %data, i32 %add5
210  %0 = load i32, i32* %arrayidx6, align 4
211  %add7 = add nsw i32 5, %0
212  %arrayidx9 = getelementptr inbounds i32, i32* %dst, i32 %i.023
213  store i32 %add7, i32* %arrayidx9, align 4
214  %inc = add nuw nsw i32 %i.023, 1
215  %exitcond.not = icmp eq i32 %inc, %n
216  br i1 %exitcond.not, label %end, label %for.body
217end:                                 ; preds = %end, %entry
218  ret void
219}
220
221define void @test_stride3_4i32(i32* readonly %data, i32* noalias nocapture %dst, i32 %n) {
222; CHECK-LABEL: @test_stride3_4i32(
223; CHECK-NEXT:  entry:
224; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
225; CHECK:       vector.ph:
226; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N:%.*]], 3
227; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 4
228; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
229; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
230; CHECK:       vector.body:
231; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
232; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
233; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
234; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 1
235; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[INDEX]], 2
236; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[INDEX]], 3
237; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP0]], i32 [[N]])
238; CHECK-NEXT:    [[TMP4:%.*]] = mul nuw nsw <4 x i32> [[VEC_IND]], <i32 3, i32 3, i32 3, i32 3>
239; CHECK-NEXT:    [[TMP5:%.*]] = add nuw nsw <4 x i32> [[TMP4]], <i32 2, i32 2, i32 2, i32 2>
240; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], <4 x i32> [[TMP5]]
241; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP6]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
242; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> <i32 5, i32 5, i32 5, i32 5>, [[WIDE_MASKED_GATHER]]
243; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i32 [[TMP0]]
244; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0
245; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
246; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP7]], <4 x i32>* [[TMP10]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
247; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
248; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
249; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
250; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP8:!llvm.loop !.*]]
251; CHECK:       middle.block:
252; CHECK-NEXT:    br i1 true, label [[END:%.*]], label [[SCALAR_PH]]
253; CHECK:       scalar.ph:
254; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
255; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
256; CHECK:       for.body:
257; CHECK-NEXT:    [[I_023:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
258; CHECK-NEXT:    [[MUL:%.*]] = mul nuw nsw i32 [[I_023]], 3
259; CHECK-NEXT:    [[ADD5:%.*]] = add nuw nsw i32 [[MUL]], 2
260; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i32 [[ADD5]]
261; CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4
262; CHECK-NEXT:    [[ADD7:%.*]] = add nsw i32 5, [[TMP12]]
263; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[DST]], i32 [[I_023]]
264; CHECK-NEXT:    store i32 [[ADD7]], i32* [[ARRAYIDX9]], align 4
265; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_023]], 1
266; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
267; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[END]], label [[FOR_BODY]], [[LOOP9:!llvm.loop !.*]]
268; CHECK:       end:
269; CHECK-NEXT:    ret void
270;
271entry:
272  br label %for.body
273for.body:                                         ; preds = %for.body.preheader, %for.body
274  %i.023 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
275  %mul = mul nuw nsw i32 %i.023, 3
276  %add5 = add nuw nsw i32 %mul, 2
277  %arrayidx6 = getelementptr inbounds i32, i32* %data, i32 %add5
278  %0 = load i32, i32* %arrayidx6, align 4
279  %add7 = add nsw i32 5, %0
280  %arrayidx9 = getelementptr inbounds i32, i32* %dst, i32 %i.023
281  store i32 %add7, i32* %arrayidx9, align 4
282  %inc = add nuw nsw i32 %i.023, 1
283  %exitcond.not = icmp eq i32 %inc, %n
284  br i1 %exitcond.not, label %end, label %for.body
285end:                                 ; preds = %end, %entry
286  ret void
287}
288
289define void @test_stride4_4i32(i32* readonly %data, i32* noalias nocapture %dst, i32 %n) {
290; CHECK-LABEL: @test_stride4_4i32(
291; CHECK-NEXT:  entry:
292; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
293; CHECK:       vector.ph:
294; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N:%.*]], 3
295; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 4
296; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
297; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
298; CHECK:       vector.body:
299; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
300; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
301; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
302; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 1
303; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[INDEX]], 2
304; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[INDEX]], 3
305; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP0]], i32 [[N]])
306; CHECK-NEXT:    [[TMP4:%.*]] = mul nuw nsw <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
307; CHECK-NEXT:    [[TMP5:%.*]] = add nuw nsw <4 x i32> [[TMP4]], <i32 2, i32 2, i32 2, i32 2>
308; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], <4 x i32> [[TMP5]]
309; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP6]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
310; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> <i32 5, i32 5, i32 5, i32 5>, [[WIDE_MASKED_GATHER]]
311; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i32 [[TMP0]]
312; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0
313; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
314; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP7]], <4 x i32>* [[TMP10]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
315; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
316; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
317; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
318; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP10:!llvm.loop !.*]]
319; CHECK:       middle.block:
320; CHECK-NEXT:    br i1 true, label [[END:%.*]], label [[SCALAR_PH]]
321; CHECK:       scalar.ph:
322; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
323; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
324; CHECK:       for.body:
325; CHECK-NEXT:    [[I_023:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
326; CHECK-NEXT:    [[MUL:%.*]] = mul nuw nsw i32 [[I_023]], 4
327; CHECK-NEXT:    [[ADD5:%.*]] = add nuw nsw i32 [[MUL]], 2
328; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i32 [[ADD5]]
329; CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4
330; CHECK-NEXT:    [[ADD7:%.*]] = add nsw i32 5, [[TMP12]]
331; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[DST]], i32 [[I_023]]
332; CHECK-NEXT:    store i32 [[ADD7]], i32* [[ARRAYIDX9]], align 4
333; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_023]], 1
334; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
335; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[END]], label [[FOR_BODY]], [[LOOP11:!llvm.loop !.*]]
336; CHECK:       end:
337; CHECK-NEXT:    ret void
338;
339entry:
340  br label %for.body
341for.body:                                         ; preds = %for.body.preheader, %for.body
342  %i.023 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
343  %mul = mul nuw nsw i32 %i.023, 4
344  %add5 = add nuw nsw i32 %mul, 2
345  %arrayidx6 = getelementptr inbounds i32, i32* %data, i32 %add5
346  %0 = load i32, i32* %arrayidx6, align 4
347  %add7 = add nsw i32 5, %0
348  %arrayidx9 = getelementptr inbounds i32, i32* %dst, i32 %i.023
349  store i32 %add7, i32* %arrayidx9, align 4
350  %inc = add nuw nsw i32 %i.023, 1
351  %exitcond.not = icmp eq i32 %inc, %n
352  br i1 %exitcond.not, label %end, label %for.body
353end:                                 ; preds = %end, %entry
354  ret void
355}
356
357define void @test_stride_loopinvar_4i32(i32* readonly %data, i32* noalias nocapture %dst, i32 %n, i32 %stride) {
358; CHECK-LABEL: @test_stride_loopinvar_4i32(
359; CHECK-NEXT:  entry:
360; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
361; CHECK:       vector.scevcheck:
362; CHECK-NEXT:    [[IDENT_CHECK:%.*]] = icmp ne i32 [[STRIDE:%.*]], 1
363; CHECK-NEXT:    [[TMP0:%.*]] = or i1 false, [[IDENT_CHECK]]
364; CHECK-NEXT:    br i1 [[TMP0]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
365; CHECK:       vector.ph:
366; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N:%.*]], 3
367; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 4
368; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
369; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
370; CHECK:       vector.body:
371; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
372; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0
373; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
374; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
375; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 0
376; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP1]], i32 [[N]])
377; CHECK-NEXT:    [[TMP2:%.*]] = mul nuw nsw i32 [[TMP1]], [[STRIDE]]
378; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw i32 [[TMP2]], 2
379; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], i32 [[TMP3]]
380; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
381; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
382; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP6]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
383; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> <i32 5, i32 5, i32 5, i32 5>, [[WIDE_MASKED_LOAD]]
384; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i32 [[TMP1]]
385; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0
386; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
387; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP7]], <4 x i32>* [[TMP10]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
388; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
389; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
390; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP12:!llvm.loop !.*]]
391; CHECK:       middle.block:
392; CHECK-NEXT:    br i1 true, label [[END:%.*]], label [[SCALAR_PH]]
393; CHECK:       scalar.ph:
394; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
395; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
396; CHECK:       for.body:
397; CHECK-NEXT:    [[I_023:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
398; CHECK-NEXT:    [[MUL:%.*]] = mul nuw nsw i32 [[I_023]], [[STRIDE]]
399; CHECK-NEXT:    [[ADD5:%.*]] = add nuw nsw i32 [[MUL]], 2
400; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i32 [[ADD5]]
401; CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4
402; CHECK-NEXT:    [[ADD7:%.*]] = add nsw i32 5, [[TMP12]]
403; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[DST]], i32 [[I_023]]
404; CHECK-NEXT:    store i32 [[ADD7]], i32* [[ARRAYIDX9]], align 4
405; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_023]], 1
406; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
407; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[END]], label [[FOR_BODY]], [[LOOP13:!llvm.loop !.*]]
408; CHECK:       end:
409; CHECK-NEXT:    ret void
410;
411entry:
412  br label %for.body
413for.body:                                         ; preds = %for.body.preheader, %for.body
414  %i.023 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
415  %mul = mul nuw nsw i32 %i.023, %stride
416  %add5 = add nuw nsw i32 %mul, 2
417  %arrayidx6 = getelementptr inbounds i32, i32* %data, i32 %add5
418  %0 = load i32, i32* %arrayidx6, align 4
419  %add7 = add nsw i32 5, %0
420  %arrayidx9 = getelementptr inbounds i32, i32* %dst, i32 %i.023
421  store i32 %add7, i32* %arrayidx9, align 4
422  %inc = add nuw nsw i32 %i.023, 1
423  %exitcond.not = icmp eq i32 %inc, %n
424  br i1 %exitcond.not, label %end, label %for.body
425end:                                 ; preds = %end, %entry
426  ret void
427}
428
429define void @test_stride_noninvar_4i32(i32* readonly %data, i32* noalias nocapture %dst, i32 %n) {
430; CHECK-LABEL: @test_stride_noninvar_4i32(
431; CHECK-NEXT:  entry:
432; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N:%.*]], 4
433; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
434; CHECK:       vector.ph:
435; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N]], 4
436; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
437; CHECK-NEXT:    [[TMP0:%.*]] = mul i32 [[N_VEC]], 8
438; CHECK-NEXT:    [[IND_END:%.*]] = add i32 3, [[TMP0]]
439; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
440; CHECK:       vector.body:
441; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
442; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
443; CHECK-NEXT:    [[VEC_IND2:%.*]] = phi <4 x i32> [ <i32 3, i32 11, i32 19, i32 27>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[VECTOR_BODY]] ]
444; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 0
445; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[INDEX]], 1
446; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[INDEX]], 2
447; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[INDEX]], 3
448; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw nsw <4 x i32> [[VEC_IND]], [[VEC_IND2]]
449; CHECK-NEXT:    [[TMP6:%.*]] = add nuw nsw <4 x i32> [[TMP5]], <i32 2, i32 2, i32 2, i32 2>
450; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], <4 x i32> [[TMP6]]
451; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP7]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
452; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> <i32 5, i32 5, i32 5, i32 5>, [[WIDE_MASKED_GATHER]]
453; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i32 [[TMP1]]
454; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP9]], i32 0
455; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>*
456; CHECK-NEXT:    store <4 x i32> [[TMP8]], <4 x i32>* [[TMP11]], align 4
457; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
458; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
459; CHECK-NEXT:    [[VEC_IND_NEXT3]] = add <4 x i32> [[VEC_IND2]], <i32 32, i32 32, i32 32, i32 32>
460; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
461; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP14:!llvm.loop !.*]]
462; CHECK:       middle.block:
463; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
464; CHECK-NEXT:    br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]]
465; CHECK:       scalar.ph:
466; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
467; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY]] ]
468; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
469; CHECK:       for.body:
470; CHECK-NEXT:    [[I_023:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
471; CHECK-NEXT:    [[STRIDE:%.*]] = phi i32 [ [[NEXT_STRIDE:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ]
472; CHECK-NEXT:    [[MUL:%.*]] = mul nuw nsw i32 [[I_023]], [[STRIDE]]
473; CHECK-NEXT:    [[ADD5:%.*]] = add nuw nsw i32 [[MUL]], 2
474; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i32 [[ADD5]]
475; CHECK-NEXT:    [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4
476; CHECK-NEXT:    [[ADD7:%.*]] = add nsw i32 5, [[TMP13]]
477; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[DST]], i32 [[I_023]]
478; CHECK-NEXT:    store i32 [[ADD7]], i32* [[ARRAYIDX9]], align 4
479; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_023]], 1
480; CHECK-NEXT:    [[NEXT_STRIDE]] = add nuw nsw i32 [[STRIDE]], 8
481; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
482; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[END]], label [[FOR_BODY]], [[LOOP15:!llvm.loop !.*]]
483; CHECK:       end:
484; CHECK-NEXT:    ret void
485;
486entry:
487  br label %for.body
488for.body:                                         ; preds = %for.body.preheader, %for.body
489  %i.023 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
490  %stride = phi i32 [ %next.stride, %for.body ], [ 3, %entry ]
491  %mul = mul nuw nsw i32 %i.023, %stride
492  %add5 = add nuw nsw i32 %mul, 2
493  %arrayidx6 = getelementptr inbounds i32, i32* %data, i32 %add5
494  %0 = load i32, i32* %arrayidx6, align 4
495  %add7 = add nsw i32 5, %0
496  %arrayidx9 = getelementptr inbounds i32, i32* %dst, i32 %i.023
497  store i32 %add7, i32* %arrayidx9, align 4
498  %inc = add nuw nsw i32 %i.023, 1
499  %next.stride = add nuw nsw i32 %stride, 8
500  %exitcond.not = icmp eq i32 %inc, %n
501  br i1 %exitcond.not, label %end, label %for.body
502end:                                 ; preds = %end, %entry
503  ret void
504}
505
506define void @test_stride_noninvar2_4i32(i32* readonly %data, i32* noalias nocapture %dst, i32 %n) {
507; CHECK-LABEL: @test_stride_noninvar2_4i32(
508; CHECK-NEXT:  entry:
509; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
510; CHECK:       for.body:
511; CHECK-NEXT:    [[I_023:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
512; CHECK-NEXT:    [[STRIDE:%.*]] = phi i32 [ [[NEXT_STRIDE:%.*]], [[FOR_BODY]] ], [ 3, [[ENTRY]] ]
513; CHECK-NEXT:    [[MUL:%.*]] = mul nuw nsw i32 [[I_023]], [[STRIDE]]
514; CHECK-NEXT:    [[ADD5:%.*]] = add nuw nsw i32 [[MUL]], 2
515; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], i32 [[ADD5]]
516; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4
517; CHECK-NEXT:    [[ADD7:%.*]] = add nsw i32 5, [[TMP0]]
518; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i32 [[I_023]]
519; CHECK-NEXT:    store i32 [[ADD7]], i32* [[ARRAYIDX9]], align 4
520; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_023]], 1
521; CHECK-NEXT:    [[NEXT_STRIDE]] = mul nuw nsw i32 [[STRIDE]], 8
522; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N:%.*]]
523; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[END:%.*]], label [[FOR_BODY]]
524; CHECK:       end:
525; CHECK-NEXT:    ret void
526;
527entry:
528  br label %for.body
529for.body:                                         ; preds = %for.body.preheader, %for.body
530  %i.023 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
531  %stride = phi i32 [ %next.stride, %for.body ], [ 3, %entry ]
532  %mul = mul nuw nsw i32 %i.023, %stride
533  %add5 = add nuw nsw i32 %mul, 2
534  %arrayidx6 = getelementptr inbounds i32, i32* %data, i32 %add5
535  %0 = load i32, i32* %arrayidx6, align 4
536  %add7 = add nsw i32 5, %0
537  %arrayidx9 = getelementptr inbounds i32, i32* %dst, i32 %i.023
538  store i32 %add7, i32* %arrayidx9, align 4
539  %inc = add nuw nsw i32 %i.023, 1
540  %next.stride = mul nuw nsw i32 %stride, 8
541  %exitcond.not = icmp eq i32 %inc, %n
542  br i1 %exitcond.not, label %end, label %for.body
543end:                                 ; preds = %end, %entry
544  ret void
545}
546
547define void @test_stride_noninvar3_4i32(i32* readonly %data, i32* noalias nocapture %dst, i32 %n, i32 %x) {
548; CHECK-LABEL: @test_stride_noninvar3_4i32(
549; CHECK-NEXT:  entry:
550; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N:%.*]], 4
551; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
552; CHECK:       vector.ph:
553; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N]], 4
554; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
555; CHECK-NEXT:    [[TMP0:%.*]] = mul i32 [[N_VEC]], [[X:%.*]]
556; CHECK-NEXT:    [[IND_END:%.*]] = add i32 3, [[TMP0]]
557; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[X]], i32 0
558; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
559; CHECK-NEXT:    [[TMP1:%.*]] = mul <4 x i32> <i32 0, i32 1, i32 2, i32 3>, [[DOTSPLAT]]
560; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> <i32 3, i32 3, i32 3, i32 3>, [[TMP1]]
561; CHECK-NEXT:    [[TMP2:%.*]] = mul i32 [[X]], 4
562; CHECK-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <4 x i32> undef, i32 [[TMP2]], i32 0
563; CHECK-NEXT:    [[DOTSPLAT3:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT2]], <4 x i32> undef, <4 x i32> zeroinitializer
564; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
565; CHECK:       vector.body:
566; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
567; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
568; CHECK-NEXT:    [[VEC_IND4:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT5:%.*]], [[VECTOR_BODY]] ]
569; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[INDEX]], 0
570; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[INDEX]], 1
571; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[INDEX]], 2
572; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[INDEX]], 3
573; CHECK-NEXT:    [[TMP7:%.*]] = mul nuw nsw <4 x i32> [[VEC_IND]], [[VEC_IND4]]
574; CHECK-NEXT:    [[TMP8:%.*]] = add nuw nsw <4 x i32> [[TMP7]], <i32 2, i32 2, i32 2, i32 2>
575; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], <4 x i32> [[TMP8]]
576; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP9]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
577; CHECK-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> <i32 5, i32 5, i32 5, i32 5>, [[WIDE_MASKED_GATHER]]
578; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i32 [[TMP3]]
579; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 0
580; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>*
581; CHECK-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP13]], align 4
582; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
583; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
584; CHECK-NEXT:    [[VEC_IND_NEXT5]] = add <4 x i32> [[VEC_IND4]], [[DOTSPLAT3]]
585; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
586; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP16:!llvm.loop !.*]]
587; CHECK:       middle.block:
588; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
589; CHECK-NEXT:    br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]]
590; CHECK:       scalar.ph:
591; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
592; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY]] ]
593; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
594; CHECK:       for.body:
595; CHECK-NEXT:    [[I_023:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
596; CHECK-NEXT:    [[STRIDE:%.*]] = phi i32 [ [[NEXT_STRIDE:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ]
597; CHECK-NEXT:    [[MUL:%.*]] = mul nuw nsw i32 [[I_023]], [[STRIDE]]
598; CHECK-NEXT:    [[ADD5:%.*]] = add nuw nsw i32 [[MUL]], 2
599; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i32 [[ADD5]]
600; CHECK-NEXT:    [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4
601; CHECK-NEXT:    [[ADD7:%.*]] = add nsw i32 5, [[TMP15]]
602; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[DST]], i32 [[I_023]]
603; CHECK-NEXT:    store i32 [[ADD7]], i32* [[ARRAYIDX9]], align 4
604; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_023]], 1
605; CHECK-NEXT:    [[NEXT_STRIDE]] = add nuw nsw i32 [[STRIDE]], [[X]]
606; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
607; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[END]], label [[FOR_BODY]], [[LOOP17:!llvm.loop !.*]]
608; CHECK:       end:
609; CHECK-NEXT:    ret void
610;
611entry:
612  br label %for.body
613for.body:                                         ; preds = %for.body.preheader, %for.body
614  %i.023 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
615  %stride = phi i32 [ %next.stride, %for.body ], [ 3, %entry ]
616  %mul = mul nuw nsw i32 %i.023, %stride
617  %add5 = add nuw nsw i32 %mul, 2
618  %arrayidx6 = getelementptr inbounds i32, i32* %data, i32 %add5
619  %0 = load i32, i32* %arrayidx6, align 4
620  %add7 = add nsw i32 5, %0
621  %arrayidx9 = getelementptr inbounds i32, i32* %dst, i32 %i.023
622  store i32 %add7, i32* %arrayidx9, align 4
623  %inc = add nuw nsw i32 %i.023, 1
624  %next.stride = add nuw nsw i32 %stride, %x
625  %exitcond.not = icmp eq i32 %inc, %n
626  br i1 %exitcond.not, label %end, label %for.body
627end:                                 ; preds = %end, %entry
628  ret void
629}
630
631declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
632declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>)
633declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
634declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)
635declare void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32>, <4 x i32*>, i32, <4 x i1>)
636