1; RUN: opt -S -loop-vectorize -instcombine -force-vector-width=4 -force-vector-interleave=1 -enable-interleaved-mem-accesses=true -runtime-memory-check-threshold=24 < %s | FileCheck %s
2
3target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
4
5; Check vectorization on an interleaved load group of factor 2 and an interleaved
6; store group of factor 2.
7
8; int AB[1024];
9; int CD[1024];
10;  void test_array_load2_store2(int C, int D) {
11;   for (int i = 0; i < 1024; i+=2) {
12;     int A = AB[i];
13;     int B = AB[i+1];
14;     CD[i] = A + C;
15;     CD[i+1] = B * D;
16;   }
17; }
18
19; CHECK-LABEL: @test_array_load2_store2(
20; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4
21; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
22; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
23; CHECK: add nsw <4 x i32>
24; CHECK: mul nsw <4 x i32>
25; CHECK: %interleaved.vec = shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
26; CHECK: store <8 x i32> %interleaved.vec, <8 x i32>* %{{.*}}, align 4
27
28@AB = common global [1024 x i32] zeroinitializer, align 4
29@CD = common global [1024 x i32] zeroinitializer, align 4
30
31define void @test_array_load2_store2(i32 %C, i32 %D) {
32entry:
33  br label %for.body
34
35for.body:                                         ; preds = %for.body, %entry
36  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
37  %arrayidx0 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %indvars.iv
38  %tmp = load i32, i32* %arrayidx0, align 4
39  %tmp1 = or i64 %indvars.iv, 1
40  %arrayidx1 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %tmp1
41  %tmp2 = load i32, i32* %arrayidx1, align 4
42  %add = add nsw i32 %tmp, %C
43  %mul = mul nsw i32 %tmp2, %D
44  %arrayidx2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %indvars.iv
45  store i32 %add, i32* %arrayidx2, align 4
46  %arrayidx3 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %tmp1
47  store i32 %mul, i32* %arrayidx3, align 4
48  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
49  %cmp = icmp slt i64 %indvars.iv.next, 1024
50  br i1 %cmp, label %for.body, label %for.end
51
52for.end:                                          ; preds = %for.body
53  ret void
54}
55
56; int A[3072];
57; struct ST S[1024];
58; void test_struct_st3() {
59;   int *ptr = A;
60;   for (int i = 0; i < 1024; i++) {
61;     int X1 = *ptr++;
62;     int X2 = *ptr++;
63;     int X3 = *ptr++;
64;     T[i].x = X1 + 1;
65;     T[i].y = X2 + 2;
66;     T[i].z = X3 + 3;
67;   }
68; }
69
70; CHECK-LABEL: @test_struct_array_load3_store3(
71; CHECK: %wide.vec = load <12 x i32>, <12 x i32>* {{.*}}, align 4
72; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
73; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
74; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
75; CHECK: add nsw <4 x i32> {{.*}}, <i32 1, i32 1, i32 1, i32 1>
76; CHECK: add nsw <4 x i32> {{.*}}, <i32 2, i32 2, i32 2, i32 2>
77; CHECK: add nsw <4 x i32> {{.*}}, <i32 3, i32 3, i32 3, i32 3>
78; CHECK: shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
79; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
80; CHECK: %interleaved.vec = shufflevector <8 x i32> {{.*}}, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
81; CHECK: store <12 x i32> %interleaved.vec, <12 x i32>* {{.*}}, align 4
82
83%struct.ST3 = type { i32, i32, i32 }
84@A = common global [3072 x i32] zeroinitializer, align 4
85@S = common global [1024 x %struct.ST3] zeroinitializer, align 4
86
87define void @test_struct_array_load3_store3() {
88entry:
89  br label %for.body
90
91for.body:                                         ; preds = %for.body, %entry
92  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
93  %ptr.016 = phi i32* [ getelementptr inbounds ([3072 x i32], [3072 x i32]* @A, i64 0, i64 0), %entry ], [ %incdec.ptr2, %for.body ]
94  %incdec.ptr = getelementptr inbounds i32, i32* %ptr.016, i64 1
95  %tmp = load i32, i32* %ptr.016, align 4
96  %incdec.ptr1 = getelementptr inbounds i32, i32* %ptr.016, i64 2
97  %tmp1 = load i32, i32* %incdec.ptr, align 4
98  %incdec.ptr2 = getelementptr inbounds i32, i32* %ptr.016, i64 3
99  %tmp2 = load i32, i32* %incdec.ptr1, align 4
100  %add = add nsw i32 %tmp, 1
101  %x = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 0
102  store i32 %add, i32* %x, align 4
103  %add3 = add nsw i32 %tmp1, 2
104  %y = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 1
105  store i32 %add3, i32* %y, align 4
106  %add6 = add nsw i32 %tmp2, 3
107  %z = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 2
108  store i32 %add6, i32* %z, align 4
109  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
110  %exitcond = icmp eq i64 %indvars.iv.next, 1024
111  br i1 %exitcond, label %for.end, label %for.body
112
113for.end:                                          ; preds = %for.body
114  ret void
115}
116
117; Check vectorization on an interleaved load group of factor 4.
118
119; struct ST4{
120;   int x;
121;   int y;
122;   int z;
123;   int w;
124; };
125; int test_struct_load4(struct ST4 *S) {
126;   int r = 0;
127;   for (int i = 0; i < 1024; i++) {
128;      r += S[i].x;
129;      r -= S[i].y;
130;      r += S[i].z;
131;      r -= S[i].w;
132;   }
133;   return r;
134; }
135
136; CHECK-LABEL: @test_struct_load4(
137; CHECK: %wide.vec = load <16 x i32>, <16 x i32>* {{.*}}, align 4
138; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
139; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
140; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
141; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
142; CHECK: add nsw <4 x i32>
143; CHECK: sub <4 x i32>
144; CHECK: add nsw <4 x i32>
145; CHECK: sub <4 x i32>
146
147%struct.ST4 = type { i32, i32, i32, i32 }
148
149define i32 @test_struct_load4(%struct.ST4* nocapture readonly %S) {
150entry:
151  br label %for.body
152
153for.body:                                         ; preds = %for.body, %entry
154  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
155  %r.022 = phi i32 [ 0, %entry ], [ %sub8, %for.body ]
156  %x = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 0
157  %tmp = load i32, i32* %x, align 4
158  %add = add nsw i32 %tmp, %r.022
159  %y = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 1
160  %tmp1 = load i32, i32* %y, align 4
161  %sub = sub i32 %add, %tmp1
162  %z = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 2
163  %tmp2 = load i32, i32* %z, align 4
164  %add5 = add nsw i32 %sub, %tmp2
165  %w = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 3
166  %tmp3 = load i32, i32* %w, align 4
167  %sub8 = sub i32 %add5, %tmp3
168  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
169  %exitcond = icmp eq i64 %indvars.iv.next, 1024
170  br i1 %exitcond, label %for.end, label %for.body
171
172for.end:                                          ; preds = %for.body
173  ret i32 %sub8
174}
175
176; Check vectorization on an interleaved store group of factor 4.
177
178; void test_struct_store4(int *A, struct ST4 *B) {
179;   int *ptr = A;
180;   for (int i = 0; i < 1024; i++) {
181;     int X = *ptr++;
182;     B[i].x = X + 1;
183;     B[i].y = X * 2;
184;     B[i].z = X + 3;
185;     B[i].w = X + 4;
186;   }
187; }
188
189; CHECK-LABEL: @test_struct_store4(
190; CHECK: %[[LD:.*]] = load <4 x i32>, <4 x i32>*
191; CHECK: add nsw <4 x i32> %[[LD]], <i32 1, i32 1, i32 1, i32 1>
192; CHECK: shl nsw <4 x i32> %[[LD]], <i32 1, i32 1, i32 1, i32 1>
193; CHECK: add nsw <4 x i32> %[[LD]], <i32 3, i32 3, i32 3, i32 3>
194; CHECK: add nsw <4 x i32> %[[LD]], <i32 4, i32 4, i32 4, i32 4>
195; CHECK: shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
196; CHECK: shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
197; CHECK: %interleaved.vec = shufflevector <8 x i32> {{.*}}, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
198; CHECK: store <16 x i32> %interleaved.vec, <16 x i32>* {{.*}}, align 4
199
200define void @test_struct_store4(i32* noalias nocapture readonly %A, %struct.ST4* noalias nocapture %B) {
201entry:
202  br label %for.body
203
204for.cond.cleanup:                                 ; preds = %for.body
205  ret void
206
207for.body:                                         ; preds = %for.body, %entry
208  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
209  %ptr.024 = phi i32* [ %A, %entry ], [ %incdec.ptr, %for.body ]
210  %incdec.ptr = getelementptr inbounds i32, i32* %ptr.024, i64 1
211  %tmp = load i32, i32* %ptr.024, align 4
212  %add = add nsw i32 %tmp, 1
213  %x = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 0
214  store i32 %add, i32* %x, align 4
215  %mul = shl nsw i32 %tmp, 1
216  %y = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 1
217  store i32 %mul, i32* %y, align 4
218  %add3 = add nsw i32 %tmp, 3
219  %z = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 2
220  store i32 %add3, i32* %z, align 4
221  %add6 = add nsw i32 %tmp, 4
222  %w = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 3
223  store i32 %add6, i32* %w, align 4
224  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
225  %exitcond = icmp eq i64 %indvars.iv.next, 1024
226  br i1 %exitcond, label %for.cond.cleanup, label %for.body
227}
228
229; Check vectorization on a reverse interleaved load group of factor 2 and
230; a reverse interleaved store group of factor 2.
231
232; struct ST2 {
233;  int x;
234;  int y;
235; };
236;
237; void test_reversed_load2_store2(struct ST2 *A, struct ST2 *B) {
238;   for (int i = 1023; i >= 0; i--) {
239;     int a = A[i].x + i;  // interleaved load of index 0
240;     int b = A[i].y - i;  // interleaved load of index 1
241;     B[i].x = a;          // interleaved store of index 0
242;     B[i].y = b;          // interleaved store of index 1
243;   }
244; }
245
246; CHECK-LABEL: @test_reversed_load2_store2(
247; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* {{.*}}, align 4
248; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
249; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
250; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
251; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
252; CHECK: add nsw <4 x i32>
253; CHECK: sub nsw <4 x i32>
254; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
255; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
256; CHECK: %interleaved.vec = shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
257; CHECK: store <8 x i32> %interleaved.vec, <8 x i32>* %{{.*}}, align 4
258
259%struct.ST2 = type { i32, i32 }
260
261define void @test_reversed_load2_store2(%struct.ST2* noalias nocapture readonly %A, %struct.ST2* noalias nocapture %B) {
262entry:
263  br label %for.body
264
265for.cond.cleanup:                                 ; preds = %for.body
266  ret void
267
268for.body:                                         ; preds = %for.body, %entry
269  %indvars.iv = phi i64 [ 1023, %entry ], [ %indvars.iv.next, %for.body ]
270  %x = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 0
271  %tmp = load i32, i32* %x, align 4
272  %tmp1 = trunc i64 %indvars.iv to i32
273  %add = add nsw i32 %tmp, %tmp1
274  %y = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 1
275  %tmp2 = load i32, i32* %y, align 4
276  %sub = sub nsw i32 %tmp2, %tmp1
277  %x5 = getelementptr inbounds %struct.ST2, %struct.ST2* %B, i64 %indvars.iv, i32 0
278  store i32 %add, i32* %x5, align 4
279  %y8 = getelementptr inbounds %struct.ST2, %struct.ST2* %B, i64 %indvars.iv, i32 1
280  store i32 %sub, i32* %y8, align 4
281  %indvars.iv.next = add nsw i64 %indvars.iv, -1
282  %cmp = icmp sgt i64 %indvars.iv, 0
283  br i1 %cmp, label %for.body, label %for.cond.cleanup
284}
285
286; Check vectorization on an interleaved load group of factor 2 with 1 gap
287; (missing the load of odd elements). Because the vectorized loop would
288; speculatively access memory out-of-bounds, we must execute at least one
289; iteration of the scalar loop.
290
291; void even_load_static_tc(int *A, int *B) {
292;  for (unsigned i = 0; i < 1024; i+=2)
293;     B[i/2] = A[i] * 2;
294; }
295
296; CHECK-LABEL: @even_load_static_tc(
297; CHECK: vector.body:
298; CHECK:   %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4
299; CHECK:   %strided.vec = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
300; CHECK:   icmp eq i64 %index.next, 508
301; CHECK: middle.block:
302; CHECK:   br i1 false, label %for.cond.cleanup, label %scalar.ph
303
304define void @even_load_static_tc(i32* noalias nocapture readonly %A, i32* noalias nocapture %B) {
305entry:
306  br label %for.body
307
308for.cond.cleanup:                                 ; preds = %for.body
309  ret void
310
311for.body:                                         ; preds = %for.body, %entry
312  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
313  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
314  %tmp = load i32, i32* %arrayidx, align 4
315  %mul = shl nsw i32 %tmp, 1
316  %tmp1 = lshr exact i64 %indvars.iv, 1
317  %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %tmp1
318  store i32 %mul, i32* %arrayidx2, align 4
319  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
320  %cmp = icmp ult i64 %indvars.iv.next, 1024
321  br i1 %cmp, label %for.body, label %for.cond.cleanup
322}
323
324; Check vectorization on an interleaved load group of factor 2 with 1 gap
325; (missing the load of odd elements). Because the vectorized loop would
326; speculatively access memory out-of-bounds, we must execute at least one
327; iteration of the scalar loop.
328
329; void even_load_dynamic_tc(int *A, int *B, unsigned N) {
330;  for (unsigned i = 0; i < N; i+=2)
331;     B[i/2] = A[i] * 2;
332; }
333
334; CHECK-LABEL: @even_load_dynamic_tc(
335; CHECK: min.iters.checked:
336; CHECK:   %n.mod.vf = and i64 %[[N:[a-zA-Z0-9]+]], 3
337; CHECK:   %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
338; CHECK:   %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf
339; CHECK:   %n.vec = sub i64 %[[N]], %[[R]]
340; CHECK: vector.body:
341; CHECK:   %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4
342; CHECK:   %strided.vec = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
343; CHECK:   icmp eq i64 %index.next, %n.vec
344; CHECK: middle.block:
345; CHECK:   br i1 false, label %for.cond.cleanup, label %scalar.ph
346
347define void @even_load_dynamic_tc(i32* noalias nocapture readonly %A, i32* noalias nocapture %B, i64 %N) {
348entry:
349  br label %for.body
350
351for.cond.cleanup:                                 ; preds = %for.body
352  ret void
353
354for.body:                                         ; preds = %for.body, %entry
355  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
356  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
357  %tmp = load i32, i32* %arrayidx, align 4
358  %mul = shl nsw i32 %tmp, 1
359  %tmp1 = lshr exact i64 %indvars.iv, 1
360  %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %tmp1
361  store i32 %mul, i32* %arrayidx2, align 4
362  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
363  %cmp = icmp ult i64 %indvars.iv.next, %N
364  br i1 %cmp, label %for.body, label %for.cond.cleanup
365}
366
367; Check vectorization on a reverse interleaved load group of factor 2 with 1
368; gap and a reverse interleaved store group of factor 2. The interleaved load
369; group should be removed since it has a gap and is reverse.
370
371; struct pair {
372;  int x;
373;  int y;
374; };
375;
376; void load_gap_reverse(struct pair *P1, struct pair *P2, int X) {
377;   for (int i = 1023; i >= 0; i--) {
378;     int a = X + i;
379;     int b = A[i].y - i;
380;     B[i].x = a;
381;     B[i].y = b;
382;   }
383; }
384
385; CHECK-LABEL: @load_gap_reverse(
386; CHECK-NOT: %wide.vec = load <8 x i64>, <8 x i64>* %{{.*}}, align 8
387; CHECK-NOT: %strided.vec = shufflevector <8 x i64> %wide.vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
388
389%pair = type { i64, i64 }
390define void @load_gap_reverse(%pair* noalias nocapture readonly %P1, %pair* noalias nocapture readonly %P2, i64 %X) {
391entry:
392  br label %for.body
393
394for.body:
395  %i = phi i64 [ 1023, %entry ], [ %i.next, %for.body ]
396  %0 = add nsw i64 %X, %i
397  %1 = getelementptr inbounds %pair, %pair* %P1, i64 %i, i32 0
398  %2 = getelementptr inbounds %pair, %pair* %P2, i64 %i, i32 1
399  %3 = load i64, i64* %2, align 8
400  %4 = sub nsw i64 %3, %i
401  store i64 %0, i64* %1, align 8
402  store i64 %4, i64* %2, align 8
403  %i.next = add nsw i64 %i, -1
404  %cond = icmp sgt i64 %i, 0
405  br i1 %cond, label %for.body, label %for.exit
406
407for.exit:
408  ret void
409}
410
411; Check vectorization on interleaved access groups identified from mixed
412; loads/stores.
413; void mixed_load2_store2(int *A, int *B) {
414;   for (unsigned i = 0; i < 1024; i+=2)  {
415;     B[i] = A[i] * A[i+1];
416;     B[i+1] = A[i] + A[i+1];
417;   }
418; }
419
420; CHECK-LABEL: @mixed_load2_store2(
421; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* {{.*}}, align 4
422; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
423; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
424; CHECK: %interleaved.vec = shufflevector <4 x i32> %{{.*}}, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
425; CHECK: store <8 x i32> %interleaved.vec
426
427define void @mixed_load2_store2(i32* noalias nocapture readonly %A, i32* noalias nocapture %B) {
428entry:
429  br label %for.body
430
431for.cond.cleanup:                                 ; preds = %for.body
432  ret void
433
434for.body:                                         ; preds = %for.body, %entry
435  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
436  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
437  %tmp = load i32, i32* %arrayidx, align 4
438  %tmp1 = or i64 %indvars.iv, 1
439  %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %tmp1
440  %tmp2 = load i32, i32* %arrayidx2, align 4
441  %mul = mul nsw i32 %tmp2, %tmp
442  %arrayidx4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
443  store i32 %mul, i32* %arrayidx4, align 4
444  %tmp3 = load i32, i32* %arrayidx, align 4
445  %tmp4 = load i32, i32* %arrayidx2, align 4
446  %add10 = add nsw i32 %tmp4, %tmp3
447  %arrayidx13 = getelementptr inbounds i32, i32* %B, i64 %tmp1
448  store i32 %add10, i32* %arrayidx13, align 4
449  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
450  %cmp = icmp ult i64 %indvars.iv.next, 1024
451  br i1 %cmp, label %for.body, label %for.cond.cleanup
452}
453
454; Check vectorization on interleaved access groups identified from mixed
455; loads/stores.
456; void mixed_load3_store3(int *A) {
457;   for (unsigned i = 0; i < 1024; i++)  {
458;     *A++ += i;
459;     *A++ += i;
460;     *A++ += i;
461;   }
462; }
463
464; CHECK-LABEL: @mixed_load3_store3(
465; CHECK: %wide.vec = load <12 x i32>, <12 x i32>* {{.*}}, align 4
466; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
467; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
468; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
469; CHECK: %interleaved.vec = shufflevector <8 x i32> %{{.*}}, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
470; CHECK: store <12 x i32> %interleaved.vec, <12 x i32>* %{{.*}}, align 4
471
472define void @mixed_load3_store3(i32* nocapture %A) {
473entry:
474  br label %for.body
475
476for.cond.cleanup:                                 ; preds = %for.body
477  ret void
478
479for.body:                                         ; preds = %for.body, %entry
480  %i.013 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
481  %A.addr.012 = phi i32* [ %A, %entry ], [ %incdec.ptr3, %for.body ]
482  %incdec.ptr = getelementptr inbounds i32, i32* %A.addr.012, i64 1
483  %tmp = load i32, i32* %A.addr.012, align 4
484  %add = add i32 %tmp, %i.013
485  store i32 %add, i32* %A.addr.012, align 4
486  %incdec.ptr1 = getelementptr inbounds i32, i32* %A.addr.012, i64 2
487  %tmp1 = load i32, i32* %incdec.ptr, align 4
488  %add2 = add i32 %tmp1, %i.013
489  store i32 %add2, i32* %incdec.ptr, align 4
490  %incdec.ptr3 = getelementptr inbounds i32, i32* %A.addr.012, i64 3
491  %tmp2 = load i32, i32* %incdec.ptr1, align 4
492  %add4 = add i32 %tmp2, %i.013
493  store i32 %add4, i32* %incdec.ptr1, align 4
494  %inc = add nuw nsw i32 %i.013, 1
495  %exitcond = icmp eq i32 %inc, 1024
496  br i1 %exitcond, label %for.cond.cleanup, label %for.body
497}
498
499; Check vectorization on interleaved access groups with members having different
500; kinds of type.
501
502; struct IntFloat {
503;   int a;
504;   float b;
505; };
506;
507; int SA;
508; float SB;
509;
510; void int_float_struct(struct IntFloat *A) {
511;   int SumA;
512;   float SumB;
513;   for (unsigned i = 0; i < 1024; i++)  {
514;     SumA += A[i].a;
515;     SumB += A[i].b;
516;   }
517;   SA = SumA;
518;   SB = SumB;
519; }
520
521; CHECK-LABEL: @int_float_struct(
522; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4
523; CHECK: %[[V0:.*]] = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
524; CHECK: %[[V1:.*]] = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
525; CHECK: bitcast <4 x i32> %[[V1]] to <4 x float>
526; CHECK: add nsw <4 x i32>
527; CHECK: fadd fast <4 x float>
528
529%struct.IntFloat = type { i32, float }
530
531@SA = common global i32 0, align 4
532@SB = common global float 0.000000e+00, align 4
533
534define void @int_float_struct(%struct.IntFloat* nocapture readonly %A) #0 {
535entry:
536  br label %for.body
537
538for.cond.cleanup:                                 ; preds = %for.body
539  store i32 %add, i32* @SA, align 4
540  store float %add3, float* @SB, align 4
541  ret void
542
543for.body:                                         ; preds = %for.body, %entry
544  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
545  %SumB.014 = phi float [ undef, %entry ], [ %add3, %for.body ]
546  %SumA.013 = phi i32 [ undef, %entry ], [ %add, %for.body ]
547  %a = getelementptr inbounds %struct.IntFloat, %struct.IntFloat* %A, i64 %indvars.iv, i32 0
548  %tmp = load i32, i32* %a, align 4
549  %add = add nsw i32 %tmp, %SumA.013
550  %b = getelementptr inbounds %struct.IntFloat, %struct.IntFloat* %A, i64 %indvars.iv, i32 1
551  %tmp1 = load float, float* %b, align 4
552  %add3 = fadd fast float %SumB.014, %tmp1
553  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
554  %exitcond = icmp eq i64 %indvars.iv.next, 1024
555  br i1 %exitcond, label %for.cond.cleanup, label %for.body
556}
557
558; Check vectorization of interleaved access groups in the presence of
559; dependences (PR27626). The following tests check that we don't reorder
560; dependent loads and stores when generating code for interleaved access
561; groups. Stores should be scalarized because the required code motion would
562; break dependences, and the remaining interleaved load groups should have
563; gaps.
564
565; PR27626_0: Ensure a strided store is not moved after a dependent (zero
566;            distance) strided load.
567
568; void PR27626_0(struct pair *p, int z, int n) {
569;   for (int i = 0; i < n; i++) {
570;     p[i].x = z;
571;     p[i].y = p[i].x;
572;   }
573; }
574
575; CHECK-LABEL: @PR27626_0(
576; CHECK: min.iters.checked:
577; CHECK:   %n.mod.vf = and i64 %[[N:.+]], 3
578; CHECK:   %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
579; CHECK:   %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf
580; CHECK:   %n.vec = sub i64 %[[N]], %[[R]]
581; CHECK: vector.body:
582; CHECK:   %[[L1:.+]] = load <8 x i32>, <8 x i32>* {{.*}}
583; CHECK:   %[[X1:.+]] = extractelement <8 x i32> %[[L1]], i32 0
584; CHECK:   store i32 %[[X1]], {{.*}}
585; CHECK:   %[[X2:.+]] = extractelement <8 x i32> %[[L1]], i32 2
586; CHECK:   store i32 %[[X2]], {{.*}}
587; CHECK:   %[[X3:.+]] = extractelement <8 x i32> %[[L1]], i32 4
588; CHECK:   store i32 %[[X3]], {{.*}}
589; CHECK:   %[[X4:.+]] = extractelement <8 x i32> %[[L1]], i32 6
590; CHECK:   store i32 %[[X4]], {{.*}}
591
592%pair.i32 = type { i32, i32 }
593define void @PR27626_0(%pair.i32 *%p, i32 %z, i64 %n) {
594entry:
595  br label %for.body
596
597for.body:
598  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
599  %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0
600  %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1
601  store i32 %z, i32* %p_i.x, align 4
602  %0 = load i32, i32* %p_i.x, align 4
603  store i32 %0, i32 *%p_i.y, align 4
604  %i.next = add nuw nsw i64 %i, 1
605  %cond = icmp slt i64 %i.next, %n
606  br i1 %cond, label %for.body, label %for.end
607
608for.end:
609  ret void
610}
611
612; PR27626_1: Ensure a strided load is not moved before a dependent (zero
613;            distance) strided store.
614
615; void PR27626_1(struct pair *p, int n) {
616;   int s = 0;
617;   for (int i = 0; i < n; i++) {
618;     p[i].y = p[i].x;
619;     s += p[i].y
620;   }
621; }
622
623; CHECK-LABEL: @PR27626_1(
624; CHECK: min.iters.checked:
625; CHECK:   %n.mod.vf = and i64 %[[N:.+]], 3
626; CHECK:   %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
627; CHECK:   %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf
628; CHECK:   %n.vec = sub i64 %[[N]], %[[R]]
629; CHECK: vector.body:
630; CHECK:   %[[Phi:.+]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ {{.*}}, %vector.body ]
631; CHECK:   %[[L1:.+]] = load <8 x i32>, <8 x i32>* {{.*}}
632; CHECK:   %[[X1:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 0
633; CHECK:   store i32 %[[X1:.+]], {{.*}}
634; CHECK:   %[[X2:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 2
635; CHECK:   store i32 %[[X2:.+]], {{.*}}
636; CHECK:   %[[X3:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 4
637; CHECK:   store i32 %[[X3:.+]], {{.*}}
638; CHECK:   %[[X4:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 6
639; CHECK:   store i32 %[[X4:.+]], {{.*}}
640; CHECK:   %[[L2:.+]] = load <8 x i32>, <8 x i32>* {{.*}}
641; CHECK:   %[[S1:.+]] = shufflevector <8 x i32> %[[L2]], <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
642; CHECK:   add nsw <4 x i32> %[[S1]], %[[Phi]]
643
644define i32 @PR27626_1(%pair.i32 *%p, i64 %n) {
645entry:
646  br label %for.body
647
648for.body:
649  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
650  %s = phi i32 [ %2, %for.body ], [ 0, %entry ]
651  %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0
652  %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1
653  %0 = load i32, i32* %p_i.x, align 4
654  store i32 %0, i32* %p_i.y, align 4
655  %1 = load i32, i32* %p_i.y, align 4
656  %2 = add nsw i32 %1, %s
657  %i.next = add nuw nsw i64 %i, 1
658  %cond = icmp slt i64 %i.next, %n
659  br i1 %cond, label %for.body, label %for.end
660
661for.end:
662  %3 = phi i32 [ %2, %for.body ]
663  ret i32 %3
664}
665
666; PR27626_2: Ensure a strided store is not moved after a dependent (negative
667;            distance) strided load.
668
669; void PR27626_2(struct pair *p, int z, int n) {
670;   for (int i = 0; i < n; i++) {
671;     p[i].x = z;
672;     p[i].y = p[i - 1].x;
673;   }
674; }
675
676; CHECK-LABEL: @PR27626_2(
677; CHECK: min.iters.checked:
678; CHECK:   %n.mod.vf = and i64 %[[N:.+]], 3
679; CHECK:   %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
680; CHECK:   %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf
681; CHECK:   %n.vec = sub i64 %[[N]], %[[R]]
682; CHECK: vector.body:
683; CHECK:   %[[L1:.+]] = load <8 x i32>, <8 x i32>* {{.*}}
684; CHECK:   %[[X1:.+]] = extractelement <8 x i32> %[[L1]], i32 0
685; CHECK:   store i32 %[[X1]], {{.*}}
686; CHECK:   %[[X2:.+]] = extractelement <8 x i32> %[[L1]], i32 2
687; CHECK:   store i32 %[[X2]], {{.*}}
688; CHECK:   %[[X3:.+]] = extractelement <8 x i32> %[[L1]], i32 4
689; CHECK:   store i32 %[[X3]], {{.*}}
690; CHECK:   %[[X4:.+]] = extractelement <8 x i32> %[[L1]], i32 6
691; CHECK:   store i32 %[[X4]], {{.*}}
692
693define void @PR27626_2(%pair.i32 *%p, i64 %n, i32 %z) {
694entry:
695  br label %for.body
696
697for.body:
698  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
699  %i_minus_1 = add nuw nsw i64 %i, -1
700  %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0
701  %p_i_minus_1.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i_minus_1, i32 0
702  %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1
703  store i32 %z, i32* %p_i.x, align 4
704  %0 = load i32, i32* %p_i_minus_1.x, align 4
705  store i32 %0, i32 *%p_i.y, align 4
706  %i.next = add nuw nsw i64 %i, 1
707  %cond = icmp slt i64 %i.next, %n
708  br i1 %cond, label %for.body, label %for.end
709
710for.end:
711  ret void
712}
713
714; PR27626_3: Ensure a strided load is not moved before a dependent (negative
715;            distance) strided store.
716
717; void PR27626_3(struct pair *p, int z, int n) {
718;   for (int i = 0; i < n; i++) {
719;     p[i + 1].y = p[i].x;
720;     s += p[i].y;
721;   }
722; }
723
724; CHECK-LABEL: @PR27626_3(
725; CHECK: min.iters.checked:
726; CHECK:   %n.mod.vf = and i64 %[[N:.+]], 3
727; CHECK:   %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
728; CHECK:   %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf
729; CHECK:   %n.vec = sub i64 %[[N]], %[[R]]
730; CHECK: vector.body:
731; CHECK:   %[[Phi:.+]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ {{.*}}, %vector.body ]
732; CHECK:   %[[L1:.+]] = load <8 x i32>, <8 x i32>* {{.*}}
733; CHECK:   %[[X1:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 0
734; CHECK:   store i32 %[[X1:.+]], {{.*}}
735; CHECK:   %[[X2:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 2
736; CHECK:   store i32 %[[X2:.+]], {{.*}}
737; CHECK:   %[[X3:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 4
738; CHECK:   store i32 %[[X3:.+]], {{.*}}
739; CHECK:   %[[X4:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 6
740; CHECK:   store i32 %[[X4:.+]], {{.*}}
741; CHECK:   %[[L2:.+]] = load <8 x i32>, <8 x i32>* {{.*}}
742; CHECK:   %[[S1:.+]] = shufflevector <8 x i32> %[[L2]], <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
743; CHECK:   add nsw <4 x i32> %[[S1]], %[[Phi]]
744
745define i32 @PR27626_3(%pair.i32 *%p, i64 %n, i32 %z) {
746entry:
747  br label %for.body
748
749for.body:
750  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
751  %s = phi i32 [ %2, %for.body ], [ 0, %entry ]
752  %i_plus_1 = add nuw nsw i64 %i, 1
753  %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0
754  %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1
755  %p_i_plus_1.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i_plus_1, i32 1
756  %0 = load i32, i32* %p_i.x, align 4
757  store i32 %0, i32* %p_i_plus_1.y, align 4
758  %1 = load i32, i32* %p_i.y, align 4
759  %2 = add nsw i32 %1, %s
760  %i.next = add nuw nsw i64 %i, 1
761  %cond = icmp slt i64 %i.next, %n
762  br i1 %cond, label %for.body, label %for.end
763
764for.end:
765  %3 = phi i32 [ %2, %for.body ]
766  ret i32 %3
767}
768
769; PR27626_4: Ensure we form an interleaved group for strided stores in the
770;            presence of a write-after-write dependence. We create a group for
771;            (2) and (3) while excluding (1).
772
773; void PR27626_4(int *a, int x, int y, int z, int n) {
774;   for (int i = 0; i < n; i += 2) {
775;     a[i] = x;      // (1)
776;     a[i] = y;      // (2)
777;     a[i + 1] = z;  // (3)
778;   }
779; }
780
781; CHECK-LABEL: @PR27626_4(
782; CHECK: vector.ph:
783; CHECK:   %[[INS_Y:.+]] = insertelement <4 x i32> undef, i32 %y, i32 0
784; CHECK:   %[[SPLAT_Y:.+]] = shufflevector <4 x i32> %[[INS_Y]], <4 x i32> undef, <4 x i32> zeroinitializer
785; CHECK:   %[[INS_Z:.+]] = insertelement <4 x i32> undef, i32 %z, i32 0
786; CHECK:   %[[SPLAT_Z:.+]] = shufflevector <4 x i32> %[[INS_Z]], <4 x i32> undef, <4 x i32> zeroinitializer
787; CHECK: vector.body:
788; CHECK:   store i32 %x, {{.*}}
789; CHECK:   store i32 %x, {{.*}}
790; CHECK:   store i32 %x, {{.*}}
791; CHECK:   store i32 %x, {{.*}}
792; CHECK:   %[[VEC:.+]] = shufflevector <4 x i32> %[[SPLAT_Y]], <4 x i32> %[[SPLAT_Z]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
793; CHECK:   store <8 x i32> %[[VEC]], {{.*}}
794
795define void @PR27626_4(i32 *%a, i32 %x, i32 %y, i32 %z, i64 %n) {
796entry:
797  br label %for.body
798
799for.body:
800  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
801  %i_plus_1 = add i64 %i, 1
802  %a_i = getelementptr inbounds i32, i32* %a, i64 %i
803  %a_i_plus_1 = getelementptr inbounds i32, i32* %a, i64 %i_plus_1
804  store i32 %x, i32* %a_i, align 4
805  store i32 %y, i32* %a_i, align 4
806  store i32 %z, i32* %a_i_plus_1, align 4
807  %i.next = add nuw nsw i64 %i, 2
808  %cond = icmp slt i64 %i.next, %n
809  br i1 %cond, label %for.body, label %for.end
810
811for.end:
812  ret void
813}
814
815; PR27626_5: Ensure we do not form an interleaved group for strided stores in
816;            the presence of a write-after-write dependence.
817
818; void PR27626_5(int *a, int x, int y, int z, int n) {
819;   for (int i = 3; i < n; i += 2) {
820;     a[i - 1] = x;
821;     a[i - 3] = y;
822;     a[i] = z;
823;   }
824; }
825
826; CHECK-LABEL: @PR27626_5(
827; CHECK: vector.body:
828; CHECK:   store i32 %x, {{.*}}
829; CHECK:   store i32 %x, {{.*}}
830; CHECK:   store i32 %x, {{.*}}
831; CHECK:   store i32 %x, {{.*}}
832; CHECK:   store i32 %y, {{.*}}
833; CHECK:   store i32 %y, {{.*}}
834; CHECK:   store i32 %y, {{.*}}
835; CHECK:   store i32 %y, {{.*}}
836; CHECK:   store i32 %z, {{.*}}
837; CHECK:   store i32 %z, {{.*}}
838; CHECK:   store i32 %z, {{.*}}
839; CHECK:   store i32 %z, {{.*}}
840
841define void @PR27626_5(i32 *%a, i32 %x, i32 %y, i32 %z, i64 %n) {
842entry:
843  br label %for.body
844
845for.body:
846  %i = phi i64 [ %i.next, %for.body ], [ 3, %entry ]
847  %i_minus_1 = sub i64 %i, 1
848  %i_minus_3 = sub i64 %i_minus_1, 2
849  %a_i = getelementptr inbounds i32, i32* %a, i64 %i
850  %a_i_minus_1 = getelementptr inbounds i32, i32* %a, i64 %i_minus_1
851  %a_i_minus_3 = getelementptr inbounds i32, i32* %a, i64 %i_minus_3
852  store i32 %x, i32* %a_i_minus_1, align 4
853  store i32 %y, i32* %a_i_minus_3, align 4
854  store i32 %z, i32* %a_i, align 4
855  %i.next = add nuw nsw i64 %i, 2
856  %cond = icmp slt i64 %i.next, %n
857  br i1 %cond, label %for.body, label %for.end
858
859for.end:
860  ret void
861}
862
863attributes #0 = { "unsafe-fp-math"="true" }
864