1; RUN: opt < %s -loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -S | FileCheck %s
2; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S | FileCheck %s -check-prefix=WIDTH
3
4target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
5
6; Vectorization with dependence checks.
7
8; No plausible dependence - can be vectorized.
9;  for (i = 0; i < 1024; ++i)
10;    A[i] = A[i + 1] + 1;
11
12; CHECK-LABEL: @f1_vec(
13; CHECK: <2 x i32>
14
15define void @f1_vec(i32* %A) {
16entry:
17  br label %for.body
18
19for.body:
20  %indvars.iv = phi i32 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
21  %indvars.iv.next = add i32 %indvars.iv, 1
22  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %indvars.iv.next
23  %0 = load i32, i32* %arrayidx, align 4
24  %add1 = add nsw i32 %0, 1
25  %arrayidx3 = getelementptr inbounds i32, i32* %A, i32 %indvars.iv
26  store i32 %add1, i32* %arrayidx3, align 4
27  %exitcond = icmp ne i32 %indvars.iv.next, 1024
28  br i1 %exitcond, label %for.body, label %for.end
29
30for.end:
31  ret void
32}
33
34; Plausible dependence of distance 1 - can't be vectorized.
35;  for (i = 0; i < 1024; ++i)
36;    A[i+1] = A[i] + 1;
37
38; CHECK-LABEL: @f2_novec(
39; CHECK-NOT: <2 x i32>
40
41define void @f2_novec(i32* %A) {
42entry:
43  br label %for.body
44
45for.body:
46  %indvars.iv = phi i32 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
47  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %indvars.iv
48  %0 = load i32, i32* %arrayidx, align 4
49  %add = add nsw i32 %0, 1
50  %indvars.iv.next = add i32 %indvars.iv, 1
51  %arrayidx3 = getelementptr inbounds i32, i32* %A, i32 %indvars.iv.next
52  store i32 %add, i32* %arrayidx3, align 4
53  %exitcond = icmp ne i32 %indvars.iv.next, 1024
54  br i1 %exitcond, label %for.body, label %for.end
55
56for.end:
57  ret void
58}
59
60; Plausible dependence of distance 2 - can be vectorized with a width of 2.
61;  for (i = 0; i < 1024; ++i)
62;    A[i+2] = A[i] + 1;
63
64; CHECK-LABEL: @f3_vec_len(
65; CHECK: <2 x i32>
66
67; WIDTH: f3_vec_len
68; WIDTH-NOT: <4 x i32>
69
70define void @f3_vec_len(i32* %A) {
71entry:
72  br label %for.body
73
74for.body:
75  %i.01 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
76  %idxprom = sext i32 %i.01 to i64
77  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %idxprom
78  %0 = load i32, i32* %arrayidx, align 4
79  %add = add nsw i32 %0, 1
80  %add1 = add nsw i32 %i.01, 2
81  %idxprom2 = sext i32 %add1 to i64
82  %arrayidx3 = getelementptr inbounds i32, i32* %A, i64 %idxprom2
83  store i32 %add, i32* %arrayidx3, align 4
84  %inc = add nsw i32 %i.01, 1
85  %cmp = icmp slt i32 %inc, 1024
86  br i1 %cmp, label %for.body, label %for.end
87
88for.end:
89  ret void
90}
91
92; Plausible dependence of distance 1 - cannot be vectorized (without reordering
93; accesses).
94;   for (i = 0; i < 1024; ++i) {
95;     B[i] = A[i];
96;     A[i] = B[i + 1];
97;   }
98
99; CHECK-LABEL: @f5(
100; CHECK-NOT: <2 x i32>
101
102define void @f5(i32*  %A, i32* %B) {
103entry:
104  br label %for.body
105
106for.body:
107  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
108  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
109  %0 = load i32, i32* %arrayidx, align 4
110  %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
111  store i32 %0, i32* %arrayidx2, align 4
112  %indvars.iv.next = add nsw i64 %indvars.iv, 1
113  %arrayidx4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv.next
114  %1 = load i32, i32* %arrayidx4, align 4
115  store i32 %1, i32* %arrayidx, align 4
116  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
117  %exitcond = icmp ne i32 %lftr.wideiv, 1024
118  br i1 %exitcond, label %for.body, label %for.end
119
120for.end:
121  ret void
122}
123
124; Dependence through a phi node - must not vectorize.
125;   for (i = 0; i < 1024; ++i) {
126;     a[i+1] = tmp;
127;     tmp = a[i];
128;   }
129
130; CHECK-LABEL: @f6
131; CHECK-NOT: <2 x i32>
132
133define i32 @f6(i32* %a, i32 %tmp) {
134entry:
135  br label %for.body
136
137for.body:
138  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
139  %tmp.addr.08 = phi i32 [ %tmp, %entry ], [ %0, %for.body ]
140  %indvars.iv.next = add nsw i64 %indvars.iv, 1
141  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next
142  store i32 %tmp.addr.08, i32* %arrayidx, align 4
143  %arrayidx3 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
144  %0 = load i32, i32* %arrayidx3, align 4
145  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
146  %exitcond = icmp ne i32 %lftr.wideiv, 1024
147  br i1 %exitcond, label %for.body, label %for.end
148
149for.end:
150  ret i32 undef
151}
152
153; Don't vectorize true loop carried dependencies that are not a multiple of the
154; vector width.
155; Example:
156;   for (int i = ...; ++i) {
157;     a[i] = a[i-3] + ...;
158; It is a bad idea to vectorize this loop because store-load forwarding will not
159; happen.
160;
161
162; CHECK-LABEL: @nostoreloadforward(
163; CHECK-NOT: <2 x i32>
164
165define void @nostoreloadforward(i32* %A) {
166entry:
167  br label %for.body
168
169for.body:
170  %indvars.iv = phi i64 [ 16, %entry ], [ %indvars.iv.next, %for.body ]
171  %0 = add nsw i64 %indvars.iv, -3
172  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %0
173  %1 = load i32, i32* %arrayidx, align 4
174  %2 = add nsw i64 %indvars.iv, 4
175  %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %2
176  %3 = load i32, i32* %arrayidx2, align 4
177  %add3 = add nsw i32 %3, %1
178  %arrayidx5 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
179  store i32 %add3, i32* %arrayidx5, align 4
180  %indvars.iv.next = add i64 %indvars.iv, 1
181  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
182  %exitcond = icmp ne i32 %lftr.wideiv, 128
183  br i1 %exitcond, label %for.body, label %for.end
184
185for.end:
186  ret void
187}
188
189; Example:
190;   for (int i = ...; ++i) {
191;     a[i] = b[i];
192;     c[i] = a[i-3] + ...;
193; It is a bad idea to vectorize this loop because store-load forwarding will not
194; happen.
195;
196
197; CHECK-LABEL: @nostoreloadforward2(
198; CHECK-NOT: <2 x i32>
199
200define void @nostoreloadforward2(i32* noalias %A, i32* noalias %B, i32* noalias %C) {
201entry:
202  br label %for.body
203
204for.body:
205  %indvars.iv = phi i64 [ 16, %entry ], [ %indvars.iv.next, %for.body ]
206  %arrayidx = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
207  %0 = load i32, i32* %arrayidx, align 4
208  %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
209  store i32 %0, i32* %arrayidx2, align 4
210  %1 = add nsw i64 %indvars.iv, -3
211  %arrayidx4 = getelementptr inbounds i32, i32* %A, i64 %1
212  %2 = load i32, i32* %arrayidx4, align 4
213  %arrayidx6 = getelementptr inbounds i32, i32* %C, i64 %indvars.iv
214  store i32 %2, i32* %arrayidx6, align 4
215  %indvars.iv.next = add i64 %indvars.iv, 1
216  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
217  %exitcond = icmp ne i32 %lftr.wideiv, 128
218  br i1 %exitcond, label %for.body, label %for.end
219
220for.end:
221  ret void
222}
223