1; RUN: opt < %s -loop-vectorize -enable-vplan-native-path -debug-only=loop-vectorize -S 2>&1 | FileCheck %s
2; REQUIRES: asserts
3
4; Verify that outer loops annotated only with the expected explicit
5; vectorization hints are collected for vectorization instead of inner loops.
6
7; Root C/C++ source code for all the test cases
8; void foo(int *a, int *b, int N, int M)
9; {
10;   int i, j;
11; #pragma clang loop vectorize(enable)
12;   for (i = 0; i < N; i++) {
13;     for (j = 0; j < M; j++) {
14;       a[i*M+j] = b[i*M+j] * b[i*M+j];
15;     }
16;   }
17; }
18
19; Case 1: Annotated outer loop WITH vector width information must be collected.
20
21; CHECK-LABEL: vector_width
22; CHECK: LV: Loop hints: force=enabled width=4 unroll=0
23; CHECK: LV: We can vectorize this outer loop!
24; CHECK: LV: Using user VF 4 to build VPlans.
25; CHECK-NOT: LV: Loop hints: force=?
26; CHECK-NOT: LV: Found a loop: inner.body
27
28target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
29
30define void @vector_width(i32* nocapture %a, i32* nocapture readonly %b, i32 %N, i32 %M) local_unnamed_addr {
31entry:
32  %cmp32 = icmp sgt i32 %N, 0
33  br i1 %cmp32, label %outer.ph, label %for.end15
34
35outer.ph:                                   ; preds = %entry
36  %cmp230 = icmp sgt i32 %M, 0
37  %0 = sext i32 %M to i64
38  %wide.trip.count = zext i32 %M to i64
39  %wide.trip.count38 = zext i32 %N to i64
40  br label %outer.body
41
42outer.body:                                 ; preds = %outer.inc, %outer.ph
43  %indvars.iv35 = phi i64 [ 0, %outer.ph ], [ %indvars.iv.next36, %outer.inc ]
44  br i1 %cmp230, label %inner.ph, label %outer.inc
45
46inner.ph:                                   ; preds = %outer.body
47  %1 = mul nsw i64 %indvars.iv35, %0
48  br label %inner.body
49
50inner.body:                                 ; preds = %inner.body, %inner.ph
51  %indvars.iv = phi i64 [ 0, %inner.ph ], [ %indvars.iv.next, %inner.body ]
52  %2 = add nsw i64 %indvars.iv, %1
53  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %2
54  %3 = load i32, i32* %arrayidx, align 4, !tbaa !2
55  %mul8 = mul nsw i32 %3, %3
56  %arrayidx12 = getelementptr inbounds i32, i32* %a, i64 %2
57  store i32 %mul8, i32* %arrayidx12, align 4, !tbaa !2
58  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
59  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
60  br i1 %exitcond, label %outer.inc, label %inner.body
61
62outer.inc:                                        ; preds = %inner.body, %outer.body
63  %indvars.iv.next36 = add nuw nsw i64 %indvars.iv35, 1
64  %exitcond39 = icmp eq i64 %indvars.iv.next36, %wide.trip.count38
65  br i1 %exitcond39, label %for.end15, label %outer.body, !llvm.loop !6
66
67for.end15:                                        ; preds = %outer.inc, %entry
68  ret void
69}
70
71; Case 2: Annotated outer loop WITHOUT vector width information must be collected.
72
73; CHECK-LABEL: case2
74; CHECK: LV: Loop hints: force=enabled width=0 unroll=0
75; CHECK: LV: We can vectorize this outer loop!
76; CHECK: LV: Using VF 1 to build VPlans.
77
78define void @case2(i32* nocapture %a, i32* nocapture readonly %b, i32 %N, i32 %M) local_unnamed_addr {
79entry:
80  %cmp32 = icmp sgt i32 %N, 0
81  br i1 %cmp32, label %outer.ph, label %for.end15
82
83outer.ph:                                          ; preds = %entry
84  %cmp230 = icmp sgt i32 %M, 0
85  %0 = sext i32 %M to i64
86  %wide.trip.count = zext i32 %M to i64
87  %wide.trip.count38 = zext i32 %N to i64
88  br label %outer.body
89
90outer.body:                                        ; preds = %outer.inc, %outer.ph
91  %indvars.iv35 = phi i64 [ 0, %outer.ph ], [ %indvars.iv.next36, %outer.inc ]
92  br i1 %cmp230, label %inner.ph, label %outer.inc
93
94inner.ph:                                  ; preds = %outer.body
95  %1 = mul nsw i64 %indvars.iv35, %0
96  br label %inner.body
97
98inner.body:                                        ; preds = %inner.body, %inner.ph
99  %indvars.iv = phi i64 [ 0, %inner.ph ], [ %indvars.iv.next, %inner.body ]
100  %2 = add nsw i64 %indvars.iv, %1
101  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %2
102  %3 = load i32, i32* %arrayidx, align 4, !tbaa !2
103  %mul8 = mul nsw i32 %3, %3
104  %arrayidx12 = getelementptr inbounds i32, i32* %a, i64 %2
105  store i32 %mul8, i32* %arrayidx12, align 4, !tbaa !2
106  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
107  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
108  br i1 %exitcond, label %outer.inc, label %inner.body
109
110outer.inc:                                        ; preds = %inner.body, %outer.body
111  %indvars.iv.next36 = add nuw nsw i64 %indvars.iv35, 1
112  %exitcond39 = icmp eq i64 %indvars.iv.next36, %wide.trip.count38
113  br i1 %exitcond39, label %for.end15, label %outer.body, !llvm.loop !9
114
115for.end15:                                        ; preds = %outer.inc, %entry
116  ret void
117}
118
119; Case 3: Annotated outer loop WITH vector width and interleave information
120; doesn't have to be collected.
121
122; CHECK-LABEL: case3
123; CHECK-NOT: LV: Loop hints: force=enabled
124; CHECK-NOT: LV: We can vectorize this outer loop!
125; CHECK: LV: Loop hints: force=?
126; CHECK: LV: Found a loop: inner.body
127
128define void @case3(i32* nocapture %a, i32* nocapture readonly %b, i32 %N, i32 %M) local_unnamed_addr {
129entry:
130  %cmp32 = icmp sgt i32 %N, 0
131  br i1 %cmp32, label %outer.ph, label %for.end15
132
133outer.ph:                                         ; preds = %entry
134  %cmp230 = icmp sgt i32 %M, 0
135  %0 = sext i32 %M to i64
136  %wide.trip.count = zext i32 %M to i64
137  %wide.trip.count38 = zext i32 %N to i64
138  br label %outer.body
139
140outer.body:                                       ; preds = %outer.inc, %outer.ph
141  %indvars.iv35 = phi i64 [ 0, %outer.ph ], [ %indvars.iv.next36, %outer.inc ]
142  br i1 %cmp230, label %inner.ph, label %outer.inc
143
144inner.ph:                                         ; preds = %outer.body
145  %1 = mul nsw i64 %indvars.iv35, %0
146  br label %inner.body
147
148inner.body:                                       ; preds = %inner.body, %inner.ph
149  %indvars.iv = phi i64 [ 0, %inner.ph ], [ %indvars.iv.next, %inner.body ]
150  %2 = add nsw i64 %indvars.iv, %1
151  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %2
152  %3 = load i32, i32* %arrayidx, align 4, !tbaa !2
153  %mul8 = mul nsw i32 %3, %3
154  %arrayidx12 = getelementptr inbounds i32, i32* %a, i64 %2
155  store i32 %mul8, i32* %arrayidx12, align 4, !tbaa !2
156  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
157  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
158  br i1 %exitcond, label %outer.inc, label %inner.body
159
160outer.inc:                                        ; preds = %inner.body, %outer.body
161  %indvars.iv.next36 = add nuw nsw i64 %indvars.iv35, 1
162  %exitcond39 = icmp eq i64 %indvars.iv.next36, %wide.trip.count38
163  br i1 %exitcond39, label %for.end15, label %outer.body, !llvm.loop !11
164
165for.end15:                                        ; preds = %outer.inc, %entry
166  ret void
167}
168
169; Case 4: Outer loop without any explicit vectorization annotation doesn't have
170; to be collected.
171
172; CHECK-LABEL: case4
173; CHECK-NOT: LV: Loop hints: force=enabled
174; CHECK-NOT: LV: We can vectorize this outer loop!
175; CHECK: LV: Loop hints: force=?
176; CHECK: LV: Found a loop: inner.body
177
178define void @case4(i32* nocapture %a, i32* nocapture readonly %b, i32 %N, i32 %M) local_unnamed_addr {
179entry:
180  %cmp32 = icmp sgt i32 %N, 0
181  br i1 %cmp32, label %outer.ph, label %for.end15
182
183outer.ph:                                         ; preds = %entry
184  %cmp230 = icmp sgt i32 %M, 0
185  %0 = sext i32 %M to i64
186  %wide.trip.count = zext i32 %M to i64
187  %wide.trip.count38 = zext i32 %N to i64
188  br label %outer.body
189
190outer.body:                                       ; preds = %outer.inc, %outer.ph
191  %indvars.iv35 = phi i64 [ 0, %outer.ph ], [ %indvars.iv.next36, %outer.inc ]
192  br i1 %cmp230, label %inner.ph, label %outer.inc
193
194inner.ph:                                  ; preds = %outer.body
195  %1 = mul nsw i64 %indvars.iv35, %0
196  br label %inner.body
197
198inner.body:                                        ; preds = %inner.body, %inner.ph
199  %indvars.iv = phi i64 [ 0, %inner.ph ], [ %indvars.iv.next, %inner.body ]
200  %2 = add nsw i64 %indvars.iv, %1
201  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %2
202  %3 = load i32, i32* %arrayidx, align 4, !tbaa !2
203  %mul8 = mul nsw i32 %3, %3
204  %arrayidx12 = getelementptr inbounds i32, i32* %a, i64 %2
205  store i32 %mul8, i32* %arrayidx12, align 4, !tbaa !2
206  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
207  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
208  br i1 %exitcond, label %outer.inc, label %inner.body
209
210outer.inc:                                        ; preds = %inner.body, %outer.body
211  %indvars.iv.next36 = add nuw nsw i64 %indvars.iv35, 1
212  %exitcond39 = icmp eq i64 %indvars.iv.next36, %wide.trip.count38
213  br i1 %exitcond39, label %for.end15, label %outer.body
214
215for.end15:                                        ; preds = %outer.inc, %entry
216  ret void
217}
218
219!llvm.module.flags = !{!0}
220!llvm.ident = !{!1}
221
222!0 = !{i32 1, !"wchar_size", i32 4}
223!1 = !{!"clang version 6.0.0"}
224!2 = !{!3, !3, i64 0}
225!3 = !{!"int", !4, i64 0}
226!4 = !{!"omnipotent char", !5, i64 0}
227!5 = !{!"Simple C/C++ TBAA"}
228; Case 1
229!6 = distinct !{!6, !7, !8}
230!7 = !{!"llvm.loop.vectorize.width", i32 4}
231!8 = !{!"llvm.loop.vectorize.enable", i1 true}
232; Case 2
233!9 = distinct !{!9, !8}
234; Case 3
235!10 = !{!"llvm.loop.interleave.count", i32 2}
236!11 = distinct !{!11, !7, !10, !8}
237