1; RUN: opt < %s -basicaa -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -loop-vectorize-with-block-frequency -dce -instcombine -S | FileCheck %s
2
3target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
4target triple = "x86_64-apple-macosx10.8.0"
5
6@b = common global [2048 x i32] zeroinitializer, align 16
7@c = common global [2048 x i32] zeroinitializer, align 16
8@a = common global [2048 x i32] zeroinitializer, align 16
9@G = common global [32 x [1024 x i32]] zeroinitializer, align 16
10@ub = common global [1024 x i32] zeroinitializer, align 16
11@uc = common global [1024 x i32] zeroinitializer, align 16
12@d = common global [2048 x i32] zeroinitializer, align 16
13@fa = common global [1024 x float] zeroinitializer, align 16
14@fb = common global [1024 x float] zeroinitializer, align 16
15@ic = common global [1024 x i32] zeroinitializer, align 16
16@da = common global [1024 x float] zeroinitializer, align 16
17@db = common global [1024 x float] zeroinitializer, align 16
18@dc = common global [1024 x float] zeroinitializer, align 16
19@dd = common global [1024 x float] zeroinitializer, align 16
20@dj = common global [1024 x i32] zeroinitializer, align 16
21
22; We can optimize this test without a tail.
23;CHECK-LABEL: @example1(
24;CHECK: load <4 x i32>
25;CHECK: add nsw <4 x i32>
26;CHECK: store <4 x i32>
27;CHECK: ret void
28define void @example1() optsize {
29  br label %1
30
31; <label>:1                                       ; preds = %1, %0
32  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
33  %2 = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 %indvars.iv
34  %3 = load i32, i32* %2, align 4
35  %4 = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 %indvars.iv
36  %5 = load i32, i32* %4, align 4
37  %6 = add nsw i32 %5, %3
38  %7 = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 %indvars.iv
39  store i32 %6, i32* %7, align 4
40  %indvars.iv.next = add i64 %indvars.iv, 1
41  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
42  %exitcond = icmp eq i32 %lftr.wideiv, 256
43  br i1 %exitcond, label %8, label %1
44
45; <label>:8                                       ; preds = %1
46  ret void
47}
48
49; Can't vectorize in 'optsize' mode because we need a tail.
50;CHECK-LABEL: @example2(
51;CHECK-NOT: store <4 x i32>
52;CHECK: ret void
53define void @example2(i32 %n, i32 %x) optsize {
54  %1 = icmp sgt i32 %n, 0
55  br i1 %1, label %.lr.ph5, label %.preheader
56
57..preheader_crit_edge:                            ; preds = %.lr.ph5
58  %phitmp = sext i32 %n to i64
59  br label %.preheader
60
61.preheader:                                       ; preds = %..preheader_crit_edge, %0
62  %i.0.lcssa = phi i64 [ %phitmp, %..preheader_crit_edge ], [ 0, %0 ]
63  %2 = icmp eq i32 %n, 0
64  br i1 %2, label %._crit_edge, label %.lr.ph
65
66.lr.ph5:                                          ; preds = %0, %.lr.ph5
67  %indvars.iv6 = phi i64 [ %indvars.iv.next7, %.lr.ph5 ], [ 0, %0 ]
68  %3 = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 %indvars.iv6
69  store i32 %x, i32* %3, align 4
70  %indvars.iv.next7 = add i64 %indvars.iv6, 1
71  %lftr.wideiv = trunc i64 %indvars.iv.next7 to i32
72  %exitcond = icmp eq i32 %lftr.wideiv, %n
73  br i1 %exitcond, label %..preheader_crit_edge, label %.lr.ph5
74
75.lr.ph:                                           ; preds = %.preheader, %.lr.ph
76  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ %i.0.lcssa, %.preheader ]
77  %.02 = phi i32 [ %4, %.lr.ph ], [ %n, %.preheader ]
78  %4 = add nsw i32 %.02, -1
79  %5 = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 %indvars.iv
80  %6 = load i32, i32* %5, align 4
81  %7 = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 %indvars.iv
82  %8 = load i32, i32* %7, align 4
83  %9 = and i32 %8, %6
84  %10 = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 %indvars.iv
85  store i32 %9, i32* %10, align 4
86  %indvars.iv.next = add i64 %indvars.iv, 1
87  %11 = icmp eq i32 %4, 0
88  br i1 %11, label %._crit_edge, label %.lr.ph
89
90._crit_edge:                                      ; preds = %.lr.ph, %.preheader
91  ret void
92}
93
94; N is unknown, we need a tail. Can't vectorize.
95;CHECK-LABEL: @example3(
96;CHECK-NOT: <4 x i32>
97;CHECK: ret void
98define void @example3(i32 %n, i32* noalias nocapture %p, i32* noalias nocapture %q) optsize {
99  %1 = icmp eq i32 %n, 0
100  br i1 %1, label %._crit_edge, label %.lr.ph
101
102.lr.ph:                                           ; preds = %0, %.lr.ph
103  %.05 = phi i32 [ %2, %.lr.ph ], [ %n, %0 ]
104  %.014 = phi i32* [ %5, %.lr.ph ], [ %p, %0 ]
105  %.023 = phi i32* [ %3, %.lr.ph ], [ %q, %0 ]
106  %2 = add nsw i32 %.05, -1
107  %3 = getelementptr inbounds i32, i32* %.023, i64 1
108  %4 = load i32, i32* %.023, align 16
109  %5 = getelementptr inbounds i32, i32* %.014, i64 1
110  store i32 %4, i32* %.014, align 16
111  %6 = icmp eq i32 %2, 0
112  br i1 %6, label %._crit_edge, label %.lr.ph
113
114._crit_edge:                                      ; preds = %.lr.ph, %0
115  ret void
116}
117
118; N is unknown, we need a tail. Can't vectorize because the loop is cold.
119;CHECK-LABEL: @example4(
120;CHECK-NOT: <4 x i32>
121;CHECK: ret void
122define void @example4(i32 %n, i32* noalias nocapture %p, i32* noalias nocapture %q) {
123  %1 = icmp eq i32 %n, 0
124  br i1 %1, label %._crit_edge, label %.lr.ph, !prof !0
125
126.lr.ph:                                           ; preds = %0, %.lr.ph
127  %.05 = phi i32 [ %2, %.lr.ph ], [ %n, %0 ]
128  %.014 = phi i32* [ %5, %.lr.ph ], [ %p, %0 ]
129  %.023 = phi i32* [ %3, %.lr.ph ], [ %q, %0 ]
130  %2 = add nsw i32 %.05, -1
131  %3 = getelementptr inbounds i32, i32* %.023, i64 1
132  %4 = load i32, i32* %.023, align 16
133  %5 = getelementptr inbounds i32, i32* %.014, i64 1
134  store i32 %4, i32* %.014, align 16
135  %6 = icmp eq i32 %2, 0
136  br i1 %6, label %._crit_edge, label %.lr.ph
137
138._crit_edge:                                      ; preds = %.lr.ph, %0
139  ret void
140}
141
142!0 = !{!"branch_weights", i32 64, i32 4}
143
144; We can't vectorize this one because we need a runtime ptr check.
145;CHECK-LABEL: @example23(
146;CHECK-NOT: <4 x i32>
147;CHECK: ret void
148define void @example23(i16* nocapture %src, i32* nocapture %dst) optsize {
149  br label %1
150
151; <label>:1                                       ; preds = %1, %0
152  %.04 = phi i16* [ %src, %0 ], [ %2, %1 ]
153  %.013 = phi i32* [ %dst, %0 ], [ %6, %1 ]
154  %i.02 = phi i32 [ 0, %0 ], [ %7, %1 ]
155  %2 = getelementptr inbounds i16, i16* %.04, i64 1
156  %3 = load i16, i16* %.04, align 2
157  %4 = zext i16 %3 to i32
158  %5 = shl nuw nsw i32 %4, 7
159  %6 = getelementptr inbounds i32, i32* %.013, i64 1
160  store i32 %5, i32* %.013, align 4
161  %7 = add nsw i32 %i.02, 1
162  %exitcond = icmp eq i32 %7, 256
163  br i1 %exitcond, label %8, label %1
164
165; <label>:8                                       ; preds = %1
166  ret void
167}
168
169
170; We CAN vectorize this example because the pointers are marked as noalias.
171;CHECK-LABEL: @example23b(
172;CHECK: <4 x i32>
173;CHECK: ret void
174define void @example23b(i16* noalias nocapture %src, i32* noalias nocapture %dst) optsize {
175  br label %1
176
177; <label>:1                                       ; preds = %1, %0
178  %.04 = phi i16* [ %src, %0 ], [ %2, %1 ]
179  %.013 = phi i32* [ %dst, %0 ], [ %6, %1 ]
180  %i.02 = phi i32 [ 0, %0 ], [ %7, %1 ]
181  %2 = getelementptr inbounds i16, i16* %.04, i64 1
182  %3 = load i16, i16* %.04, align 2
183  %4 = zext i16 %3 to i32
184  %5 = shl nuw nsw i32 %4, 7
185  %6 = getelementptr inbounds i32, i32* %.013, i64 1
186  store i32 %5, i32* %.013, align 4
187  %7 = add nsw i32 %i.02, 1
188  %exitcond = icmp eq i32 %7, 256
189  br i1 %exitcond, label %8, label %1
190
191; <label>:8                                       ; preds = %1
192  ret void
193}
194
195
196