1; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
2
3target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
4target triple = "x86_64-apple-macosx10.8.0"
5
6;CHECK-LABEL: @reduction_sum(
7;CHECK: phi <4 x i32>
8;CHECK: load <4 x i32>
9;CHECK: add <4 x i32>
10;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
11;CHECK: add <4 x i32>
12;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
13;CHECK: add <4 x i32>
14;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
15;CHECK: ret i32
16define i32 @reduction_sum(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
17  %1 = icmp sgt i32 %n, 0
18  br i1 %1, label %.lr.ph, label %._crit_edge
19
20.lr.ph:                                           ; preds = %0, %.lr.ph
21  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
22  %sum.02 = phi i32 [ %9, %.lr.ph ], [ 0, %0 ]
23  %2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
24  %3 = load i32, i32* %2, align 4
25  %4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
26  %5 = load i32, i32* %4, align 4
27  %6 = trunc i64 %indvars.iv to i32
28  %7 = add i32 %sum.02, %6
29  %8 = add i32 %7, %3
30  %9 = add i32 %8, %5
31  %indvars.iv.next = add i64 %indvars.iv, 1
32  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
33  %exitcond = icmp eq i32 %lftr.wideiv, %n
34  br i1 %exitcond, label %._crit_edge, label %.lr.ph
35
36._crit_edge:                                      ; preds = %.lr.ph, %0
37  %sum.0.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ]
38  ret i32 %sum.0.lcssa
39}
40
41;CHECK-LABEL: @reduction_prod(
42;CHECK: phi <4 x i32>
43;CHECK: load <4 x i32>
44;CHECK: mul <4 x i32>
45;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
46;CHECK: mul <4 x i32>
47;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
48;CHECK: mul <4 x i32>
49;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
50;CHECK: ret i32
51define i32 @reduction_prod(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
52  %1 = icmp sgt i32 %n, 0
53  br i1 %1, label %.lr.ph, label %._crit_edge
54
55.lr.ph:                                           ; preds = %0, %.lr.ph
56  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
57  %prod.02 = phi i32 [ %9, %.lr.ph ], [ 1, %0 ]
58  %2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
59  %3 = load i32, i32* %2, align 4
60  %4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
61  %5 = load i32, i32* %4, align 4
62  %6 = trunc i64 %indvars.iv to i32
63  %7 = mul i32 %prod.02, %6
64  %8 = mul i32 %7, %3
65  %9 = mul i32 %8, %5
66  %indvars.iv.next = add i64 %indvars.iv, 1
67  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
68  %exitcond = icmp eq i32 %lftr.wideiv, %n
69  br i1 %exitcond, label %._crit_edge, label %.lr.ph
70
71._crit_edge:                                      ; preds = %.lr.ph, %0
72  %prod.0.lcssa = phi i32 [ 1, %0 ], [ %9, %.lr.ph ]
73  ret i32 %prod.0.lcssa
74}
75
76;CHECK-LABEL: @reduction_mix(
77;CHECK: phi <4 x i32>
78;CHECK: load <4 x i32>
79;CHECK: mul nsw <4 x i32>
80;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
81;CHECK: add <4 x i32>
82;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
83;CHECK: add <4 x i32>
84;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
85;CHECK: ret i32
86define i32 @reduction_mix(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
87  %1 = icmp sgt i32 %n, 0
88  br i1 %1, label %.lr.ph, label %._crit_edge
89
90.lr.ph:                                           ; preds = %0, %.lr.ph
91  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
92  %sum.02 = phi i32 [ %9, %.lr.ph ], [ 0, %0 ]
93  %2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
94  %3 = load i32, i32* %2, align 4
95  %4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
96  %5 = load i32, i32* %4, align 4
97  %6 = mul nsw i32 %5, %3
98  %7 = trunc i64 %indvars.iv to i32
99  %8 = add i32 %sum.02, %7
100  %9 = add i32 %8, %6
101  %indvars.iv.next = add i64 %indvars.iv, 1
102  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
103  %exitcond = icmp eq i32 %lftr.wideiv, %n
104  br i1 %exitcond, label %._crit_edge, label %.lr.ph
105
106._crit_edge:                                      ; preds = %.lr.ph, %0
107  %sum.0.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ]
108  ret i32 %sum.0.lcssa
109}
110
111;CHECK-LABEL: @reduction_mul(
112;CHECK: mul <4 x i32>
113;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
114;CHECK: mul <4 x i32>
115;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
116;CHECK: mul <4 x i32>
117;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
118;CHECK: ret i32
119define i32 @reduction_mul(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
120  %1 = icmp sgt i32 %n, 0
121  br i1 %1, label %.lr.ph, label %._crit_edge
122
123.lr.ph:                                           ; preds = %0, %.lr.ph
124  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
125  %sum.02 = phi i32 [ %9, %.lr.ph ], [ 19, %0 ]
126  %2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
127  %3 = load i32, i32* %2, align 4
128  %4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
129  %5 = load i32, i32* %4, align 4
130  %6 = trunc i64 %indvars.iv to i32
131  %7 = add i32 %3, %6
132  %8 = add i32 %7, %5
133  %9 = mul i32 %8, %sum.02
134  %indvars.iv.next = add i64 %indvars.iv, 1
135  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
136  %exitcond = icmp eq i32 %lftr.wideiv, %n
137  br i1 %exitcond, label %._crit_edge, label %.lr.ph
138
139._crit_edge:                                      ; preds = %.lr.ph, %0
140  %sum.0.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ]
141  ret i32 %sum.0.lcssa
142}
143
144;CHECK-LABEL: @start_at_non_zero(
145;CHECK: phi <4 x i32>
146;CHECK: <i32 120, i32 0, i32 0, i32 0>
147;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
148;CHECK: add <4 x i32>
149;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
150;CHECK: add <4 x i32>
151;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
152;CHECK: ret i32
153define i32 @start_at_non_zero(i32* nocapture %in, i32* nocapture %coeff, i32* nocapture %out, i32 %n) nounwind uwtable readonly ssp {
154entry:
155  %cmp7 = icmp sgt i32 %n, 0
156  br i1 %cmp7, label %for.body, label %for.end
157
158for.body:                                         ; preds = %entry, %for.body
159  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
160  %sum.09 = phi i32 [ %add, %for.body ], [ 120, %entry ]
161  %arrayidx = getelementptr inbounds i32, i32* %in, i64 %indvars.iv
162  %0 = load i32, i32* %arrayidx, align 4
163  %arrayidx2 = getelementptr inbounds i32, i32* %coeff, i64 %indvars.iv
164  %1 = load i32, i32* %arrayidx2, align 4
165  %mul = mul nsw i32 %1, %0
166  %add = add nsw i32 %mul, %sum.09
167  %indvars.iv.next = add i64 %indvars.iv, 1
168  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
169  %exitcond = icmp eq i32 %lftr.wideiv, %n
170  br i1 %exitcond, label %for.end, label %for.body
171
172for.end:                                          ; preds = %for.body, %entry
173  %sum.0.lcssa = phi i32 [ 120, %entry ], [ %add, %for.body ]
174  ret i32 %sum.0.lcssa
175}
176
177;CHECK-LABEL: @reduction_and(
178;CHECK: <i32 -1, i32 -1, i32 -1, i32 -1>
179;CHECK: and <4 x i32>
180;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
181;CHECK: and <4 x i32>
182;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
183;CHECK: and <4 x i32>
184;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
185;CHECK: ret i32
186define i32 @reduction_and(i32 %n, i32* nocapture %A, i32* nocapture %B) nounwind uwtable readonly {
187entry:
188  %cmp7 = icmp sgt i32 %n, 0
189  br i1 %cmp7, label %for.body, label %for.end
190
191for.body:                                         ; preds = %entry, %for.body
192  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
193  %result.08 = phi i32 [ %and, %for.body ], [ -1, %entry ]
194  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
195  %0 = load i32, i32* %arrayidx, align 4
196  %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
197  %1 = load i32, i32* %arrayidx2, align 4
198  %add = add nsw i32 %1, %0
199  %and = and i32 %add, %result.08
200  %indvars.iv.next = add i64 %indvars.iv, 1
201  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
202  %exitcond = icmp eq i32 %lftr.wideiv, %n
203  br i1 %exitcond, label %for.end, label %for.body
204
205for.end:                                          ; preds = %for.body, %entry
206  %result.0.lcssa = phi i32 [ -1, %entry ], [ %and, %for.body ]
207  ret i32 %result.0.lcssa
208}
209
210;CHECK-LABEL: @reduction_or(
211;CHECK: or <4 x i32>
212;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
213;CHECK: or <4 x i32>
214;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
215;CHECK: or <4 x i32>
216;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
217;CHECK: ret i32
218define i32 @reduction_or(i32 %n, i32* nocapture %A, i32* nocapture %B) nounwind uwtable readonly {
219entry:
220  %cmp7 = icmp sgt i32 %n, 0
221  br i1 %cmp7, label %for.body, label %for.end
222
223for.body:                                         ; preds = %entry, %for.body
224  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
225  %result.08 = phi i32 [ %or, %for.body ], [ 0, %entry ]
226  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
227  %0 = load i32, i32* %arrayidx, align 4
228  %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
229  %1 = load i32, i32* %arrayidx2, align 4
230  %add = add nsw i32 %1, %0
231  %or = or i32 %add, %result.08
232  %indvars.iv.next = add i64 %indvars.iv, 1
233  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
234  %exitcond = icmp eq i32 %lftr.wideiv, %n
235  br i1 %exitcond, label %for.end, label %for.body
236
237for.end:                                          ; preds = %for.body, %entry
238  %result.0.lcssa = phi i32 [ 0, %entry ], [ %or, %for.body ]
239  ret i32 %result.0.lcssa
240}
241
242;CHECK-LABEL: @reduction_xor(
243;CHECK: xor <4 x i32>
244;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
245;CHECK: xor <4 x i32>
246;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
247;CHECK: xor <4 x i32>
248;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
249;CHECK: ret i32
250define i32 @reduction_xor(i32 %n, i32* nocapture %A, i32* nocapture %B) nounwind uwtable readonly {
251entry:
252  %cmp7 = icmp sgt i32 %n, 0
253  br i1 %cmp7, label %for.body, label %for.end
254
255for.body:                                         ; preds = %entry, %for.body
256  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
257  %result.08 = phi i32 [ %xor, %for.body ], [ 0, %entry ]
258  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
259  %0 = load i32, i32* %arrayidx, align 4
260  %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
261  %1 = load i32, i32* %arrayidx2, align 4
262  %add = add nsw i32 %1, %0
263  %xor = xor i32 %add, %result.08
264  %indvars.iv.next = add i64 %indvars.iv, 1
265  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
266  %exitcond = icmp eq i32 %lftr.wideiv, %n
267  br i1 %exitcond, label %for.end, label %for.body
268
269for.end:                                          ; preds = %for.body, %entry
270  %result.0.lcssa = phi i32 [ 0, %entry ], [ %xor, %for.body ]
271  ret i32 %result.0.lcssa
272}
273
274; In this code the subtracted variable is on the RHS and this is not an induction variable.
275;CHECK-LABEL: @reduction_sub_rhs(
276;CHECK-NOT: phi <4 x i32>
277;CHECK-NOT: sub nsw <4 x i32>
278;CHECK: ret i32
279define i32 @reduction_sub_rhs(i32 %n, i32* noalias nocapture %A) nounwind uwtable readonly {
280entry:
281  %cmp4 = icmp sgt i32 %n, 0
282  br i1 %cmp4, label %for.body, label %for.end
283
284for.body:                                         ; preds = %entry, %for.body
285  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
286  %x.05 = phi i32 [ %sub, %for.body ], [ 0, %entry ]
287  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
288  %0 = load i32, i32* %arrayidx, align 4
289  %sub = sub nsw i32 %0, %x.05
290  %indvars.iv.next = add i64 %indvars.iv, 1
291  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
292  %exitcond = icmp eq i32 %lftr.wideiv, %n
293  br i1 %exitcond, label %for.end, label %for.body
294
295for.end:                                          ; preds = %for.body, %entry
296  %x.0.lcssa = phi i32 [ 0, %entry ], [ %sub, %for.body ]
297  ret i32 %x.0.lcssa
298}
299
300
301; In this test the reduction variable is on the LHS and we can vectorize it.
302;CHECK-LABEL: @reduction_sub_lhs(
303;CHECK: phi <4 x i32>
304;CHECK: sub nsw <4 x i32>
305;CHECK: ret i32
306define i32 @reduction_sub_lhs(i32 %n, i32* noalias nocapture %A) nounwind uwtable readonly {
307entry:
308  %cmp4 = icmp sgt i32 %n, 0
309  br i1 %cmp4, label %for.body, label %for.end
310
311for.body:                                         ; preds = %entry, %for.body
312  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
313  %x.05 = phi i32 [ %sub, %for.body ], [ 0, %entry ]
314  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
315  %0 = load i32, i32* %arrayidx, align 4
316  %sub = sub nsw i32 %x.05, %0
317  %indvars.iv.next = add i64 %indvars.iv, 1
318  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
319  %exitcond = icmp eq i32 %lftr.wideiv, %n
320  br i1 %exitcond, label %for.end, label %for.body
321
322for.end:                                          ; preds = %for.body, %entry
323  %x.0.lcssa = phi i32 [ 0, %entry ], [ %sub, %for.body ]
324  ret i32 %x.0.lcssa
325}
326
327; We can vectorize conditional reductions with multi-input phis.
328; CHECK: reduction_conditional
329; CHECK: fadd <4 x float>
330
331define float @reduction_conditional(float* %A, float* %B, float* %C, float %S) {
332entry:
333  br label %for.body
334
335for.body:
336  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
337  %sum.033 = phi float [ %S, %entry ], [ %sum.1, %for.inc ]
338  %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv
339  %0 = load float, float* %arrayidx, align 4
340  %arrayidx2 = getelementptr inbounds float, float* %B, i64 %indvars.iv
341  %1 = load float, float* %arrayidx2, align 4
342  %cmp3 = fcmp ogt float %0, %1
343  br i1 %cmp3, label %if.then, label %for.inc
344
345if.then:
346  %cmp6 = fcmp ogt float %1, 1.000000e+00
347  br i1 %cmp6, label %if.then8, label %if.else
348
349if.then8:
350  %add = fadd fast float %sum.033, %0
351  br label %for.inc
352
353if.else:
354  %cmp14 = fcmp ogt float %0, 2.000000e+00
355  br i1 %cmp14, label %if.then16, label %for.inc
356
357if.then16:
358  %add19 = fadd fast float %sum.033, %1
359  br label %for.inc
360
361for.inc:
362  %sum.1 = phi float [ %add, %if.then8 ], [ %add19, %if.then16 ], [ %sum.033, %if.else ], [ %sum.033, %for.body ]
363  %indvars.iv.next = add i64 %indvars.iv, 1
364  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
365  %exitcond = icmp ne i32 %lftr.wideiv, 128
366  br i1 %exitcond, label %for.body, label %for.end
367
368for.end:
369  %sum.1.lcssa = phi float [ %sum.1, %for.inc ]
370  ret float %sum.1.lcssa
371}
372
373; We can't vectorize reductions with phi inputs from outside the reduction.
374; CHECK: noreduction_phi
375; CHECK-NOT: fadd <4 x float>
376define float @noreduction_phi(float* %A, float* %B, float* %C, float %S) {
377entry:
378  br label %for.body
379
380for.body:
381  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
382  %sum.033 = phi float [ %S, %entry ], [ %sum.1, %for.inc ]
383  %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv
384  %0 = load float, float* %arrayidx, align 4
385  %arrayidx2 = getelementptr inbounds float, float* %B, i64 %indvars.iv
386  %1 = load float, float* %arrayidx2, align 4
387  %cmp3 = fcmp ogt float %0, %1
388  br i1 %cmp3, label %if.then, label %for.inc
389
390if.then:
391  %cmp6 = fcmp ogt float %1, 1.000000e+00
392  br i1 %cmp6, label %if.then8, label %if.else
393
394if.then8:
395  %add = fadd fast float %sum.033, %0
396  br label %for.inc
397
398if.else:
399  %cmp14 = fcmp ogt float %0, 2.000000e+00
400  br i1 %cmp14, label %if.then16, label %for.inc
401
402if.then16:
403  %add19 = fadd fast float %sum.033, %1
404  br label %for.inc
405
406for.inc:
407  %sum.1 = phi float [ %add, %if.then8 ], [ %add19, %if.then16 ], [ 0.000000e+00, %if.else ], [ %sum.033, %for.body ]
408  %indvars.iv.next = add i64 %indvars.iv, 1
409  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
410  %exitcond = icmp ne i32 %lftr.wideiv, 128
411  br i1 %exitcond, label %for.body, label %for.end
412
413for.end:
414  %sum.1.lcssa = phi float [ %sum.1, %for.inc ]
415  ret float %sum.1.lcssa
416}
417
418; We can't vectorize reductions that feed another header PHI.
419; CHECK: noredux_header_phi
420; CHECK-NOT: fadd <4 x float>
421
422define float @noredux_header_phi(float* %A, float* %B, float* %C, float %S)  {
423entry:
424  br label %for.body
425
426for.body:
427  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
428  %sum2.09 = phi float [ 0.000000e+00, %entry ], [ %add1, %for.body ]
429  %sum.08 = phi float [ %S, %entry ], [ %add, %for.body ]
430  %arrayidx = getelementptr inbounds float, float* %B, i64 %indvars.iv
431  %0 = load float, float* %arrayidx, align 4
432  %add = fadd fast float %sum.08, %0
433  %add1 = fadd fast float %sum2.09, %add
434  %indvars.iv.next = add i64 %indvars.iv, 1
435  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
436  %exitcond = icmp ne i32 %lftr.wideiv, 128
437  br i1 %exitcond, label %for.body, label %for.end
438
439for.end:
440  %add1.lcssa = phi float [ %add1, %for.body ]
441  %add.lcssa = phi float [ %add, %for.body ]
442  %add2 = fadd fast float %add.lcssa, %add1.lcssa
443  ret float %add2
444}
445
446
447; When vectorizing a reduction whose loop header phi value is used outside the
448; loop special care must be taken. Otherwise, the reduced value feeding into the
449; outside user misses a few iterations (VF-1) of the loop.
450; PR16522
451
452; CHECK-LABEL: @phivalueredux(
453; CHECK-NOT: x i32>
454
455define i32 @phivalueredux(i32 %p) {
456entry:
457  br label %for.body
458
459for.body:
460  %t.03 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
461  %p.addr.02 = phi i32 [ %p, %entry ], [ %xor, %for.body ]
462  %xor = xor i32 %p.addr.02, -1
463  %inc = add nsw i32 %t.03, 1
464  %exitcond = icmp eq i32 %inc, 16
465  br i1 %exitcond, label %for.end, label %for.body
466
467for.end:
468  ret i32 %p.addr.02
469}
470
471; Don't vectorize a reduction value that is not the last in a reduction cyle. We
472; would loose iterations (VF-1) on the operations after that use.
473; PR17498
474
475; CHECK-LABEL: not_last_operation
476; CHECK-NOT: x i32>
477define i32 @not_last_operation(i32 %p, i32 %val) {
478entry:
479  %tobool = icmp eq i32 %p, 0
480  br label %for.body
481
482for.body:
483  %inc613.1 = phi i32 [ 0, %entry ], [ %inc6.1, %for.body ]
484  %inc511.1 = phi i32 [ %val, %entry ], [ %inc5.1, %for.body ]
485  %0 = zext i1 %tobool to i32
486  %inc4.1 = xor i32 %0, 1
487  %inc511.1.inc4.1 = add nsw i32 %inc511.1, %inc4.1
488  %inc5.1 = add nsw i32 %inc511.1.inc4.1, 1
489  %inc6.1 = add nsw i32 %inc613.1, 1
490  %exitcond.1 = icmp eq i32 %inc6.1, 22
491  br i1 %exitcond.1, label %exit, label %for.body
492
493exit:
494  %inc.2 = add nsw i32 %inc511.1.inc4.1, 2
495  ret i32 %inc.2
496}
497