1; RUN: opt -slp-vectorizer -S <  %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx | FileCheck %s --check-prefix=NOSTORE
2
3target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
4
5; #include <stdint.h>
6;
7; int foo(float *A, int n) {
8;   float sum = 0;
9;   for (intptr_t i=0; i < n; ++i) {
10;     sum += 7*A[i*4  ] +
11;            7*A[i*4+1] +
12;            7*A[i*4+2] +
13;            7*A[i*4+3];
14;   }
15;   return sum;
16; }
17
18; NOSTORE-LABEL: add_red
19; NOSTORE: fmul <4 x float>
20; NOSTORE: shufflevector <4 x float>
21
22define i32 @add_red(float* %A, i32 %n) {
23entry:
24  %cmp31 = icmp sgt i32 %n, 0
25  br i1 %cmp31, label %for.body.lr.ph, label %for.end
26
27for.body.lr.ph:
28  %0 = sext i32 %n to i64
29  br label %for.body
30
31for.body:
32  %i.033 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
33  %sum.032 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add17, %for.body ]
34  %mul = shl nsw i64 %i.033, 2
35  %arrayidx = getelementptr inbounds float, float* %A, i64 %mul
36  %1 = load float, float* %arrayidx, align 4
37  %mul2 = fmul float %1, 7.000000e+00
38  %add28 = or i64 %mul, 1
39  %arrayidx4 = getelementptr inbounds float, float* %A, i64 %add28
40  %2 = load float, float* %arrayidx4, align 4
41  %mul5 = fmul float %2, 7.000000e+00
42  %add6 = fadd fast float %mul2, %mul5
43  %add829 = or i64 %mul, 2
44  %arrayidx9 = getelementptr inbounds float, float* %A, i64 %add829
45  %3 = load float, float* %arrayidx9, align 4
46  %mul10 = fmul float %3, 7.000000e+00
47  %add11 = fadd fast float %add6, %mul10
48  %add1330 = or i64 %mul, 3
49  %arrayidx14 = getelementptr inbounds float, float* %A, i64 %add1330
50  %4 = load float, float* %arrayidx14, align 4
51  %mul15 = fmul float %4, 7.000000e+00
52  %add16 = fadd fast float %add11, %mul15
53  %add17 = fadd fast float %sum.032, %add16
54  %inc = add nsw i64 %i.033, 1
55  %exitcond = icmp eq i64 %inc, %0
56  br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
57
58for.cond.for.end_crit_edge:
59  %phitmp = fptosi float %add17 to i32
60  br label %for.end
61
62for.end:
63  %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
64  ret i32 %sum.0.lcssa
65}
66
67; int foo(float * restrict A, float * restrict B, int n) {
68;   float sum = 0;
69;   for (intptr_t i=0; i < n; ++i) {
70;     sum *= B[0]*A[i*4  ] +
71;       B[1]*A[i*4+1] +
72;       B[2]*A[i*4+2] +
73;       B[3]*A[i*4+3];
74;   }
75;   return sum;
76; }
77
78; CHECK-LABEL: mul_red
79; CHECK: fmul <4 x float>
80; CHECK: shufflevector <4 x float>
81
82define i32 @mul_red(float* noalias %A, float* noalias %B, i32 %n) {
83entry:
84  %cmp38 = icmp sgt i32 %n, 0
85  br i1 %cmp38, label %for.body.lr.ph, label %for.end
86
87for.body.lr.ph:
88  %0 = load float, float* %B, align 4
89  %arrayidx4 = getelementptr inbounds float, float* %B, i64 1
90  %1 = load float, float* %arrayidx4, align 4
91  %arrayidx9 = getelementptr inbounds float, float* %B, i64 2
92  %2 = load float, float* %arrayidx9, align 4
93  %arrayidx15 = getelementptr inbounds float, float* %B, i64 3
94  %3 = load float, float* %arrayidx15, align 4
95  %4 = sext i32 %n to i64
96  br label %for.body
97
98for.body:
99  %i.040 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
100  %sum.039 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %mul21, %for.body ]
101  %mul = shl nsw i64 %i.040, 2
102  %arrayidx2 = getelementptr inbounds float, float* %A, i64 %mul
103  %5 = load float, float* %arrayidx2, align 4
104  %mul3 = fmul float %0, %5
105  %add35 = or i64 %mul, 1
106  %arrayidx6 = getelementptr inbounds float, float* %A, i64 %add35
107  %6 = load float, float* %arrayidx6, align 4
108  %mul7 = fmul float %1, %6
109  %add8 = fadd fast float %mul3, %mul7
110  %add1136 = or i64 %mul, 2
111  %arrayidx12 = getelementptr inbounds float, float* %A, i64 %add1136
112  %7 = load float, float* %arrayidx12, align 4
113  %mul13 = fmul float %2, %7
114  %add14 = fadd fast float %add8, %mul13
115  %add1737 = or i64 %mul, 3
116  %arrayidx18 = getelementptr inbounds float, float* %A, i64 %add1737
117  %8 = load float, float* %arrayidx18, align 4
118  %mul19 = fmul float %3, %8
119  %add20 = fadd fast float %add14, %mul19
120  %mul21 = fmul float %sum.039, %add20
121  %inc = add nsw i64 %i.040, 1
122  %exitcond = icmp eq i64 %inc, %4
123  br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
124
125for.cond.for.end_crit_edge:
126  %phitmp = fptosi float %mul21 to i32
127  br label %for.end
128
129for.end:
130  %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
131  ret i32 %sum.0.lcssa
132}
133
134; int foo(float * restrict A, float * restrict B, int n) {
135;   float sum = 0;
136;   for (intptr_t i=0; i < n; ++i) {
137;     sum += B[0]*A[i*6  ] +
138;            B[1]*A[i*6+1] +
139;            B[2]*A[i*6+2] +
140;            B[3]*A[i*6+3] +
141;            B[4]*A[i*6+4] +
142;            B[5]*A[i*6+5] +
143;            B[6]*A[i*6+6] +
144;            B[7]*A[i*6+7] +
145;            B[8]*A[i*6+8];
146;   }
147;   return sum;
148; }
149
150; CHECK-LABEL: long_red
151; CHECK: fmul fast <4 x float>
152; CHECK: shufflevector <4 x float>
153
154define i32 @long_red(float* noalias %A, float* noalias %B, i32 %n) {
155entry:
156  %cmp81 = icmp sgt i32 %n, 0
157  br i1 %cmp81, label %for.body.lr.ph, label %for.end
158
159for.body.lr.ph:
160  %0 = load float, float* %B, align 4
161  %arrayidx4 = getelementptr inbounds float, float* %B, i64 1
162  %1 = load float, float* %arrayidx4, align 4
163  %arrayidx9 = getelementptr inbounds float, float* %B, i64 2
164  %2 = load float, float* %arrayidx9, align 4
165  %arrayidx15 = getelementptr inbounds float, float* %B, i64 3
166  %3 = load float, float* %arrayidx15, align 4
167  %arrayidx21 = getelementptr inbounds float, float* %B, i64 4
168  %4 = load float, float* %arrayidx21, align 4
169  %arrayidx27 = getelementptr inbounds float, float* %B, i64 5
170  %5 = load float, float* %arrayidx27, align 4
171  %arrayidx33 = getelementptr inbounds float, float* %B, i64 6
172  %6 = load float, float* %arrayidx33, align 4
173  %arrayidx39 = getelementptr inbounds float, float* %B, i64 7
174  %7 = load float, float* %arrayidx39, align 4
175  %arrayidx45 = getelementptr inbounds float, float* %B, i64 8
176  %8 = load float, float* %arrayidx45, align 4
177  %9 = sext i32 %n to i64
178  br label %for.body
179
180for.body:
181  %i.083 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
182  %sum.082 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add51, %for.body ]
183  %mul = mul nsw i64 %i.083, 6
184  %arrayidx2 = getelementptr inbounds float, float* %A, i64 %mul
185  %10 = load float, float* %arrayidx2, align 4
186  %mul3 = fmul fast float %0, %10
187  %add80 = or i64 %mul, 1
188  %arrayidx6 = getelementptr inbounds float, float* %A, i64 %add80
189  %11 = load float, float* %arrayidx6, align 4
190  %mul7 = fmul fast float %1, %11
191  %add8 = fadd fast float %mul3, %mul7
192  %add11 = add nsw i64 %mul, 2
193  %arrayidx12 = getelementptr inbounds float, float* %A, i64 %add11
194  %12 = load float, float* %arrayidx12, align 4
195  %mul13 = fmul fast float %2, %12
196  %add14 = fadd fast float %add8, %mul13
197  %add17 = add nsw i64 %mul, 3
198  %arrayidx18 = getelementptr inbounds float, float* %A, i64 %add17
199  %13 = load float, float* %arrayidx18, align 4
200  %mul19 = fmul fast float %3, %13
201  %add20 = fadd fast float %add14, %mul19
202  %add23 = add nsw i64 %mul, 4
203  %arrayidx24 = getelementptr inbounds float, float* %A, i64 %add23
204  %14 = load float, float* %arrayidx24, align 4
205  %mul25 = fmul fast float %4, %14
206  %add26 = fadd fast float %add20, %mul25
207  %add29 = add nsw i64 %mul, 5
208  %arrayidx30 = getelementptr inbounds float, float* %A, i64 %add29
209  %15 = load float, float* %arrayidx30, align 4
210  %mul31 = fmul fast float %5, %15
211  %add32 = fadd fast float %add26, %mul31
212  %add35 = add nsw i64 %mul, 6
213  %arrayidx36 = getelementptr inbounds float, float* %A, i64 %add35
214  %16 = load float, float* %arrayidx36, align 4
215  %mul37 = fmul fast float %6, %16
216  %add38 = fadd fast float %add32, %mul37
217  %add41 = add nsw i64 %mul, 7
218  %arrayidx42 = getelementptr inbounds float, float* %A, i64 %add41
219  %17 = load float, float* %arrayidx42, align 4
220  %mul43 = fmul fast float %7, %17
221  %add44 = fadd fast float %add38, %mul43
222  %add47 = add nsw i64 %mul, 8
223  %arrayidx48 = getelementptr inbounds float, float* %A, i64 %add47
224  %18 = load float, float* %arrayidx48, align 4
225  %mul49 = fmul fast float %8, %18
226  %add50 = fadd fast float %add44, %mul49
227  %add51 = fadd fast float %sum.082, %add50
228  %inc = add nsw i64 %i.083, 1
229  %exitcond = icmp eq i64 %inc, %9
230  br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
231
232for.cond.for.end_crit_edge:
233  %phitmp = fptosi float %add51 to i32
234  br label %for.end
235
236for.end:
237  %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
238  ret i32 %sum.0.lcssa
239}
240
241; int foo(float * restrict A, float * restrict B, int n) {
242;   float sum = 0;
243;   for (intptr_t i=0; i < n; ++i) {
244;     sum += B[0]*A[i*4  ];
245;     sum += B[1]*A[i*4+1];
246;     sum += B[2]*A[i*4+2];
247;     sum += B[3]*A[i*4+3];
248;   }
249;   return sum;
250; }
251
252; CHECK-LABEL: chain_red
253; CHECK: fmul fast <4 x float>
254; CHECK: shufflevector <4 x float>
255
256define i32 @chain_red(float* noalias %A, float* noalias %B, i32 %n) {
257entry:
258  %cmp41 = icmp sgt i32 %n, 0
259  br i1 %cmp41, label %for.body.lr.ph, label %for.end
260
261for.body.lr.ph:
262  %0 = load float, float* %B, align 4
263  %arrayidx4 = getelementptr inbounds float, float* %B, i64 1
264  %1 = load float, float* %arrayidx4, align 4
265  %arrayidx10 = getelementptr inbounds float, float* %B, i64 2
266  %2 = load float, float* %arrayidx10, align 4
267  %arrayidx16 = getelementptr inbounds float, float* %B, i64 3
268  %3 = load float, float* %arrayidx16, align 4
269  %4 = sext i32 %n to i64
270  br label %for.body
271
272for.body:
273  %i.043 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
274  %sum.042 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add21, %for.body ]
275  %mul = shl nsw i64 %i.043, 2
276  %arrayidx2 = getelementptr inbounds float, float* %A, i64 %mul
277  %5 = load float, float* %arrayidx2, align 4
278  %mul3 = fmul fast float %0, %5
279  %add = fadd fast float %sum.042, %mul3
280  %add638 = or i64 %mul, 1
281  %arrayidx7 = getelementptr inbounds float, float* %A, i64 %add638
282  %6 = load float, float* %arrayidx7, align 4
283  %mul8 = fmul fast float %1, %6
284  %add9 = fadd fast float %add, %mul8
285  %add1239 = or i64 %mul, 2
286  %arrayidx13 = getelementptr inbounds float, float* %A, i64 %add1239
287  %7 = load float, float* %arrayidx13, align 4
288  %mul14 = fmul fast float %2, %7
289  %add15 = fadd fast float %add9, %mul14
290  %add1840 = or i64 %mul, 3
291  %arrayidx19 = getelementptr inbounds float, float* %A, i64 %add1840
292  %8 = load float, float* %arrayidx19, align 4
293  %mul20 = fmul fast float %3, %8
294  %add21 = fadd fast float %add15, %mul20
295  %inc = add nsw i64 %i.043, 1
296  %exitcond = icmp eq i64 %inc, %4
297  br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
298
299for.cond.for.end_crit_edge:
300  %phitmp = fptosi float %add21 to i32
301  br label %for.end
302
303for.end:
304  %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
305  ret i32 %sum.0.lcssa
306}
307
308; int foo(float * restrict A, float * restrict B, float * restrict C, int n) {
309;   float sum = 0;
310;   for (intptr_t i=0; i < n; ++i) {
311;     C[i] = B[0] *A[i*4  ] +
312;          B[1] *A[i*4+1] +
313;          B[2] *A[i*4+2] +
314;          B[3] *A[i*4+3];
315;   }
316;   return sum;
317; }
318
319; CHECK-LABEL: store_red
320; CHECK: fmul fast <4 x float>
321; CHECK: shufflevector <4 x float>
322
323define i32 @store_red(float* noalias %A, float* noalias %B, float* noalias %C, i32 %n) {
324entry:
325  %cmp37 = icmp sgt i32 %n, 0
326  br i1 %cmp37, label %for.body.lr.ph, label %for.end
327
328for.body.lr.ph:
329  %arrayidx4 = getelementptr inbounds float, float* %B, i64 1
330  %arrayidx9 = getelementptr inbounds float, float* %B, i64 2
331  %arrayidx15 = getelementptr inbounds float, float* %B, i64 3
332  %0 = sext i32 %n to i64
333  br label %for.body
334
335for.body:
336  %i.039 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
337  %C.addr.038 = phi float* [ %C, %for.body.lr.ph ], [ %incdec.ptr, %for.body ]
338  %1 = load float, float* %B, align 4
339  %mul = shl nsw i64 %i.039, 2
340  %arrayidx2 = getelementptr inbounds float, float* %A, i64 %mul
341  %2 = load float, float* %arrayidx2, align 4
342  %mul3 = fmul fast float %1, %2
343  %3 = load float, float* %arrayidx4, align 4
344  %add34 = or i64 %mul, 1
345  %arrayidx6 = getelementptr inbounds float, float* %A, i64 %add34
346  %4 = load float, float* %arrayidx6, align 4
347  %mul7 = fmul fast float %3, %4
348  %add8 = fadd fast float %mul3, %mul7
349  %5 = load float, float* %arrayidx9, align 4
350  %add1135 = or i64 %mul, 2
351  %arrayidx12 = getelementptr inbounds float, float* %A, i64 %add1135
352  %6 = load float, float* %arrayidx12, align 4
353  %mul13 = fmul fast float %5, %6
354  %add14 = fadd fast float %add8, %mul13
355  %7 = load float, float* %arrayidx15, align 4
356  %add1736 = or i64 %mul, 3
357  %arrayidx18 = getelementptr inbounds float, float* %A, i64 %add1736
358  %8 = load float, float* %arrayidx18, align 4
359  %mul19 = fmul fast float %7, %8
360  %add20 = fadd fast float %add14, %mul19
361  store float %add20, float* %C.addr.038, align 4
362  %incdec.ptr = getelementptr inbounds float, float* %C.addr.038, i64 1
363  %inc = add nsw i64 %i.039, 1
364  %exitcond = icmp eq i64 %inc, %0
365  br i1 %exitcond, label %for.end, label %for.body
366
367for.end:
368  ret i32 0
369}
370
371
372; RUN: opt -slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S <  %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx | FileCheck %s --check-prefix=STORE
373
374; void foo(double * restrict A, double * restrict B, double * restrict C,
375;          int n) {
376;   for (intptr_t i=0; i < n; ++i) {
377;     C[i] = B[0] *A[i*4  ] + B[1] *A[i*4+1];
378;   }
379; }
380
381; STORE-LABEL: store_red_double
382; STORE: fmul fast <2 x double>
383; STORE: extractelement <2 x double>
384; STORE: extractelement <2 x double>
385
386define void @store_red_double(double* noalias %A, double* noalias %B, double* noalias %C, i32 %n) {
387entry:
388  %cmp17 = icmp sgt i32 %n, 0
389  br i1 %cmp17, label %for.body.lr.ph, label %for.end
390
391for.body.lr.ph:
392  %0 = load double, double* %B, align 8
393  %arrayidx4 = getelementptr inbounds double, double* %B, i64 1
394  %1 = load double, double* %arrayidx4, align 8
395  %2 = sext i32 %n to i64
396  br label %for.body
397
398for.body:
399  %i.018 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
400  %mul = shl nsw i64 %i.018, 2
401  %arrayidx2 = getelementptr inbounds double, double* %A, i64 %mul
402  %3 = load double, double* %arrayidx2, align 8
403  %mul3 = fmul fast double %0, %3
404  %add16 = or i64 %mul, 1
405  %arrayidx6 = getelementptr inbounds double, double* %A, i64 %add16
406  %4 = load double, double* %arrayidx6, align 8
407  %mul7 = fmul fast double %1, %4
408  %add8 = fadd fast double %mul3, %mul7
409  %arrayidx9 = getelementptr inbounds double, double* %C, i64 %i.018
410  store double %add8, double* %arrayidx9, align 8
411  %inc = add nsw i64 %i.018, 1
412  %exitcond = icmp eq i64 %inc, %2
413  br i1 %exitcond, label %for.end, label %for.body
414
415for.end:
416  ret void
417}
418