1; RUN: opt < %s -basicaa -slp-vectorizer -slp-threshold=-999 -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
2
3target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
4target triple = "x86_64-apple-macosx10.8.0"
5
6declare double @llvm.fabs.f64(double) nounwind readnone
7
8;CHECK-LABEL: @vec_fabs_f64(
9;CHECK: load <2 x double>
10;CHECK: load <2 x double>
11;CHECK: call <2 x double> @llvm.fabs.v2f64
12;CHECK: store <2 x double>
13;CHECK: ret
14define void @vec_fabs_f64(double* %a, double* %b, double* %c) {
15entry:
16  %i0 = load double, double* %a, align 8
17  %i1 = load double, double* %b, align 8
18  %mul = fmul double %i0, %i1
19  %call = tail call double @llvm.fabs.f64(double %mul) nounwind readnone
20  %arrayidx3 = getelementptr inbounds double, double* %a, i64 1
21  %i3 = load double, double* %arrayidx3, align 8
22  %arrayidx4 = getelementptr inbounds double, double* %b, i64 1
23  %i4 = load double, double* %arrayidx4, align 8
24  %mul5 = fmul double %i3, %i4
25  %call5 = tail call double @llvm.fabs.f64(double %mul5) nounwind readnone
26  store double %call, double* %c, align 8
27  %arrayidx5 = getelementptr inbounds double, double* %c, i64 1
28  store double %call5, double* %arrayidx5, align 8
29  ret void
30}
31
32declare float @llvm.copysign.f32(float, float) nounwind readnone
33
34;CHECK-LABEL: @vec_copysign_f32(
35;CHECK: load <4 x float>
36;CHECK: load <4 x float>
37;CHECK: call <4 x float> @llvm.copysign.v4f32
38;CHECK: store <4 x float>
39;CHECK: ret
40define void @vec_copysign_f32(float* %a, float* %b, float* noalias %c) {
41entry:
42  %0 = load float, float* %a, align 4
43  %1 = load float, float* %b, align 4
44  %call0 = tail call float @llvm.copysign.f32(float %0, float %1) nounwind readnone
45  store float %call0, float* %c, align 4
46
47  %ix2 = getelementptr inbounds float, float* %a, i64 1
48  %2 = load float, float* %ix2, align 4
49  %ix3 = getelementptr inbounds float, float* %b, i64 1
50  %3 = load float, float* %ix3, align 4
51  %call1 = tail call float @llvm.copysign.f32(float %2, float %3) nounwind readnone
52  %c1 = getelementptr inbounds float, float* %c, i64 1
53  store float %call1, float* %c1, align 4
54
55  %ix4 = getelementptr inbounds float, float* %a, i64 2
56  %4 = load float, float* %ix4, align 4
57  %ix5 = getelementptr inbounds float, float* %b, i64 2
58  %5 = load float, float* %ix5, align 4
59  %call2 = tail call float @llvm.copysign.f32(float %4, float %5) nounwind readnone
60  %c2 = getelementptr inbounds float, float* %c, i64 2
61  store float %call2, float* %c2, align 4
62
63  %ix6 = getelementptr inbounds float, float* %a, i64 3
64  %6 = load float, float* %ix6, align 4
65  %ix7 = getelementptr inbounds float, float* %b, i64 3
66  %7 = load float, float* %ix7, align 4
67  %call3 = tail call float @llvm.copysign.f32(float %6, float %7) nounwind readnone
68  %c3 = getelementptr inbounds float, float* %c, i64 3
69  store float %call3, float* %c3, align 4
70
71  ret void
72}
73
74declare i32 @llvm.bswap.i32(i32) nounwind readnone
75
76define void @vec_bswap_i32(i32* %a, i32* %b, i32* %c) {
77entry:
78  %i0 = load i32, i32* %a, align 4
79  %i1 = load i32, i32* %b, align 4
80  %add1 = add i32 %i0, %i1
81  %call1 = tail call i32 @llvm.bswap.i32(i32 %add1) nounwind readnone
82
83  %arrayidx2 = getelementptr inbounds i32, i32* %a, i32 1
84  %i2 = load i32, i32* %arrayidx2, align 4
85  %arrayidx3 = getelementptr inbounds i32, i32* %b, i32 1
86  %i3 = load i32, i32* %arrayidx3, align 4
87  %add2 = add i32 %i2, %i3
88  %call2 = tail call i32 @llvm.bswap.i32(i32 %add2) nounwind readnone
89
90  %arrayidx4 = getelementptr inbounds i32, i32* %a, i32 2
91  %i4 = load i32, i32* %arrayidx4, align 4
92  %arrayidx5 = getelementptr inbounds i32, i32* %b, i32 2
93  %i5 = load i32, i32* %arrayidx5, align 4
94  %add3 = add i32 %i4, %i5
95  %call3 = tail call i32 @llvm.bswap.i32(i32 %add3) nounwind readnone
96
97  %arrayidx6 = getelementptr inbounds i32, i32* %a, i32 3
98  %i6 = load i32, i32* %arrayidx6, align 4
99  %arrayidx7 = getelementptr inbounds i32, i32* %b, i32 3
100  %i7 = load i32, i32* %arrayidx7, align 4
101  %add4 = add i32 %i6, %i7
102  %call4 = tail call i32 @llvm.bswap.i32(i32 %add4) nounwind readnone
103
104  store i32 %call1, i32* %c, align 4
105  %arrayidx8 = getelementptr inbounds i32, i32* %c, i32 1
106  store i32 %call2, i32* %arrayidx8, align 4
107  %arrayidx9 = getelementptr inbounds i32, i32* %c, i32 2
108  store i32 %call3, i32* %arrayidx9, align 4
109  %arrayidx10 = getelementptr inbounds i32, i32* %c, i32 3
110  store i32 %call4, i32* %arrayidx10, align 4
111  ret void
112
113; CHECK-LABEL: @vec_bswap_i32(
114; CHECK: load <4 x i32>
115; CHECK: load <4 x i32>
116; CHECK: call <4 x i32> @llvm.bswap.v4i32
117; CHECK: store <4 x i32>
118; CHECK: ret
119}
120
121declare i32 @llvm.ctlz.i32(i32,i1) nounwind readnone
122
123define void @vec_ctlz_i32(i32* %a, i32* %b, i32* %c, i1) {
124entry:
125  %i0 = load i32, i32* %a, align 4
126  %i1 = load i32, i32* %b, align 4
127  %add1 = add i32 %i0, %i1
128  %call1 = tail call i32 @llvm.ctlz.i32(i32 %add1,i1 true) nounwind readnone
129
130  %arrayidx2 = getelementptr inbounds i32, i32* %a, i32 1
131  %i2 = load i32, i32* %arrayidx2, align 4
132  %arrayidx3 = getelementptr inbounds i32, i32* %b, i32 1
133  %i3 = load i32, i32* %arrayidx3, align 4
134  %add2 = add i32 %i2, %i3
135  %call2 = tail call i32 @llvm.ctlz.i32(i32 %add2,i1 true) nounwind readnone
136
137  %arrayidx4 = getelementptr inbounds i32, i32* %a, i32 2
138  %i4 = load i32, i32* %arrayidx4, align 4
139  %arrayidx5 = getelementptr inbounds i32, i32* %b, i32 2
140  %i5 = load i32, i32* %arrayidx5, align 4
141  %add3 = add i32 %i4, %i5
142  %call3 = tail call i32 @llvm.ctlz.i32(i32 %add3,i1 true) nounwind readnone
143
144  %arrayidx6 = getelementptr inbounds i32, i32* %a, i32 3
145  %i6 = load i32, i32* %arrayidx6, align 4
146  %arrayidx7 = getelementptr inbounds i32, i32* %b, i32 3
147  %i7 = load i32, i32* %arrayidx7, align 4
148  %add4 = add i32 %i6, %i7
149  %call4 = tail call i32 @llvm.ctlz.i32(i32 %add4,i1 true) nounwind readnone
150
151  store i32 %call1, i32* %c, align 4
152  %arrayidx8 = getelementptr inbounds i32, i32* %c, i32 1
153  store i32 %call2, i32* %arrayidx8, align 4
154  %arrayidx9 = getelementptr inbounds i32, i32* %c, i32 2
155  store i32 %call3, i32* %arrayidx9, align 4
156  %arrayidx10 = getelementptr inbounds i32, i32* %c, i32 3
157  store i32 %call4, i32* %arrayidx10, align 4
158  ret void
159
160; CHECK-LABEL: @vec_ctlz_i32(
161; CHECK: load <4 x i32>
162; CHECK: load <4 x i32>
163; CHECK: call <4 x i32> @llvm.ctlz.v4i32
164; CHECK: store <4 x i32>
165; CHECK: ret
166}
167
168define void @vec_ctlz_i32_neg(i32* %a, i32* %b, i32* %c, i1) {
169entry:
170  %i0 = load i32, i32* %a, align 4
171  %i1 = load i32, i32* %b, align 4
172  %add1 = add i32 %i0, %i1
173  %call1 = tail call i32 @llvm.ctlz.i32(i32 %add1,i1 true) nounwind readnone
174
175  %arrayidx2 = getelementptr inbounds i32, i32* %a, i32 1
176  %i2 = load i32, i32* %arrayidx2, align 4
177  %arrayidx3 = getelementptr inbounds i32, i32* %b, i32 1
178  %i3 = load i32, i32* %arrayidx3, align 4
179  %add2 = add i32 %i2, %i3
180  %call2 = tail call i32 @llvm.ctlz.i32(i32 %add2,i1 false) nounwind readnone
181
182  %arrayidx4 = getelementptr inbounds i32, i32* %a, i32 2
183  %i4 = load i32, i32* %arrayidx4, align 4
184  %arrayidx5 = getelementptr inbounds i32, i32* %b, i32 2
185  %i5 = load i32, i32* %arrayidx5, align 4
186  %add3 = add i32 %i4, %i5
187  %call3 = tail call i32 @llvm.ctlz.i32(i32 %add3,i1 true) nounwind readnone
188
189  %arrayidx6 = getelementptr inbounds i32, i32* %a, i32 3
190  %i6 = load i32, i32* %arrayidx6, align 4
191  %arrayidx7 = getelementptr inbounds i32, i32* %b, i32 3
192  %i7 = load i32, i32* %arrayidx7, align 4
193  %add4 = add i32 %i6, %i7
194  %call4 = tail call i32 @llvm.ctlz.i32(i32 %add4,i1 false) nounwind readnone
195
196  store i32 %call1, i32* %c, align 4
197  %arrayidx8 = getelementptr inbounds i32, i32* %c, i32 1
198  store i32 %call2, i32* %arrayidx8, align 4
199  %arrayidx9 = getelementptr inbounds i32, i32* %c, i32 2
200  store i32 %call3, i32* %arrayidx9, align 4
201  %arrayidx10 = getelementptr inbounds i32, i32* %c, i32 3
202  store i32 %call4, i32* %arrayidx10, align 4
203  ret void
204
205; CHECK-LABEL: @vec_ctlz_i32_neg(
206; CHECK-NOT: call <4 x i32> @llvm.ctlz.v4i32
207
208}
209
210
211declare i32 @llvm.cttz.i32(i32,i1) nounwind readnone
212
213define void @vec_cttz_i32(i32* %a, i32* %b, i32* %c, i1) {
214entry:
215  %i0 = load i32, i32* %a, align 4
216  %i1 = load i32, i32* %b, align 4
217  %add1 = add i32 %i0, %i1
218  %call1 = tail call i32 @llvm.cttz.i32(i32 %add1,i1 true) nounwind readnone
219
220  %arrayidx2 = getelementptr inbounds i32, i32* %a, i32 1
221  %i2 = load i32, i32* %arrayidx2, align 4
222  %arrayidx3 = getelementptr inbounds i32, i32* %b, i32 1
223  %i3 = load i32, i32* %arrayidx3, align 4
224  %add2 = add i32 %i2, %i3
225  %call2 = tail call i32 @llvm.cttz.i32(i32 %add2,i1 true) nounwind readnone
226
227  %arrayidx4 = getelementptr inbounds i32, i32* %a, i32 2
228  %i4 = load i32, i32* %arrayidx4, align 4
229  %arrayidx5 = getelementptr inbounds i32, i32* %b, i32 2
230  %i5 = load i32, i32* %arrayidx5, align 4
231  %add3 = add i32 %i4, %i5
232  %call3 = tail call i32 @llvm.cttz.i32(i32 %add3,i1 true) nounwind readnone
233
234  %arrayidx6 = getelementptr inbounds i32, i32* %a, i32 3
235  %i6 = load i32, i32* %arrayidx6, align 4
236  %arrayidx7 = getelementptr inbounds i32, i32* %b, i32 3
237  %i7 = load i32, i32* %arrayidx7, align 4
238  %add4 = add i32 %i6, %i7
239  %call4 = tail call i32 @llvm.cttz.i32(i32 %add4,i1 true) nounwind readnone
240
241  store i32 %call1, i32* %c, align 4
242  %arrayidx8 = getelementptr inbounds i32, i32* %c, i32 1
243  store i32 %call2, i32* %arrayidx8, align 4
244  %arrayidx9 = getelementptr inbounds i32, i32* %c, i32 2
245  store i32 %call3, i32* %arrayidx9, align 4
246  %arrayidx10 = getelementptr inbounds i32, i32* %c, i32 3
247  store i32 %call4, i32* %arrayidx10, align 4
248  ret void
249
250; CHECK-LABEL: @vec_cttz_i32(
251; CHECK: load <4 x i32>
252; CHECK: load <4 x i32>
253; CHECK: call <4 x i32> @llvm.cttz.v4i32
254; CHECK: store <4 x i32>
255; CHECK: ret
256}
257
258define void @vec_cttz_i32_neg(i32* %a, i32* %b, i32* %c, i1) {
259entry:
260  %i0 = load i32, i32* %a, align 4
261  %i1 = load i32, i32* %b, align 4
262  %add1 = add i32 %i0, %i1
263  %call1 = tail call i32 @llvm.cttz.i32(i32 %add1,i1 true) nounwind readnone
264
265  %arrayidx2 = getelementptr inbounds i32, i32* %a, i32 1
266  %i2 = load i32, i32* %arrayidx2, align 4
267  %arrayidx3 = getelementptr inbounds i32, i32* %b, i32 1
268  %i3 = load i32, i32* %arrayidx3, align 4
269  %add2 = add i32 %i2, %i3
270  %call2 = tail call i32 @llvm.cttz.i32(i32 %add2,i1 false) nounwind readnone
271
272  %arrayidx4 = getelementptr inbounds i32, i32* %a, i32 2
273  %i4 = load i32, i32* %arrayidx4, align 4
274  %arrayidx5 = getelementptr inbounds i32, i32* %b, i32 2
275  %i5 = load i32, i32* %arrayidx5, align 4
276  %add3 = add i32 %i4, %i5
277  %call3 = tail call i32 @llvm.cttz.i32(i32 %add3,i1 true) nounwind readnone
278
279  %arrayidx6 = getelementptr inbounds i32, i32* %a, i32 3
280  %i6 = load i32, i32* %arrayidx6, align 4
281  %arrayidx7 = getelementptr inbounds i32, i32* %b, i32 3
282  %i7 = load i32, i32* %arrayidx7, align 4
283  %add4 = add i32 %i6, %i7
284  %call4 = tail call i32 @llvm.cttz.i32(i32 %add4,i1 false) nounwind readnone
285
286  store i32 %call1, i32* %c, align 4
287  %arrayidx8 = getelementptr inbounds i32, i32* %c, i32 1
288  store i32 %call2, i32* %arrayidx8, align 4
289  %arrayidx9 = getelementptr inbounds i32, i32* %c, i32 2
290  store i32 %call3, i32* %arrayidx9, align 4
291  %arrayidx10 = getelementptr inbounds i32, i32* %c, i32 3
292  store i32 %call4, i32* %arrayidx10, align 4
293  ret void
294
295; CHECK-LABEL: @vec_cttz_i32_neg(
296; CHECK-NOT: call <4 x i32> @llvm.cttz.v4i32
297}
298
299
300declare float @llvm.powi.f32(float, i32)
301define void @vec_powi_f32(float* %a, float* %b, float* %c, i32 %P) {
302entry:
303  %i0 = load float, float* %a, align 4
304  %i1 = load float, float* %b, align 4
305  %add1 = fadd float %i0, %i1
306  %call1 = tail call float @llvm.powi.f32(float %add1,i32 %P) nounwind readnone
307
308  %arrayidx2 = getelementptr inbounds float, float* %a, i32 1
309  %i2 = load float, float* %arrayidx2, align 4
310  %arrayidx3 = getelementptr inbounds float, float* %b, i32 1
311  %i3 = load float, float* %arrayidx3, align 4
312  %add2 = fadd float %i2, %i3
313  %call2 = tail call float @llvm.powi.f32(float %add2,i32 %P) nounwind readnone
314
315  %arrayidx4 = getelementptr inbounds float, float* %a, i32 2
316  %i4 = load float, float* %arrayidx4, align 4
317  %arrayidx5 = getelementptr inbounds float, float* %b, i32 2
318  %i5 = load float, float* %arrayidx5, align 4
319  %add3 = fadd float %i4, %i5
320  %call3 = tail call float @llvm.powi.f32(float %add3,i32 %P) nounwind readnone
321
322  %arrayidx6 = getelementptr inbounds float, float* %a, i32 3
323  %i6 = load float, float* %arrayidx6, align 4
324  %arrayidx7 = getelementptr inbounds float, float* %b, i32 3
325  %i7 = load float, float* %arrayidx7, align 4
326  %add4 = fadd float %i6, %i7
327  %call4 = tail call float @llvm.powi.f32(float %add4,i32 %P) nounwind readnone
328
329  store float %call1, float* %c, align 4
330  %arrayidx8 = getelementptr inbounds float, float* %c, i32 1
331  store float %call2, float* %arrayidx8, align 4
332  %arrayidx9 = getelementptr inbounds float, float* %c, i32 2
333  store float %call3, float* %arrayidx9, align 4
334  %arrayidx10 = getelementptr inbounds float, float* %c, i32 3
335  store float %call4, float* %arrayidx10, align 4
336  ret void
337
338; CHECK-LABEL: @vec_powi_f32(
339; CHECK: load <4 x float>
340; CHECK: load <4 x float>
341; CHECK: call <4 x float> @llvm.powi.v4f32
342; CHECK: store <4 x float>
343; CHECK: ret
344}
345
346
347define void @vec_powi_f32_neg(float* %a, float* %b, float* %c, i32 %P, i32 %Q) {
348entry:
349  %i0 = load float, float* %a, align 4
350  %i1 = load float, float* %b, align 4
351  %add1 = fadd float %i0, %i1
352  %call1 = tail call float @llvm.powi.f32(float %add1,i32 %P) nounwind readnone
353
354  %arrayidx2 = getelementptr inbounds float, float* %a, i32 1
355  %i2 = load float, float* %arrayidx2, align 4
356  %arrayidx3 = getelementptr inbounds float, float* %b, i32 1
357  %i3 = load float, float* %arrayidx3, align 4
358  %add2 = fadd float %i2, %i3
359  %call2 = tail call float @llvm.powi.f32(float %add2,i32 %Q) nounwind readnone
360
361  %arrayidx4 = getelementptr inbounds float, float* %a, i32 2
362  %i4 = load float, float* %arrayidx4, align 4
363  %arrayidx5 = getelementptr inbounds float, float* %b, i32 2
364  %i5 = load float, float* %arrayidx5, align 4
365  %add3 = fadd float %i4, %i5
366  %call3 = tail call float @llvm.powi.f32(float %add3,i32 %P) nounwind readnone
367
368  %arrayidx6 = getelementptr inbounds float, float* %a, i32 3
369  %i6 = load float, float* %arrayidx6, align 4
370  %arrayidx7 = getelementptr inbounds float, float* %b, i32 3
371  %i7 = load float, float* %arrayidx7, align 4
372  %add4 = fadd float %i6, %i7
373  %call4 = tail call float @llvm.powi.f32(float %add4,i32 %Q) nounwind readnone
374
375  store float %call1, float* %c, align 4
376  %arrayidx8 = getelementptr inbounds float, float* %c, i32 1
377  store float %call2, float* %arrayidx8, align 4
378  %arrayidx9 = getelementptr inbounds float, float* %c, i32 2
379  store float %call3, float* %arrayidx9, align 4
380  %arrayidx10 = getelementptr inbounds float, float* %c, i32 3
381  store float %call4, float* %arrayidx10, align 4
382  ret void
383
384; CHECK-LABEL: @vec_powi_f32_neg(
385; CHECK-NOT: call <4 x float> @llvm.powi.v4f32
386}
387