1; RUN: opt -mtriple armv7-linux-gnueabihf -loop-vectorize -S %s -debug-only=loop-vectorize -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=LINUX
2; RUN: opt -mtriple armv8-linux-gnu -loop-vectorize -S %s -debug-only=loop-vectorize -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=LINUX
3; RUN: opt -mtriple armv8.1.m-none-eabi -mattr=+mve.fp -loop-vectorize -S %s -debug-only=loop-vectorize -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=MVE
4; RUN: opt -mtriple armv7-unknwon-darwin -loop-vectorize -S %s -debug-only=loop-vectorize -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=DARWIN
5; REQUIRES: asserts
6
7; Testing the ability of the loop vectorizer to tell when SIMD is safe or not
8; regarding IEEE 754 standard.
9; On Linux, we only want the vectorizer to work when -ffast-math flag is set,
10; because NEON is not IEEE compliant.
11; Darwin, on the other hand, doesn't support subnormals, and all optimizations
12; are allowed, even without -ffast-math.
13
14; Integer loops are always vectorizeable
15; CHECK: Checking a loop in "sumi"
16; CHECK: We can vectorize this loop!
17define void @sumi(i32* noalias nocapture readonly %A, i32* noalias nocapture readonly %B, i32* noalias nocapture %C, i32 %N) {
18entry:
19  %cmp5 = icmp eq i32 %N, 0
20  br i1 %cmp5, label %for.end, label %for.body.preheader
21
22for.body.preheader:                               ; preds = %entry
23  br label %for.body
24
25for.body:                                         ; preds = %for.body.preheader, %for.body
26  %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
27  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.06
28  %0 = load i32, i32* %arrayidx, align 4
29  %arrayidx1 = getelementptr inbounds i32, i32* %B, i32 %i.06
30  %1 = load i32, i32* %arrayidx1, align 4
31  %mul = mul nsw i32 %1, %0
32  %arrayidx2 = getelementptr inbounds i32, i32* %C, i32 %i.06
33  store i32 %mul, i32* %arrayidx2, align 4
34  %inc = add nuw nsw i32 %i.06, 1
35  %exitcond = icmp eq i32 %inc, %N
36  br i1 %exitcond, label %for.end.loopexit, label %for.body
37
38for.end.loopexit:                                 ; preds = %for.body
39  br label %for.end
40
41for.end:                                          ; preds = %for.end.loopexit, %entry
42  ret void
43}
44
45; Floating-point loops need fast-math to be vectorizeable
46; LINUX: Checking a loop in "sumf"
47; LINUX: Potentially unsafe FP op prevents vectorization
48; MVE: Checking a loop in "sumf"
49; MVE: We can vectorize this loop!
50; DARWIN: Checking a loop in "sumf"
51; DARWIN: We can vectorize this loop!
52define void @sumf(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) {
53entry:
54  %cmp5 = icmp eq i32 %N, 0
55  br i1 %cmp5, label %for.end, label %for.body.preheader
56
57for.body.preheader:                               ; preds = %entry
58  br label %for.body
59
60for.body:                                         ; preds = %for.body.preheader, %for.body
61  %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
62  %arrayidx = getelementptr inbounds float, float* %A, i32 %i.06
63  %0 = load float, float* %arrayidx, align 4
64  %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.06
65  %1 = load float, float* %arrayidx1, align 4
66  %mul = fmul float %0, %1
67  %arrayidx2 = getelementptr inbounds float, float* %C, i32 %i.06
68  store float %mul, float* %arrayidx2, align 4
69  %inc = add nuw nsw i32 %i.06, 1
70  %exitcond = icmp eq i32 %inc, %N
71  br i1 %exitcond, label %for.end.loopexit, label %for.body
72
73for.end.loopexit:                                 ; preds = %for.body
74  br label %for.end
75
76for.end:                                          ; preds = %for.end.loopexit, %entry
77  ret void
78}
79
80; Integer loops are always vectorizeable
81; CHECK: Checking a loop in "redi"
82; CHECK: We can vectorize this loop!
83define i32 @redi(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) {
84entry:
85  %cmp5 = icmp eq i32 %N, 0
86  br i1 %cmp5, label %for.end, label %for.body.preheader
87
88for.body.preheader:                               ; preds = %entry
89  br label %for.body
90
91for.body:                                         ; preds = %for.body.preheader, %for.body
92  %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
93  %Red.06 = phi i32 [ %add, %for.body ], [ undef, %for.body.preheader ]
94  %arrayidx = getelementptr inbounds i32, i32* %a, i32 %i.07
95  %0 = load i32, i32* %arrayidx, align 4
96  %arrayidx1 = getelementptr inbounds i32, i32* %b, i32 %i.07
97  %1 = load i32, i32* %arrayidx1, align 4
98  %mul = mul nsw i32 %1, %0
99  %add = add nsw i32 %mul, %Red.06
100  %inc = add nuw nsw i32 %i.07, 1
101  %exitcond = icmp eq i32 %inc, %N
102  br i1 %exitcond, label %for.end.loopexit, label %for.body
103
104for.end.loopexit:                                 ; preds = %for.body
105  %add.lcssa = phi i32 [ %add, %for.body ]
106  br label %for.end
107
108for.end:                                          ; preds = %for.end.loopexit, %entry
109  %Red.0.lcssa = phi i32 [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
110  ret i32 %Red.0.lcssa
111}
112
113; Floating-point loops need fast-math to be vectorizeable
114; LINUX: Checking a loop in "redf"
115; LINUX: Potentially unsafe FP op prevents vectorization
116; MVE: Checking a loop in "redf"
117; MVE: We can vectorize this loop!
118; DARWIN: Checking a loop in "redf"
119; DARWIN: We can vectorize this loop!
120define float @redf(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i32 %N) {
121entry:
122  %cmp5 = icmp eq i32 %N, 0
123  br i1 %cmp5, label %for.end, label %for.body.preheader
124
125for.body.preheader:                               ; preds = %entry
126  br label %for.body
127
128for.body:                                         ; preds = %for.body.preheader, %for.body
129  %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
130  %Red.06 = phi float [ %add, %for.body ], [ undef, %for.body.preheader ]
131  %arrayidx = getelementptr inbounds float, float* %a, i32 %i.07
132  %0 = load float, float* %arrayidx, align 4
133  %arrayidx1 = getelementptr inbounds float, float* %b, i32 %i.07
134  %1 = load float, float* %arrayidx1, align 4
135  %mul = fmul float %0, %1
136  %add = fadd float %Red.06, %mul
137  %inc = add nuw nsw i32 %i.07, 1
138  %exitcond = icmp eq i32 %inc, %N
139  br i1 %exitcond, label %for.end.loopexit, label %for.body
140
141for.end.loopexit:                                 ; preds = %for.body
142  %add.lcssa = phi float [ %add, %for.body ]
143  br label %for.end
144
145for.end:                                          ; preds = %for.end.loopexit, %entry
146  %Red.0.lcssa = phi float [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
147  ret float %Red.0.lcssa
148}
149
150; Make sure calls that turn into builtins are also covered
151; LINUX: Checking a loop in "fabs"
152; LINUX: Potentially unsafe FP op prevents vectorization
153; DARWIN: Checking a loop in "fabs"
154; DARWIN: We can vectorize this loop!
155define void @fabs(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) {
156entry:
157  %cmp10 = icmp eq i32 %N, 0
158  br i1 %cmp10, label %for.end, label %for.body
159
160for.body:                                         ; preds = %entry, %for.body
161  %i.011 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
162  %arrayidx = getelementptr inbounds float, float* %A, i32 %i.011
163  %0 = load float, float* %arrayidx, align 4
164  %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.011
165  %1 = load float, float* %arrayidx1, align 4
166  %fabsf = tail call float @fabsf(float %1) #1
167  %conv3 = fmul float %0, %fabsf
168  %arrayidx4 = getelementptr inbounds float, float* %C, i32 %i.011
169  store float %conv3, float* %arrayidx4, align 4
170  %inc = add nuw nsw i32 %i.011, 1
171  %exitcond = icmp eq i32 %inc, %N
172  br i1 %exitcond, label %for.end, label %for.body
173
174for.end:                                          ; preds = %for.body, %entry
175  ret void
176}
177
178; Integer loops are always vectorizeable
179; CHECK: Checking a loop in "sumi_fast"
180; CHECK: We can vectorize this loop!
181define void @sumi_fast(i32* noalias nocapture readonly %A, i32* noalias nocapture readonly %B, i32* noalias nocapture %C, i32 %N) {
182entry:
183  %cmp5 = icmp eq i32 %N, 0
184  br i1 %cmp5, label %for.end, label %for.body.preheader
185
186for.body.preheader:                               ; preds = %entry
187  br label %for.body
188
189for.body:                                         ; preds = %for.body.preheader, %for.body
190  %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
191  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.06
192  %0 = load i32, i32* %arrayidx, align 4
193  %arrayidx1 = getelementptr inbounds i32, i32* %B, i32 %i.06
194  %1 = load i32, i32* %arrayidx1, align 4
195  %mul = mul nsw i32 %1, %0
196  %arrayidx2 = getelementptr inbounds i32, i32* %C, i32 %i.06
197  store i32 %mul, i32* %arrayidx2, align 4
198  %inc = add nuw nsw i32 %i.06, 1
199  %exitcond = icmp eq i32 %inc, %N
200  br i1 %exitcond, label %for.end.loopexit, label %for.body
201
202for.end.loopexit:                                 ; preds = %for.body
203  br label %for.end
204
205for.end:                                          ; preds = %for.end.loopexit, %entry
206  ret void
207}
208
209; Floating-point loops can be vectorizeable with fast-math
210; CHECK: Checking a loop in "sumf_fast"
211; CHECK: We can vectorize this loop!
212define void @sumf_fast(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) {
213entry:
214  %cmp5 = icmp eq i32 %N, 0
215  br i1 %cmp5, label %for.end, label %for.body.preheader
216
217for.body.preheader:                               ; preds = %entry
218  br label %for.body
219
220for.body:                                         ; preds = %for.body.preheader, %for.body
221  %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
222  %arrayidx = getelementptr inbounds float, float* %A, i32 %i.06
223  %0 = load float, float* %arrayidx, align 4
224  %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.06
225  %1 = load float, float* %arrayidx1, align 4
226  %mul = fmul fast float %1, %0
227  %arrayidx2 = getelementptr inbounds float, float* %C, i32 %i.06
228  store float %mul, float* %arrayidx2, align 4
229  %inc = add nuw nsw i32 %i.06, 1
230  %exitcond = icmp eq i32 %inc, %N
231  br i1 %exitcond, label %for.end.loopexit, label %for.body
232
233for.end.loopexit:                                 ; preds = %for.body
234  br label %for.end
235
236for.end:                                          ; preds = %for.end.loopexit, %entry
237  ret void
238}
239
240; Integer loops are always vectorizeable
241; CHECK: Checking a loop in "redi_fast"
242; CHECK: We can vectorize this loop!
243define i32 @redi_fast(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) {
244entry:
245  %cmp5 = icmp eq i32 %N, 0
246  br i1 %cmp5, label %for.end, label %for.body.preheader
247
248for.body.preheader:                               ; preds = %entry
249  br label %for.body
250
251for.body:                                         ; preds = %for.body.preheader, %for.body
252  %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
253  %Red.06 = phi i32 [ %add, %for.body ], [ undef, %for.body.preheader ]
254  %arrayidx = getelementptr inbounds i32, i32* %a, i32 %i.07
255  %0 = load i32, i32* %arrayidx, align 4
256  %arrayidx1 = getelementptr inbounds i32, i32* %b, i32 %i.07
257  %1 = load i32, i32* %arrayidx1, align 4
258  %mul = mul nsw i32 %1, %0
259  %add = add nsw i32 %mul, %Red.06
260  %inc = add nuw nsw i32 %i.07, 1
261  %exitcond = icmp eq i32 %inc, %N
262  br i1 %exitcond, label %for.end.loopexit, label %for.body
263
264for.end.loopexit:                                 ; preds = %for.body
265  %add.lcssa = phi i32 [ %add, %for.body ]
266  br label %for.end
267
268for.end:                                          ; preds = %for.end.loopexit, %entry
269  %Red.0.lcssa = phi i32 [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
270  ret i32 %Red.0.lcssa
271}
272
273; Floating-point loops can be vectorizeable with fast-math
274; CHECK: Checking a loop in "redf_fast"
275; CHECK: We can vectorize this loop!
276define float @redf_fast(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i32 %N) {
277entry:
278  %cmp5 = icmp eq i32 %N, 0
279  br i1 %cmp5, label %for.end, label %for.body.preheader
280
281for.body.preheader:                               ; preds = %entry
282  br label %for.body
283
284for.body:                                         ; preds = %for.body.preheader, %for.body
285  %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
286  %Red.06 = phi float [ %add, %for.body ], [ undef, %for.body.preheader ]
287  %arrayidx = getelementptr inbounds float, float* %a, i32 %i.07
288  %0 = load float, float* %arrayidx, align 4
289  %arrayidx1 = getelementptr inbounds float, float* %b, i32 %i.07
290  %1 = load float, float* %arrayidx1, align 4
291  %mul = fmul fast float %1, %0
292  %add = fadd fast float %mul, %Red.06
293  %inc = add nuw nsw i32 %i.07, 1
294  %exitcond = icmp eq i32 %inc, %N
295  br i1 %exitcond, label %for.end.loopexit, label %for.body
296
297for.end.loopexit:                                 ; preds = %for.body
298  %add.lcssa = phi float [ %add, %for.body ]
299  br label %for.end
300
301for.end:                                          ; preds = %for.end.loopexit, %entry
302  %Red.0.lcssa = phi float [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
303  ret float %Red.0.lcssa
304}
305
306; Make sure calls that turn into builtins are also covered
307; CHECK: Checking a loop in "fabs_fast"
308; CHECK: We can vectorize this loop!
309define void @fabs_fast(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) {
310entry:
311  %cmp10 = icmp eq i32 %N, 0
312  br i1 %cmp10, label %for.end, label %for.body
313
314for.body:                                         ; preds = %entry, %for.body
315  %i.011 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
316  %arrayidx = getelementptr inbounds float, float* %A, i32 %i.011
317  %0 = load float, float* %arrayidx, align 4
318  %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.011
319  %1 = load float, float* %arrayidx1, align 4
320  %fabsf = tail call fast float @fabsf(float %1) #2
321  %conv3 = fmul fast float %fabsf, %0
322  %arrayidx4 = getelementptr inbounds float, float* %C, i32 %i.011
323  store float %conv3, float* %arrayidx4, align 4
324  %inc = add nuw nsw i32 %i.011, 1
325  %exitcond = icmp eq i32 %inc, %N
326  br i1 %exitcond, label %for.end, label %for.body
327
328for.end:                                          ; preds = %for.body, %entry
329  ret void
330}
331
332declare float @fabsf(float)
333
334attributes #1 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "unsafe-fp-math"="false" "use-soft-float"="false" }
335attributes #2 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "unsafe-fp-math"="true" "use-soft-float"="false" }
336