1target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
2target triple = "x86_64-apple-macosx10.8.0"
3; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 | FileCheck %s
4
5
6; CHECK: tiny_tree_fully_vectorizable
7; CHECK: load <2 x double>
8; CHECK: store <2 x double>
9; CHECK: ret
10
11define void @tiny_tree_fully_vectorizable(double* noalias nocapture %dst, double* noalias nocapture readonly %src, i64 %count) #0 {
12entry:
13  %cmp12 = icmp eq i64 %count, 0
14  br i1 %cmp12, label %for.end, label %for.body
15
16for.body:                                         ; preds = %entry, %for.body
17  %i.015 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
18  %dst.addr.014 = phi double* [ %add.ptr4, %for.body ], [ %dst, %entry ]
19  %src.addr.013 = phi double* [ %add.ptr, %for.body ], [ %src, %entry ]
20  %0 = load double, double* %src.addr.013, align 8
21  store double %0, double* %dst.addr.014, align 8
22  %arrayidx2 = getelementptr inbounds double, double* %src.addr.013, i64 1
23  %1 = load double, double* %arrayidx2, align 8
24  %arrayidx3 = getelementptr inbounds double, double* %dst.addr.014, i64 1
25  store double %1, double* %arrayidx3, align 8
26  %add.ptr = getelementptr inbounds double, double* %src.addr.013, i64 %i.015
27  %add.ptr4 = getelementptr inbounds double, double* %dst.addr.014, i64 %i.015
28  %inc = add i64 %i.015, 1
29  %exitcond = icmp eq i64 %inc, %count
30  br i1 %exitcond, label %for.end, label %for.body
31
32for.end:                                          ; preds = %for.body, %entry
33  ret void
34}
35
36; CHECK: tiny_tree_fully_vectorizable2
37; CHECK: load <4 x float>
38; CHECK: store <4 x float>
39; CHECK: ret
40
41define void @tiny_tree_fully_vectorizable2(float* noalias nocapture %dst, float* noalias nocapture readonly %src, i64 %count) #0 {
42entry:
43  %cmp20 = icmp eq i64 %count, 0
44  br i1 %cmp20, label %for.end, label %for.body
45
46for.body:                                         ; preds = %entry, %for.body
47  %i.023 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
48  %dst.addr.022 = phi float* [ %add.ptr8, %for.body ], [ %dst, %entry ]
49  %src.addr.021 = phi float* [ %add.ptr, %for.body ], [ %src, %entry ]
50  %0 = load float, float* %src.addr.021, align 4
51  store float %0, float* %dst.addr.022, align 4
52  %arrayidx2 = getelementptr inbounds float, float* %src.addr.021, i64 1
53  %1 = load float, float* %arrayidx2, align 4
54  %arrayidx3 = getelementptr inbounds float, float* %dst.addr.022, i64 1
55  store float %1, float* %arrayidx3, align 4
56  %arrayidx4 = getelementptr inbounds float, float* %src.addr.021, i64 2
57  %2 = load float, float* %arrayidx4, align 4
58  %arrayidx5 = getelementptr inbounds float, float* %dst.addr.022, i64 2
59  store float %2, float* %arrayidx5, align 4
60  %arrayidx6 = getelementptr inbounds float, float* %src.addr.021, i64 3
61  %3 = load float, float* %arrayidx6, align 4
62  %arrayidx7 = getelementptr inbounds float, float* %dst.addr.022, i64 3
63  store float %3, float* %arrayidx7, align 4
64  %add.ptr = getelementptr inbounds float, float* %src.addr.021, i64 %i.023
65  %add.ptr8 = getelementptr inbounds float, float* %dst.addr.022, i64 %i.023
66  %inc = add i64 %i.023, 1
67  %exitcond = icmp eq i64 %inc, %count
68  br i1 %exitcond, label %for.end, label %for.body
69
70for.end:                                          ; preds = %for.body, %entry
71  ret void
72}
73
74; We do not vectorize the tiny tree which is not fully vectorizable.
75; CHECK: tiny_tree_not_fully_vectorizable
76; CHECK-NOT: <2 x double>
77; CHECK: ret
78
79define void @tiny_tree_not_fully_vectorizable(double* noalias nocapture %dst, double* noalias nocapture readonly %src, i64 %count) #0 {
80entry:
81  %cmp12 = icmp eq i64 %count, 0
82  br i1 %cmp12, label %for.end, label %for.body
83
84for.body:                                         ; preds = %entry, %for.body
85  %i.015 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
86  %dst.addr.014 = phi double* [ %add.ptr4, %for.body ], [ %dst, %entry ]
87  %src.addr.013 = phi double* [ %add.ptr, %for.body ], [ %src, %entry ]
88  %0 = load double, double* %src.addr.013, align 8
89  store double %0, double* %dst.addr.014, align 8
90  %arrayidx2 = getelementptr inbounds double, double* %src.addr.013, i64 2
91  %1 = load double, double* %arrayidx2, align 8
92  %arrayidx3 = getelementptr inbounds double, double* %dst.addr.014, i64 1
93  store double %1, double* %arrayidx3, align 8
94  %add.ptr = getelementptr inbounds double, double* %src.addr.013, i64 %i.015
95  %add.ptr4 = getelementptr inbounds double, double* %dst.addr.014, i64 %i.015
96  %inc = add i64 %i.015, 1
97  %exitcond = icmp eq i64 %inc, %count
98  br i1 %exitcond, label %for.end, label %for.body
99
100for.end:                                          ; preds = %for.body, %entry
101  ret void
102}
103
104
105; CHECK: tiny_tree_not_fully_vectorizable2
106; CHECK-NOT: <2 x double>
107; CHECK: ret
108
109define void @tiny_tree_not_fully_vectorizable2(float* noalias nocapture %dst, float* noalias nocapture readonly %src, i64 %count) #0 {
110entry:
111  %cmp20 = icmp eq i64 %count, 0
112  br i1 %cmp20, label %for.end, label %for.body
113
114for.body:                                         ; preds = %entry, %for.body
115  %i.023 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
116  %dst.addr.022 = phi float* [ %add.ptr8, %for.body ], [ %dst, %entry ]
117  %src.addr.021 = phi float* [ %add.ptr, %for.body ], [ %src, %entry ]
118  %0 = load float, float* %src.addr.021, align 4
119  store float %0, float* %dst.addr.022, align 4
120  %arrayidx2 = getelementptr inbounds float, float* %src.addr.021, i64 4
121  %1 = load float, float* %arrayidx2, align 4
122  %arrayidx3 = getelementptr inbounds float, float* %dst.addr.022, i64 1
123  store float %1, float* %arrayidx3, align 4
124  %arrayidx4 = getelementptr inbounds float, float* %src.addr.021, i64 2
125  %2 = load float, float* %arrayidx4, align 4
126  %arrayidx5 = getelementptr inbounds float, float* %dst.addr.022, i64 2
127  store float %2, float* %arrayidx5, align 4
128  %arrayidx6 = getelementptr inbounds float, float* %src.addr.021, i64 3
129  %3 = load float, float* %arrayidx6, align 4
130  %arrayidx7 = getelementptr inbounds float, float* %dst.addr.022, i64 3
131  store float %3, float* %arrayidx7, align 4
132  %add.ptr = getelementptr inbounds float, float* %src.addr.021, i64 %i.023
133  %add.ptr8 = getelementptr inbounds float, float* %dst.addr.022, i64 %i.023
134  %inc = add i64 %i.023, 1
135  %exitcond = icmp eq i64 %inc, %count
136  br i1 %exitcond, label %for.end, label %for.body
137
138for.end:                                          ; preds = %for.body, %entry
139  ret void
140}
141
142
143; CHECK-LABEL: store_splat
144; CHECK: store <4 x float>
145define void @store_splat(float*, float) {
146  %3 = getelementptr inbounds float, float* %0, i64 0
147  store float %1, float* %3, align 4
148  %4 = getelementptr inbounds float, float* %0, i64 1
149  store float %1, float* %4, align 4
150  %5 = getelementptr inbounds float, float* %0, i64 2
151  store float %1, float* %5, align 4
152  %6 = getelementptr inbounds float, float* %0, i64 3
153  store float %1, float* %6, align 4
154  ret void
155}
156