1; RUN: llc < %s -debug-only=isel -o /dev/null 2>&1 | FileCheck %s
2; REQUIRES: asserts
3
4@a = global [1024 x i32] zeroinitializer, align 16
5
6define i32 @reduce_add() {
7; CHECK-LABEL: reduce_add
8; CHECK:       Detected a reduction operation: {{.*}} add
9; CHECK:       Detected a reduction operation: {{.*}} add
10; CHECK:       Detected a reduction operation: {{.*}} add
11; CHECK:       Detected a reduction operation: {{.*}} add
12; CHECK:       Detected a reduction operation: {{.*}} add
13; CHECK:       Detected a reduction operation: {{.*}} add
14; CHECK:       Detected a reduction operation: {{.*}} add
15; CHECK:       Detected a reduction operation: {{.*}} add
16; CHECK:       Detected a reduction operation: {{.*}} add
17; CHECK:       Detected a reduction operation: {{.*}} add
18; CHECK:       Detected a reduction operation: {{.*}} add
19
20min.iters.checked:
21  br label %vector.body
22
23vector.body:
24  %index = phi i64 [ 0, %min.iters.checked ], [ %index.next.4, %vector.body ]
25  %vec.phi = phi <4 x i32> [ zeroinitializer, %min.iters.checked ], [ %28, %vector.body ]
26  %vec.phi4 = phi <4 x i32> [ zeroinitializer, %min.iters.checked ], [ %29, %vector.body ]
27  %0 = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 %index
28  %1 = bitcast i32* %0 to <4 x i32>*
29  %wide.load = load <4 x i32>, <4 x i32>* %1, align 16
30  %2 = getelementptr i32, i32* %0, i64 4
31  %3 = bitcast i32* %2 to <4 x i32>*
32  %wide.load5 = load <4 x i32>, <4 x i32>* %3, align 16
33  %4 = add nsw <4 x i32> %wide.load, %vec.phi
34  %5 = add nsw <4 x i32> %wide.load5, %vec.phi4
35  %index.next = add nuw nsw i64 %index, 8
36  %6 = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 %index.next
37  %7 = bitcast i32* %6 to <4 x i32>*
38  %wide.load.1 = load <4 x i32>, <4 x i32>* %7, align 16
39  %8 = getelementptr i32, i32* %6, i64 4
40  %9 = bitcast i32* %8 to <4 x i32>*
41  %wide.load5.1 = load <4 x i32>, <4 x i32>* %9, align 16
42  %10 = add nsw <4 x i32> %wide.load.1, %4
43  %11 = add nsw <4 x i32> %wide.load5.1, %5
44  %index.next.1 = add nsw i64 %index, 16
45  %12 = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 %index.next.1
46  %13 = bitcast i32* %12 to <4 x i32>*
47  %wide.load.2 = load <4 x i32>, <4 x i32>* %13, align 16
48  %14 = getelementptr i32, i32* %12, i64 4
49  %15 = bitcast i32* %14 to <4 x i32>*
50  %wide.load5.2 = load <4 x i32>, <4 x i32>* %15, align 16
51  %16 = add nsw <4 x i32> %wide.load.2, %10
52  %17 = add nsw <4 x i32> %wide.load5.2, %11
53  %index.next.2 = add nsw i64 %index, 24
54  %18 = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 %index.next.2
55  %19 = bitcast i32* %18 to <4 x i32>*
56  %wide.load.3 = load <4 x i32>, <4 x i32>* %19, align 16
57  %20 = getelementptr i32, i32* %18, i64 4
58  %21 = bitcast i32* %20 to <4 x i32>*
59  %wide.load5.3 = load <4 x i32>, <4 x i32>* %21, align 16
60  %22 = add nsw <4 x i32> %wide.load.3, %16
61  %23 = add nsw <4 x i32> %wide.load5.3, %17
62  %index.next.3 = add nsw i64 %index, 32
63  %24 = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 %index.next.3
64  %25 = bitcast i32* %24 to <4 x i32>*
65  %wide.load.4 = load <4 x i32>, <4 x i32>* %25, align 16
66  %26 = getelementptr i32, i32* %24, i64 4
67  %27 = bitcast i32* %26 to <4 x i32>*
68  %wide.load5.4 = load <4 x i32>, <4 x i32>* %27, align 16
69  %28 = add nsw <4 x i32> %wide.load.4, %22
70  %29 = add nsw <4 x i32> %wide.load5.4, %23
71  %index.next.4 = add nsw i64 %index, 40
72  %30 = icmp eq i64 %index.next.4, 1000
73  br i1 %30, label %middle.block, label %vector.body
74
75middle.block:
76  %.lcssa10 = phi <4 x i32> [ %29, %vector.body ]
77  %.lcssa = phi <4 x i32> [ %28, %vector.body ]
78  %bin.rdx = add <4 x i32> %.lcssa10, %.lcssa
79  %rdx.shuf = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
80  %bin.rdx6 = add <4 x i32> %bin.rdx, %rdx.shuf
81  %rdx.shuf7 = shufflevector <4 x i32> %bin.rdx6, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
82  %bin.rdx8 = add <4 x i32> %bin.rdx6, %rdx.shuf7
83  %31 = extractelement <4 x i32> %bin.rdx8, i32 0
84  ret i32 %31
85}
86
87define i32 @reduce_and() {
88; CHECK-LABEL: reduce_and
89; CHECK:       Detected a reduction operation: {{.*}} and
90; CHECK:       Detected a reduction operation: {{.*}} and
91; CHECK:       Detected a reduction operation: {{.*}} and
92; CHECK:       Detected a reduction operation: {{.*}} and
93; CHECK:       Detected a reduction operation: {{.*}} and
94; CHECK:       Detected a reduction operation: {{.*}} and
95; CHECK:       Detected a reduction operation: {{.*}} and
96; CHECK:       Detected a reduction operation: {{.*}} and
97; CHECK:       Detected a reduction operation: {{.*}} and
98
99entry:
100  br label %vector.body
101
102vector.body:
103  %lsr.iv = phi i64 [ %lsr.iv.next, %vector.body ], [ -4096, %entry ]
104  %vec.phi = phi <4 x i32> [ <i32 -1, i32 -1, i32 -1, i32 -1>, %entry ], [ %6, %vector.body ]
105  %vec.phi9 = phi <4 x i32> [ <i32 -1, i32 -1, i32 -1, i32 -1>, %entry ], [ %7, %vector.body ]
106  %uglygep33 = getelementptr i8, i8* bitcast ([1024 x i32]* @a to i8*), i64 %lsr.iv
107  %uglygep3334 = bitcast i8* %uglygep33 to <4 x i32>*
108  %scevgep35 = getelementptr <4 x i32>, <4 x i32>* %uglygep3334, i64 256
109  %wide.load = load <4 x i32>, <4 x i32>* %scevgep35, align 16
110  %scevgep36 = getelementptr <4 x i32>, <4 x i32>* %uglygep3334, i64 257
111  %wide.load10 = load <4 x i32>, <4 x i32>* %scevgep36, align 16
112  %0 = and <4 x i32> %wide.load, %vec.phi
113  %1 = and <4 x i32> %wide.load10, %vec.phi9
114  %uglygep30 = getelementptr i8, i8* bitcast ([1024 x i32]* @a to i8*), i64 %lsr.iv
115  %uglygep3031 = bitcast i8* %uglygep30 to <4 x i32>*
116  %scevgep32 = getelementptr <4 x i32>, <4 x i32>* %uglygep3031, i64 258
117  %wide.load.1 = load <4 x i32>, <4 x i32>* %scevgep32, align 16
118  %uglygep27 = getelementptr i8, i8* bitcast ([1024 x i32]* @a to i8*), i64 %lsr.iv
119  %uglygep2728 = bitcast i8* %uglygep27 to <4 x i32>*
120  %scevgep29 = getelementptr <4 x i32>, <4 x i32>* %uglygep2728, i64 259
121  %wide.load10.1 = load <4 x i32>, <4 x i32>* %scevgep29, align 16
122  %2 = and <4 x i32> %wide.load.1, %0
123  %3 = and <4 x i32> %wide.load10.1, %1
124  %uglygep24 = getelementptr i8, i8* bitcast ([1024 x i32]* @a to i8*), i64 %lsr.iv
125  %uglygep2425 = bitcast i8* %uglygep24 to <4 x i32>*
126  %scevgep26 = getelementptr <4 x i32>, <4 x i32>* %uglygep2425, i64 260
127  %wide.load.2 = load <4 x i32>, <4 x i32>* %scevgep26, align 16
128  %uglygep21 = getelementptr i8, i8* bitcast ([1024 x i32]* @a to i8*), i64 %lsr.iv
129  %uglygep2122 = bitcast i8* %uglygep21 to <4 x i32>*
130  %scevgep23 = getelementptr <4 x i32>, <4 x i32>* %uglygep2122, i64 261
131  %wide.load10.2 = load <4 x i32>, <4 x i32>* %scevgep23, align 16
132  %4 = and <4 x i32> %wide.load.2, %2
133  %5 = and <4 x i32> %wide.load10.2, %3
134  %uglygep18 = getelementptr i8, i8* bitcast ([1024 x i32]* @a to i8*), i64 %lsr.iv
135  %uglygep1819 = bitcast i8* %uglygep18 to <4 x i32>*
136  %scevgep20 = getelementptr <4 x i32>, <4 x i32>* %uglygep1819, i64 262
137  %wide.load.3 = load <4 x i32>, <4 x i32>* %scevgep20, align 16
138  %uglygep = getelementptr i8, i8* bitcast ([1024 x i32]* @a to i8*), i64 %lsr.iv
139  %uglygep17 = bitcast i8* %uglygep to <4 x i32>*
140  %scevgep = getelementptr <4 x i32>, <4 x i32>* %uglygep17, i64 263
141  %wide.load10.3 = load <4 x i32>, <4 x i32>* %scevgep, align 16
142  %6 = and <4 x i32> %wide.load.3, %4
143  %7 = and <4 x i32> %wide.load10.3, %5
144  %lsr.iv.next = add nsw i64 %lsr.iv, 128
145  %8 = icmp eq i64 %lsr.iv.next, 0
146  br i1 %8, label %middle.block, label %vector.body
147
148middle.block:
149  %bin.rdx = and <4 x i32> %7, %6
150  %rdx.shuf = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
151  %bin.rdx11 = and <4 x i32> %bin.rdx, %rdx.shuf
152  %rdx.shuf12 = shufflevector <4 x i32> %bin.rdx11, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
153  %bin.rdx13 = and <4 x i32> %bin.rdx11, %rdx.shuf12
154  %9 = extractelement <4 x i32> %bin.rdx13, i32 0
155  ret i32 %9
156}
157
158define float @reduce_add_float(float* nocapture readonly %a) {
159; CHECK-LABEL: reduce_add_float
160; CHECK:       Detected a reduction operation: {{.*}} fadd fast
161; CHECK:       Detected a reduction operation: {{.*}} fadd fast
162; CHECK:       Detected a reduction operation: {{.*}} fadd fast
163; CHECK:       Detected a reduction operation: {{.*}} fadd fast
164; CHECK:       Detected a reduction operation: {{.*}} fadd fast
165; CHECK:       Detected a reduction operation: {{.*}} fadd fast
166; CHECK:       Detected a reduction operation: {{.*}} fadd fast
167; CHECK:       Detected a reduction operation: {{.*}} fadd fast
168; CHECK:       Detected a reduction operation: {{.*}} fadd fast
169; CHECK:       Detected a reduction operation: {{.*}} fadd fast
170; CHECK:       Detected a reduction operation: {{.*}} fadd fast
171;
172entry:
173  br label %vector.body
174
175vector.body:
176  %index = phi i64 [ 0, %entry ], [ %index.next.4, %vector.body ]
177  %vec.phi = phi <4 x float> [ zeroinitializer, %entry ], [ %28, %vector.body ]
178  %vec.phi9 = phi <4 x float> [ zeroinitializer, %entry ], [ %29, %vector.body ]
179  %0 = getelementptr inbounds float, float* %a, i64 %index
180  %1 = bitcast float* %0 to <4 x float>*
181  %wide.load = load <4 x float>, <4 x float>* %1, align 4
182  %2 = getelementptr float, float* %0, i64 4
183  %3 = bitcast float* %2 to <4 x float>*
184  %wide.load10 = load <4 x float>, <4 x float>* %3, align 4
185  %4 = fadd fast <4 x float> %wide.load, %vec.phi
186  %5 = fadd fast <4 x float> %wide.load10, %vec.phi9
187  %index.next = add nuw nsw i64 %index, 8
188  %6 = getelementptr inbounds float, float* %a, i64 %index.next
189  %7 = bitcast float* %6 to <4 x float>*
190  %wide.load.1 = load <4 x float>, <4 x float>* %7, align 4
191  %8 = getelementptr float, float* %6, i64 4
192  %9 = bitcast float* %8 to <4 x float>*
193  %wide.load10.1 = load <4 x float>, <4 x float>* %9, align 4
194  %10 = fadd fast <4 x float> %wide.load.1, %4
195  %11 = fadd fast <4 x float> %wide.load10.1, %5
196  %index.next.1 = add nsw i64 %index, 16
197  %12 = getelementptr inbounds float, float* %a, i64 %index.next.1
198  %13 = bitcast float* %12 to <4 x float>*
199  %wide.load.2 = load <4 x float>, <4 x float>* %13, align 4
200  %14 = getelementptr float, float* %12, i64 4
201  %15 = bitcast float* %14 to <4 x float>*
202  %wide.load10.2 = load <4 x float>, <4 x float>* %15, align 4
203  %16 = fadd fast <4 x float> %wide.load.2, %10
204  %17 = fadd fast <4 x float> %wide.load10.2, %11
205  %index.next.2 = add nsw i64 %index, 24
206  %18 = getelementptr inbounds float, float* %a, i64 %index.next.2
207  %19 = bitcast float* %18 to <4 x float>*
208  %wide.load.3 = load <4 x float>, <4 x float>* %19, align 4
209  %20 = getelementptr float, float* %18, i64 4
210  %21 = bitcast float* %20 to <4 x float>*
211  %wide.load10.3 = load <4 x float>, <4 x float>* %21, align 4
212  %22 = fadd fast <4 x float> %wide.load.3, %16
213  %23 = fadd fast <4 x float> %wide.load10.3, %17
214  %index.next.3 = add nsw i64 %index, 32
215  %24 = getelementptr inbounds float, float* %a, i64 %index.next.3
216  %25 = bitcast float* %24 to <4 x float>*
217  %wide.load.4 = load <4 x float>, <4 x float>* %25, align 4
218  %26 = getelementptr float, float* %24, i64 4
219  %27 = bitcast float* %26 to <4 x float>*
220  %wide.load10.4 = load <4 x float>, <4 x float>* %27, align 4
221  %28 = fadd fast <4 x float> %wide.load.4, %22
222  %29 = fadd fast <4 x float> %wide.load10.4, %23
223  %index.next.4 = add nsw i64 %index, 40
224  %30 = icmp eq i64 %index.next.4, 1000
225  br i1 %30, label %middle.block, label %vector.body
226
227middle.block:
228  %.lcssa15 = phi <4 x float> [ %29, %vector.body ]
229  %.lcssa = phi <4 x float> [ %28, %vector.body ]
230  %bin.rdx = fadd fast <4 x float> %.lcssa15, %.lcssa
231  %rdx.shuf = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
232  %bin.rdx11 = fadd fast <4 x float> %bin.rdx, %rdx.shuf
233  %rdx.shuf12 = shufflevector <4 x float> %bin.rdx11, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
234  %bin.rdx13 = fadd fast <4 x float> %bin.rdx11, %rdx.shuf12
235  %31 = extractelement <4 x float> %bin.rdx13, i32 0
236  ret float %31
237}
238