1; RUN: opt < %s -cost-model -costmodel-reduxcost=true -analyze -mcpu=core2 -mtriple=x86_64-apple-darwin | FileCheck %s
2; RUN: opt < %s -cost-model -costmodel-reduxcost=true -analyze -mcpu=corei7 -mtriple=x86_64-apple-darwin | FileCheck %s --check-prefix=SSE3
3; RUN: opt < %s -cost-model -costmodel-reduxcost=true -analyze -mcpu=corei7-avx -mtriple=x86_64-apple-darwin | FileCheck %s --check-prefix=AVX
4; RUN: opt < %s -cost-model -costmodel-reduxcost=true -analyze -mcpu=core-avx2 -mtriple=x86_64-apple-darwin | FileCheck %s --check-prefix=AVX2
5
6define fastcc float @reduction_cost_float(<4 x float> %rdx) {
7  %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
8  %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
9  %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
10  %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7
11
12; Check that we recognize the tree starting at the extractelement as a
13; reduction.
14; CHECK-LABEL: reduction_cost
15; CHECK:  cost of 9 {{.*}} extractelement
16
17  %r = extractelement <4 x float> %bin.rdx8, i32 0
18  ret float %r
19}
20
21define fastcc i32 @reduction_cost_int(<8 x i32> %rdx) {
22  %rdx.shuf = shufflevector <8 x i32> %rdx, <8 x i32> undef,
23   <8 x i32> <i32 4    , i32     5, i32     6, i32     7,
24              i32 undef, i32 undef, i32 undef, i32 undef>
25  %bin.rdx = add <8 x i32> %rdx, %rdx.shuf
26  %rdx.shuf.2 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef,
27   <8 x i32> <i32 2    , i32 3,     i32 undef, i32 undef,
28              i32 undef, i32 undef, i32 undef, i32 undef>
29  %bin.rdx.2 = add <8 x i32> %bin.rdx, %rdx.shuf.2
30  %rdx.shuf.3 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef,
31   <8 x i32> <i32 1    , i32 undef, i32 undef, i32 undef,
32              i32 undef, i32 undef, i32 undef, i32 undef>
33  %bin.rdx.3 = add <8 x i32> %bin.rdx.2, %rdx.shuf.3
34
35; CHECK-LABEL: reduction_cost_int
36; CHECK:  cost of 17 {{.*}} extractelement
37
38  %r = extractelement <8 x i32> %bin.rdx.3, i32 0
39  ret i32 %r
40}
41
42define fastcc float @pairwise_hadd(<4 x float> %rdx, float %f1) {
43  %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef,
44        <4 x i32> <i32 0, i32 2 , i32 undef, i32 undef>
45  %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef,
46        <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
47  %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
48  %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef,
49        <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
50  %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef,
51        <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
52  %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
53
54; CHECK-LABEL: pairwise_hadd
55; CHECK: cost of 11 {{.*}} extractelement
56
57  %r = extractelement <4 x float> %bin.rdx.1, i32 0
58  %r2 = fadd float %r, %f1
59  ret float %r2
60}
61
62define fastcc float @pairwise_hadd_assoc(<4 x float> %rdx, float %f1) {
63  %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef,
64        <4 x i32> <i32 0, i32 2 , i32 undef, i32 undef>
65  %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef,
66        <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
67  %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.1, %rdx.shuf.0.0
68  %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef,
69        <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
70  %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef,
71        <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
72  %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
73
74; CHECK-LABEL: pairwise_hadd_assoc
75; CHECK: cost of 11 {{.*}} extractelement
76
77  %r = extractelement <4 x float> %bin.rdx.1, i32 0
78  %r2 = fadd float %r, %f1
79  ret float %r2
80}
81
82define fastcc float @pairwise_hadd_skip_first(<4 x float> %rdx, float %f1) {
83  %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef,
84        <4 x i32> <i32 0, i32 2 , i32 undef, i32 undef>
85  %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef,
86        <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
87  %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
88  %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef,
89        <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
90  %bin.rdx.1 = fadd <4 x float> %bin.rdx.0, %rdx.shuf.1.1
91
92; CHECK-LABEL: pairwise_hadd_skip_first
93; CHECK: cost of 11 {{.*}} extractelement
94
95  %r = extractelement <4 x float> %bin.rdx.1, i32 0
96  %r2 = fadd float %r, %f1
97  ret float %r2
98}
99
100define fastcc double @no_pairwise_reduction2double(<2 x double> %rdx, double %f1) {
101  %rdx.shuf = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
102  %bin.rdx = fadd <2 x double> %rdx, %rdx.shuf
103
104; SSE3:  cost of 2 {{.*}} extractelement
105; AVX:  cost of 2 {{.*}} extractelement
106; AVX2:  cost of 2 {{.*}} extractelement
107
108  %r = extractelement <2 x double> %bin.rdx, i32 0
109  ret double %r
110}
111
112define fastcc float @no_pairwise_reduction4float(<4 x float> %rdx, float %f1) {
113  %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
114  %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
115  %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
116  %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7
117
118; SSE3:  cost of 4 {{.*}} extractelement
119; AVX:  cost of 3 {{.*}} extractelement
120; AVX2:  cost of 3 {{.*}} extractelement
121
122  %r = extractelement <4 x float> %bin.rdx8, i32 0
123  ret float %r
124}
125
126define fastcc double @no_pairwise_reduction4double(<4 x double> %rdx, double %f1) {
127  %rdx.shuf = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
128  %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf
129  %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
130  %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7
131
132; AVX:  cost of 3 {{.*}} extractelement
133; AVX2:  cost of 3 {{.*}} extractelement
134
135  %r = extractelement <4 x double> %bin.rdx8, i32 0
136  ret double %r
137}
138
139define fastcc float @no_pairwise_reduction8float(<8 x float> %rdx, float %f1) {
140  %rdx.shuf3 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7,i32 undef, i32 undef, i32 undef, i32 undef>
141  %bin.rdx4 = fadd <8 x float> %rdx, %rdx.shuf3
142  %rdx.shuf = shufflevector <8 x float> %bin.rdx4, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
143  %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf
144  %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
145  %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7
146
147; AVX:  cost of 4 {{.*}} extractelement
148; AVX2:  cost of 4 {{.*}} extractelement
149
150  %r = extractelement <8 x float> %bin.rdx8, i32 0
151  ret float %r
152}
153
154define fastcc i64 @no_pairwise_reduction2i64(<2 x i64> %rdx, i64 %f1) {
155  %rdx.shuf = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
156  %bin.rdx = add <2 x i64> %rdx, %rdx.shuf
157
158; SSE3:  cost of 2 {{.*}} extractelement
159; AVX:  cost of 1 {{.*}} extractelement
160; AVX2:  cost of 1 {{.*}} extractelement
161
162  %r = extractelement <2 x i64> %bin.rdx, i32 0
163  ret i64 %r
164}
165
166define fastcc i32 @no_pairwise_reduction4i32(<4 x i32> %rdx, i32 %f1) {
167  %rdx.shuf = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
168  %bin.rdx = add <4 x i32> %rdx, %rdx.shuf
169  %rdx.shuf7 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
170  %bin.rdx8 = add <4 x i32> %bin.rdx, %rdx.shuf7
171
172; SSE3:  cost of 3 {{.*}} extractelement
173; AVX:  cost of 3 {{.*}} extractelement
174; AVX2:  cost of 3 {{.*}} extractelement
175
176  %r = extractelement <4 x i32> %bin.rdx8, i32 0
177  ret i32 %r
178}
179
180define fastcc i64 @no_pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) {
181  %rdx.shuf = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
182  %bin.rdx = add <4 x i64> %rdx, %rdx.shuf
183  %rdx.shuf7 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
184  %bin.rdx8 = add <4 x i64> %bin.rdx, %rdx.shuf7
185
186; AVX:  cost of 3 {{.*}} extractelement
187; AVX2:  cost of 3 {{.*}} extractelement
188
189  %r = extractelement <4 x i64> %bin.rdx8, i32 0
190  ret i64 %r
191}
192
193define fastcc i16 @no_pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) {
194  %rdx.shuf3 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7,i32 undef, i32 undef, i32 undef, i32 undef>
195  %bin.rdx4 = add <8 x i16> %rdx, %rdx.shuf3
196  %rdx.shuf = shufflevector <8 x i16> %bin.rdx4, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
197  %bin.rdx = add <8 x i16> %bin.rdx4, %rdx.shuf
198  %rdx.shuf7 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
199  %bin.rdx8 = add <8 x i16> %bin.rdx, %rdx.shuf7
200
201; SSE3:  cost of 4 {{.*}} extractelement
202; AVX:  cost of 4 {{.*}} extractelement
203; AVX2:  cost of 4 {{.*}} extractelement
204
205  %r = extractelement <8 x i16> %bin.rdx8, i32 0
206  ret i16 %r
207}
208
209define fastcc i32 @no_pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) {
210  %rdx.shuf3 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7,i32 undef, i32 undef, i32 undef, i32 undef>
211  %bin.rdx4 = add <8 x i32> %rdx, %rdx.shuf3
212  %rdx.shuf = shufflevector <8 x i32> %bin.rdx4, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
213  %bin.rdx = add <8 x i32> %bin.rdx4, %rdx.shuf
214  %rdx.shuf7 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
215  %bin.rdx8 = add <8 x i32> %bin.rdx, %rdx.shuf7
216
217; AVX:  cost of 5 {{.*}} extractelement
218; AVX2:  cost of 5 {{.*}} extractelement
219
220  %r = extractelement <8 x i32> %bin.rdx8, i32 0
221  ret i32 %r
222}
223
224define fastcc double @pairwise_reduction2double(<2 x double> %rdx, double %f1) {
225  %rdx.shuf.1.0 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
226  %rdx.shuf.1.1 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
227  %bin.rdx8 = fadd <2 x double> %rdx.shuf.1.0, %rdx.shuf.1.1
228
229; SSE3:  cost of 2 {{.*}} extractelement
230; AVX:  cost of 2 {{.*}} extractelement
231; AVX2:  cost of 2 {{.*}} extractelement
232
233  %r = extractelement <2 x double> %bin.rdx8, i32 0
234  ret double %r
235}
236
237define fastcc float @pairwise_reduction4float(<4 x float> %rdx, float %f1) {
238  %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
239  %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
240  %bin.rdx = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
241  %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
242  %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
243  %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
244
245; SSE3:  cost of 4 {{.*}} extractelement
246; AVX:  cost of 4 {{.*}} extractelement
247; AVX2:  cost of 4 {{.*}} extractelement
248
249  %r = extractelement <4 x float> %bin.rdx8, i32 0
250  ret float %r
251}
252
253define fastcc double @pairwise_reduction4double(<4 x double> %rdx, double %f1) {
254  %rdx.shuf.0.0 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
255  %rdx.shuf.0.1 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
256  %bin.rdx = fadd <4 x double> %rdx.shuf.0.0, %rdx.shuf.0.1
257  %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
258  %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
259  %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1
260
261; AVX:  cost of 5 {{.*}} extractelement
262; AVX2:  cost of 5 {{.*}} extractelement
263
264  %r = extractelement <4 x double> %bin.rdx8, i32 0
265  ret double %r
266}
267
268define fastcc float @pairwise_reduction8float(<8 x float> %rdx, float %f1) {
269  %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6,i32 undef, i32 undef, i32 undef, i32 undef>
270  %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7,i32 undef, i32 undef, i32 undef, i32 undef>
271  %bin.rdx = fadd <8 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
272  %rdx.shuf.1.0 = shufflevector <8 x float> %bin.rdx, <8 x float> undef,<8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
273  %rdx.shuf.1.1 = shufflevector <8 x float> %bin.rdx, <8 x float> undef,<8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
274  %bin.rdx8 = fadd <8 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
275  %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef,<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
276  %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef,<8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
277  %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1
278
279; AVX:  cost of 7 {{.*}} extractelement
280; AVX2:  cost of 7 {{.*}} extractelement
281
282  %r = extractelement <8 x float> %bin.rdx9, i32 0
283  ret float %r
284}
285
286define fastcc i64 @pairwise_reduction2i64(<2 x i64> %rdx, i64 %f1) {
287  %rdx.shuf.1.0 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 0, i32 undef>
288  %rdx.shuf.1.1 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
289  %bin.rdx8 = add <2 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1
290
291; SSE3:  cost of 2 {{.*}} extractelement
292; AVX:  cost of 1 {{.*}} extractelement
293; AVX2:  cost of 1 {{.*}} extractelement
294
295  %r = extractelement <2 x i64> %bin.rdx8, i32 0
296  ret i64 %r
297}
298
299define fastcc i32 @pairwise_reduction4i32(<4 x i32> %rdx, i32 %f1) {
300  %rdx.shuf.0.0 = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
301  %rdx.shuf.0.1 = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
302  %bin.rdx = add <4 x i32> %rdx.shuf.0.0, %rdx.shuf.0.1
303  %rdx.shuf.1.0 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
304  %rdx.shuf.1.1 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
305  %bin.rdx8 = add <4 x i32> %rdx.shuf.1.0, %rdx.shuf.1.1
306
307; SSE3:  cost of 3 {{.*}} extractelement
308; AVX:  cost of 3 {{.*}} extractelement
309; AVX2:  cost of 3 {{.*}} extractelement
310
311  %r = extractelement <4 x i32> %bin.rdx8, i32 0
312  ret i32 %r
313}
314
315define fastcc i64 @pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) {
316  %rdx.shuf.0.0 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
317  %rdx.shuf.0.1 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
318  %bin.rdx = add <4 x i64> %rdx.shuf.0.0, %rdx.shuf.0.1
319  %rdx.shuf.1.0 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
320  %rdx.shuf.1.1 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
321  %bin.rdx8 = add <4 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1
322
323; AVX:  cost of 5 {{.*}} extractelement
324; AVX2:  cost of 5 {{.*}} extractelement
325
326  %r = extractelement <4 x i64> %bin.rdx8, i32 0
327  ret i64 %r
328}
329
330define fastcc i16 @pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) {
331  %rdx.shuf.0.0 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6,i32 undef, i32 undef, i32 undef, i32 undef>
332  %rdx.shuf.0.1 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7,i32 undef, i32 undef, i32 undef, i32 undef>
333  %bin.rdx = add <8 x i16> %rdx.shuf.0.0, %rdx.shuf.0.1
334  %rdx.shuf.1.0 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef,<8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
335  %rdx.shuf.1.1 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef,<8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
336  %bin.rdx8 = add <8 x i16> %rdx.shuf.1.0, %rdx.shuf.1.1
337  %rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef,<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
338  %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef,<8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
339  %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1
340
341; SSE3:  cost of 5 {{.*}} extractelement
342; AVX:  cost of 5 {{.*}} extractelement
343; AVX2:  cost of 5 {{.*}} extractelement
344
345  %r = extractelement <8 x i16> %bin.rdx9, i32 0
346  ret i16 %r
347}
348
349define fastcc i32 @pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) {
350  %rdx.shuf.0.0 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6,i32 undef, i32 undef, i32 undef, i32 undef>
351  %rdx.shuf.0.1 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7,i32 undef, i32 undef, i32 undef, i32 undef>
352  %bin.rdx = add <8 x i32> %rdx.shuf.0.0, %rdx.shuf.0.1
353  %rdx.shuf.1.0 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef,<8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
354  %rdx.shuf.1.1 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef,<8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
355  %bin.rdx8 = add <8 x i32> %rdx.shuf.1.0, %rdx.shuf.1.1
356  %rdx.shuf.2.0 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef,<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
357  %rdx.shuf.2.1 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef,<8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
358  %bin.rdx9 = add <8 x i32> %rdx.shuf.2.0, %rdx.shuf.2.1
359
360; AVX:  cost of 5 {{.*}} extractelement
361; AVX2:  cost of 5 {{.*}} extractelement
362
363  %r = extractelement <8 x i32> %bin.rdx9, i32 0
364  ret i32 %r
365}
366