1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1-SLOW
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fast-hops | FileCheck %s --check-prefixes=AVX,AVX1-FAST
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=AVX512
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512
9
10;
11; vXf32 (accum)
12;
13
14define float @test_v2f32(float %a0, <2 x float> %a1) {
15; SSE2-LABEL: test_v2f32:
16; SSE2:       # %bb.0:
17; SSE2-NEXT:    addss %xmm1, %xmm0
18; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
19; SSE2-NEXT:    addss %xmm1, %xmm0
20; SSE2-NEXT:    retq
21;
22; SSE41-LABEL: test_v2f32:
23; SSE41:       # %bb.0:
24; SSE41-NEXT:    addss %xmm1, %xmm0
25; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
26; SSE41-NEXT:    addss %xmm1, %xmm0
27; SSE41-NEXT:    retq
28;
29; AVX-LABEL: test_v2f32:
30; AVX:       # %bb.0:
31; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
32; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
33; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
34; AVX-NEXT:    retq
35;
36; AVX512-LABEL: test_v2f32:
37; AVX512:       # %bb.0:
38; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
39; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
40; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
41; AVX512-NEXT:    retq
42  %1 = call float @llvm.vector.reduce.fadd.f32.v2f32(float %a0, <2 x float> %a1)
43  ret float %1
44}
45
46define float @test_v4f32(float %a0, <4 x float> %a1) {
47; SSE2-LABEL: test_v4f32:
48; SSE2:       # %bb.0:
49; SSE2-NEXT:    addss %xmm1, %xmm0
50; SSE2-NEXT:    movaps %xmm1, %xmm2
51; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1]
52; SSE2-NEXT:    addss %xmm2, %xmm0
53; SSE2-NEXT:    movaps %xmm1, %xmm2
54; SSE2-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
55; SSE2-NEXT:    addss %xmm2, %xmm0
56; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
57; SSE2-NEXT:    addss %xmm1, %xmm0
58; SSE2-NEXT:    retq
59;
60; SSE41-LABEL: test_v4f32:
61; SSE41:       # %bb.0:
62; SSE41-NEXT:    addss %xmm1, %xmm0
63; SSE41-NEXT:    movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
64; SSE41-NEXT:    addss %xmm2, %xmm0
65; SSE41-NEXT:    movaps %xmm1, %xmm2
66; SSE41-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
67; SSE41-NEXT:    addss %xmm2, %xmm0
68; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
69; SSE41-NEXT:    addss %xmm1, %xmm0
70; SSE41-NEXT:    retq
71;
72; AVX-LABEL: test_v4f32:
73; AVX:       # %bb.0:
74; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
75; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
76; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
77; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
78; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
79; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
80; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
81; AVX-NEXT:    retq
82;
83; AVX512-LABEL: test_v4f32:
84; AVX512:       # %bb.0:
85; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
86; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
87; AVX512-NEXT:    vaddss %xmm2, %xmm0, %xmm0
88; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
89; AVX512-NEXT:    vaddss %xmm2, %xmm0, %xmm0
90; AVX512-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
91; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
92; AVX512-NEXT:    retq
93  %1 = call float @llvm.vector.reduce.fadd.f32.v4f32(float %a0, <4 x float> %a1)
94  ret float %1
95}
96
97define float @test_v8f32(float %a0, <8 x float> %a1) {
98; SSE2-LABEL: test_v8f32:
99; SSE2:       # %bb.0:
100; SSE2-NEXT:    addss %xmm1, %xmm0
101; SSE2-NEXT:    movaps %xmm1, %xmm3
102; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[1,1]
103; SSE2-NEXT:    addss %xmm3, %xmm0
104; SSE2-NEXT:    movaps %xmm1, %xmm3
105; SSE2-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1]
106; SSE2-NEXT:    addss %xmm3, %xmm0
107; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
108; SSE2-NEXT:    addss %xmm1, %xmm0
109; SSE2-NEXT:    addss %xmm2, %xmm0
110; SSE2-NEXT:    movaps %xmm2, %xmm1
111; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1]
112; SSE2-NEXT:    addss %xmm1, %xmm0
113; SSE2-NEXT:    movaps %xmm2, %xmm1
114; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
115; SSE2-NEXT:    addss %xmm1, %xmm0
116; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
117; SSE2-NEXT:    addss %xmm2, %xmm0
118; SSE2-NEXT:    retq
119;
120; SSE41-LABEL: test_v8f32:
121; SSE41:       # %bb.0:
122; SSE41-NEXT:    addss %xmm1, %xmm0
123; SSE41-NEXT:    movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
124; SSE41-NEXT:    addss %xmm3, %xmm0
125; SSE41-NEXT:    movaps %xmm1, %xmm3
126; SSE41-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1]
127; SSE41-NEXT:    addss %xmm3, %xmm0
128; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
129; SSE41-NEXT:    addss %xmm1, %xmm0
130; SSE41-NEXT:    addss %xmm2, %xmm0
131; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
132; SSE41-NEXT:    addss %xmm1, %xmm0
133; SSE41-NEXT:    movaps %xmm2, %xmm1
134; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
135; SSE41-NEXT:    addss %xmm1, %xmm0
136; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
137; SSE41-NEXT:    addss %xmm2, %xmm0
138; SSE41-NEXT:    retq
139;
140; AVX-LABEL: test_v8f32:
141; AVX:       # %bb.0:
142; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
143; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
144; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
145; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
146; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
147; AVX-NEXT:    vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3]
148; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
149; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
150; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
151; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
152; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
153; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
154; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
155; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
156; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
157; AVX-NEXT:    vzeroupper
158; AVX-NEXT:    retq
159;
160; AVX512-LABEL: test_v8f32:
161; AVX512:       # %bb.0:
162; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
163; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
164; AVX512-NEXT:    vaddss %xmm2, %xmm0, %xmm0
165; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
166; AVX512-NEXT:    vaddss %xmm2, %xmm0, %xmm0
167; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3]
168; AVX512-NEXT:    vaddss %xmm2, %xmm0, %xmm0
169; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm1
170; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
171; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
172; AVX512-NEXT:    vaddss %xmm2, %xmm0, %xmm0
173; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
174; AVX512-NEXT:    vaddss %xmm2, %xmm0, %xmm0
175; AVX512-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
176; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
177; AVX512-NEXT:    vzeroupper
178; AVX512-NEXT:    retq
179  %1 = call float @llvm.vector.reduce.fadd.f32.v8f32(float %a0, <8 x float> %a1)
180  ret float %1
181}
182
183define float @test_v16f32(float %a0, <16 x float> %a1) {
184; SSE2-LABEL: test_v16f32:
185; SSE2:       # %bb.0:
186; SSE2-NEXT:    addss %xmm1, %xmm0
187; SSE2-NEXT:    movaps %xmm1, %xmm5
188; SSE2-NEXT:    shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1]
189; SSE2-NEXT:    addss %xmm5, %xmm0
190; SSE2-NEXT:    movaps %xmm1, %xmm5
191; SSE2-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1]
192; SSE2-NEXT:    addss %xmm5, %xmm0
193; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
194; SSE2-NEXT:    addss %xmm1, %xmm0
195; SSE2-NEXT:    addss %xmm2, %xmm0
196; SSE2-NEXT:    movaps %xmm2, %xmm1
197; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1]
198; SSE2-NEXT:    addss %xmm1, %xmm0
199; SSE2-NEXT:    movaps %xmm2, %xmm1
200; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
201; SSE2-NEXT:    addss %xmm1, %xmm0
202; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
203; SSE2-NEXT:    addss %xmm2, %xmm0
204; SSE2-NEXT:    addss %xmm3, %xmm0
205; SSE2-NEXT:    movaps %xmm3, %xmm1
206; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[1,1]
207; SSE2-NEXT:    addss %xmm1, %xmm0
208; SSE2-NEXT:    movaps %xmm3, %xmm1
209; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
210; SSE2-NEXT:    addss %xmm1, %xmm0
211; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
212; SSE2-NEXT:    addss %xmm3, %xmm0
213; SSE2-NEXT:    addss %xmm4, %xmm0
214; SSE2-NEXT:    movaps %xmm4, %xmm1
215; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm4[1,1]
216; SSE2-NEXT:    addss %xmm1, %xmm0
217; SSE2-NEXT:    movaps %xmm4, %xmm1
218; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1]
219; SSE2-NEXT:    addss %xmm1, %xmm0
220; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,3,3,3]
221; SSE2-NEXT:    addss %xmm4, %xmm0
222; SSE2-NEXT:    retq
223;
224; SSE41-LABEL: test_v16f32:
225; SSE41:       # %bb.0:
226; SSE41-NEXT:    addss %xmm1, %xmm0
227; SSE41-NEXT:    movshdup {{.*#+}} xmm5 = xmm1[1,1,3,3]
228; SSE41-NEXT:    addss %xmm5, %xmm0
229; SSE41-NEXT:    movaps %xmm1, %xmm5
230; SSE41-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1]
231; SSE41-NEXT:    addss %xmm5, %xmm0
232; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
233; SSE41-NEXT:    addss %xmm1, %xmm0
234; SSE41-NEXT:    addss %xmm2, %xmm0
235; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
236; SSE41-NEXT:    addss %xmm1, %xmm0
237; SSE41-NEXT:    movaps %xmm2, %xmm1
238; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
239; SSE41-NEXT:    addss %xmm1, %xmm0
240; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
241; SSE41-NEXT:    addss %xmm2, %xmm0
242; SSE41-NEXT:    addss %xmm3, %xmm0
243; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
244; SSE41-NEXT:    addss %xmm1, %xmm0
245; SSE41-NEXT:    movaps %xmm3, %xmm1
246; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
247; SSE41-NEXT:    addss %xmm1, %xmm0
248; SSE41-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
249; SSE41-NEXT:    addss %xmm3, %xmm0
250; SSE41-NEXT:    addss %xmm4, %xmm0
251; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm4[1,1,3,3]
252; SSE41-NEXT:    addss %xmm1, %xmm0
253; SSE41-NEXT:    movaps %xmm4, %xmm1
254; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1]
255; SSE41-NEXT:    addss %xmm1, %xmm0
256; SSE41-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,3,3,3]
257; SSE41-NEXT:    addss %xmm4, %xmm0
258; SSE41-NEXT:    retq
259;
260; AVX-LABEL: test_v16f32:
261; AVX:       # %bb.0:
262; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
263; AVX-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
264; AVX-NEXT:    vaddss %xmm3, %xmm0, %xmm0
265; AVX-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
266; AVX-NEXT:    vaddss %xmm3, %xmm0, %xmm0
267; AVX-NEXT:    vpermilps {{.*#+}} xmm3 = xmm1[3,3,3,3]
268; AVX-NEXT:    vaddss %xmm3, %xmm0, %xmm0
269; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
270; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
271; AVX-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
272; AVX-NEXT:    vaddss %xmm3, %xmm0, %xmm0
273; AVX-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
274; AVX-NEXT:    vaddss %xmm3, %xmm0, %xmm0
275; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
276; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
277; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
278; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
279; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
280; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
281; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
282; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm2[3,3,3,3]
283; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
284; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm1
285; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
286; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
287; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
288; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
289; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
290; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
291; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
292; AVX-NEXT:    vzeroupper
293; AVX-NEXT:    retq
294;
295; AVX512-LABEL: test_v16f32:
296; AVX512:       # %bb.0:
297; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
298; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
299; AVX512-NEXT:    vaddss %xmm2, %xmm0, %xmm0
300; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
301; AVX512-NEXT:    vaddss %xmm2, %xmm0, %xmm0
302; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3]
303; AVX512-NEXT:    vaddss %xmm2, %xmm0, %xmm0
304; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm2
305; AVX512-NEXT:    vaddss %xmm2, %xmm0, %xmm0
306; AVX512-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
307; AVX512-NEXT:    vaddss %xmm3, %xmm0, %xmm0
308; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
309; AVX512-NEXT:    vaddss %xmm3, %xmm0, %xmm0
310; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3]
311; AVX512-NEXT:    vaddss %xmm2, %xmm0, %xmm0
312; AVX512-NEXT:    vextractf32x4 $2, %zmm1, %xmm2
313; AVX512-NEXT:    vaddss %xmm2, %xmm0, %xmm0
314; AVX512-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
315; AVX512-NEXT:    vaddss %xmm3, %xmm0, %xmm0
316; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
317; AVX512-NEXT:    vaddss %xmm3, %xmm0, %xmm0
318; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3]
319; AVX512-NEXT:    vaddss %xmm2, %xmm0, %xmm0
320; AVX512-NEXT:    vextractf32x4 $3, %zmm1, %xmm1
321; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
322; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
323; AVX512-NEXT:    vaddss %xmm2, %xmm0, %xmm0
324; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
325; AVX512-NEXT:    vaddss %xmm2, %xmm0, %xmm0
326; AVX512-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
327; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
328; AVX512-NEXT:    vzeroupper
329; AVX512-NEXT:    retq
330  %1 = call float @llvm.vector.reduce.fadd.f32.v16f32(float %a0, <16 x float> %a1)
331  ret float %1
332}
333
334;
335; vXf32 (zero)
336;
337
338define float @test_v2f32_zero(<2 x float> %a0) {
339; SSE2-LABEL: test_v2f32_zero:
340; SSE2:       # %bb.0:
341; SSE2-NEXT:    movaps %xmm0, %xmm1
342; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
343; SSE2-NEXT:    addss %xmm0, %xmm1
344; SSE2-NEXT:    movaps %xmm1, %xmm0
345; SSE2-NEXT:    retq
346;
347; SSE41-LABEL: test_v2f32_zero:
348; SSE41:       # %bb.0:
349; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
350; SSE41-NEXT:    addss %xmm1, %xmm0
351; SSE41-NEXT:    retq
352;
353; AVX1-SLOW-LABEL: test_v2f32_zero:
354; AVX1-SLOW:       # %bb.0:
355; AVX1-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
356; AVX1-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
357; AVX1-SLOW-NEXT:    retq
358;
359; AVX1-FAST-LABEL: test_v2f32_zero:
360; AVX1-FAST:       # %bb.0:
361; AVX1-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
362; AVX1-FAST-NEXT:    retq
363;
364; AVX2-LABEL: test_v2f32_zero:
365; AVX2:       # %bb.0:
366; AVX2-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
367; AVX2-NEXT:    vaddss %xmm1, %xmm0, %xmm0
368; AVX2-NEXT:    retq
369;
370; AVX512-LABEL: test_v2f32_zero:
371; AVX512:       # %bb.0:
372; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
373; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
374; AVX512-NEXT:    retq
375  %1 = call float @llvm.vector.reduce.fadd.f32.v2f32(float -0.0, <2 x float> %a0)
376  ret float %1
377}
378
379define float @test_v4f32_zero(<4 x float> %a0) {
380; SSE2-LABEL: test_v4f32_zero:
381; SSE2:       # %bb.0:
382; SSE2-NEXT:    movaps %xmm0, %xmm1
383; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
384; SSE2-NEXT:    addss %xmm0, %xmm1
385; SSE2-NEXT:    movaps %xmm0, %xmm2
386; SSE2-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
387; SSE2-NEXT:    addss %xmm1, %xmm2
388; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
389; SSE2-NEXT:    addss %xmm2, %xmm0
390; SSE2-NEXT:    retq
391;
392; SSE41-LABEL: test_v4f32_zero:
393; SSE41:       # %bb.0:
394; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
395; SSE41-NEXT:    addss %xmm0, %xmm1
396; SSE41-NEXT:    movaps %xmm0, %xmm2
397; SSE41-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
398; SSE41-NEXT:    addss %xmm1, %xmm2
399; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
400; SSE41-NEXT:    addss %xmm2, %xmm0
401; SSE41-NEXT:    retq
402;
403; AVX1-SLOW-LABEL: test_v4f32_zero:
404; AVX1-SLOW:       # %bb.0:
405; AVX1-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
406; AVX1-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm1
407; AVX1-SLOW-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
408; AVX1-SLOW-NEXT:    vaddss %xmm2, %xmm1, %xmm1
409; AVX1-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
410; AVX1-SLOW-NEXT:    vaddss %xmm0, %xmm1, %xmm0
411; AVX1-SLOW-NEXT:    retq
412;
413; AVX1-FAST-LABEL: test_v4f32_zero:
414; AVX1-FAST:       # %bb.0:
415; AVX1-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm1
416; AVX1-FAST-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
417; AVX1-FAST-NEXT:    vaddss %xmm2, %xmm1, %xmm1
418; AVX1-FAST-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
419; AVX1-FAST-NEXT:    vaddss %xmm0, %xmm1, %xmm0
420; AVX1-FAST-NEXT:    retq
421;
422; AVX2-LABEL: test_v4f32_zero:
423; AVX2:       # %bb.0:
424; AVX2-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
425; AVX2-NEXT:    vaddss %xmm1, %xmm0, %xmm1
426; AVX2-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
427; AVX2-NEXT:    vaddss %xmm2, %xmm1, %xmm1
428; AVX2-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
429; AVX2-NEXT:    vaddss %xmm0, %xmm1, %xmm0
430; AVX2-NEXT:    retq
431;
432; AVX512-LABEL: test_v4f32_zero:
433; AVX512:       # %bb.0:
434; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
435; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm1
436; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
437; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
438; AVX512-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
439; AVX512-NEXT:    vaddss %xmm0, %xmm1, %xmm0
440; AVX512-NEXT:    retq
441  %1 = call float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %a0)
442  ret float %1
443}
444
445define float @test_v8f32_zero(<8 x float> %a0) {
446; SSE2-LABEL: test_v8f32_zero:
447; SSE2:       # %bb.0:
448; SSE2-NEXT:    movaps %xmm0, %xmm2
449; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1]
450; SSE2-NEXT:    addss %xmm0, %xmm2
451; SSE2-NEXT:    movaps %xmm0, %xmm3
452; SSE2-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1]
453; SSE2-NEXT:    addss %xmm2, %xmm3
454; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
455; SSE2-NEXT:    addss %xmm3, %xmm0
456; SSE2-NEXT:    addss %xmm1, %xmm0
457; SSE2-NEXT:    movaps %xmm1, %xmm2
458; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1]
459; SSE2-NEXT:    addss %xmm2, %xmm0
460; SSE2-NEXT:    movaps %xmm1, %xmm2
461; SSE2-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
462; SSE2-NEXT:    addss %xmm2, %xmm0
463; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
464; SSE2-NEXT:    addss %xmm1, %xmm0
465; SSE2-NEXT:    retq
466;
467; SSE41-LABEL: test_v8f32_zero:
468; SSE41:       # %bb.0:
469; SSE41-NEXT:    movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
470; SSE41-NEXT:    addss %xmm0, %xmm2
471; SSE41-NEXT:    movaps %xmm0, %xmm3
472; SSE41-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1]
473; SSE41-NEXT:    addss %xmm2, %xmm3
474; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
475; SSE41-NEXT:    addss %xmm3, %xmm0
476; SSE41-NEXT:    addss %xmm1, %xmm0
477; SSE41-NEXT:    movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
478; SSE41-NEXT:    addss %xmm2, %xmm0
479; SSE41-NEXT:    movaps %xmm1, %xmm2
480; SSE41-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
481; SSE41-NEXT:    addss %xmm2, %xmm0
482; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
483; SSE41-NEXT:    addss %xmm1, %xmm0
484; SSE41-NEXT:    retq
485;
486; AVX1-SLOW-LABEL: test_v8f32_zero:
487; AVX1-SLOW:       # %bb.0:
488; AVX1-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
489; AVX1-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm1
490; AVX1-SLOW-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
491; AVX1-SLOW-NEXT:    vaddss %xmm2, %xmm1, %xmm1
492; AVX1-SLOW-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3]
493; AVX1-SLOW-NEXT:    vaddss %xmm2, %xmm1, %xmm1
494; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0
495; AVX1-SLOW-NEXT:    vaddss %xmm0, %xmm1, %xmm1
496; AVX1-SLOW-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
497; AVX1-SLOW-NEXT:    vaddss %xmm2, %xmm1, %xmm1
498; AVX1-SLOW-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
499; AVX1-SLOW-NEXT:    vaddss %xmm2, %xmm1, %xmm1
500; AVX1-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
501; AVX1-SLOW-NEXT:    vaddss %xmm0, %xmm1, %xmm0
502; AVX1-SLOW-NEXT:    vzeroupper
503; AVX1-SLOW-NEXT:    retq
504;
505; AVX1-FAST-LABEL: test_v8f32_zero:
506; AVX1-FAST:       # %bb.0:
507; AVX1-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm1
508; AVX1-FAST-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
509; AVX1-FAST-NEXT:    vaddss %xmm2, %xmm1, %xmm1
510; AVX1-FAST-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3]
511; AVX1-FAST-NEXT:    vaddss %xmm2, %xmm1, %xmm1
512; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm0
513; AVX1-FAST-NEXT:    vaddss %xmm0, %xmm1, %xmm1
514; AVX1-FAST-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
515; AVX1-FAST-NEXT:    vaddss %xmm2, %xmm1, %xmm1
516; AVX1-FAST-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
517; AVX1-FAST-NEXT:    vaddss %xmm2, %xmm1, %xmm1
518; AVX1-FAST-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
519; AVX1-FAST-NEXT:    vaddss %xmm0, %xmm1, %xmm0
520; AVX1-FAST-NEXT:    vzeroupper
521; AVX1-FAST-NEXT:    retq
522;
523; AVX2-LABEL: test_v8f32_zero:
524; AVX2:       # %bb.0:
525; AVX2-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
526; AVX2-NEXT:    vaddss %xmm1, %xmm0, %xmm1
527; AVX2-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
528; AVX2-NEXT:    vaddss %xmm2, %xmm1, %xmm1
529; AVX2-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3]
530; AVX2-NEXT:    vaddss %xmm2, %xmm1, %xmm1
531; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
532; AVX2-NEXT:    vaddss %xmm0, %xmm1, %xmm1
533; AVX2-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
534; AVX2-NEXT:    vaddss %xmm2, %xmm1, %xmm1
535; AVX2-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
536; AVX2-NEXT:    vaddss %xmm2, %xmm1, %xmm1
537; AVX2-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
538; AVX2-NEXT:    vaddss %xmm0, %xmm1, %xmm0
539; AVX2-NEXT:    vzeroupper
540; AVX2-NEXT:    retq
541;
542; AVX512-LABEL: test_v8f32_zero:
543; AVX512:       # %bb.0:
544; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
545; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm1
546; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
547; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
548; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3]
549; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
550; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
551; AVX512-NEXT:    vaddss %xmm0, %xmm1, %xmm1
552; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
553; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
554; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
555; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
556; AVX512-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
557; AVX512-NEXT:    vaddss %xmm0, %xmm1, %xmm0
558; AVX512-NEXT:    vzeroupper
559; AVX512-NEXT:    retq
560  %1 = call float @llvm.vector.reduce.fadd.f32.v8f32(float -0.0, <8 x float> %a0)
561  ret float %1
562}
563
564define float @test_v16f32_zero(<16 x float> %a0) {
565; SSE2-LABEL: test_v16f32_zero:
566; SSE2:       # %bb.0:
567; SSE2-NEXT:    movaps %xmm0, %xmm4
568; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1],xmm0[1,1]
569; SSE2-NEXT:    addss %xmm0, %xmm4
570; SSE2-NEXT:    movaps %xmm0, %xmm5
571; SSE2-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1]
572; SSE2-NEXT:    addss %xmm4, %xmm5
573; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
574; SSE2-NEXT:    addss %xmm5, %xmm0
575; SSE2-NEXT:    addss %xmm1, %xmm0
576; SSE2-NEXT:    movaps %xmm1, %xmm4
577; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[1,1]
578; SSE2-NEXT:    addss %xmm4, %xmm0
579; SSE2-NEXT:    movaps %xmm1, %xmm4
580; SSE2-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1]
581; SSE2-NEXT:    addss %xmm4, %xmm0
582; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
583; SSE2-NEXT:    addss %xmm1, %xmm0
584; SSE2-NEXT:    addss %xmm2, %xmm0
585; SSE2-NEXT:    movaps %xmm2, %xmm1
586; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1]
587; SSE2-NEXT:    addss %xmm1, %xmm0
588; SSE2-NEXT:    movaps %xmm2, %xmm1
589; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
590; SSE2-NEXT:    addss %xmm1, %xmm0
591; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
592; SSE2-NEXT:    addss %xmm2, %xmm0
593; SSE2-NEXT:    addss %xmm3, %xmm0
594; SSE2-NEXT:    movaps %xmm3, %xmm1
595; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[1,1]
596; SSE2-NEXT:    addss %xmm1, %xmm0
597; SSE2-NEXT:    movaps %xmm3, %xmm1
598; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
599; SSE2-NEXT:    addss %xmm1, %xmm0
600; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
601; SSE2-NEXT:    addss %xmm3, %xmm0
602; SSE2-NEXT:    retq
603;
604; SSE41-LABEL: test_v16f32_zero:
605; SSE41:       # %bb.0:
606; SSE41-NEXT:    movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
607; SSE41-NEXT:    addss %xmm0, %xmm4
608; SSE41-NEXT:    movaps %xmm0, %xmm5
609; SSE41-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1]
610; SSE41-NEXT:    addss %xmm4, %xmm5
611; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
612; SSE41-NEXT:    addss %xmm5, %xmm0
613; SSE41-NEXT:    addss %xmm1, %xmm0
614; SSE41-NEXT:    movshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
615; SSE41-NEXT:    addss %xmm4, %xmm0
616; SSE41-NEXT:    movaps %xmm1, %xmm4
617; SSE41-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1]
618; SSE41-NEXT:    addss %xmm4, %xmm0
619; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
620; SSE41-NEXT:    addss %xmm1, %xmm0
621; SSE41-NEXT:    addss %xmm2, %xmm0
622; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
623; SSE41-NEXT:    addss %xmm1, %xmm0
624; SSE41-NEXT:    movaps %xmm2, %xmm1
625; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
626; SSE41-NEXT:    addss %xmm1, %xmm0
627; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
628; SSE41-NEXT:    addss %xmm2, %xmm0
629; SSE41-NEXT:    addss %xmm3, %xmm0
630; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
631; SSE41-NEXT:    addss %xmm1, %xmm0
632; SSE41-NEXT:    movaps %xmm3, %xmm1
633; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
634; SSE41-NEXT:    addss %xmm1, %xmm0
635; SSE41-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
636; SSE41-NEXT:    addss %xmm3, %xmm0
637; SSE41-NEXT:    retq
638;
639; AVX1-SLOW-LABEL: test_v16f32_zero:
640; AVX1-SLOW:       # %bb.0:
641; AVX1-SLOW-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
642; AVX1-SLOW-NEXT:    vaddss %xmm2, %xmm0, %xmm2
643; AVX1-SLOW-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
644; AVX1-SLOW-NEXT:    vaddss %xmm3, %xmm2, %xmm2
645; AVX1-SLOW-NEXT:    vpermilps {{.*#+}} xmm3 = xmm0[3,3,3,3]
646; AVX1-SLOW-NEXT:    vaddss %xmm3, %xmm2, %xmm2
647; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0
648; AVX1-SLOW-NEXT:    vaddss %xmm0, %xmm2, %xmm2
649; AVX1-SLOW-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
650; AVX1-SLOW-NEXT:    vaddss %xmm3, %xmm2, %xmm2
651; AVX1-SLOW-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
652; AVX1-SLOW-NEXT:    vaddss %xmm3, %xmm2, %xmm2
653; AVX1-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
654; AVX1-SLOW-NEXT:    vaddss %xmm0, %xmm2, %xmm0
655; AVX1-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
656; AVX1-SLOW-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
657; AVX1-SLOW-NEXT:    vaddss %xmm2, %xmm0, %xmm0
658; AVX1-SLOW-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
659; AVX1-SLOW-NEXT:    vaddss %xmm2, %xmm0, %xmm0
660; AVX1-SLOW-NEXT:    vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3]
661; AVX1-SLOW-NEXT:    vaddss %xmm2, %xmm0, %xmm0
662; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm1
663; AVX1-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
664; AVX1-SLOW-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
665; AVX1-SLOW-NEXT:    vaddss %xmm2, %xmm0, %xmm0
666; AVX1-SLOW-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
667; AVX1-SLOW-NEXT:    vaddss %xmm2, %xmm0, %xmm0
668; AVX1-SLOW-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
669; AVX1-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
670; AVX1-SLOW-NEXT:    vzeroupper
671; AVX1-SLOW-NEXT:    retq
672;
673; AVX1-FAST-LABEL: test_v16f32_zero:
674; AVX1-FAST:       # %bb.0:
675; AVX1-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm2
676; AVX1-FAST-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
677; AVX1-FAST-NEXT:    vaddss %xmm3, %xmm2, %xmm2
678; AVX1-FAST-NEXT:    vpermilps {{.*#+}} xmm3 = xmm0[3,3,3,3]
679; AVX1-FAST-NEXT:    vaddss %xmm3, %xmm2, %xmm2
680; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm0
681; AVX1-FAST-NEXT:    vaddss %xmm0, %xmm2, %xmm2
682; AVX1-FAST-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
683; AVX1-FAST-NEXT:    vaddss %xmm3, %xmm2, %xmm2
684; AVX1-FAST-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
685; AVX1-FAST-NEXT:    vaddss %xmm3, %xmm2, %xmm2
686; AVX1-FAST-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
687; AVX1-FAST-NEXT:    vaddss %xmm0, %xmm2, %xmm0
688; AVX1-FAST-NEXT:    vaddss %xmm1, %xmm0, %xmm0
689; AVX1-FAST-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
690; AVX1-FAST-NEXT:    vaddss %xmm2, %xmm0, %xmm0
691; AVX1-FAST-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
692; AVX1-FAST-NEXT:    vaddss %xmm2, %xmm0, %xmm0
693; AVX1-FAST-NEXT:    vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3]
694; AVX1-FAST-NEXT:    vaddss %xmm2, %xmm0, %xmm0
695; AVX1-FAST-NEXT:    vextractf128 $1, %ymm1, %xmm1
696; AVX1-FAST-NEXT:    vaddss %xmm1, %xmm0, %xmm0
697; AVX1-FAST-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
698; AVX1-FAST-NEXT:    vaddss %xmm2, %xmm0, %xmm0
699; AVX1-FAST-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
700; AVX1-FAST-NEXT:    vaddss %xmm2, %xmm0, %xmm0
701; AVX1-FAST-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
702; AVX1-FAST-NEXT:    vaddss %xmm1, %xmm0, %xmm0
703; AVX1-FAST-NEXT:    vzeroupper
704; AVX1-FAST-NEXT:    retq
705;
706; AVX2-LABEL: test_v16f32_zero:
707; AVX2:       # %bb.0:
708; AVX2-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
709; AVX2-NEXT:    vaddss %xmm2, %xmm0, %xmm2
710; AVX2-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
711; AVX2-NEXT:    vaddss %xmm3, %xmm2, %xmm2
712; AVX2-NEXT:    vpermilps {{.*#+}} xmm3 = xmm0[3,3,3,3]
713; AVX2-NEXT:    vaddss %xmm3, %xmm2, %xmm2
714; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
715; AVX2-NEXT:    vaddss %xmm0, %xmm2, %xmm2
716; AVX2-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
717; AVX2-NEXT:    vaddss %xmm3, %xmm2, %xmm2
718; AVX2-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
719; AVX2-NEXT:    vaddss %xmm3, %xmm2, %xmm2
720; AVX2-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
721; AVX2-NEXT:    vaddss %xmm0, %xmm2, %xmm0
722; AVX2-NEXT:    vaddss %xmm1, %xmm0, %xmm0
723; AVX2-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
724; AVX2-NEXT:    vaddss %xmm2, %xmm0, %xmm0
725; AVX2-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
726; AVX2-NEXT:    vaddss %xmm2, %xmm0, %xmm0
727; AVX2-NEXT:    vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3]
728; AVX2-NEXT:    vaddss %xmm2, %xmm0, %xmm0
729; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm1
730; AVX2-NEXT:    vaddss %xmm1, %xmm0, %xmm0
731; AVX2-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
732; AVX2-NEXT:    vaddss %xmm2, %xmm0, %xmm0
733; AVX2-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
734; AVX2-NEXT:    vaddss %xmm2, %xmm0, %xmm0
735; AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
736; AVX2-NEXT:    vaddss %xmm1, %xmm0, %xmm0
737; AVX2-NEXT:    vzeroupper
738; AVX2-NEXT:    retq
739;
740; AVX512-LABEL: test_v16f32_zero:
741; AVX512:       # %bb.0:
742; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
743; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm1
744; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
745; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
746; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3]
747; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
748; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm2
749; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
750; AVX512-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
751; AVX512-NEXT:    vaddss %xmm3, %xmm1, %xmm1
752; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
753; AVX512-NEXT:    vaddss %xmm3, %xmm1, %xmm1
754; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3]
755; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
756; AVX512-NEXT:    vextractf32x4 $2, %zmm0, %xmm2
757; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
758; AVX512-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
759; AVX512-NEXT:    vaddss %xmm3, %xmm1, %xmm1
760; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
761; AVX512-NEXT:    vaddss %xmm3, %xmm1, %xmm1
762; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3]
763; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
764; AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
765; AVX512-NEXT:    vaddss %xmm0, %xmm1, %xmm1
766; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
767; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
768; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
769; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
770; AVX512-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
771; AVX512-NEXT:    vaddss %xmm0, %xmm1, %xmm0
772; AVX512-NEXT:    vzeroupper
773; AVX512-NEXT:    retq
774  %1 = call float @llvm.vector.reduce.fadd.f32.v16f32(float -0.0, <16 x float> %a0)
775  ret float %1
776}
777
778;
779; vXf32 (undef)
780;
781
782define float @test_v2f32_undef(<2 x float> %a0) {
783; SSE2-LABEL: test_v2f32_undef:
784; SSE2:       # %bb.0:
785; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
786; SSE2-NEXT:    addss {{.*}}(%rip), %xmm0
787; SSE2-NEXT:    retq
788;
789; SSE41-LABEL: test_v2f32_undef:
790; SSE41:       # %bb.0:
791; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
792; SSE41-NEXT:    addss {{.*}}(%rip), %xmm0
793; SSE41-NEXT:    retq
794;
795; AVX-LABEL: test_v2f32_undef:
796; AVX:       # %bb.0:
797; AVX-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
798; AVX-NEXT:    vaddss {{.*}}(%rip), %xmm0, %xmm0
799; AVX-NEXT:    retq
800;
801; AVX512-LABEL: test_v2f32_undef:
802; AVX512:       # %bb.0:
803; AVX512-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
804; AVX512-NEXT:    vaddss {{.*}}(%rip), %xmm0, %xmm0
805; AVX512-NEXT:    retq
806  %1 = call float @llvm.vector.reduce.fadd.f32.v2f32(float undef, <2 x float> %a0)
807  ret float %1
808}
809
810define float @test_v4f32_undef(<4 x float> %a0) {
811; SSE2-LABEL: test_v4f32_undef:
812; SSE2:       # %bb.0:
813; SSE2-NEXT:    movaps %xmm0, %xmm1
814; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
815; SSE2-NEXT:    addss {{.*}}(%rip), %xmm1
816; SSE2-NEXT:    movaps %xmm0, %xmm2
817; SSE2-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
818; SSE2-NEXT:    addss %xmm1, %xmm2
819; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
820; SSE2-NEXT:    addss %xmm2, %xmm0
821; SSE2-NEXT:    retq
822;
823; SSE41-LABEL: test_v4f32_undef:
824; SSE41:       # %bb.0:
825; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
826; SSE41-NEXT:    addss {{.*}}(%rip), %xmm1
827; SSE41-NEXT:    movaps %xmm0, %xmm2
828; SSE41-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
829; SSE41-NEXT:    addss %xmm1, %xmm2
830; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
831; SSE41-NEXT:    addss %xmm2, %xmm0
832; SSE41-NEXT:    retq
833;
834; AVX-LABEL: test_v4f32_undef:
835; AVX:       # %bb.0:
836; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
837; AVX-NEXT:    vaddss {{.*}}(%rip), %xmm1, %xmm1
838; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
839; AVX-NEXT:    vaddss %xmm2, %xmm1, %xmm1
840; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
841; AVX-NEXT:    vaddss %xmm0, %xmm1, %xmm0
842; AVX-NEXT:    retq
843;
844; AVX512-LABEL: test_v4f32_undef:
845; AVX512:       # %bb.0:
846; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
847; AVX512-NEXT:    vaddss {{.*}}(%rip), %xmm1, %xmm1
848; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
849; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
850; AVX512-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
851; AVX512-NEXT:    vaddss %xmm0, %xmm1, %xmm0
852; AVX512-NEXT:    retq
853  %1 = call float @llvm.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %a0)
854  ret float %1
855}
856
857define float @test_v8f32_undef(<8 x float> %a0) {
858; SSE2-LABEL: test_v8f32_undef:
859; SSE2:       # %bb.0:
860; SSE2-NEXT:    movaps %xmm0, %xmm2
861; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1]
862; SSE2-NEXT:    addss {{.*}}(%rip), %xmm2
863; SSE2-NEXT:    movaps %xmm0, %xmm3
864; SSE2-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1]
865; SSE2-NEXT:    addss %xmm2, %xmm3
866; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
867; SSE2-NEXT:    addss %xmm3, %xmm0
868; SSE2-NEXT:    addss %xmm1, %xmm0
869; SSE2-NEXT:    movaps %xmm1, %xmm2
870; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1]
871; SSE2-NEXT:    addss %xmm2, %xmm0
872; SSE2-NEXT:    movaps %xmm1, %xmm2
873; SSE2-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
874; SSE2-NEXT:    addss %xmm2, %xmm0
875; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
876; SSE2-NEXT:    addss %xmm1, %xmm0
877; SSE2-NEXT:    retq
878;
879; SSE41-LABEL: test_v8f32_undef:
880; SSE41:       # %bb.0:
881; SSE41-NEXT:    movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
882; SSE41-NEXT:    addss {{.*}}(%rip), %xmm2
883; SSE41-NEXT:    movaps %xmm0, %xmm3
884; SSE41-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1]
885; SSE41-NEXT:    addss %xmm2, %xmm3
886; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
887; SSE41-NEXT:    addss %xmm3, %xmm0
888; SSE41-NEXT:    addss %xmm1, %xmm0
889; SSE41-NEXT:    movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
890; SSE41-NEXT:    addss %xmm2, %xmm0
891; SSE41-NEXT:    movaps %xmm1, %xmm2
892; SSE41-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
893; SSE41-NEXT:    addss %xmm2, %xmm0
894; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
895; SSE41-NEXT:    addss %xmm1, %xmm0
896; SSE41-NEXT:    retq
897;
898; AVX-LABEL: test_v8f32_undef:
899; AVX:       # %bb.0:
900; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
901; AVX-NEXT:    vaddss {{.*}}(%rip), %xmm1, %xmm1
902; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
903; AVX-NEXT:    vaddss %xmm2, %xmm1, %xmm1
904; AVX-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3]
905; AVX-NEXT:    vaddss %xmm2, %xmm1, %xmm1
906; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
907; AVX-NEXT:    vaddss %xmm0, %xmm1, %xmm1
908; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
909; AVX-NEXT:    vaddss %xmm2, %xmm1, %xmm1
910; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
911; AVX-NEXT:    vaddss %xmm2, %xmm1, %xmm1
912; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
913; AVX-NEXT:    vaddss %xmm0, %xmm1, %xmm0
914; AVX-NEXT:    vzeroupper
915; AVX-NEXT:    retq
916;
917; AVX512-LABEL: test_v8f32_undef:
918; AVX512:       # %bb.0:
919; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
920; AVX512-NEXT:    vaddss {{.*}}(%rip), %xmm1, %xmm1
921; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
922; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
923; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3]
924; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
925; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
926; AVX512-NEXT:    vaddss %xmm0, %xmm1, %xmm1
927; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
928; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
929; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
930; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
931; AVX512-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
932; AVX512-NEXT:    vaddss %xmm0, %xmm1, %xmm0
933; AVX512-NEXT:    vzeroupper
934; AVX512-NEXT:    retq
935  %1 = call float @llvm.vector.reduce.fadd.f32.v8f32(float undef, <8 x float> %a0)
936  ret float %1
937}
938
939define float @test_v16f32_undef(<16 x float> %a0) {
940; SSE2-LABEL: test_v16f32_undef:
941; SSE2:       # %bb.0:
942; SSE2-NEXT:    movaps %xmm0, %xmm4
943; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1],xmm0[1,1]
944; SSE2-NEXT:    addss {{.*}}(%rip), %xmm4
945; SSE2-NEXT:    movaps %xmm0, %xmm5
946; SSE2-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1]
947; SSE2-NEXT:    addss %xmm4, %xmm5
948; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
949; SSE2-NEXT:    addss %xmm5, %xmm0
950; SSE2-NEXT:    addss %xmm1, %xmm0
951; SSE2-NEXT:    movaps %xmm1, %xmm4
952; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[1,1]
953; SSE2-NEXT:    addss %xmm4, %xmm0
954; SSE2-NEXT:    movaps %xmm1, %xmm4
955; SSE2-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1]
956; SSE2-NEXT:    addss %xmm4, %xmm0
957; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
958; SSE2-NEXT:    addss %xmm1, %xmm0
959; SSE2-NEXT:    addss %xmm2, %xmm0
960; SSE2-NEXT:    movaps %xmm2, %xmm1
961; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1]
962; SSE2-NEXT:    addss %xmm1, %xmm0
963; SSE2-NEXT:    movaps %xmm2, %xmm1
964; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
965; SSE2-NEXT:    addss %xmm1, %xmm0
966; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
967; SSE2-NEXT:    addss %xmm2, %xmm0
968; SSE2-NEXT:    addss %xmm3, %xmm0
969; SSE2-NEXT:    movaps %xmm3, %xmm1
970; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[1,1]
971; SSE2-NEXT:    addss %xmm1, %xmm0
972; SSE2-NEXT:    movaps %xmm3, %xmm1
973; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
974; SSE2-NEXT:    addss %xmm1, %xmm0
975; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
976; SSE2-NEXT:    addss %xmm3, %xmm0
977; SSE2-NEXT:    retq
978;
979; SSE41-LABEL: test_v16f32_undef:
980; SSE41:       # %bb.0:
981; SSE41-NEXT:    movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
982; SSE41-NEXT:    addss {{.*}}(%rip), %xmm4
983; SSE41-NEXT:    movaps %xmm0, %xmm5
984; SSE41-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1]
985; SSE41-NEXT:    addss %xmm4, %xmm5
986; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
987; SSE41-NEXT:    addss %xmm5, %xmm0
988; SSE41-NEXT:    addss %xmm1, %xmm0
989; SSE41-NEXT:    movshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
990; SSE41-NEXT:    addss %xmm4, %xmm0
991; SSE41-NEXT:    movaps %xmm1, %xmm4
992; SSE41-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1]
993; SSE41-NEXT:    addss %xmm4, %xmm0
994; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
995; SSE41-NEXT:    addss %xmm1, %xmm0
996; SSE41-NEXT:    addss %xmm2, %xmm0
997; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
998; SSE41-NEXT:    addss %xmm1, %xmm0
999; SSE41-NEXT:    movaps %xmm2, %xmm1
1000; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
1001; SSE41-NEXT:    addss %xmm1, %xmm0
1002; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
1003; SSE41-NEXT:    addss %xmm2, %xmm0
1004; SSE41-NEXT:    addss %xmm3, %xmm0
1005; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
1006; SSE41-NEXT:    addss %xmm1, %xmm0
1007; SSE41-NEXT:    movaps %xmm3, %xmm1
1008; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
1009; SSE41-NEXT:    addss %xmm1, %xmm0
1010; SSE41-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
1011; SSE41-NEXT:    addss %xmm3, %xmm0
1012; SSE41-NEXT:    retq
1013;
1014; AVX-LABEL: test_v16f32_undef:
1015; AVX:       # %bb.0:
1016; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
1017; AVX-NEXT:    vaddss {{.*}}(%rip), %xmm2, %xmm2
1018; AVX-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
1019; AVX-NEXT:    vaddss %xmm3, %xmm2, %xmm2
1020; AVX-NEXT:    vpermilps {{.*#+}} xmm3 = xmm0[3,3,3,3]
1021; AVX-NEXT:    vaddss %xmm3, %xmm2, %xmm2
1022; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
1023; AVX-NEXT:    vaddss %xmm0, %xmm2, %xmm2
1024; AVX-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
1025; AVX-NEXT:    vaddss %xmm3, %xmm2, %xmm2
1026; AVX-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
1027; AVX-NEXT:    vaddss %xmm3, %xmm2, %xmm2
1028; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
1029; AVX-NEXT:    vaddss %xmm0, %xmm2, %xmm0
1030; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
1031; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
1032; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
1033; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1034; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
1035; AVX-NEXT:    vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3]
1036; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
1037; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
1038; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
1039; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
1040; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
1041; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1042; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
1043; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
1044; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
1045; AVX-NEXT:    vzeroupper
1046; AVX-NEXT:    retq
1047;
1048; AVX512-LABEL: test_v16f32_undef:
1049; AVX512:       # %bb.0:
1050; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1051; AVX512-NEXT:    vaddss {{.*}}(%rip), %xmm1, %xmm1
1052; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
1053; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
1054; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3]
1055; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
1056; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm2
1057; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
1058; AVX512-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
1059; AVX512-NEXT:    vaddss %xmm3, %xmm1, %xmm1
1060; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
1061; AVX512-NEXT:    vaddss %xmm3, %xmm1, %xmm1
1062; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3]
1063; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
1064; AVX512-NEXT:    vextractf32x4 $2, %zmm0, %xmm2
1065; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
1066; AVX512-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
1067; AVX512-NEXT:    vaddss %xmm3, %xmm1, %xmm1
1068; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
1069; AVX512-NEXT:    vaddss %xmm3, %xmm1, %xmm1
1070; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3]
1071; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
1072; AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
1073; AVX512-NEXT:    vaddss %xmm0, %xmm1, %xmm1
1074; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
1075; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
1076; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
1077; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
1078; AVX512-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
1079; AVX512-NEXT:    vaddss %xmm0, %xmm1, %xmm0
1080; AVX512-NEXT:    vzeroupper
1081; AVX512-NEXT:    retq
1082  %1 = call float @llvm.vector.reduce.fadd.f32.v16f32(float undef, <16 x float> %a0)
1083  ret float %1
1084}
1085
1086;
1087; vXf64 (accum)
1088;
1089
1090define double @test_v2f64(double %a0, <2 x double> %a1) {
1091; SSE-LABEL: test_v2f64:
1092; SSE:       # %bb.0:
1093; SSE-NEXT:    addsd %xmm1, %xmm0
1094; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
1095; SSE-NEXT:    addsd %xmm1, %xmm0
1096; SSE-NEXT:    retq
1097;
1098; AVX-LABEL: test_v2f64:
1099; AVX:       # %bb.0:
1100; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1101; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1102; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1103; AVX-NEXT:    retq
1104;
1105; AVX512-LABEL: test_v2f64:
1106; AVX512:       # %bb.0:
1107; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1108; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1109; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1110; AVX512-NEXT:    retq
1111  %1 = call double @llvm.vector.reduce.fadd.f64.v2f64(double %a0, <2 x double> %a1)
1112  ret double %1
1113}
1114
1115define double @test_v4f64(double %a0, <4 x double> %a1) {
1116; SSE-LABEL: test_v4f64:
1117; SSE:       # %bb.0:
1118; SSE-NEXT:    addsd %xmm1, %xmm0
1119; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
1120; SSE-NEXT:    addsd %xmm1, %xmm0
1121; SSE-NEXT:    addsd %xmm2, %xmm0
1122; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
1123; SSE-NEXT:    addsd %xmm2, %xmm0
1124; SSE-NEXT:    retq
1125;
1126; AVX-LABEL: test_v4f64:
1127; AVX:       # %bb.0:
1128; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1129; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1130; AVX-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1131; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
1132; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1133; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1134; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1135; AVX-NEXT:    vzeroupper
1136; AVX-NEXT:    retq
1137;
1138; AVX512-LABEL: test_v4f64:
1139; AVX512:       # %bb.0:
1140; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1141; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1142; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1143; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm1
1144; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1145; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1146; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1147; AVX512-NEXT:    vzeroupper
1148; AVX512-NEXT:    retq
1149  %1 = call double @llvm.vector.reduce.fadd.f64.v4f64(double %a0, <4 x double> %a1)
1150  ret double %1
1151}
1152
1153define double @test_v8f64(double %a0, <8 x double> %a1) {
1154; SSE-LABEL: test_v8f64:
1155; SSE:       # %bb.0:
1156; SSE-NEXT:    addsd %xmm1, %xmm0
1157; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
1158; SSE-NEXT:    addsd %xmm1, %xmm0
1159; SSE-NEXT:    addsd %xmm2, %xmm0
1160; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
1161; SSE-NEXT:    addsd %xmm2, %xmm0
1162; SSE-NEXT:    addsd %xmm3, %xmm0
1163; SSE-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
1164; SSE-NEXT:    addsd %xmm3, %xmm0
1165; SSE-NEXT:    addsd %xmm4, %xmm0
1166; SSE-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1,1]
1167; SSE-NEXT:    addsd %xmm4, %xmm0
1168; SSE-NEXT:    retq
1169;
1170; AVX-LABEL: test_v8f64:
1171; AVX:       # %bb.0:
1172; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1173; AVX-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
1174; AVX-NEXT:    vaddsd %xmm3, %xmm0, %xmm0
1175; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
1176; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1177; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1178; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1179; AVX-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1180; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
1181; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1182; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm1
1183; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1184; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1185; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1186; AVX-NEXT:    vzeroupper
1187; AVX-NEXT:    retq
1188;
1189; AVX512-LABEL: test_v8f64:
1190; AVX512:       # %bb.0:
1191; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1192; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1193; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1194; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm2
1195; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1196; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
1197; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1198; AVX512-NEXT:    vextractf32x4 $2, %zmm1, %xmm2
1199; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1200; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
1201; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1202; AVX512-NEXT:    vextractf32x4 $3, %zmm1, %xmm1
1203; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1204; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1205; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1206; AVX512-NEXT:    vzeroupper
1207; AVX512-NEXT:    retq
1208  %1 = call double @llvm.vector.reduce.fadd.f64.v8f64(double %a0, <8 x double> %a1)
1209  ret double %1
1210}
1211
1212define double @test_v16f64(double %a0, <16 x double> %a1) {
1213; SSE2-LABEL: test_v16f64:
1214; SSE2:       # %bb.0:
1215; SSE2-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm8
1216; SSE2-NEXT:    addsd %xmm1, %xmm0
1217; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
1218; SSE2-NEXT:    addsd %xmm1, %xmm0
1219; SSE2-NEXT:    addsd %xmm2, %xmm0
1220; SSE2-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
1221; SSE2-NEXT:    addsd %xmm2, %xmm0
1222; SSE2-NEXT:    addsd %xmm3, %xmm0
1223; SSE2-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
1224; SSE2-NEXT:    addsd %xmm3, %xmm0
1225; SSE2-NEXT:    addsd %xmm4, %xmm0
1226; SSE2-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1,1]
1227; SSE2-NEXT:    addsd %xmm4, %xmm0
1228; SSE2-NEXT:    addsd %xmm5, %xmm0
1229; SSE2-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1,1]
1230; SSE2-NEXT:    addsd %xmm5, %xmm0
1231; SSE2-NEXT:    addsd %xmm6, %xmm0
1232; SSE2-NEXT:    unpckhpd {{.*#+}} xmm6 = xmm6[1,1]
1233; SSE2-NEXT:    addsd %xmm6, %xmm0
1234; SSE2-NEXT:    addsd %xmm7, %xmm0
1235; SSE2-NEXT:    unpckhpd {{.*#+}} xmm7 = xmm7[1,1]
1236; SSE2-NEXT:    addsd %xmm7, %xmm0
1237; SSE2-NEXT:    addsd %xmm8, %xmm0
1238; SSE2-NEXT:    unpckhpd {{.*#+}} xmm8 = xmm8[1,1]
1239; SSE2-NEXT:    addsd %xmm8, %xmm0
1240; SSE2-NEXT:    retq
1241;
1242; SSE41-LABEL: test_v16f64:
1243; SSE41:       # %bb.0:
1244; SSE41-NEXT:    addsd %xmm1, %xmm0
1245; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
1246; SSE41-NEXT:    addsd %xmm1, %xmm0
1247; SSE41-NEXT:    addsd %xmm2, %xmm0
1248; SSE41-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
1249; SSE41-NEXT:    addsd %xmm2, %xmm0
1250; SSE41-NEXT:    addsd %xmm3, %xmm0
1251; SSE41-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
1252; SSE41-NEXT:    addsd %xmm3, %xmm0
1253; SSE41-NEXT:    addsd %xmm4, %xmm0
1254; SSE41-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1,1]
1255; SSE41-NEXT:    addsd %xmm4, %xmm0
1256; SSE41-NEXT:    addsd %xmm5, %xmm0
1257; SSE41-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1,1]
1258; SSE41-NEXT:    addsd %xmm5, %xmm0
1259; SSE41-NEXT:    addsd %xmm6, %xmm0
1260; SSE41-NEXT:    unpckhpd {{.*#+}} xmm6 = xmm6[1,1]
1261; SSE41-NEXT:    addsd %xmm6, %xmm0
1262; SSE41-NEXT:    addsd %xmm7, %xmm0
1263; SSE41-NEXT:    unpckhpd {{.*#+}} xmm7 = xmm7[1,1]
1264; SSE41-NEXT:    addsd %xmm7, %xmm0
1265; SSE41-NEXT:    addsd {{[0-9]+}}(%rsp), %xmm0
1266; SSE41-NEXT:    addsd {{[0-9]+}}(%rsp), %xmm0
1267; SSE41-NEXT:    retq
1268;
1269; AVX-LABEL: test_v16f64:
1270; AVX:       # %bb.0:
1271; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1272; AVX-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm1[1,0]
1273; AVX-NEXT:    vaddsd %xmm5, %xmm0, %xmm0
1274; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
1275; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1276; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1277; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1278; AVX-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1279; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
1280; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1281; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm1
1282; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1283; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1284; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1285; AVX-NEXT:    vaddsd %xmm3, %xmm0, %xmm0
1286; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm3[1,0]
1287; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1288; AVX-NEXT:    vextractf128 $1, %ymm3, %xmm1
1289; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1290; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1291; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1292; AVX-NEXT:    vaddsd %xmm4, %xmm0, %xmm0
1293; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm4[1,0]
1294; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1295; AVX-NEXT:    vextractf128 $1, %ymm4, %xmm1
1296; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1297; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1298; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1299; AVX-NEXT:    vzeroupper
1300; AVX-NEXT:    retq
1301;
1302; AVX512-LABEL: test_v16f64:
1303; AVX512:       # %bb.0:
1304; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1305; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
1306; AVX512-NEXT:    vaddsd %xmm3, %xmm0, %xmm0
1307; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm3
1308; AVX512-NEXT:    vaddsd %xmm3, %xmm0, %xmm0
1309; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
1310; AVX512-NEXT:    vaddsd %xmm3, %xmm0, %xmm0
1311; AVX512-NEXT:    vextractf32x4 $2, %zmm1, %xmm3
1312; AVX512-NEXT:    vaddsd %xmm3, %xmm0, %xmm0
1313; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
1314; AVX512-NEXT:    vaddsd %xmm3, %xmm0, %xmm0
1315; AVX512-NEXT:    vextractf32x4 $3, %zmm1, %xmm1
1316; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1317; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1318; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1319; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1320; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
1321; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1322; AVX512-NEXT:    vextractf128 $1, %ymm2, %xmm1
1323; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1324; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1325; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1326; AVX512-NEXT:    vextractf32x4 $2, %zmm2, %xmm1
1327; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1328; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1329; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1330; AVX512-NEXT:    vextractf32x4 $3, %zmm2, %xmm1
1331; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1332; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1333; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1334; AVX512-NEXT:    vzeroupper
1335; AVX512-NEXT:    retq
1336  %1 = call double @llvm.vector.reduce.fadd.f64.v16f64(double %a0, <16 x double> %a1)
1337  ret double %1
1338}
1339
1340;
1341; vXf64 (zero)
1342;
1343
1344define double @test_v2f64_zero(<2 x double> %a0) {
1345; SSE-LABEL: test_v2f64_zero:
1346; SSE:       # %bb.0:
1347; SSE-NEXT:    movapd %xmm0, %xmm1
1348; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1349; SSE-NEXT:    addsd %xmm0, %xmm1
1350; SSE-NEXT:    movapd %xmm1, %xmm0
1351; SSE-NEXT:    retq
1352;
1353; AVX1-SLOW-LABEL: test_v2f64_zero:
1354; AVX1-SLOW:       # %bb.0:
1355; AVX1-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1356; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1357; AVX1-SLOW-NEXT:    retq
1358;
1359; AVX1-FAST-LABEL: test_v2f64_zero:
1360; AVX1-FAST:       # %bb.0:
1361; AVX1-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
1362; AVX1-FAST-NEXT:    retq
1363;
1364; AVX2-LABEL: test_v2f64_zero:
1365; AVX2:       # %bb.0:
1366; AVX2-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1367; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1368; AVX2-NEXT:    retq
1369;
1370; AVX512-LABEL: test_v2f64_zero:
1371; AVX512:       # %bb.0:
1372; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1373; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1374; AVX512-NEXT:    retq
1375  %1 = call double @llvm.vector.reduce.fadd.f64.v2f64(double -0.0, <2 x double> %a0)
1376  ret double %1
1377}
1378
1379define double @test_v4f64_zero(<4 x double> %a0) {
1380; SSE-LABEL: test_v4f64_zero:
1381; SSE:       # %bb.0:
1382; SSE-NEXT:    movapd %xmm0, %xmm2
1383; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
1384; SSE-NEXT:    addsd %xmm0, %xmm2
1385; SSE-NEXT:    addsd %xmm1, %xmm2
1386; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
1387; SSE-NEXT:    addsd %xmm1, %xmm2
1388; SSE-NEXT:    movapd %xmm2, %xmm0
1389; SSE-NEXT:    retq
1390;
1391; AVX1-SLOW-LABEL: test_v4f64_zero:
1392; AVX1-SLOW:       # %bb.0:
1393; AVX1-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1394; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm1
1395; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0
1396; AVX1-SLOW-NEXT:    vaddsd %xmm0, %xmm1, %xmm1
1397; AVX1-SLOW-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1398; AVX1-SLOW-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
1399; AVX1-SLOW-NEXT:    vzeroupper
1400; AVX1-SLOW-NEXT:    retq
1401;
1402; AVX1-FAST-LABEL: test_v4f64_zero:
1403; AVX1-FAST:       # %bb.0:
1404; AVX1-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm1
1405; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm0
1406; AVX1-FAST-NEXT:    vaddsd %xmm0, %xmm1, %xmm1
1407; AVX1-FAST-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1408; AVX1-FAST-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
1409; AVX1-FAST-NEXT:    vzeroupper
1410; AVX1-FAST-NEXT:    retq
1411;
1412; AVX2-LABEL: test_v4f64_zero:
1413; AVX2:       # %bb.0:
1414; AVX2-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1415; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm1
1416; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
1417; AVX2-NEXT:    vaddsd %xmm0, %xmm1, %xmm1
1418; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1419; AVX2-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
1420; AVX2-NEXT:    vzeroupper
1421; AVX2-NEXT:    retq
1422;
1423; AVX512-LABEL: test_v4f64_zero:
1424; AVX512:       # %bb.0:
1425; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1426; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm1
1427; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
1428; AVX512-NEXT:    vaddsd %xmm0, %xmm1, %xmm1
1429; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1430; AVX512-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
1431; AVX512-NEXT:    vzeroupper
1432; AVX512-NEXT:    retq
1433  %1 = call double @llvm.vector.reduce.fadd.f64.v4f64(double -0.0, <4 x double> %a0)
1434  ret double %1
1435}
1436
1437define double @test_v8f64_zero(<8 x double> %a0) {
1438; SSE-LABEL: test_v8f64_zero:
1439; SSE:       # %bb.0:
1440; SSE-NEXT:    movapd %xmm0, %xmm4
1441; SSE-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1]
1442; SSE-NEXT:    addsd %xmm0, %xmm4
1443; SSE-NEXT:    addsd %xmm1, %xmm4
1444; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
1445; SSE-NEXT:    addsd %xmm1, %xmm4
1446; SSE-NEXT:    addsd %xmm2, %xmm4
1447; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
1448; SSE-NEXT:    addsd %xmm2, %xmm4
1449; SSE-NEXT:    addsd %xmm3, %xmm4
1450; SSE-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
1451; SSE-NEXT:    addsd %xmm3, %xmm4
1452; SSE-NEXT:    movapd %xmm4, %xmm0
1453; SSE-NEXT:    retq
1454;
1455; AVX1-SLOW-LABEL: test_v8f64_zero:
1456; AVX1-SLOW:       # %bb.0:
1457; AVX1-SLOW-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
1458; AVX1-SLOW-NEXT:    vaddsd %xmm2, %xmm0, %xmm2
1459; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0
1460; AVX1-SLOW-NEXT:    vaddsd %xmm0, %xmm2, %xmm2
1461; AVX1-SLOW-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1462; AVX1-SLOW-NEXT:    vaddsd %xmm0, %xmm2, %xmm0
1463; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1464; AVX1-SLOW-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1465; AVX1-SLOW-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1466; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm1
1467; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1468; AVX1-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1469; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1470; AVX1-SLOW-NEXT:    vzeroupper
1471; AVX1-SLOW-NEXT:    retq
1472;
1473; AVX1-FAST-LABEL: test_v8f64_zero:
1474; AVX1-FAST:       # %bb.0:
1475; AVX1-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm2
1476; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm0
1477; AVX1-FAST-NEXT:    vaddsd %xmm0, %xmm2, %xmm2
1478; AVX1-FAST-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1479; AVX1-FAST-NEXT:    vaddsd %xmm0, %xmm2, %xmm0
1480; AVX1-FAST-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1481; AVX1-FAST-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1482; AVX1-FAST-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1483; AVX1-FAST-NEXT:    vextractf128 $1, %ymm1, %xmm1
1484; AVX1-FAST-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1485; AVX1-FAST-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1486; AVX1-FAST-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1487; AVX1-FAST-NEXT:    vzeroupper
1488; AVX1-FAST-NEXT:    retq
1489;
1490; AVX2-LABEL: test_v8f64_zero:
1491; AVX2:       # %bb.0:
1492; AVX2-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
1493; AVX2-NEXT:    vaddsd %xmm2, %xmm0, %xmm2
1494; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
1495; AVX2-NEXT:    vaddsd %xmm0, %xmm2, %xmm2
1496; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1497; AVX2-NEXT:    vaddsd %xmm0, %xmm2, %xmm0
1498; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1499; AVX2-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1500; AVX2-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1501; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm1
1502; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1503; AVX2-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1504; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1505; AVX2-NEXT:    vzeroupper
1506; AVX2-NEXT:    retq
1507;
1508; AVX512-LABEL: test_v8f64_zero:
1509; AVX512:       # %bb.0:
1510; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1511; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm1
1512; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm2
1513; AVX512-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
1514; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
1515; AVX512-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
1516; AVX512-NEXT:    vextractf32x4 $2, %zmm0, %xmm2
1517; AVX512-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
1518; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
1519; AVX512-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
1520; AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
1521; AVX512-NEXT:    vaddsd %xmm0, %xmm1, %xmm1
1522; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1523; AVX512-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
1524; AVX512-NEXT:    vzeroupper
1525; AVX512-NEXT:    retq
1526  %1 = call double @llvm.vector.reduce.fadd.f64.v8f64(double -0.0, <8 x double> %a0)
1527  ret double %1
1528}
1529
1530define double @test_v16f64_zero(<16 x double> %a0) {
1531; SSE-LABEL: test_v16f64_zero:
1532; SSE:       # %bb.0:
1533; SSE-NEXT:    movapd %xmm0, %xmm8
1534; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
1535; SSE-NEXT:    addsd %xmm8, %xmm0
1536; SSE-NEXT:    addsd %xmm1, %xmm0
1537; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
1538; SSE-NEXT:    addsd %xmm1, %xmm0
1539; SSE-NEXT:    addsd %xmm2, %xmm0
1540; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
1541; SSE-NEXT:    addsd %xmm2, %xmm0
1542; SSE-NEXT:    addsd %xmm3, %xmm0
1543; SSE-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
1544; SSE-NEXT:    addsd %xmm3, %xmm0
1545; SSE-NEXT:    addsd %xmm4, %xmm0
1546; SSE-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1,1]
1547; SSE-NEXT:    addsd %xmm4, %xmm0
1548; SSE-NEXT:    addsd %xmm5, %xmm0
1549; SSE-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1,1]
1550; SSE-NEXT:    addsd %xmm5, %xmm0
1551; SSE-NEXT:    addsd %xmm6, %xmm0
1552; SSE-NEXT:    unpckhpd {{.*#+}} xmm6 = xmm6[1,1]
1553; SSE-NEXT:    addsd %xmm6, %xmm0
1554; SSE-NEXT:    addsd %xmm7, %xmm0
1555; SSE-NEXT:    unpckhpd {{.*#+}} xmm7 = xmm7[1,1]
1556; SSE-NEXT:    addsd %xmm7, %xmm0
1557; SSE-NEXT:    retq
1558;
1559; AVX1-SLOW-LABEL: test_v16f64_zero:
1560; AVX1-SLOW:       # %bb.0:
1561; AVX1-SLOW-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm0[1,0]
1562; AVX1-SLOW-NEXT:    vaddsd %xmm4, %xmm0, %xmm4
1563; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0
1564; AVX1-SLOW-NEXT:    vaddsd %xmm0, %xmm4, %xmm4
1565; AVX1-SLOW-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1566; AVX1-SLOW-NEXT:    vaddsd %xmm0, %xmm4, %xmm0
1567; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1568; AVX1-SLOW-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm1[1,0]
1569; AVX1-SLOW-NEXT:    vaddsd %xmm4, %xmm0, %xmm0
1570; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm1
1571; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1572; AVX1-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1573; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1574; AVX1-SLOW-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1575; AVX1-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
1576; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1577; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm2, %xmm1
1578; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1579; AVX1-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1580; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1581; AVX1-SLOW-NEXT:    vaddsd %xmm3, %xmm0, %xmm0
1582; AVX1-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm3[1,0]
1583; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1584; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm3, %xmm1
1585; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1586; AVX1-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1587; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1588; AVX1-SLOW-NEXT:    vzeroupper
1589; AVX1-SLOW-NEXT:    retq
1590;
1591; AVX1-FAST-LABEL: test_v16f64_zero:
1592; AVX1-FAST:       # %bb.0:
1593; AVX1-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm4
1594; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm0
1595; AVX1-FAST-NEXT:    vaddsd %xmm0, %xmm4, %xmm4
1596; AVX1-FAST-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1597; AVX1-FAST-NEXT:    vaddsd %xmm0, %xmm4, %xmm0
1598; AVX1-FAST-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1599; AVX1-FAST-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm1[1,0]
1600; AVX1-FAST-NEXT:    vaddsd %xmm4, %xmm0, %xmm0
1601; AVX1-FAST-NEXT:    vextractf128 $1, %ymm1, %xmm1
1602; AVX1-FAST-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1603; AVX1-FAST-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1604; AVX1-FAST-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1605; AVX1-FAST-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1606; AVX1-FAST-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
1607; AVX1-FAST-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1608; AVX1-FAST-NEXT:    vextractf128 $1, %ymm2, %xmm1
1609; AVX1-FAST-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1610; AVX1-FAST-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1611; AVX1-FAST-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1612; AVX1-FAST-NEXT:    vaddsd %xmm3, %xmm0, %xmm0
1613; AVX1-FAST-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm3[1,0]
1614; AVX1-FAST-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1615; AVX1-FAST-NEXT:    vextractf128 $1, %ymm3, %xmm1
1616; AVX1-FAST-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1617; AVX1-FAST-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1618; AVX1-FAST-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1619; AVX1-FAST-NEXT:    vzeroupper
1620; AVX1-FAST-NEXT:    retq
1621;
1622; AVX2-LABEL: test_v16f64_zero:
1623; AVX2:       # %bb.0:
1624; AVX2-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm0[1,0]
1625; AVX2-NEXT:    vaddsd %xmm4, %xmm0, %xmm4
1626; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
1627; AVX2-NEXT:    vaddsd %xmm0, %xmm4, %xmm4
1628; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1629; AVX2-NEXT:    vaddsd %xmm0, %xmm4, %xmm0
1630; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1631; AVX2-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm1[1,0]
1632; AVX2-NEXT:    vaddsd %xmm4, %xmm0, %xmm0
1633; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm1
1634; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1635; AVX2-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1636; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1637; AVX2-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1638; AVX2-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
1639; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1640; AVX2-NEXT:    vextractf128 $1, %ymm2, %xmm1
1641; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1642; AVX2-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1643; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1644; AVX2-NEXT:    vaddsd %xmm3, %xmm0, %xmm0
1645; AVX2-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm3[1,0]
1646; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1647; AVX2-NEXT:    vextractf128 $1, %ymm3, %xmm1
1648; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1649; AVX2-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1650; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1651; AVX2-NEXT:    vzeroupper
1652; AVX2-NEXT:    retq
1653;
1654; AVX512-LABEL: test_v16f64_zero:
1655; AVX512:       # %bb.0:
1656; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
1657; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm2
1658; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm3
1659; AVX512-NEXT:    vaddsd %xmm3, %xmm2, %xmm2
1660; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
1661; AVX512-NEXT:    vaddsd %xmm3, %xmm2, %xmm2
1662; AVX512-NEXT:    vextractf32x4 $2, %zmm0, %xmm3
1663; AVX512-NEXT:    vaddsd %xmm3, %xmm2, %xmm2
1664; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
1665; AVX512-NEXT:    vaddsd %xmm3, %xmm2, %xmm2
1666; AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
1667; AVX512-NEXT:    vaddsd %xmm0, %xmm2, %xmm2
1668; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1669; AVX512-NEXT:    vaddsd %xmm0, %xmm2, %xmm0
1670; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1671; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1672; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1673; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm2
1674; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1675; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
1676; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1677; AVX512-NEXT:    vextractf32x4 $2, %zmm1, %xmm2
1678; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1679; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
1680; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1681; AVX512-NEXT:    vextractf32x4 $3, %zmm1, %xmm1
1682; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1683; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1684; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1685; AVX512-NEXT:    vzeroupper
1686; AVX512-NEXT:    retq
1687  %1 = call double @llvm.vector.reduce.fadd.f64.v16f64(double -0.0, <16 x double> %a0)
1688  ret double %1
1689}
1690
1691;
1692; vXf64 (undef)
1693;
1694
1695define double @test_v2f64_undef(<2 x double> %a0) {
1696; SSE-LABEL: test_v2f64_undef:
1697; SSE:       # %bb.0:
1698; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
1699; SSE-NEXT:    addsd {{.*}}(%rip), %xmm0
1700; SSE-NEXT:    retq
1701;
1702; AVX-LABEL: test_v2f64_undef:
1703; AVX:       # %bb.0:
1704; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1705; AVX-NEXT:    vaddsd {{.*}}(%rip), %xmm0, %xmm0
1706; AVX-NEXT:    retq
1707;
1708; AVX512-LABEL: test_v2f64_undef:
1709; AVX512:       # %bb.0:
1710; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1711; AVX512-NEXT:    vaddsd {{.*}}(%rip), %xmm0, %xmm0
1712; AVX512-NEXT:    retq
1713  %1 = call double @llvm.vector.reduce.fadd.f64.v2f64(double undef, <2 x double> %a0)
1714  ret double %1
1715}
1716
1717define double @test_v4f64_undef(<4 x double> %a0) {
1718; SSE-LABEL: test_v4f64_undef:
1719; SSE:       # %bb.0:
1720; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
1721; SSE-NEXT:    addsd {{.*}}(%rip), %xmm0
1722; SSE-NEXT:    addsd %xmm1, %xmm0
1723; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
1724; SSE-NEXT:    addsd %xmm1, %xmm0
1725; SSE-NEXT:    retq
1726;
1727; AVX-LABEL: test_v4f64_undef:
1728; AVX:       # %bb.0:
1729; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1730; AVX-NEXT:    vaddsd {{.*}}(%rip), %xmm1, %xmm1
1731; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
1732; AVX-NEXT:    vaddsd %xmm0, %xmm1, %xmm1
1733; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1734; AVX-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
1735; AVX-NEXT:    vzeroupper
1736; AVX-NEXT:    retq
1737;
1738; AVX512-LABEL: test_v4f64_undef:
1739; AVX512:       # %bb.0:
1740; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1741; AVX512-NEXT:    vaddsd {{.*}}(%rip), %xmm1, %xmm1
1742; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
1743; AVX512-NEXT:    vaddsd %xmm0, %xmm1, %xmm1
1744; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1745; AVX512-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
1746; AVX512-NEXT:    vzeroupper
1747; AVX512-NEXT:    retq
1748  %1 = call double @llvm.vector.reduce.fadd.f64.v4f64(double undef, <4 x double> %a0)
1749  ret double %1
1750}
1751
1752define double @test_v8f64_undef(<8 x double> %a0) {
1753; SSE-LABEL: test_v8f64_undef:
1754; SSE:       # %bb.0:
1755; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
1756; SSE-NEXT:    addsd {{.*}}(%rip), %xmm0
1757; SSE-NEXT:    addsd %xmm1, %xmm0
1758; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
1759; SSE-NEXT:    addsd %xmm1, %xmm0
1760; SSE-NEXT:    addsd %xmm2, %xmm0
1761; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
1762; SSE-NEXT:    addsd %xmm2, %xmm0
1763; SSE-NEXT:    addsd %xmm3, %xmm0
1764; SSE-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
1765; SSE-NEXT:    addsd %xmm3, %xmm0
1766; SSE-NEXT:    retq
1767;
1768; AVX-LABEL: test_v8f64_undef:
1769; AVX:       # %bb.0:
1770; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
1771; AVX-NEXT:    vaddsd {{.*}}(%rip), %xmm2, %xmm2
1772; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
1773; AVX-NEXT:    vaddsd %xmm0, %xmm2, %xmm2
1774; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1775; AVX-NEXT:    vaddsd %xmm0, %xmm2, %xmm0
1776; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1777; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1778; AVX-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1779; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
1780; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1781; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1782; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1783; AVX-NEXT:    vzeroupper
1784; AVX-NEXT:    retq
1785;
1786; AVX512-LABEL: test_v8f64_undef:
1787; AVX512:       # %bb.0:
1788; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1789; AVX512-NEXT:    vaddsd {{.*}}(%rip), %xmm1, %xmm1
1790; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm2
1791; AVX512-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
1792; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
1793; AVX512-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
1794; AVX512-NEXT:    vextractf32x4 $2, %zmm0, %xmm2
1795; AVX512-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
1796; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
1797; AVX512-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
1798; AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
1799; AVX512-NEXT:    vaddsd %xmm0, %xmm1, %xmm1
1800; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1801; AVX512-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
1802; AVX512-NEXT:    vzeroupper
1803; AVX512-NEXT:    retq
1804  %1 = call double @llvm.vector.reduce.fadd.f64.v8f64(double undef, <8 x double> %a0)
1805  ret double %1
1806}
1807
1808define double @test_v16f64_undef(<16 x double> %a0) {
1809; SSE-LABEL: test_v16f64_undef:
1810; SSE:       # %bb.0:
1811; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
1812; SSE-NEXT:    addsd {{.*}}(%rip), %xmm0
1813; SSE-NEXT:    addsd %xmm1, %xmm0
1814; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
1815; SSE-NEXT:    addsd %xmm1, %xmm0
1816; SSE-NEXT:    addsd %xmm2, %xmm0
1817; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
1818; SSE-NEXT:    addsd %xmm2, %xmm0
1819; SSE-NEXT:    addsd %xmm3, %xmm0
1820; SSE-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
1821; SSE-NEXT:    addsd %xmm3, %xmm0
1822; SSE-NEXT:    addsd %xmm4, %xmm0
1823; SSE-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1,1]
1824; SSE-NEXT:    addsd %xmm4, %xmm0
1825; SSE-NEXT:    addsd %xmm5, %xmm0
1826; SSE-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1,1]
1827; SSE-NEXT:    addsd %xmm5, %xmm0
1828; SSE-NEXT:    addsd %xmm6, %xmm0
1829; SSE-NEXT:    unpckhpd {{.*#+}} xmm6 = xmm6[1,1]
1830; SSE-NEXT:    addsd %xmm6, %xmm0
1831; SSE-NEXT:    addsd %xmm7, %xmm0
1832; SSE-NEXT:    unpckhpd {{.*#+}} xmm7 = xmm7[1,1]
1833; SSE-NEXT:    addsd %xmm7, %xmm0
1834; SSE-NEXT:    retq
1835;
1836; AVX-LABEL: test_v16f64_undef:
1837; AVX:       # %bb.0:
1838; AVX-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm0[1,0]
1839; AVX-NEXT:    vaddsd {{.*}}(%rip), %xmm4, %xmm4
1840; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
1841; AVX-NEXT:    vaddsd %xmm0, %xmm4, %xmm4
1842; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1843; AVX-NEXT:    vaddsd %xmm0, %xmm4, %xmm0
1844; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1845; AVX-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm1[1,0]
1846; AVX-NEXT:    vaddsd %xmm4, %xmm0, %xmm0
1847; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
1848; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1849; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1850; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1851; AVX-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1852; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
1853; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1854; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm1
1855; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1856; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1857; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1858; AVX-NEXT:    vaddsd %xmm3, %xmm0, %xmm0
1859; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm3[1,0]
1860; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1861; AVX-NEXT:    vextractf128 $1, %ymm3, %xmm1
1862; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1863; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1864; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1865; AVX-NEXT:    vzeroupper
1866; AVX-NEXT:    retq
1867;
1868; AVX512-LABEL: test_v16f64_undef:
1869; AVX512:       # %bb.0:
1870; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
1871; AVX512-NEXT:    vaddsd {{.*}}(%rip), %xmm2, %xmm2
1872; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm3
1873; AVX512-NEXT:    vaddsd %xmm3, %xmm2, %xmm2
1874; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
1875; AVX512-NEXT:    vaddsd %xmm3, %xmm2, %xmm2
1876; AVX512-NEXT:    vextractf32x4 $2, %zmm0, %xmm3
1877; AVX512-NEXT:    vaddsd %xmm3, %xmm2, %xmm2
1878; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
1879; AVX512-NEXT:    vaddsd %xmm3, %xmm2, %xmm2
1880; AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
1881; AVX512-NEXT:    vaddsd %xmm0, %xmm2, %xmm2
1882; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1883; AVX512-NEXT:    vaddsd %xmm0, %xmm2, %xmm0
1884; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1885; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1886; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1887; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm2
1888; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1889; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
1890; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1891; AVX512-NEXT:    vextractf32x4 $2, %zmm1, %xmm2
1892; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1893; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
1894; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1895; AVX512-NEXT:    vextractf32x4 $3, %zmm1, %xmm1
1896; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1897; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1898; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1899; AVX512-NEXT:    vzeroupper
1900; AVX512-NEXT:    retq
1901  %1 = call double @llvm.vector.reduce.fadd.f64.v16f64(double undef, <16 x double> %a0)
1902  ret double %1
1903}
1904
1905declare float @llvm.vector.reduce.fadd.f32.v2f32(float, <2 x float>)
1906declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
1907declare float @llvm.vector.reduce.fadd.f32.v8f32(float, <8 x float>)
1908declare float @llvm.vector.reduce.fadd.f32.v16f32(float, <16 x float>)
1909
1910declare double @llvm.vector.reduce.fadd.f64.v2f64(double, <2 x double>)
1911declare double @llvm.vector.reduce.fadd.f64.v4f64(double, <4 x double>)
1912declare double @llvm.vector.reduce.fadd.f64.v8f64(double, <8 x double>)
1913declare double @llvm.vector.reduce.fadd.f64.v16f64(double, <16 x double>)
1914