1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=AVX512
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512
8
9;
10; vXf32
11;
12
13define float @test_v2f32(<2 x float> %a0) {
14; SSE2-LABEL: test_v2f32:
15; SSE2:       # %bb.0:
16; SSE2-NEXT:    movaps %xmm0, %xmm1
17; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
18; SSE2-NEXT:    maxss %xmm1, %xmm0
19; SSE2-NEXT:    retq
20;
21; SSE41-LABEL: test_v2f32:
22; SSE41:       # %bb.0:
23; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
24; SSE41-NEXT:    maxss %xmm1, %xmm0
25; SSE41-NEXT:    retq
26;
27; AVX-LABEL: test_v2f32:
28; AVX:       # %bb.0:
29; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
30; AVX-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
31; AVX-NEXT:    retq
32;
33; AVX512-LABEL: test_v2f32:
34; AVX512:       # %bb.0:
35; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
36; AVX512-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
37; AVX512-NEXT:    retq
38  %1 = call nnan float @llvm.vector.reduce.fmax.v2f32(<2 x float> %a0)
39  ret float %1
40}
41
42define float @test_v4f32(<4 x float> %a0) {
43; SSE2-LABEL: test_v4f32:
44; SSE2:       # %bb.0:
45; SSE2-NEXT:    movaps %xmm0, %xmm1
46; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3]
47; SSE2-NEXT:    movaps %xmm0, %xmm2
48; SSE2-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
49; SSE2-NEXT:    movaps %xmm0, %xmm3
50; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1]
51; SSE2-NEXT:    maxss %xmm3, %xmm0
52; SSE2-NEXT:    maxss %xmm2, %xmm0
53; SSE2-NEXT:    maxss %xmm1, %xmm0
54; SSE2-NEXT:    retq
55;
56; SSE41-LABEL: test_v4f32:
57; SSE41:       # %bb.0:
58; SSE41-NEXT:    movaps %xmm0, %xmm1
59; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3]
60; SSE41-NEXT:    movaps %xmm0, %xmm2
61; SSE41-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
62; SSE41-NEXT:    movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
63; SSE41-NEXT:    maxss %xmm3, %xmm0
64; SSE41-NEXT:    maxss %xmm2, %xmm0
65; SSE41-NEXT:    maxss %xmm1, %xmm0
66; SSE41-NEXT:    retq
67;
68; AVX-LABEL: test_v4f32:
69; AVX:       # %bb.0:
70; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3]
71; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
72; AVX-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
73; AVX-NEXT:    vmaxss %xmm3, %xmm0, %xmm0
74; AVX-NEXT:    vmaxss %xmm2, %xmm0, %xmm0
75; AVX-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
76; AVX-NEXT:    retq
77;
78; AVX512-LABEL: test_v4f32:
79; AVX512:       # %bb.0:
80; AVX512-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3]
81; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
82; AVX512-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
83; AVX512-NEXT:    vmaxss %xmm3, %xmm0, %xmm0
84; AVX512-NEXT:    vmaxss %xmm2, %xmm0, %xmm0
85; AVX512-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
86; AVX512-NEXT:    retq
87  %1 = call nnan float @llvm.vector.reduce.fmax.v4f32(<4 x float> %a0)
88  ret float %1
89}
90
91define float @test_v8f32(<8 x float> %a0) {
92; SSE2-LABEL: test_v8f32:
93; SSE2:       # %bb.0:
94; SSE2-NEXT:    maxps %xmm1, %xmm0
95; SSE2-NEXT:    movaps %xmm0, %xmm2
96; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1]
97; SSE2-NEXT:    movaps %xmm0, %xmm1
98; SSE2-NEXT:    maxss %xmm2, %xmm1
99; SSE2-NEXT:    movaps %xmm0, %xmm2
100; SSE2-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
101; SSE2-NEXT:    maxss %xmm2, %xmm1
102; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
103; SSE2-NEXT:    maxss %xmm0, %xmm1
104; SSE2-NEXT:    movaps %xmm1, %xmm0
105; SSE2-NEXT:    retq
106;
107; SSE41-LABEL: test_v8f32:
108; SSE41:       # %bb.0:
109; SSE41-NEXT:    maxps %xmm1, %xmm0
110; SSE41-NEXT:    movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
111; SSE41-NEXT:    movaps %xmm0, %xmm1
112; SSE41-NEXT:    maxss %xmm2, %xmm1
113; SSE41-NEXT:    movaps %xmm0, %xmm2
114; SSE41-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
115; SSE41-NEXT:    maxss %xmm2, %xmm1
116; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
117; SSE41-NEXT:    maxss %xmm0, %xmm1
118; SSE41-NEXT:    movaps %xmm1, %xmm0
119; SSE41-NEXT:    retq
120;
121; AVX-LABEL: test_v8f32:
122; AVX:       # %bb.0:
123; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
124; AVX-NEXT:    vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3]
125; AVX-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
126; AVX-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
127; AVX-NEXT:    vpermilps {{.*#+}} xmm5 = xmm0[3,3,3,3]
128; AVX-NEXT:    vpermilpd {{.*#+}} xmm6 = xmm0[1,0]
129; AVX-NEXT:    vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3]
130; AVX-NEXT:    vmaxss %xmm7, %xmm0, %xmm0
131; AVX-NEXT:    vmaxss %xmm6, %xmm0, %xmm0
132; AVX-NEXT:    vmaxss %xmm5, %xmm0, %xmm0
133; AVX-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
134; AVX-NEXT:    vmaxss %xmm4, %xmm0, %xmm0
135; AVX-NEXT:    vmaxss %xmm3, %xmm0, %xmm0
136; AVX-NEXT:    vmaxss %xmm2, %xmm0, %xmm0
137; AVX-NEXT:    vzeroupper
138; AVX-NEXT:    retq
139;
140; AVX512-LABEL: test_v8f32:
141; AVX512:       # %bb.0:
142; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
143; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3]
144; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
145; AVX512-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
146; AVX512-NEXT:    vpermilps {{.*#+}} xmm5 = xmm0[3,3,3,3]
147; AVX512-NEXT:    vpermilpd {{.*#+}} xmm6 = xmm0[1,0]
148; AVX512-NEXT:    vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3]
149; AVX512-NEXT:    vmaxss %xmm7, %xmm0, %xmm0
150; AVX512-NEXT:    vmaxss %xmm6, %xmm0, %xmm0
151; AVX512-NEXT:    vmaxss %xmm5, %xmm0, %xmm0
152; AVX512-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
153; AVX512-NEXT:    vmaxss %xmm4, %xmm0, %xmm0
154; AVX512-NEXT:    vmaxss %xmm3, %xmm0, %xmm0
155; AVX512-NEXT:    vmaxss %xmm2, %xmm0, %xmm0
156; AVX512-NEXT:    vzeroupper
157; AVX512-NEXT:    retq
158  %1 = call nnan float @llvm.vector.reduce.fmax.v8f32(<8 x float> %a0)
159  ret float %1
160}
161
162define float @test_v16f32(<16 x float> %a0) {
163; SSE2-LABEL: test_v16f32:
164; SSE2:       # %bb.0:
165; SSE2-NEXT:    maxps %xmm3, %xmm1
166; SSE2-NEXT:    maxps %xmm2, %xmm0
167; SSE2-NEXT:    maxps %xmm1, %xmm0
168; SSE2-NEXT:    movaps %xmm0, %xmm2
169; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1]
170; SSE2-NEXT:    movaps %xmm0, %xmm1
171; SSE2-NEXT:    maxss %xmm2, %xmm1
172; SSE2-NEXT:    movaps %xmm0, %xmm2
173; SSE2-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
174; SSE2-NEXT:    maxss %xmm2, %xmm1
175; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
176; SSE2-NEXT:    maxss %xmm0, %xmm1
177; SSE2-NEXT:    movaps %xmm1, %xmm0
178; SSE2-NEXT:    retq
179;
180; SSE41-LABEL: test_v16f32:
181; SSE41:       # %bb.0:
182; SSE41-NEXT:    maxps %xmm3, %xmm1
183; SSE41-NEXT:    maxps %xmm2, %xmm0
184; SSE41-NEXT:    maxps %xmm1, %xmm0
185; SSE41-NEXT:    movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
186; SSE41-NEXT:    movaps %xmm0, %xmm1
187; SSE41-NEXT:    maxss %xmm2, %xmm1
188; SSE41-NEXT:    movaps %xmm0, %xmm2
189; SSE41-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
190; SSE41-NEXT:    maxss %xmm2, %xmm1
191; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
192; SSE41-NEXT:    maxss %xmm0, %xmm1
193; SSE41-NEXT:    movaps %xmm1, %xmm0
194; SSE41-NEXT:    retq
195;
196; AVX-LABEL: test_v16f32:
197; AVX:       # %bb.0:
198; AVX-NEXT:    vmaxps %ymm1, %ymm0, %ymm0
199; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
200; AVX-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
201; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
202; AVX-NEXT:    vmaxss %xmm2, %xmm1, %xmm1
203; AVX-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3]
204; AVX-NEXT:    vmaxss %xmm2, %xmm1, %xmm1
205; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
206; AVX-NEXT:    vmaxss %xmm0, %xmm1, %xmm1
207; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
208; AVX-NEXT:    vmaxss %xmm2, %xmm1, %xmm1
209; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
210; AVX-NEXT:    vmaxss %xmm2, %xmm1, %xmm1
211; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
212; AVX-NEXT:    vmaxss %xmm0, %xmm1, %xmm0
213; AVX-NEXT:    vzeroupper
214; AVX-NEXT:    retq
215;
216; AVX512-LABEL: test_v16f32:
217; AVX512:       # %bb.0:
218; AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm1
219; AVX512-NEXT:    vpermilps {{.*#+}} xmm8 = xmm1[3,3,3,3]
220; AVX512-NEXT:    vpermilpd {{.*#+}} xmm9 = xmm1[1,0]
221; AVX512-NEXT:    vmovshdup {{.*#+}} xmm10 = xmm1[1,1,3,3]
222; AVX512-NEXT:    vextractf32x4 $2, %zmm0, %xmm5
223; AVX512-NEXT:    vpermilps {{.*#+}} xmm11 = xmm5[3,3,3,3]
224; AVX512-NEXT:    vpermilpd {{.*#+}} xmm12 = xmm5[1,0]
225; AVX512-NEXT:    vmovshdup {{.*#+}} xmm13 = xmm5[1,1,3,3]
226; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm3
227; AVX512-NEXT:    vpermilps {{.*#+}} xmm14 = xmm3[3,3,3,3]
228; AVX512-NEXT:    vpermilpd {{.*#+}} xmm15 = xmm3[1,0]
229; AVX512-NEXT:    vmovshdup {{.*#+}} xmm7 = xmm3[1,1,3,3]
230; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3]
231; AVX512-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm0[1,0]
232; AVX512-NEXT:    vmovshdup {{.*#+}} xmm6 = xmm0[1,1,3,3]
233; AVX512-NEXT:    vmaxss %xmm6, %xmm0, %xmm0
234; AVX512-NEXT:    vmaxss %xmm4, %xmm0, %xmm0
235; AVX512-NEXT:    vmaxss %xmm2, %xmm0, %xmm0
236; AVX512-NEXT:    vmaxss %xmm3, %xmm0, %xmm0
237; AVX512-NEXT:    vmaxss %xmm7, %xmm0, %xmm0
238; AVX512-NEXT:    vmaxss %xmm15, %xmm0, %xmm0
239; AVX512-NEXT:    vmaxss %xmm14, %xmm0, %xmm0
240; AVX512-NEXT:    vmaxss %xmm5, %xmm0, %xmm0
241; AVX512-NEXT:    vmaxss %xmm13, %xmm0, %xmm0
242; AVX512-NEXT:    vmaxss %xmm12, %xmm0, %xmm0
243; AVX512-NEXT:    vmaxss %xmm11, %xmm0, %xmm0
244; AVX512-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
245; AVX512-NEXT:    vmaxss %xmm10, %xmm0, %xmm0
246; AVX512-NEXT:    vmaxss %xmm9, %xmm0, %xmm0
247; AVX512-NEXT:    vmaxss %xmm8, %xmm0, %xmm0
248; AVX512-NEXT:    vzeroupper
249; AVX512-NEXT:    retq
250  %1 = call nnan float @llvm.vector.reduce.fmax.v16f32(<16 x float> %a0)
251  ret float %1
252}
253
254;
255; vXf64
256;
257
258define double @test_v2f64(<2 x double> %a0) {
259; SSE-LABEL: test_v2f64:
260; SSE:       # %bb.0:
261; SSE-NEXT:    movapd %xmm0, %xmm1
262; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
263; SSE-NEXT:    maxsd %xmm1, %xmm0
264; SSE-NEXT:    retq
265;
266; AVX-LABEL: test_v2f64:
267; AVX:       # %bb.0:
268; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
269; AVX-NEXT:    vmaxsd %xmm1, %xmm0, %xmm0
270; AVX-NEXT:    retq
271;
272; AVX512-LABEL: test_v2f64:
273; AVX512:       # %bb.0:
274; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
275; AVX512-NEXT:    vmaxsd %xmm1, %xmm0, %xmm0
276; AVX512-NEXT:    retq
277  %1 = call nnan double @llvm.vector.reduce.fmax.v2f64(<2 x double> %a0)
278  ret double %1
279}
280
281define double @test_v3f64(<3 x double> %a0) {
282; SSE2-LABEL: test_v3f64:
283; SSE2:       # %bb.0:
284; SSE2-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
285; SSE2-NEXT:    shufpd {{.*#+}} xmm2 = xmm2[0],mem[1]
286; SSE2-NEXT:    maxpd %xmm2, %xmm0
287; SSE2-NEXT:    movapd %xmm0, %xmm1
288; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
289; SSE2-NEXT:    maxsd %xmm1, %xmm0
290; SSE2-NEXT:    retq
291;
292; SSE41-LABEL: test_v3f64:
293; SSE41:       # %bb.0:
294; SSE41-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
295; SSE41-NEXT:    blendpd {{.*#+}} xmm2 = xmm2[0],mem[1]
296; SSE41-NEXT:    maxpd %xmm2, %xmm0
297; SSE41-NEXT:    movapd %xmm0, %xmm1
298; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
299; SSE41-NEXT:    maxsd %xmm1, %xmm0
300; SSE41-NEXT:    retq
301;
302; AVX-LABEL: test_v3f64:
303; AVX:       # %bb.0:
304; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
305; AVX-NEXT:    vmaxsd %xmm1, %xmm0, %xmm1
306; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
307; AVX-NEXT:    vmaxsd %xmm0, %xmm1, %xmm0
308; AVX-NEXT:    vzeroupper
309; AVX-NEXT:    retq
310;
311; AVX512-LABEL: test_v3f64:
312; AVX512:       # %bb.0:
313; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
314; AVX512-NEXT:    vmaxsd %xmm1, %xmm0, %xmm1
315; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
316; AVX512-NEXT:    vmaxsd %xmm0, %xmm1, %xmm0
317; AVX512-NEXT:    vzeroupper
318; AVX512-NEXT:    retq
319  %1 = call nnan double @llvm.vector.reduce.fmax.v3f64(<3 x double> %a0)
320  ret double %1
321}
322
323define double @test_v4f64(<4 x double> %a0) {
324; SSE-LABEL: test_v4f64:
325; SSE:       # %bb.0:
326; SSE-NEXT:    maxpd %xmm1, %xmm0
327; SSE-NEXT:    movapd %xmm0, %xmm1
328; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
329; SSE-NEXT:    maxsd %xmm1, %xmm0
330; SSE-NEXT:    retq
331;
332; AVX-LABEL: test_v4f64:
333; AVX:       # %bb.0:
334; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
335; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
336; AVX-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
337; AVX-NEXT:    vmaxsd %xmm3, %xmm0, %xmm0
338; AVX-NEXT:    vmaxsd %xmm1, %xmm0, %xmm0
339; AVX-NEXT:    vmaxsd %xmm2, %xmm0, %xmm0
340; AVX-NEXT:    vzeroupper
341; AVX-NEXT:    retq
342;
343; AVX512-LABEL: test_v4f64:
344; AVX512:       # %bb.0:
345; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
346; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
347; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
348; AVX512-NEXT:    vmaxsd %xmm3, %xmm0, %xmm0
349; AVX512-NEXT:    vmaxsd %xmm1, %xmm0, %xmm0
350; AVX512-NEXT:    vmaxsd %xmm2, %xmm0, %xmm0
351; AVX512-NEXT:    vzeroupper
352; AVX512-NEXT:    retq
353  %1 = call nnan double @llvm.vector.reduce.fmax.v4f64(<4 x double> %a0)
354  ret double %1
355}
356
357define double @test_v8f64(<8 x double> %a0) {
358; SSE-LABEL: test_v8f64:
359; SSE:       # %bb.0:
360; SSE-NEXT:    maxpd %xmm3, %xmm1
361; SSE-NEXT:    maxpd %xmm2, %xmm0
362; SSE-NEXT:    maxpd %xmm1, %xmm0
363; SSE-NEXT:    movapd %xmm0, %xmm1
364; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
365; SSE-NEXT:    maxsd %xmm1, %xmm0
366; SSE-NEXT:    retq
367;
368; AVX-LABEL: test_v8f64:
369; AVX:       # %bb.0:
370; AVX-NEXT:    vmaxpd %ymm1, %ymm0, %ymm0
371; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
372; AVX-NEXT:    vmaxsd %xmm1, %xmm0, %xmm1
373; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
374; AVX-NEXT:    vmaxsd %xmm0, %xmm1, %xmm1
375; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
376; AVX-NEXT:    vmaxsd %xmm0, %xmm1, %xmm0
377; AVX-NEXT:    vzeroupper
378; AVX-NEXT:    retq
379;
380; AVX512-LABEL: test_v8f64:
381; AVX512:       # %bb.0:
382; AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm1
383; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
384; AVX512-NEXT:    vextractf32x4 $2, %zmm0, %xmm3
385; AVX512-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm3[1,0]
386; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm5
387; AVX512-NEXT:    vpermilpd {{.*#+}} xmm6 = xmm5[1,0]
388; AVX512-NEXT:    vpermilpd {{.*#+}} xmm7 = xmm0[1,0]
389; AVX512-NEXT:    vmaxsd %xmm7, %xmm0, %xmm0
390; AVX512-NEXT:    vmaxsd %xmm5, %xmm0, %xmm0
391; AVX512-NEXT:    vmaxsd %xmm6, %xmm0, %xmm0
392; AVX512-NEXT:    vmaxsd %xmm3, %xmm0, %xmm0
393; AVX512-NEXT:    vmaxsd %xmm4, %xmm0, %xmm0
394; AVX512-NEXT:    vmaxsd %xmm1, %xmm0, %xmm0
395; AVX512-NEXT:    vmaxsd %xmm2, %xmm0, %xmm0
396; AVX512-NEXT:    vzeroupper
397; AVX512-NEXT:    retq
398  %1 = call nnan double @llvm.vector.reduce.fmax.v8f64(<8 x double> %a0)
399  ret double %1
400}
401
402define double @test_v16f64(<16 x double> %a0) {
403; SSE-LABEL: test_v16f64:
404; SSE:       # %bb.0:
405; SSE-NEXT:    maxpd %xmm7, %xmm3
406; SSE-NEXT:    maxpd %xmm5, %xmm1
407; SSE-NEXT:    maxpd %xmm3, %xmm1
408; SSE-NEXT:    maxpd %xmm6, %xmm2
409; SSE-NEXT:    maxpd %xmm4, %xmm0
410; SSE-NEXT:    maxpd %xmm2, %xmm0
411; SSE-NEXT:    maxpd %xmm1, %xmm0
412; SSE-NEXT:    movapd %xmm0, %xmm1
413; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
414; SSE-NEXT:    maxsd %xmm1, %xmm0
415; SSE-NEXT:    retq
416;
417; AVX-LABEL: test_v16f64:
418; AVX:       # %bb.0:
419; AVX-NEXT:    vmaxpd %ymm3, %ymm1, %ymm1
420; AVX-NEXT:    vmaxpd %ymm2, %ymm0, %ymm0
421; AVX-NEXT:    vmaxpd %ymm1, %ymm0, %ymm0
422; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
423; AVX-NEXT:    vmaxsd %xmm1, %xmm0, %xmm1
424; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
425; AVX-NEXT:    vmaxsd %xmm0, %xmm1, %xmm1
426; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
427; AVX-NEXT:    vmaxsd %xmm0, %xmm1, %xmm0
428; AVX-NEXT:    vzeroupper
429; AVX-NEXT:    retq
430;
431; AVX512-LABEL: test_v16f64:
432; AVX512:       # %bb.0:
433; AVX512-NEXT:    vmaxpd %zmm1, %zmm0, %zmm0
434; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
435; AVX512-NEXT:    vmaxsd %xmm1, %xmm0, %xmm1
436; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm2
437; AVX512-NEXT:    vmaxsd %xmm2, %xmm1, %xmm1
438; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
439; AVX512-NEXT:    vmaxsd %xmm2, %xmm1, %xmm1
440; AVX512-NEXT:    vextractf32x4 $2, %zmm0, %xmm2
441; AVX512-NEXT:    vmaxsd %xmm2, %xmm1, %xmm1
442; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
443; AVX512-NEXT:    vmaxsd %xmm2, %xmm1, %xmm1
444; AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
445; AVX512-NEXT:    vmaxsd %xmm0, %xmm1, %xmm1
446; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
447; AVX512-NEXT:    vmaxsd %xmm0, %xmm1, %xmm0
448; AVX512-NEXT:    vzeroupper
449; AVX512-NEXT:    retq
450  %1 = call nnan double @llvm.vector.reduce.fmax.v16f64(<16 x double> %a0)
451  ret double %1
452}
453
454define half @test_v2f16(<2 x half> %a0) nounwind {
455; SSE-LABEL: test_v2f16:
456; SSE:       # %bb.0:
457; SSE-NEXT:    pushq %rbx
458; SSE-NEXT:    subq $16, %rsp
459; SSE-NEXT:    movl %edi, %ebx
460; SSE-NEXT:    movzwl %si, %edi
461; SSE-NEXT:    callq __gnu_h2f_ieee
462; SSE-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
463; SSE-NEXT:    movzwl %bx, %edi
464; SSE-NEXT:    callq __gnu_h2f_ieee
465; SSE-NEXT:    movaps %xmm0, %xmm1
466; SSE-NEXT:    cmpunordss %xmm0, %xmm1
467; SSE-NEXT:    movaps %xmm1, %xmm2
468; SSE-NEXT:    movaps (%rsp), %xmm3 # 16-byte Reload
469; SSE-NEXT:    andps %xmm3, %xmm2
470; SSE-NEXT:    maxss %xmm0, %xmm3
471; SSE-NEXT:    andnps %xmm3, %xmm1
472; SSE-NEXT:    orps %xmm2, %xmm1
473; SSE-NEXT:    movaps %xmm1, %xmm0
474; SSE-NEXT:    callq __gnu_f2h_ieee
475; SSE-NEXT:    addq $16, %rsp
476; SSE-NEXT:    popq %rbx
477; SSE-NEXT:    retq
478;
479; AVX-LABEL: test_v2f16:
480; AVX:       # %bb.0:
481; AVX-NEXT:    pushq %rbx
482; AVX-NEXT:    subq $16, %rsp
483; AVX-NEXT:    movl %esi, %ebx
484; AVX-NEXT:    movzwl %di, %edi
485; AVX-NEXT:    callq __gnu_h2f_ieee
486; AVX-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
487; AVX-NEXT:    movzwl %bx, %edi
488; AVX-NEXT:    callq __gnu_h2f_ieee
489; AVX-NEXT:    vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload
490; AVX-NEXT:    # xmm2 = mem[0],zero,zero,zero
491; AVX-NEXT:    vmaxss %xmm2, %xmm0, %xmm1
492; AVX-NEXT:    vcmpunordss %xmm2, %xmm2, %xmm2
493; AVX-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
494; AVX-NEXT:    callq __gnu_f2h_ieee
495; AVX-NEXT:    addq $16, %rsp
496; AVX-NEXT:    popq %rbx
497; AVX-NEXT:    retq
498;
499; AVX512-LABEL: test_v2f16:
500; AVX512:       # %bb.0:
501; AVX512-NEXT:    movzwl %di, %eax
502; AVX512-NEXT:    vmovd %eax, %xmm0
503; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
504; AVX512-NEXT:    movzwl %si, %eax
505; AVX512-NEXT:    vmovd %eax, %xmm1
506; AVX512-NEXT:    vcvtph2ps %xmm1, %xmm1
507; AVX512-NEXT:    vmaxss %xmm0, %xmm1, %xmm2
508; AVX512-NEXT:    vcmpunordss %xmm0, %xmm0, %k1
509; AVX512-NEXT:    vmovss %xmm1, %xmm2, %xmm2 {%k1}
510; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm0
511; AVX512-NEXT:    vmovd %xmm0, %eax
512; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
513; AVX512-NEXT:    retq
514  %1 = call nnan half @llvm.vector.reduce.fmax.v2f16(<2 x half> %a0)
515  ret half %1
516}
517declare float @llvm.vector.reduce.fmax.v2f32(<2 x float>)
518declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>)
519declare float @llvm.vector.reduce.fmax.v8f32(<8 x float>)
520declare float @llvm.vector.reduce.fmax.v16f32(<16 x float>)
521
522declare double @llvm.vector.reduce.fmax.v2f64(<2 x double>)
523declare double @llvm.vector.reduce.fmax.v3f64(<3 x double>)
524declare double @llvm.vector.reduce.fmax.v4f64(<4 x double>)
525declare double @llvm.vector.reduce.fmax.v8f64(<8 x double>)
526declare double @llvm.vector.reduce.fmax.v16f64(<16 x double>)
527
528declare half @llvm.vector.reduce.fmax.v2f16(<2 x half>)
529