1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=AVX512
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512
8
9; These tests are identical to corresponding tests in the 'nnan' versions
10; of the files except that they use 'fast' FMF. If things are working as
11; expected, the 'nnan' codegen should be the same as 'fast'.
12
13;
14; vXf32
15;
16
17define float @test_v2f32(<2 x float> %a0) {
18; SSE2-LABEL: test_v2f32:
19; SSE2:       # %bb.0:
20; SSE2-NEXT:    movaps %xmm0, %xmm1
21; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
22; SSE2-NEXT:    minss %xmm1, %xmm0
23; SSE2-NEXT:    retq
24;
25; SSE41-LABEL: test_v2f32:
26; SSE41:       # %bb.0:
27; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
28; SSE41-NEXT:    minss %xmm1, %xmm0
29; SSE41-NEXT:    retq
30;
31; AVX-LABEL: test_v2f32:
32; AVX:       # %bb.0:
33; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
34; AVX-NEXT:    vminss %xmm1, %xmm0, %xmm0
35; AVX-NEXT:    retq
36;
37; AVX512-LABEL: test_v2f32:
38; AVX512:       # %bb.0:
39; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
40; AVX512-NEXT:    vminss %xmm1, %xmm0, %xmm0
41; AVX512-NEXT:    retq
42  %1 = call fast float @llvm.vector.reduce.fmin.v2f32(<2 x float> %a0)
43  ret float %1
44}
45
46define float @test_v4f32(<4 x float> %a0) {
47; SSE2-LABEL: test_v4f32:
48; SSE2:       # %bb.0:
49; SSE2-NEXT:    movaps %xmm0, %xmm1
50; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
51; SSE2-NEXT:    maxps %xmm1, %xmm0
52; SSE2-NEXT:    movaps %xmm0, %xmm1
53; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
54; SSE2-NEXT:    maxss %xmm1, %xmm0
55; SSE2-NEXT:    retq
56;
57; SSE41-LABEL: test_v4f32:
58; SSE41:       # %bb.0:
59; SSE41-NEXT:    movaps %xmm0, %xmm1
60; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
61; SSE41-NEXT:    maxps %xmm1, %xmm0
62; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
63; SSE41-NEXT:    maxss %xmm1, %xmm0
64; SSE41-NEXT:    retq
65;
66; AVX-LABEL: test_v4f32:
67; AVX:       # %bb.0:
68; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
69; AVX-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
70; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
71; AVX-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
72; AVX-NEXT:    retq
73;
74; AVX512-LABEL: test_v4f32:
75; AVX512:       # %bb.0:
76; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
77; AVX512-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
78; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
79; AVX512-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
80; AVX512-NEXT:    retq
81  %1 = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> %a0)
82  ret float %1
83}
84
85define float @test_v8f32(<8 x float> %a0) {
86; SSE2-LABEL: test_v8f32:
87; SSE2:       # %bb.0:
88; SSE2-NEXT:    minps %xmm1, %xmm0
89; SSE2-NEXT:    movaps %xmm0, %xmm1
90; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
91; SSE2-NEXT:    minps %xmm1, %xmm0
92; SSE2-NEXT:    movaps %xmm0, %xmm1
93; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
94; SSE2-NEXT:    minss %xmm1, %xmm0
95; SSE2-NEXT:    retq
96;
97; SSE41-LABEL: test_v8f32:
98; SSE41:       # %bb.0:
99; SSE41-NEXT:    minps %xmm1, %xmm0
100; SSE41-NEXT:    movaps %xmm0, %xmm1
101; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
102; SSE41-NEXT:    minps %xmm1, %xmm0
103; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
104; SSE41-NEXT:    minss %xmm1, %xmm0
105; SSE41-NEXT:    retq
106;
107; AVX-LABEL: test_v8f32:
108; AVX:       # %bb.0:
109; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
110; AVX-NEXT:    vminps %xmm1, %xmm0, %xmm0
111; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
112; AVX-NEXT:    vminps %xmm1, %xmm0, %xmm0
113; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
114; AVX-NEXT:    vminss %xmm1, %xmm0, %xmm0
115; AVX-NEXT:    vzeroupper
116; AVX-NEXT:    retq
117;
118; AVX512-LABEL: test_v8f32:
119; AVX512:       # %bb.0:
120; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
121; AVX512-NEXT:    vminps %xmm1, %xmm0, %xmm0
122; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
123; AVX512-NEXT:    vminps %xmm1, %xmm0, %xmm0
124; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
125; AVX512-NEXT:    vminss %xmm1, %xmm0, %xmm0
126; AVX512-NEXT:    vzeroupper
127; AVX512-NEXT:    retq
128  %1 = call fast float @llvm.vector.reduce.fmin.v8f32(<8 x float> %a0)
129  ret float %1
130}
131
132define float @test_v16f32(<16 x float> %a0) {
133; SSE2-LABEL: test_v16f32:
134; SSE2:       # %bb.0:
135; SSE2-NEXT:    maxps %xmm3, %xmm1
136; SSE2-NEXT:    maxps %xmm2, %xmm0
137; SSE2-NEXT:    maxps %xmm1, %xmm0
138; SSE2-NEXT:    movaps %xmm0, %xmm1
139; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
140; SSE2-NEXT:    maxps %xmm1, %xmm0
141; SSE2-NEXT:    movaps %xmm0, %xmm1
142; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
143; SSE2-NEXT:    maxss %xmm1, %xmm0
144; SSE2-NEXT:    retq
145;
146; SSE41-LABEL: test_v16f32:
147; SSE41:       # %bb.0:
148; SSE41-NEXT:    maxps %xmm3, %xmm1
149; SSE41-NEXT:    maxps %xmm2, %xmm0
150; SSE41-NEXT:    maxps %xmm1, %xmm0
151; SSE41-NEXT:    movaps %xmm0, %xmm1
152; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
153; SSE41-NEXT:    maxps %xmm1, %xmm0
154; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
155; SSE41-NEXT:    maxss %xmm1, %xmm0
156; SSE41-NEXT:    retq
157;
158; AVX-LABEL: test_v16f32:
159; AVX:       # %bb.0:
160; AVX-NEXT:    vmaxps %ymm1, %ymm0, %ymm0
161; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
162; AVX-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
163; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
164; AVX-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
165; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
166; AVX-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
167; AVX-NEXT:    vzeroupper
168; AVX-NEXT:    retq
169;
170; AVX512-LABEL: test_v16f32:
171; AVX512:       # %bb.0:
172; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
173; AVX512-NEXT:    vmaxps %zmm1, %zmm0, %zmm0
174; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
175; AVX512-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
176; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
177; AVX512-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
178; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
179; AVX512-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
180; AVX512-NEXT:    vzeroupper
181; AVX512-NEXT:    retq
182  %1 = call fast float @llvm.vector.reduce.fmax.v16f32(<16 x float> %a0)
183  ret float %1
184}
185
186;
187; vXf64
188;
189
190define double @test_v2f64(<2 x double> %a0) {
191; SSE-LABEL: test_v2f64:
192; SSE:       # %bb.0:
193; SSE-NEXT:    movapd %xmm0, %xmm1
194; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
195; SSE-NEXT:    minsd %xmm1, %xmm0
196; SSE-NEXT:    retq
197;
198; AVX-LABEL: test_v2f64:
199; AVX:       # %bb.0:
200; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
201; AVX-NEXT:    vminsd %xmm1, %xmm0, %xmm0
202; AVX-NEXT:    retq
203;
204; AVX512-LABEL: test_v2f64:
205; AVX512:       # %bb.0:
206; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
207; AVX512-NEXT:    vminsd %xmm1, %xmm0, %xmm0
208; AVX512-NEXT:    retq
209  %1 = call fast double @llvm.vector.reduce.fmin.v2f64(<2 x double> %a0)
210  ret double %1
211}
212
213define double @test_v4f64(<4 x double> %a0) {
214; SSE-LABEL: test_v4f64:
215; SSE:       # %bb.0:
216; SSE-NEXT:    maxpd %xmm1, %xmm0
217; SSE-NEXT:    movapd %xmm0, %xmm1
218; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
219; SSE-NEXT:    maxsd %xmm1, %xmm0
220; SSE-NEXT:    retq
221;
222; AVX-LABEL: test_v4f64:
223; AVX:       # %bb.0:
224; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
225; AVX-NEXT:    vmaxpd %xmm1, %xmm0, %xmm0
226; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
227; AVX-NEXT:    vmaxsd %xmm1, %xmm0, %xmm0
228; AVX-NEXT:    vzeroupper
229; AVX-NEXT:    retq
230;
231; AVX512-LABEL: test_v4f64:
232; AVX512:       # %bb.0:
233; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
234; AVX512-NEXT:    vmaxpd %xmm1, %xmm0, %xmm0
235; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
236; AVX512-NEXT:    vmaxsd %xmm1, %xmm0, %xmm0
237; AVX512-NEXT:    vzeroupper
238; AVX512-NEXT:    retq
239  %1 = call fast double @llvm.vector.reduce.fmax.v4f64(<4 x double> %a0)
240  ret double %1
241}
242
243define double @test_v8f64(<8 x double> %a0) {
244; SSE-LABEL: test_v8f64:
245; SSE:       # %bb.0:
246; SSE-NEXT:    minpd %xmm3, %xmm1
247; SSE-NEXT:    minpd %xmm2, %xmm0
248; SSE-NEXT:    minpd %xmm1, %xmm0
249; SSE-NEXT:    movapd %xmm0, %xmm1
250; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
251; SSE-NEXT:    minsd %xmm1, %xmm0
252; SSE-NEXT:    retq
253;
254; AVX-LABEL: test_v8f64:
255; AVX:       # %bb.0:
256; AVX-NEXT:    vminpd %ymm1, %ymm0, %ymm0
257; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
258; AVX-NEXT:    vminpd %xmm1, %xmm0, %xmm0
259; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
260; AVX-NEXT:    vminsd %xmm1, %xmm0, %xmm0
261; AVX-NEXT:    vzeroupper
262; AVX-NEXT:    retq
263;
264; AVX512-LABEL: test_v8f64:
265; AVX512:       # %bb.0:
266; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
267; AVX512-NEXT:    vminpd %zmm1, %zmm0, %zmm0
268; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
269; AVX512-NEXT:    vminpd %xmm1, %xmm0, %xmm0
270; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
271; AVX512-NEXT:    vminsd %xmm1, %xmm0, %xmm0
272; AVX512-NEXT:    vzeroupper
273; AVX512-NEXT:    retq
274  %1 = call fast double @llvm.vector.reduce.fmin.v8f64(<8 x double> %a0)
275  ret double %1
276}
277
278define double @test_v16f64(<16 x double> %a0) {
279; SSE-LABEL: test_v16f64:
280; SSE:       # %bb.0:
281; SSE-NEXT:    maxpd %xmm6, %xmm2
282; SSE-NEXT:    maxpd %xmm4, %xmm0
283; SSE-NEXT:    maxpd %xmm2, %xmm0
284; SSE-NEXT:    maxpd %xmm7, %xmm3
285; SSE-NEXT:    maxpd %xmm5, %xmm1
286; SSE-NEXT:    maxpd %xmm3, %xmm1
287; SSE-NEXT:    maxpd %xmm1, %xmm0
288; SSE-NEXT:    movapd %xmm0, %xmm1
289; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
290; SSE-NEXT:    maxsd %xmm1, %xmm0
291; SSE-NEXT:    retq
292;
293; AVX-LABEL: test_v16f64:
294; AVX:       # %bb.0:
295; AVX-NEXT:    vmaxpd %ymm3, %ymm1, %ymm1
296; AVX-NEXT:    vmaxpd %ymm2, %ymm0, %ymm0
297; AVX-NEXT:    vmaxpd %ymm1, %ymm0, %ymm0
298; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
299; AVX-NEXT:    vmaxpd %xmm1, %xmm0, %xmm0
300; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
301; AVX-NEXT:    vmaxsd %xmm1, %xmm0, %xmm0
302; AVX-NEXT:    vzeroupper
303; AVX-NEXT:    retq
304;
305; AVX512-LABEL: test_v16f64:
306; AVX512:       # %bb.0:
307; AVX512-NEXT:    vmaxpd %zmm1, %zmm0, %zmm0
308; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
309; AVX512-NEXT:    vmaxpd %zmm1, %zmm0, %zmm0
310; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
311; AVX512-NEXT:    vmaxpd %xmm1, %xmm0, %xmm0
312; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
313; AVX512-NEXT:    vmaxsd %xmm1, %xmm0, %xmm0
314; AVX512-NEXT:    vzeroupper
315; AVX512-NEXT:    retq
316  %1 = call fast double @llvm.vector.reduce.fmax.v16f64(<16 x double> %a0)
317  ret double %1
318}
319
320declare float @llvm.vector.reduce.fmin.v2f32(<2 x float>)
321declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>)
322declare float @llvm.vector.reduce.fmin.v8f32(<8 x float>)
323declare float @llvm.vector.reduce.fmax.v16f32(<16 x float>)
324
325declare double @llvm.vector.reduce.fmin.v2f64(<2 x double>)
326declare double @llvm.vector.reduce.fmax.v4f64(<4 x double>)
327declare double @llvm.vector.reduce.fmin.v8f64(<8 x double>)
328declare double @llvm.vector.reduce.fmax.v16f64(<16 x double>)
329