1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2    | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1  | FileCheck %s --check-prefixes=CHECK,SSE,SSE4
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx     | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
6
7declare float @fmaxf(float, float)
8declare double @fmax(double, double)
9declare x86_fp80 @fmaxl(x86_fp80, x86_fp80)
10declare float @llvm.maxnum.f32(float, float)
11declare double @llvm.maxnum.f64(double, double)
12declare x86_fp80 @llvm.maxnum.f80(x86_fp80, x86_fp80)
13
14declare <2 x float> @llvm.maxnum.v2f32(<2 x float>, <2 x float>)
15declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>)
16declare <8 x float> @llvm.maxnum.v8f32(<8 x float>, <8 x float>)
17declare <16 x float> @llvm.maxnum.v16f32(<16 x float>, <16 x float>)
18declare <2 x double> @llvm.maxnum.v2f64(<2 x double>, <2 x double>)
19declare <4 x double> @llvm.maxnum.v4f64(<4 x double>, <4 x double>)
20declare <8 x double> @llvm.maxnum.v8f64(<8 x double>, <8 x double>)
21
22; FIXME: As the vector tests show, the SSE run shouldn't need this many moves.
23
24define float @test_fmaxf(float %x, float %y) {
25; SSE-LABEL: test_fmaxf:
26; SSE:       # %bb.0:
27; SSE-NEXT:    movaps %xmm0, %xmm2
28; SSE-NEXT:    cmpunordss %xmm0, %xmm2
29; SSE-NEXT:    movaps %xmm2, %xmm3
30; SSE-NEXT:    andps %xmm1, %xmm3
31; SSE-NEXT:    maxss %xmm0, %xmm1
32; SSE-NEXT:    andnps %xmm1, %xmm2
33; SSE-NEXT:    orps %xmm3, %xmm2
34; SSE-NEXT:    movaps %xmm2, %xmm0
35; SSE-NEXT:    retq
36;
37; AVX1-LABEL: test_fmaxf:
38; AVX1:       # %bb.0:
39; AVX1-NEXT:    vmaxss %xmm0, %xmm1, %xmm2
40; AVX1-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm0
41; AVX1-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
42; AVX1-NEXT:    retq
43;
44; AVX512-LABEL: test_fmaxf:
45; AVX512:       # %bb.0:
46; AVX512-NEXT:    vmaxss %xmm0, %xmm1, %xmm2
47; AVX512-NEXT:    vcmpunordss %xmm0, %xmm0, %k1
48; AVX512-NEXT:    vmovss %xmm1, %xmm2, %xmm2 {%k1}
49; AVX512-NEXT:    vmovaps %xmm2, %xmm0
50; AVX512-NEXT:    retq
51  %z = call float @fmaxf(float %x, float %y) readnone
52  ret float %z
53}
54
55define float @test_fmaxf_minsize(float %x, float %y) minsize {
56; CHECK-LABEL: test_fmaxf_minsize:
57; CHECK:       # %bb.0:
58; CHECK-NEXT:    jmp fmaxf@PLT # TAILCALL
59  %z = call float @fmaxf(float %x, float %y) readnone
60  ret float %z
61}
62
63; FIXME: As the vector tests show, the SSE run shouldn't need this many moves.
64
65define double @test_fmax(double %x, double %y) {
66; SSE-LABEL: test_fmax:
67; SSE:       # %bb.0:
68; SSE-NEXT:    movapd %xmm0, %xmm2
69; SSE-NEXT:    cmpunordsd %xmm0, %xmm2
70; SSE-NEXT:    movapd %xmm2, %xmm3
71; SSE-NEXT:    andpd %xmm1, %xmm3
72; SSE-NEXT:    maxsd %xmm0, %xmm1
73; SSE-NEXT:    andnpd %xmm1, %xmm2
74; SSE-NEXT:    orpd %xmm3, %xmm2
75; SSE-NEXT:    movapd %xmm2, %xmm0
76; SSE-NEXT:    retq
77;
78; AVX1-LABEL: test_fmax:
79; AVX1:       # %bb.0:
80; AVX1-NEXT:    vmaxsd %xmm0, %xmm1, %xmm2
81; AVX1-NEXT:    vcmpunordsd %xmm0, %xmm0, %xmm0
82; AVX1-NEXT:    vblendvpd %xmm0, %xmm1, %xmm2, %xmm0
83; AVX1-NEXT:    retq
84;
85; AVX512-LABEL: test_fmax:
86; AVX512:       # %bb.0:
87; AVX512-NEXT:    vmaxsd %xmm0, %xmm1, %xmm2
88; AVX512-NEXT:    vcmpunordsd %xmm0, %xmm0, %k1
89; AVX512-NEXT:    vmovsd %xmm1, %xmm2, %xmm2 {%k1}
90; AVX512-NEXT:    vmovapd %xmm2, %xmm0
91; AVX512-NEXT:    retq
92  %z = call double @fmax(double %x, double %y) readnone
93  ret double %z
94}
95
96define x86_fp80 @test_fmaxl(x86_fp80 %x, x86_fp80 %y) {
97; CHECK-LABEL: test_fmaxl:
98; CHECK:       # %bb.0:
99; CHECK-NEXT:    subq $40, %rsp
100; CHECK-NEXT:    .cfi_def_cfa_offset 48
101; CHECK-NEXT:    fldt {{[0-9]+}}(%rsp)
102; CHECK-NEXT:    fldt {{[0-9]+}}(%rsp)
103; CHECK-NEXT:    fstpt {{[0-9]+}}(%rsp)
104; CHECK-NEXT:    fstpt (%rsp)
105; CHECK-NEXT:    callq fmaxl
106; CHECK-NEXT:    addq $40, %rsp
107; CHECK-NEXT:    .cfi_def_cfa_offset 8
108; CHECK-NEXT:    retq
109  %z = call x86_fp80 @fmaxl(x86_fp80 %x, x86_fp80 %y) readnone
110  ret x86_fp80 %z
111}
112
113define float @test_intrinsic_fmaxf(float %x, float %y) {
114; SSE-LABEL: test_intrinsic_fmaxf:
115; SSE:       # %bb.0:
116; SSE-NEXT:    movaps %xmm0, %xmm2
117; SSE-NEXT:    cmpunordss %xmm0, %xmm2
118; SSE-NEXT:    movaps %xmm2, %xmm3
119; SSE-NEXT:    andps %xmm1, %xmm3
120; SSE-NEXT:    maxss %xmm0, %xmm1
121; SSE-NEXT:    andnps %xmm1, %xmm2
122; SSE-NEXT:    orps %xmm3, %xmm2
123; SSE-NEXT:    movaps %xmm2, %xmm0
124; SSE-NEXT:    retq
125;
126; AVX1-LABEL: test_intrinsic_fmaxf:
127; AVX1:       # %bb.0:
128; AVX1-NEXT:    vmaxss %xmm0, %xmm1, %xmm2
129; AVX1-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm0
130; AVX1-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
131; AVX1-NEXT:    retq
132;
133; AVX512-LABEL: test_intrinsic_fmaxf:
134; AVX512:       # %bb.0:
135; AVX512-NEXT:    vmaxss %xmm0, %xmm1, %xmm2
136; AVX512-NEXT:    vcmpunordss %xmm0, %xmm0, %k1
137; AVX512-NEXT:    vmovss %xmm1, %xmm2, %xmm2 {%k1}
138; AVX512-NEXT:    vmovaps %xmm2, %xmm0
139; AVX512-NEXT:    retq
140  %z = call float @llvm.maxnum.f32(float %x, float %y) readnone
141  ret float %z
142}
143
144define double @test_intrinsic_fmax(double %x, double %y) {
145; SSE-LABEL: test_intrinsic_fmax:
146; SSE:       # %bb.0:
147; SSE-NEXT:    movapd %xmm0, %xmm2
148; SSE-NEXT:    cmpunordsd %xmm0, %xmm2
149; SSE-NEXT:    movapd %xmm2, %xmm3
150; SSE-NEXT:    andpd %xmm1, %xmm3
151; SSE-NEXT:    maxsd %xmm0, %xmm1
152; SSE-NEXT:    andnpd %xmm1, %xmm2
153; SSE-NEXT:    orpd %xmm3, %xmm2
154; SSE-NEXT:    movapd %xmm2, %xmm0
155; SSE-NEXT:    retq
156;
157; AVX1-LABEL: test_intrinsic_fmax:
158; AVX1:       # %bb.0:
159; AVX1-NEXT:    vmaxsd %xmm0, %xmm1, %xmm2
160; AVX1-NEXT:    vcmpunordsd %xmm0, %xmm0, %xmm0
161; AVX1-NEXT:    vblendvpd %xmm0, %xmm1, %xmm2, %xmm0
162; AVX1-NEXT:    retq
163;
164; AVX512-LABEL: test_intrinsic_fmax:
165; AVX512:       # %bb.0:
166; AVX512-NEXT:    vmaxsd %xmm0, %xmm1, %xmm2
167; AVX512-NEXT:    vcmpunordsd %xmm0, %xmm0, %k1
168; AVX512-NEXT:    vmovsd %xmm1, %xmm2, %xmm2 {%k1}
169; AVX512-NEXT:    vmovapd %xmm2, %xmm0
170; AVX512-NEXT:    retq
171  %z = call double @llvm.maxnum.f64(double %x, double %y) readnone
172  ret double %z
173}
174
175define x86_fp80 @test_intrinsic_fmaxl(x86_fp80 %x, x86_fp80 %y) {
176; CHECK-LABEL: test_intrinsic_fmaxl:
177; CHECK:       # %bb.0:
178; CHECK-NEXT:    subq $40, %rsp
179; CHECK-NEXT:    .cfi_def_cfa_offset 48
180; CHECK-NEXT:    fldt {{[0-9]+}}(%rsp)
181; CHECK-NEXT:    fldt {{[0-9]+}}(%rsp)
182; CHECK-NEXT:    fstpt {{[0-9]+}}(%rsp)
183; CHECK-NEXT:    fstpt (%rsp)
184; CHECK-NEXT:    callq fmaxl
185; CHECK-NEXT:    addq $40, %rsp
186; CHECK-NEXT:    .cfi_def_cfa_offset 8
187; CHECK-NEXT:    retq
188  %z = call x86_fp80 @llvm.maxnum.f80(x86_fp80 %x, x86_fp80 %y) readnone
189  ret x86_fp80 %z
190}
191
192define <2 x float> @test_intrinsic_fmax_v2f32(<2 x float> %x, <2 x float> %y) {
193; SSE2-LABEL: test_intrinsic_fmax_v2f32:
194; SSE2:       # %bb.0:
195; SSE2-NEXT:    movaps %xmm1, %xmm2
196; SSE2-NEXT:    maxps %xmm0, %xmm2
197; SSE2-NEXT:    cmpunordps %xmm0, %xmm0
198; SSE2-NEXT:    andps %xmm0, %xmm1
199; SSE2-NEXT:    andnps %xmm2, %xmm0
200; SSE2-NEXT:    orps %xmm1, %xmm0
201; SSE2-NEXT:    retq
202;
203; SSE4-LABEL: test_intrinsic_fmax_v2f32:
204; SSE4:       # %bb.0:
205; SSE4-NEXT:    movaps %xmm1, %xmm2
206; SSE4-NEXT:    maxps %xmm0, %xmm2
207; SSE4-NEXT:    cmpunordps %xmm0, %xmm0
208; SSE4-NEXT:    blendvps %xmm0, %xmm1, %xmm2
209; SSE4-NEXT:    movaps %xmm2, %xmm0
210; SSE4-NEXT:    retq
211;
212; AVX-LABEL: test_intrinsic_fmax_v2f32:
213; AVX:       # %bb.0:
214; AVX-NEXT:    vmaxps %xmm0, %xmm1, %xmm2
215; AVX-NEXT:    vcmpunordps %xmm0, %xmm0, %xmm0
216; AVX-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
217; AVX-NEXT:    retq
218  %z = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %x, <2 x float> %y) readnone
219  ret <2 x float> %z
220}
221
222define <4 x float> @test_intrinsic_fmax_v4f32(<4 x float> %x, <4 x float> %y) {
223; SSE2-LABEL: test_intrinsic_fmax_v4f32:
224; SSE2:       # %bb.0:
225; SSE2-NEXT:    movaps %xmm1, %xmm2
226; SSE2-NEXT:    maxps %xmm0, %xmm2
227; SSE2-NEXT:    cmpunordps %xmm0, %xmm0
228; SSE2-NEXT:    andps %xmm0, %xmm1
229; SSE2-NEXT:    andnps %xmm2, %xmm0
230; SSE2-NEXT:    orps %xmm1, %xmm0
231; SSE2-NEXT:    retq
232;
233; SSE4-LABEL: test_intrinsic_fmax_v4f32:
234; SSE4:       # %bb.0:
235; SSE4-NEXT:    movaps %xmm1, %xmm2
236; SSE4-NEXT:    maxps %xmm0, %xmm2
237; SSE4-NEXT:    cmpunordps %xmm0, %xmm0
238; SSE4-NEXT:    blendvps %xmm0, %xmm1, %xmm2
239; SSE4-NEXT:    movaps %xmm2, %xmm0
240; SSE4-NEXT:    retq
241;
242; AVX-LABEL: test_intrinsic_fmax_v4f32:
243; AVX:       # %bb.0:
244; AVX-NEXT:    vmaxps %xmm0, %xmm1, %xmm2
245; AVX-NEXT:    vcmpunordps %xmm0, %xmm0, %xmm0
246; AVX-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
247; AVX-NEXT:    retq
248  %z = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %x, <4 x float> %y) readnone
249  ret <4 x float> %z
250}
251
252define <8 x float> @test_intrinsic_fmax_v8f32(<8 x float> %x, <8 x float> %y) {
253; SSE2-LABEL: test_intrinsic_fmax_v8f32:
254; SSE2:       # %bb.0:
255; SSE2-NEXT:    movaps %xmm2, %xmm4
256; SSE2-NEXT:    maxps %xmm0, %xmm4
257; SSE2-NEXT:    cmpunordps %xmm0, %xmm0
258; SSE2-NEXT:    andps %xmm0, %xmm2
259; SSE2-NEXT:    andnps %xmm4, %xmm0
260; SSE2-NEXT:    orps %xmm2, %xmm0
261; SSE2-NEXT:    movaps %xmm3, %xmm2
262; SSE2-NEXT:    maxps %xmm1, %xmm2
263; SSE2-NEXT:    cmpunordps %xmm1, %xmm1
264; SSE2-NEXT:    andps %xmm1, %xmm3
265; SSE2-NEXT:    andnps %xmm2, %xmm1
266; SSE2-NEXT:    orps %xmm3, %xmm1
267; SSE2-NEXT:    retq
268;
269; SSE4-LABEL: test_intrinsic_fmax_v8f32:
270; SSE4:       # %bb.0:
271; SSE4-NEXT:    movaps %xmm1, %xmm5
272; SSE4-NEXT:    movaps %xmm2, %xmm4
273; SSE4-NEXT:    maxps %xmm0, %xmm4
274; SSE4-NEXT:    cmpunordps %xmm0, %xmm0
275; SSE4-NEXT:    blendvps %xmm0, %xmm2, %xmm4
276; SSE4-NEXT:    movaps %xmm3, %xmm1
277; SSE4-NEXT:    maxps %xmm5, %xmm1
278; SSE4-NEXT:    cmpunordps %xmm5, %xmm5
279; SSE4-NEXT:    movaps %xmm5, %xmm0
280; SSE4-NEXT:    blendvps %xmm0, %xmm3, %xmm1
281; SSE4-NEXT:    movaps %xmm4, %xmm0
282; SSE4-NEXT:    retq
283;
284; AVX-LABEL: test_intrinsic_fmax_v8f32:
285; AVX:       # %bb.0:
286; AVX-NEXT:    vmaxps %ymm0, %ymm1, %ymm2
287; AVX-NEXT:    vcmpunordps %ymm0, %ymm0, %ymm0
288; AVX-NEXT:    vblendvps %ymm0, %ymm1, %ymm2, %ymm0
289; AVX-NEXT:    retq
290  %z = call <8 x float> @llvm.maxnum.v8f32(<8 x float> %x, <8 x float> %y) readnone
291  ret <8 x float> %z
292}
293
294define <16 x float> @test_intrinsic_fmax_v16f32(<16 x float> %x, <16 x float> %y) {
295; SSE2-LABEL: test_intrinsic_fmax_v16f32:
296; SSE2:       # %bb.0:
297; SSE2-NEXT:    movaps %xmm4, %xmm8
298; SSE2-NEXT:    maxps %xmm0, %xmm8
299; SSE2-NEXT:    cmpunordps %xmm0, %xmm0
300; SSE2-NEXT:    andps %xmm0, %xmm4
301; SSE2-NEXT:    andnps %xmm8, %xmm0
302; SSE2-NEXT:    orps %xmm4, %xmm0
303; SSE2-NEXT:    movaps %xmm5, %xmm4
304; SSE2-NEXT:    maxps %xmm1, %xmm4
305; SSE2-NEXT:    cmpunordps %xmm1, %xmm1
306; SSE2-NEXT:    andps %xmm1, %xmm5
307; SSE2-NEXT:    andnps %xmm4, %xmm1
308; SSE2-NEXT:    orps %xmm5, %xmm1
309; SSE2-NEXT:    movaps %xmm6, %xmm4
310; SSE2-NEXT:    maxps %xmm2, %xmm4
311; SSE2-NEXT:    cmpunordps %xmm2, %xmm2
312; SSE2-NEXT:    andps %xmm2, %xmm6
313; SSE2-NEXT:    andnps %xmm4, %xmm2
314; SSE2-NEXT:    orps %xmm6, %xmm2
315; SSE2-NEXT:    movaps %xmm7, %xmm4
316; SSE2-NEXT:    maxps %xmm3, %xmm4
317; SSE2-NEXT:    cmpunordps %xmm3, %xmm3
318; SSE2-NEXT:    andps %xmm3, %xmm7
319; SSE2-NEXT:    andnps %xmm4, %xmm3
320; SSE2-NEXT:    orps %xmm7, %xmm3
321; SSE2-NEXT:    retq
322;
323; SSE4-LABEL: test_intrinsic_fmax_v16f32:
324; SSE4:       # %bb.0:
325; SSE4-NEXT:    movaps %xmm3, %xmm8
326; SSE4-NEXT:    movaps %xmm2, %xmm9
327; SSE4-NEXT:    movaps %xmm1, %xmm2
328; SSE4-NEXT:    movaps %xmm4, %xmm10
329; SSE4-NEXT:    maxps %xmm0, %xmm10
330; SSE4-NEXT:    cmpunordps %xmm0, %xmm0
331; SSE4-NEXT:    blendvps %xmm0, %xmm4, %xmm10
332; SSE4-NEXT:    movaps %xmm5, %xmm1
333; SSE4-NEXT:    maxps %xmm2, %xmm1
334; SSE4-NEXT:    cmpunordps %xmm2, %xmm2
335; SSE4-NEXT:    movaps %xmm2, %xmm0
336; SSE4-NEXT:    blendvps %xmm0, %xmm5, %xmm1
337; SSE4-NEXT:    movaps %xmm6, %xmm2
338; SSE4-NEXT:    maxps %xmm9, %xmm2
339; SSE4-NEXT:    cmpunordps %xmm9, %xmm9
340; SSE4-NEXT:    movaps %xmm9, %xmm0
341; SSE4-NEXT:    blendvps %xmm0, %xmm6, %xmm2
342; SSE4-NEXT:    movaps %xmm7, %xmm3
343; SSE4-NEXT:    maxps %xmm8, %xmm3
344; SSE4-NEXT:    cmpunordps %xmm8, %xmm8
345; SSE4-NEXT:    movaps %xmm8, %xmm0
346; SSE4-NEXT:    blendvps %xmm0, %xmm7, %xmm3
347; SSE4-NEXT:    movaps %xmm10, %xmm0
348; SSE4-NEXT:    retq
349;
350; AVX1-LABEL: test_intrinsic_fmax_v16f32:
351; AVX1:       # %bb.0:
352; AVX1-NEXT:    vmaxps %ymm0, %ymm2, %ymm4
353; AVX1-NEXT:    vcmpunordps %ymm0, %ymm0, %ymm0
354; AVX1-NEXT:    vblendvps %ymm0, %ymm2, %ymm4, %ymm0
355; AVX1-NEXT:    vmaxps %ymm1, %ymm3, %ymm2
356; AVX1-NEXT:    vcmpunordps %ymm1, %ymm1, %ymm1
357; AVX1-NEXT:    vblendvps %ymm1, %ymm3, %ymm2, %ymm1
358; AVX1-NEXT:    retq
359;
360; AVX512-LABEL: test_intrinsic_fmax_v16f32:
361; AVX512:       # %bb.0:
362; AVX512-NEXT:    vmaxps %zmm0, %zmm1, %zmm2
363; AVX512-NEXT:    vcmpunordps %zmm0, %zmm0, %k1
364; AVX512-NEXT:    vmovaps %zmm1, %zmm2 {%k1}
365; AVX512-NEXT:    vmovaps %zmm2, %zmm0
366; AVX512-NEXT:    retq
367  %z = call <16 x float> @llvm.maxnum.v16f32(<16 x float> %x, <16 x float> %y) readnone
368  ret <16 x float> %z
369}
370
371define <2 x double> @test_intrinsic_fmax_v2f64(<2 x double> %x, <2 x double> %y) {
372; SSE2-LABEL: test_intrinsic_fmax_v2f64:
373; SSE2:       # %bb.0:
374; SSE2-NEXT:    movapd %xmm1, %xmm2
375; SSE2-NEXT:    maxpd %xmm0, %xmm2
376; SSE2-NEXT:    cmpunordpd %xmm0, %xmm0
377; SSE2-NEXT:    andpd %xmm0, %xmm1
378; SSE2-NEXT:    andnpd %xmm2, %xmm0
379; SSE2-NEXT:    orpd %xmm1, %xmm0
380; SSE2-NEXT:    retq
381;
382; SSE4-LABEL: test_intrinsic_fmax_v2f64:
383; SSE4:       # %bb.0:
384; SSE4-NEXT:    movapd %xmm1, %xmm2
385; SSE4-NEXT:    maxpd %xmm0, %xmm2
386; SSE4-NEXT:    cmpunordpd %xmm0, %xmm0
387; SSE4-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
388; SSE4-NEXT:    movapd %xmm2, %xmm0
389; SSE4-NEXT:    retq
390;
391; AVX-LABEL: test_intrinsic_fmax_v2f64:
392; AVX:       # %bb.0:
393; AVX-NEXT:    vmaxpd %xmm0, %xmm1, %xmm2
394; AVX-NEXT:    vcmpunordpd %xmm0, %xmm0, %xmm0
395; AVX-NEXT:    vblendvpd %xmm0, %xmm1, %xmm2, %xmm0
396; AVX-NEXT:    retq
397  %z = call <2 x double> @llvm.maxnum.v2f64(<2 x double> %x, <2 x double> %y) readnone
398  ret <2 x double> %z
399}
400
401define <4 x double> @test_intrinsic_fmax_v4f64(<4 x double> %x, <4 x double> %y) {
402; SSE2-LABEL: test_intrinsic_fmax_v4f64:
403; SSE2:       # %bb.0:
404; SSE2-NEXT:    movapd %xmm2, %xmm4
405; SSE2-NEXT:    maxpd %xmm0, %xmm4
406; SSE2-NEXT:    cmpunordpd %xmm0, %xmm0
407; SSE2-NEXT:    andpd %xmm0, %xmm2
408; SSE2-NEXT:    andnpd %xmm4, %xmm0
409; SSE2-NEXT:    orpd %xmm2, %xmm0
410; SSE2-NEXT:    movapd %xmm3, %xmm2
411; SSE2-NEXT:    maxpd %xmm1, %xmm2
412; SSE2-NEXT:    cmpunordpd %xmm1, %xmm1
413; SSE2-NEXT:    andpd %xmm1, %xmm3
414; SSE2-NEXT:    andnpd %xmm2, %xmm1
415; SSE2-NEXT:    orpd %xmm3, %xmm1
416; SSE2-NEXT:    retq
417;
418; SSE4-LABEL: test_intrinsic_fmax_v4f64:
419; SSE4:       # %bb.0:
420; SSE4-NEXT:    movapd %xmm1, %xmm5
421; SSE4-NEXT:    movapd %xmm2, %xmm4
422; SSE4-NEXT:    maxpd %xmm0, %xmm4
423; SSE4-NEXT:    cmpunordpd %xmm0, %xmm0
424; SSE4-NEXT:    blendvpd %xmm0, %xmm2, %xmm4
425; SSE4-NEXT:    movapd %xmm3, %xmm1
426; SSE4-NEXT:    maxpd %xmm5, %xmm1
427; SSE4-NEXT:    cmpunordpd %xmm5, %xmm5
428; SSE4-NEXT:    movapd %xmm5, %xmm0
429; SSE4-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
430; SSE4-NEXT:    movapd %xmm4, %xmm0
431; SSE4-NEXT:    retq
432;
433; AVX-LABEL: test_intrinsic_fmax_v4f64:
434; AVX:       # %bb.0:
435; AVX-NEXT:    vmaxpd %ymm0, %ymm1, %ymm2
436; AVX-NEXT:    vcmpunordpd %ymm0, %ymm0, %ymm0
437; AVX-NEXT:    vblendvpd %ymm0, %ymm1, %ymm2, %ymm0
438; AVX-NEXT:    retq
439  %z = call <4 x double> @llvm.maxnum.v4f64(<4 x double> %x, <4 x double> %y) readnone
440  ret <4 x double> %z
441}
442
443define <8 x double> @test_intrinsic_fmax_v8f64(<8 x double> %x, <8 x double> %y) {
444; SSE2-LABEL: test_intrinsic_fmax_v8f64:
445; SSE2:       # %bb.0:
446; SSE2-NEXT:    movapd %xmm4, %xmm8
447; SSE2-NEXT:    maxpd %xmm0, %xmm8
448; SSE2-NEXT:    cmpunordpd %xmm0, %xmm0
449; SSE2-NEXT:    andpd %xmm0, %xmm4
450; SSE2-NEXT:    andnpd %xmm8, %xmm0
451; SSE2-NEXT:    orpd %xmm4, %xmm0
452; SSE2-NEXT:    movapd %xmm5, %xmm4
453; SSE2-NEXT:    maxpd %xmm1, %xmm4
454; SSE2-NEXT:    cmpunordpd %xmm1, %xmm1
455; SSE2-NEXT:    andpd %xmm1, %xmm5
456; SSE2-NEXT:    andnpd %xmm4, %xmm1
457; SSE2-NEXT:    orpd %xmm5, %xmm1
458; SSE2-NEXT:    movapd %xmm6, %xmm4
459; SSE2-NEXT:    maxpd %xmm2, %xmm4
460; SSE2-NEXT:    cmpunordpd %xmm2, %xmm2
461; SSE2-NEXT:    andpd %xmm2, %xmm6
462; SSE2-NEXT:    andnpd %xmm4, %xmm2
463; SSE2-NEXT:    orpd %xmm6, %xmm2
464; SSE2-NEXT:    movapd %xmm7, %xmm4
465; SSE2-NEXT:    maxpd %xmm3, %xmm4
466; SSE2-NEXT:    cmpunordpd %xmm3, %xmm3
467; SSE2-NEXT:    andpd %xmm3, %xmm7
468; SSE2-NEXT:    andnpd %xmm4, %xmm3
469; SSE2-NEXT:    orpd %xmm7, %xmm3
470; SSE2-NEXT:    retq
471;
472; SSE4-LABEL: test_intrinsic_fmax_v8f64:
473; SSE4:       # %bb.0:
474; SSE4-NEXT:    movapd %xmm3, %xmm8
475; SSE4-NEXT:    movapd %xmm2, %xmm9
476; SSE4-NEXT:    movapd %xmm1, %xmm2
477; SSE4-NEXT:    movapd %xmm4, %xmm10
478; SSE4-NEXT:    maxpd %xmm0, %xmm10
479; SSE4-NEXT:    cmpunordpd %xmm0, %xmm0
480; SSE4-NEXT:    blendvpd %xmm0, %xmm4, %xmm10
481; SSE4-NEXT:    movapd %xmm5, %xmm1
482; SSE4-NEXT:    maxpd %xmm2, %xmm1
483; SSE4-NEXT:    cmpunordpd %xmm2, %xmm2
484; SSE4-NEXT:    movapd %xmm2, %xmm0
485; SSE4-NEXT:    blendvpd %xmm0, %xmm5, %xmm1
486; SSE4-NEXT:    movapd %xmm6, %xmm2
487; SSE4-NEXT:    maxpd %xmm9, %xmm2
488; SSE4-NEXT:    cmpunordpd %xmm9, %xmm9
489; SSE4-NEXT:    movapd %xmm9, %xmm0
490; SSE4-NEXT:    blendvpd %xmm0, %xmm6, %xmm2
491; SSE4-NEXT:    movapd %xmm7, %xmm3
492; SSE4-NEXT:    maxpd %xmm8, %xmm3
493; SSE4-NEXT:    cmpunordpd %xmm8, %xmm8
494; SSE4-NEXT:    movapd %xmm8, %xmm0
495; SSE4-NEXT:    blendvpd %xmm0, %xmm7, %xmm3
496; SSE4-NEXT:    movapd %xmm10, %xmm0
497; SSE4-NEXT:    retq
498;
499; AVX1-LABEL: test_intrinsic_fmax_v8f64:
500; AVX1:       # %bb.0:
501; AVX1-NEXT:    vmaxpd %ymm0, %ymm2, %ymm4
502; AVX1-NEXT:    vcmpunordpd %ymm0, %ymm0, %ymm0
503; AVX1-NEXT:    vblendvpd %ymm0, %ymm2, %ymm4, %ymm0
504; AVX1-NEXT:    vmaxpd %ymm1, %ymm3, %ymm2
505; AVX1-NEXT:    vcmpunordpd %ymm1, %ymm1, %ymm1
506; AVX1-NEXT:    vblendvpd %ymm1, %ymm3, %ymm2, %ymm1
507; AVX1-NEXT:    retq
508;
509; AVX512-LABEL: test_intrinsic_fmax_v8f64:
510; AVX512:       # %bb.0:
511; AVX512-NEXT:    vmaxpd %zmm0, %zmm1, %zmm2
512; AVX512-NEXT:    vcmpunordpd %zmm0, %zmm0, %k1
513; AVX512-NEXT:    vmovapd %zmm1, %zmm2 {%k1}
514; AVX512-NEXT:    vmovapd %zmm2, %zmm0
515; AVX512-NEXT:    retq
516  %z = call <8 x double> @llvm.maxnum.v8f64(<8 x double> %x, <8 x double> %y) readnone
517  ret <8 x double> %z
518}
519
520; The IR-level FMF propagate to the node. With nnan, there's no need to blend.
521
522define double @maxnum_intrinsic_nnan_fmf_f64(double %a, double %b) {
523; SSE-LABEL: maxnum_intrinsic_nnan_fmf_f64:
524; SSE:       # %bb.0:
525; SSE-NEXT:    maxsd %xmm1, %xmm0
526; SSE-NEXT:    retq
527;
528; AVX-LABEL: maxnum_intrinsic_nnan_fmf_f64:
529; AVX:       # %bb.0:
530; AVX-NEXT:    vmaxsd %xmm1, %xmm0, %xmm0
531; AVX-NEXT:    retq
532  %r = tail call nnan double @llvm.maxnum.f64(double %a, double %b)
533  ret double %r
534}
535
536; Make sure vectors work too.
537
538define <4 x float> @maxnum_intrinsic_nnan_fmf_f432(<4 x float> %a, <4 x float> %b) {
539; SSE-LABEL: maxnum_intrinsic_nnan_fmf_f432:
540; SSE:       # %bb.0:
541; SSE-NEXT:    maxps %xmm1, %xmm0
542; SSE-NEXT:    retq
543;
544; AVX-LABEL: maxnum_intrinsic_nnan_fmf_f432:
545; AVX:       # %bb.0:
546; AVX-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
547; AVX-NEXT:    retq
548  %r = tail call nnan <4 x float> @llvm.maxnum.v4f32(<4 x float> %a, <4 x float> %b)
549  ret <4 x float> %r
550}
551
552; Current (but legacy someday): a function-level attribute should also enable the fold.
553
554define float @maxnum_intrinsic_nnan_attr_f32(float %a, float %b) #0 {
555; SSE-LABEL: maxnum_intrinsic_nnan_attr_f32:
556; SSE:       # %bb.0:
557; SSE-NEXT:    maxss %xmm1, %xmm0
558; SSE-NEXT:    retq
559;
560; AVX-LABEL: maxnum_intrinsic_nnan_attr_f32:
561; AVX:       # %bb.0:
562; AVX-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
563; AVX-NEXT:    retq
564  %r = tail call float @llvm.maxnum.f32(float %a, float %b)
565  ret float %r
566}
567
568; Make sure vectors work too.
569
570define <2 x double> @maxnum_intrinsic_nnan_attr_f64(<2 x double> %a, <2 x double> %b) #0 {
571; SSE-LABEL: maxnum_intrinsic_nnan_attr_f64:
572; SSE:       # %bb.0:
573; SSE-NEXT:    maxpd %xmm1, %xmm0
574; SSE-NEXT:    retq
575;
576; AVX-LABEL: maxnum_intrinsic_nnan_attr_f64:
577; AVX:       # %bb.0:
578; AVX-NEXT:    vmaxpd %xmm1, %xmm0, %xmm0
579; AVX-NEXT:    retq
580  %r = tail call <2 x double> @llvm.maxnum.v2f64(<2 x double> %a, <2 x double> %b)
581  ret <2 x double> %r
582}
583
584define float @test_maxnum_const_op1(float %x) {
585; SSE-LABEL: test_maxnum_const_op1:
586; SSE:       # %bb.0:
587; SSE-NEXT:    maxss {{.*}}(%rip), %xmm0
588; SSE-NEXT:    retq
589;
590; AVX-LABEL: test_maxnum_const_op1:
591; AVX:       # %bb.0:
592; AVX-NEXT:    vmaxss {{.*}}(%rip), %xmm0, %xmm0
593; AVX-NEXT:    retq
594  %r = call float @llvm.maxnum.f32(float 1.0, float %x)
595  ret float %r
596}
597
598define float @test_maxnum_const_op2(float %x) {
599; SSE-LABEL: test_maxnum_const_op2:
600; SSE:       # %bb.0:
601; SSE-NEXT:    maxss {{.*}}(%rip), %xmm0
602; SSE-NEXT:    retq
603;
604; AVX-LABEL: test_maxnum_const_op2:
605; AVX:       # %bb.0:
606; AVX-NEXT:    vmaxss {{.*}}(%rip), %xmm0, %xmm0
607; AVX-NEXT:    retq
608  %r = call float @llvm.maxnum.f32(float %x, float 1.0)
609  ret float %r
610}
611
612define float @test_maxnum_const_nan(float %x) {
613; CHECK-LABEL: test_maxnum_const_nan:
614; CHECK:       # %bb.0:
615; CHECK-NEXT:    retq
616  %r = call float @llvm.maxnum.f32(float %x, float 0x7fff000000000000)
617  ret float %r
618}
619
620attributes #0 = { "no-nans-fp-math"="true" }
621
622