1; NOTE: Assertions have been autogenerated by update_llc_test_checks.py
2; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma -fp-contract=fast | FileCheck %s --check-prefix=ALL --check-prefix=FMA
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+fma -fp-contract=fast | FileCheck %s --check-prefix=ALL --check-prefix=FMA4
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4 -fp-contract=fast | FileCheck %s --check-prefix=ALL --check-prefix=FMA4
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl -fp-contract=fast | FileCheck %s --check-prefix=ALL --check-prefix=AVX512
7
8;
9; Pattern: (fadd (fmul x, y), z) -> (fmadd x,y,z)
10;
11
12define float @test_f32_fmadd(float %a0, float %a1, float %a2) {
13; FMA-LABEL: test_f32_fmadd:
14; FMA:       # BB#0:
15; FMA-NEXT:    vfmadd213ss %xmm2, %xmm1, %xmm0
16; FMA-NEXT:    retq
17;
18; FMA4-LABEL: test_f32_fmadd:
19; FMA4:       # BB#0:
20; FMA4-NEXT:    vfmaddss %xmm2, %xmm1, %xmm0, %xmm0
21; FMA4-NEXT:    retq
22;
23; AVX512-LABEL: test_f32_fmadd:
24; AVX512:       # BB#0:
25; AVX512-NEXT:    vfmadd213ss %xmm2, %xmm0, %xmm1
26; AVX512-NEXT:    vmovaps %zmm1, %zmm0
27; AVX512-NEXT:    retq
28  %x = fmul float %a0, %a1
29  %res = fadd float %x, %a2
30  ret float %res
31}
32
33define <4 x float> @test_4f32_fmadd(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
34; FMA-LABEL: test_4f32_fmadd:
35; FMA:       # BB#0:
36; FMA-NEXT:    vfmadd213ps %xmm2, %xmm1, %xmm0
37; FMA-NEXT:    retq
38;
39; FMA4-LABEL: test_4f32_fmadd:
40; FMA4:       # BB#0:
41; FMA4-NEXT:    vfmaddps %xmm2, %xmm1, %xmm0, %xmm0
42; FMA4-NEXT:    retq
43;
44; AVX512-LABEL: test_4f32_fmadd:
45; AVX512:       # BB#0:
46; AVX512-NEXT:    vfmadd213ps %xmm2, %xmm1, %xmm0
47; AVX512-NEXT:    retq
48  %x = fmul <4 x float> %a0, %a1
49  %res = fadd <4 x float> %x, %a2
50  ret <4 x float> %res
51}
52
53define <8 x float> @test_8f32_fmadd(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
54; FMA-LABEL: test_8f32_fmadd:
55; FMA:       # BB#0:
56; FMA-NEXT:    vfmadd213ps %ymm2, %ymm1, %ymm0
57; FMA-NEXT:    retq
58;
59; FMA4-LABEL: test_8f32_fmadd:
60; FMA4:       # BB#0:
61; FMA4-NEXT:    vfmaddps %ymm2, %ymm1, %ymm0, %ymm0
62; FMA4-NEXT:    retq
63;
64; AVX512-LABEL: test_8f32_fmadd:
65; AVX512:       # BB#0:
66; AVX512-NEXT:    vfmadd213ps %ymm2, %ymm1, %ymm0
67; AVX512-NEXT:    retq
68  %x = fmul <8 x float> %a0, %a1
69  %res = fadd <8 x float> %x, %a2
70  ret <8 x float> %res
71}
72
73define double @test_f64_fmadd(double %a0, double %a1, double %a2) {
74; FMA-LABEL: test_f64_fmadd:
75; FMA:       # BB#0:
76; FMA-NEXT:    vfmadd213sd %xmm2, %xmm1, %xmm0
77; FMA-NEXT:    retq
78;
79; FMA4-LABEL: test_f64_fmadd:
80; FMA4:       # BB#0:
81; FMA4-NEXT:    vfmaddsd %xmm2, %xmm1, %xmm0, %xmm0
82; FMA4-NEXT:    retq
83;
84; AVX512-LABEL: test_f64_fmadd:
85; AVX512:       # BB#0:
86; AVX512-NEXT:    vfmadd213sd %xmm2, %xmm0, %xmm1
87; AVX512-NEXT:    vmovaps %zmm1, %zmm0
88; AVX512-NEXT:    retq
89  %x = fmul double %a0, %a1
90  %res = fadd double %x, %a2
91  ret double %res
92}
93
94define <2 x double> @test_2f64_fmadd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
95; FMA-LABEL: test_2f64_fmadd:
96; FMA:       # BB#0:
97; FMA-NEXT:    vfmadd213pd %xmm2, %xmm1, %xmm0
98; FMA-NEXT:    retq
99;
100; FMA4-LABEL: test_2f64_fmadd:
101; FMA4:       # BB#0:
102; FMA4-NEXT:    vfmaddpd %xmm2, %xmm1, %xmm0, %xmm0
103; FMA4-NEXT:    retq
104;
105; AVX512-LABEL: test_2f64_fmadd:
106; AVX512:       # BB#0:
107; AVX512-NEXT:    vfmadd213pd %xmm2, %xmm1, %xmm0
108; AVX512-NEXT:    retq
109  %x = fmul <2 x double> %a0, %a1
110  %res = fadd <2 x double> %x, %a2
111  ret <2 x double> %res
112}
113
114define <4 x double> @test_4f64_fmadd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
115; FMA-LABEL: test_4f64_fmadd:
116; FMA:       # BB#0:
117; FMA-NEXT:    vfmadd213pd %ymm2, %ymm1, %ymm0
118; FMA-NEXT:    retq
119;
120; FMA4-LABEL: test_4f64_fmadd:
121; FMA4:       # BB#0:
122; FMA4-NEXT:    vfmaddpd %ymm2, %ymm1, %ymm0, %ymm0
123; FMA4-NEXT:    retq
124;
125; AVX512-LABEL: test_4f64_fmadd:
126; AVX512:       # BB#0:
127; AVX512-NEXT:    vfmadd213pd %ymm2, %ymm1, %ymm0
128; AVX512-NEXT:    retq
129  %x = fmul <4 x double> %a0, %a1
130  %res = fadd <4 x double> %x, %a2
131  ret <4 x double> %res
132}
133
134;
135; Pattern: (fsub (fmul x, y), z) -> (fmsub x, y, z)
136;
137
138define float @test_f32_fmsub(float %a0, float %a1, float %a2) {
139; FMA-LABEL: test_f32_fmsub:
140; FMA:       # BB#0:
141; FMA-NEXT:    vfmsub213ss %xmm2, %xmm1, %xmm0
142; FMA-NEXT:    retq
143;
144; FMA4-LABEL: test_f32_fmsub:
145; FMA4:       # BB#0:
146; FMA4-NEXT:    vfmsubss %xmm2, %xmm1, %xmm0, %xmm0
147; FMA4-NEXT:    retq
148;
149; AVX512-LABEL: test_f32_fmsub:
150; AVX512:       # BB#0:
151; AVX512-NEXT:    vfmsub213ss %xmm2, %xmm0, %xmm1
152; AVX512-NEXT:    vmovaps %zmm1, %zmm0
153; AVX512-NEXT:    retq
154  %x = fmul float %a0, %a1
155  %res = fsub float %x, %a2
156  ret float %res
157}
158
159define <4 x float> @test_4f32_fmsub(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
160; FMA-LABEL: test_4f32_fmsub:
161; FMA:       # BB#0:
162; FMA-NEXT:    vfmsub213ps %xmm2, %xmm1, %xmm0
163; FMA-NEXT:    retq
164;
165; FMA4-LABEL: test_4f32_fmsub:
166; FMA4:       # BB#0:
167; FMA4-NEXT:    vfmsubps %xmm2, %xmm1, %xmm0, %xmm0
168; FMA4-NEXT:    retq
169;
170; AVX512-LABEL: test_4f32_fmsub:
171; AVX512:       # BB#0:
172; AVX512-NEXT:    vfmsub213ps %xmm2, %xmm1, %xmm0
173; AVX512-NEXT:    retq
174  %x = fmul <4 x float> %a0, %a1
175  %res = fsub <4 x float> %x, %a2
176  ret <4 x float> %res
177}
178
179define <8 x float> @test_8f32_fmsub(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
180; FMA-LABEL: test_8f32_fmsub:
181; FMA:       # BB#0:
182; FMA-NEXT:    vfmsub213ps %ymm2, %ymm1, %ymm0
183; FMA-NEXT:    retq
184;
185; FMA4-LABEL: test_8f32_fmsub:
186; FMA4:       # BB#0:
187; FMA4-NEXT:    vfmsubps %ymm2, %ymm1, %ymm0, %ymm0
188; FMA4-NEXT:    retq
189;
190; AVX512-LABEL: test_8f32_fmsub:
191; AVX512:       # BB#0:
192; AVX512-NEXT:    vfmsub213ps %ymm2, %ymm1, %ymm0
193; AVX512-NEXT:    retq
194  %x = fmul <8 x float> %a0, %a1
195  %res = fsub <8 x float> %x, %a2
196  ret <8 x float> %res
197}
198
199define double @test_f64_fmsub(double %a0, double %a1, double %a2) {
200; FMA-LABEL: test_f64_fmsub:
201; FMA:       # BB#0:
202; FMA-NEXT:    vfmsub213sd %xmm2, %xmm1, %xmm0
203; FMA-NEXT:    retq
204;
205; FMA4-LABEL: test_f64_fmsub:
206; FMA4:       # BB#0:
207; FMA4-NEXT:    vfmsubsd %xmm2, %xmm1, %xmm0, %xmm0
208; FMA4-NEXT:    retq
209;
210; AVX512-LABEL: test_f64_fmsub:
211; AVX512:       # BB#0:
212; AVX512-NEXT:    vfmsub213sd %xmm2, %xmm0, %xmm1
213; AVX512-NEXT:    vmovaps %zmm1, %zmm0
214; AVX512-NEXT:    retq
215  %x = fmul double %a0, %a1
216  %res = fsub double %x, %a2
217  ret double %res
218}
219
220define <2 x double> @test_2f64_fmsub(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
221; FMA-LABEL: test_2f64_fmsub:
222; FMA:       # BB#0:
223; FMA-NEXT:    vfmsub213pd %xmm2, %xmm1, %xmm0
224; FMA-NEXT:    retq
225;
226; FMA4-LABEL: test_2f64_fmsub:
227; FMA4:       # BB#0:
228; FMA4-NEXT:    vfmsubpd %xmm2, %xmm1, %xmm0, %xmm0
229; FMA4-NEXT:    retq
230;
231; AVX512-LABEL: test_2f64_fmsub:
232; AVX512:       # BB#0:
233; AVX512-NEXT:    vfmsub213pd %xmm2, %xmm1, %xmm0
234; AVX512-NEXT:    retq
235  %x = fmul <2 x double> %a0, %a1
236  %res = fsub <2 x double> %x, %a2
237  ret <2 x double> %res
238}
239
240define <4 x double> @test_4f64_fmsub(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
241; FMA-LABEL: test_4f64_fmsub:
242; FMA:       # BB#0:
243; FMA-NEXT:    vfmsub213pd %ymm2, %ymm1, %ymm0
244; FMA-NEXT:    retq
245;
246; FMA4-LABEL: test_4f64_fmsub:
247; FMA4:       # BB#0:
248; FMA4-NEXT:    vfmsubpd %ymm2, %ymm1, %ymm0, %ymm0
249; FMA4-NEXT:    retq
250;
251; AVX512-LABEL: test_4f64_fmsub:
252; AVX512:       # BB#0:
253; AVX512-NEXT:    vfmsub213pd %ymm2, %ymm1, %ymm0
254; AVX512-NEXT:    retq
255  %x = fmul <4 x double> %a0, %a1
256  %res = fsub <4 x double> %x, %a2
257  ret <4 x double> %res
258}
259
260;
261; Pattern: (fsub z, (fmul x, y)) -> (fnmadd x, y, z)
262;
263
264define float @test_f32_fnmadd(float %a0, float %a1, float %a2) {
265; FMA-LABEL: test_f32_fnmadd:
266; FMA:       # BB#0:
267; FMA-NEXT:    vfnmadd213ss %xmm2, %xmm1, %xmm0
268; FMA-NEXT:    retq
269;
270; FMA4-LABEL: test_f32_fnmadd:
271; FMA4:       # BB#0:
272; FMA4-NEXT:    vfnmaddss %xmm2, %xmm1, %xmm0, %xmm0
273; FMA4-NEXT:    retq
274;
275; AVX512-LABEL: test_f32_fnmadd:
276; AVX512:       # BB#0:
277; AVX512-NEXT:    vfnmadd213ss %xmm2, %xmm0, %xmm1
278; AVX512-NEXT:    vmovaps %zmm1, %zmm0
279; AVX512-NEXT:    retq
280  %x = fmul float %a0, %a1
281  %res = fsub float %a2, %x
282  ret float %res
283}
284
285define <4 x float> @test_4f32_fnmadd(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
286; FMA-LABEL: test_4f32_fnmadd:
287; FMA:       # BB#0:
288; FMA-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0
289; FMA-NEXT:    retq
290;
291; FMA4-LABEL: test_4f32_fnmadd:
292; FMA4:       # BB#0:
293; FMA4-NEXT:    vfnmaddps %xmm2, %xmm1, %xmm0, %xmm0
294; FMA4-NEXT:    retq
295;
296; AVX512-LABEL: test_4f32_fnmadd:
297; AVX512:       # BB#0:
298; AVX512-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0
299; AVX512-NEXT:    retq
300  %x = fmul <4 x float> %a0, %a1
301  %res = fsub <4 x float> %a2, %x
302  ret <4 x float> %res
303}
304
305define <8 x float> @test_8f32_fnmadd(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
306; FMA-LABEL: test_8f32_fnmadd:
307; FMA:       # BB#0:
308; FMA-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0
309; FMA-NEXT:    retq
310;
311; FMA4-LABEL: test_8f32_fnmadd:
312; FMA4:       # BB#0:
313; FMA4-NEXT:    vfnmaddps %ymm2, %ymm1, %ymm0, %ymm0
314; FMA4-NEXT:    retq
315;
316; AVX512-LABEL: test_8f32_fnmadd:
317; AVX512:       # BB#0:
318; AVX512-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0
319; AVX512-NEXT:    retq
320  %x = fmul <8 x float> %a0, %a1
321  %res = fsub <8 x float> %a2, %x
322  ret <8 x float> %res
323}
324
325define double @test_f64_fnmadd(double %a0, double %a1, double %a2) {
326; FMA-LABEL: test_f64_fnmadd:
327; FMA:       # BB#0:
328; FMA-NEXT:    vfnmadd213sd %xmm2, %xmm1, %xmm0
329; FMA-NEXT:    retq
330;
331; FMA4-LABEL: test_f64_fnmadd:
332; FMA4:       # BB#0:
333; FMA4-NEXT:    vfnmaddsd %xmm2, %xmm1, %xmm0, %xmm0
334; FMA4-NEXT:    retq
335;
336; AVX512-LABEL: test_f64_fnmadd:
337; AVX512:       # BB#0:
338; AVX512-NEXT:    vfnmadd213sd %xmm2, %xmm0, %xmm1
339; AVX512-NEXT:    vmovaps %zmm1, %zmm0
340; AVX512-NEXT:    retq
341  %x = fmul double %a0, %a1
342  %res = fsub double %a2, %x
343  ret double %res
344}
345
346define <2 x double> @test_2f64_fnmadd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
347; FMA-LABEL: test_2f64_fnmadd:
348; FMA:       # BB#0:
349; FMA-NEXT:    vfnmadd213pd %xmm2, %xmm1, %xmm0
350; FMA-NEXT:    retq
351;
352; FMA4-LABEL: test_2f64_fnmadd:
353; FMA4:       # BB#0:
354; FMA4-NEXT:    vfnmaddpd %xmm2, %xmm1, %xmm0, %xmm0
355; FMA4-NEXT:    retq
356;
357; AVX512-LABEL: test_2f64_fnmadd:
358; AVX512:       # BB#0:
359; AVX512-NEXT:    vfnmadd213pd %xmm2, %xmm1, %xmm0
360; AVX512-NEXT:    retq
361  %x = fmul <2 x double> %a0, %a1
362  %res = fsub <2 x double> %a2, %x
363  ret <2 x double> %res
364}
365
366define <4 x double> @test_4f64_fnmadd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
367; FMA-LABEL: test_4f64_fnmadd:
368; FMA:       # BB#0:
369; FMA-NEXT:    vfnmadd213pd %ymm2, %ymm1, %ymm0
370; FMA-NEXT:    retq
371;
372; FMA4-LABEL: test_4f64_fnmadd:
373; FMA4:       # BB#0:
374; FMA4-NEXT:    vfnmaddpd %ymm2, %ymm1, %ymm0, %ymm0
375; FMA4-NEXT:    retq
376;
377; AVX512-LABEL: test_4f64_fnmadd:
378; AVX512:       # BB#0:
379; AVX512-NEXT:    vfnmadd213pd %ymm2, %ymm1, %ymm0
380; AVX512-NEXT:    retq
381  %x = fmul <4 x double> %a0, %a1
382  %res = fsub <4 x double> %a2, %x
383  ret <4 x double> %res
384}
385
386;
387; Pattern: (fsub (fneg (fmul x, y)), z) -> (fnmsub x, y, z)
388;
389
390define float @test_f32_fnmsub(float %a0, float %a1, float %a2) {
391; FMA-LABEL: test_f32_fnmsub:
392; FMA:       # BB#0:
393; FMA-NEXT:    vfnmsub213ss %xmm2, %xmm1, %xmm0
394; FMA-NEXT:    retq
395;
396; FMA4-LABEL: test_f32_fnmsub:
397; FMA4:       # BB#0:
398; FMA4-NEXT:    vfnmsubss %xmm2, %xmm1, %xmm0, %xmm0
399; FMA4-NEXT:    retq
400;
401; AVX512-LABEL: test_f32_fnmsub:
402; AVX512:       # BB#0:
403; AVX512-NEXT:    vfnmsub213ss %xmm2, %xmm0, %xmm1
404; AVX512-NEXT:    vmovaps %zmm1, %zmm0
405; AVX512-NEXT:    retq
406  %x = fmul float %a0, %a1
407  %y = fsub float -0.000000e+00, %x
408  %res = fsub float %y, %a2
409  ret float %res
410}
411
412define <4 x float> @test_4f32_fnmsub(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
413; FMA-LABEL: test_4f32_fnmsub:
414; FMA:       # BB#0:
415; FMA-NEXT:    vfnmsub213ps %xmm2, %xmm1, %xmm0
416; FMA-NEXT:    retq
417;
418; FMA4-LABEL: test_4f32_fnmsub:
419; FMA4:       # BB#0:
420; FMA4-NEXT:    vfnmsubps %xmm2, %xmm1, %xmm0, %xmm0
421; FMA4-NEXT:    retq
422;
423; AVX512-LABEL: test_4f32_fnmsub:
424; AVX512:       # BB#0:
425; AVX512-NEXT:    vfnmsub213ps %xmm2, %xmm1, %xmm0
426; AVX512-NEXT:    retq
427  %x = fmul <4 x float> %a0, %a1
428  %y = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x
429  %res = fsub <4 x float> %y, %a2
430  ret <4 x float> %res
431}
432
433define <8 x float> @test_8f32_fnmsub(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
434; FMA-LABEL: test_8f32_fnmsub:
435; FMA:       # BB#0:
436; FMA-NEXT:    vfnmsub213ps %ymm2, %ymm1, %ymm0
437; FMA-NEXT:    retq
438;
439; FMA4-LABEL: test_8f32_fnmsub:
440; FMA4:       # BB#0:
441; FMA4-NEXT:    vfnmsubps %ymm2, %ymm1, %ymm0, %ymm0
442; FMA4-NEXT:    retq
443;
444; AVX512-LABEL: test_8f32_fnmsub:
445; AVX512:       # BB#0:
446; AVX512-NEXT:    vfnmsub213ps %ymm2, %ymm1, %ymm0
447; AVX512-NEXT:    retq
448  %x = fmul <8 x float> %a0, %a1
449  %y = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x
450  %res = fsub <8 x float> %y, %a2
451  ret <8 x float> %res
452}
453
454define double @test_f64_fnmsub(double %a0, double %a1, double %a2) {
455; FMA-LABEL: test_f64_fnmsub:
456; FMA:       # BB#0:
457; FMA-NEXT:    vfnmsub213sd %xmm2, %xmm1, %xmm0
458; FMA-NEXT:    retq
459;
460; FMA4-LABEL: test_f64_fnmsub:
461; FMA4:       # BB#0:
462; FMA4-NEXT:    vfnmsubsd %xmm2, %xmm1, %xmm0, %xmm0
463; FMA4-NEXT:    retq
464;
465; AVX512-LABEL: test_f64_fnmsub:
466; AVX512:       # BB#0:
467; AVX512-NEXT:    vfnmsub213sd %xmm2, %xmm0, %xmm1
468; AVX512-NEXT:    vmovaps %zmm1, %zmm0
469; AVX512-NEXT:    retq
470  %x = fmul double %a0, %a1
471  %y = fsub double -0.000000e+00, %x
472  %res = fsub double %y, %a2
473  ret double %res
474}
475
476define <2 x double> @test_2f64_fnmsub(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
477; FMA-LABEL: test_2f64_fnmsub:
478; FMA:       # BB#0:
479; FMA-NEXT:    vfnmsub213pd %xmm2, %xmm1, %xmm0
480; FMA-NEXT:    retq
481;
482; FMA4-LABEL: test_2f64_fnmsub:
483; FMA4:       # BB#0:
484; FMA4-NEXT:    vfnmsubpd %xmm2, %xmm1, %xmm0, %xmm0
485; FMA4-NEXT:    retq
486;
487; AVX512-LABEL: test_2f64_fnmsub:
488; AVX512:       # BB#0:
489; AVX512-NEXT:    vfnmsub213pd %xmm2, %xmm1, %xmm0
490; AVX512-NEXT:    retq
491  %x = fmul <2 x double> %a0, %a1
492  %y = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %x
493  %res = fsub <2 x double> %y, %a2
494  ret <2 x double> %res
495}
496
497define <4 x double> @test_4f64_fnmsub(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
498; FMA-LABEL: test_4f64_fnmsub:
499; FMA:       # BB#0:
500; FMA-NEXT:    vfnmsub213pd %ymm2, %ymm1, %ymm0
501; FMA-NEXT:    retq
502;
503; FMA4-LABEL: test_4f64_fnmsub:
504; FMA4:       # BB#0:
505; FMA4-NEXT:    vfnmsubpd %ymm2, %ymm1, %ymm0, %ymm0
506; FMA4-NEXT:    retq
507;
508; AVX512-LABEL: test_4f64_fnmsub:
509; AVX512:       # BB#0:
510; AVX512-NEXT:    vfnmsub213pd %ymm2, %ymm1, %ymm0
511; AVX512-NEXT:    retq
512  %x = fmul <4 x double> %a0, %a1
513  %y = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %x
514  %res = fsub <4 x double> %y, %a2
515  ret <4 x double> %res
516}
517
518;
519; Load Folding Patterns
520;
521
522define <4 x float> @test_4f32_fmadd_load(<4 x float>* %a0, <4 x float> %a1, <4 x float> %a2) {
523; FMA-LABEL: test_4f32_fmadd_load:
524; FMA:       # BB#0:
525; FMA-NEXT:    vfmadd132ps (%rdi), %xmm1, %xmm0
526; FMA-NEXT:    retq
527;
528; FMA4-LABEL: test_4f32_fmadd_load:
529; FMA4:       # BB#0:
530; FMA4-NEXT:    vfmaddps %xmm1, (%rdi), %xmm0, %xmm0
531; FMA4-NEXT:    retq
532;
533; AVX512-LABEL: test_4f32_fmadd_load:
534; AVX512:       # BB#0:
535; AVX512-NEXT:    vmovaps (%rdi), %xmm2
536; AVX512-NEXT:    vfmadd213ps %xmm1, %xmm0, %xmm2
537; AVX512-NEXT:    vmovaps %zmm2, %zmm0
538; AVX512-NEXT:    retq
539  %x = load <4 x float>, <4 x float>* %a0
540  %y = fmul <4 x float> %x, %a1
541  %res = fadd <4 x float> %y, %a2
542  ret <4 x float> %res
543}
544
545define <2 x double> @test_2f64_fmsub_load(<2 x double>* %a0, <2 x double> %a1, <2 x double> %a2) {
546; FMA-LABEL: test_2f64_fmsub_load:
547; FMA:       # BB#0:
548; FMA-NEXT:    vfmsub132pd (%rdi), %xmm1, %xmm0
549; FMA-NEXT:    retq
550;
551; FMA4-LABEL: test_2f64_fmsub_load:
552; FMA4:       # BB#0:
553; FMA4-NEXT:    vfmsubpd %xmm1, (%rdi), %xmm0, %xmm0
554; FMA4-NEXT:    retq
555;
556; AVX512-LABEL: test_2f64_fmsub_load:
557; AVX512:       # BB#0:
558; AVX512-NEXT:    vmovapd (%rdi), %xmm2
559; AVX512-NEXT:    vfmsub213pd %xmm1, %xmm0, %xmm2
560; AVX512-NEXT:    vmovaps %zmm2, %zmm0
561; AVX512-NEXT:    retq
562  %x = load <2 x double>, <2 x double>* %a0
563  %y = fmul <2 x double> %x, %a1
564  %res = fsub <2 x double> %y, %a2
565  ret <2 x double> %res
566}
567
568;
569; Patterns (+ fneg variants): mul(add(1.0,x),y), mul(sub(1.0,x),y), mul(sub(x,1.0),y)
570;
571
572define <4 x float> @test_v4f32_mul_add_x_one_y(<4 x float> %x, <4 x float> %y) {
573; FMA-LABEL: test_v4f32_mul_add_x_one_y:
574; FMA:       # BB#0:
575; FMA-NEXT:    vfmadd213ps %xmm1, %xmm1, %xmm0
576; FMA-NEXT:    retq
577;
578; FMA4-LABEL: test_v4f32_mul_add_x_one_y:
579; FMA4:       # BB#0:
580; FMA4-NEXT:    vfmaddps %xmm1, %xmm1, %xmm0, %xmm0
581; FMA4-NEXT:    retq
582;
583; AVX512-LABEL: test_v4f32_mul_add_x_one_y:
584; AVX512:       # BB#0:
585; AVX512-NEXT:    vfmadd213ps %xmm1, %xmm1, %xmm0
586; AVX512-NEXT:    retq
587  %a = fadd <4 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0>
588  %m = fmul <4 x float> %a, %y
589  ret <4 x float> %m
590}
591
592define <4 x float> @test_v4f32_mul_y_add_x_one(<4 x float> %x, <4 x float> %y) {
593; FMA-LABEL: test_v4f32_mul_y_add_x_one:
594; FMA:       # BB#0:
595; FMA-NEXT:    vfmadd213ps %xmm1, %xmm1, %xmm0
596; FMA-NEXT:    retq
597;
598; FMA4-LABEL: test_v4f32_mul_y_add_x_one:
599; FMA4:       # BB#0:
600; FMA4-NEXT:    vfmaddps %xmm1, %xmm1, %xmm0, %xmm0
601; FMA4-NEXT:    retq
602;
603; AVX512-LABEL: test_v4f32_mul_y_add_x_one:
604; AVX512:       # BB#0:
605; AVX512-NEXT:    vfmadd213ps %xmm1, %xmm1, %xmm0
606; AVX512-NEXT:    retq
607  %a = fadd <4 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0>
608  %m = fmul <4 x float> %y, %a
609  ret <4 x float> %m
610}
611
612define <4 x float> @test_v4f32_mul_add_x_negone_y(<4 x float> %x, <4 x float> %y) {
613; FMA-LABEL: test_v4f32_mul_add_x_negone_y:
614; FMA:       # BB#0:
615; FMA-NEXT:    vfmsub213ps %xmm1, %xmm1, %xmm0
616; FMA-NEXT:    retq
617;
618; FMA4-LABEL: test_v4f32_mul_add_x_negone_y:
619; FMA4:       # BB#0:
620; FMA4-NEXT:    vfmsubps %xmm1, %xmm1, %xmm0, %xmm0
621; FMA4-NEXT:    retq
622;
623; AVX512-LABEL: test_v4f32_mul_add_x_negone_y:
624; AVX512:       # BB#0:
625; AVX512-NEXT:    vfmsub213ps %xmm1, %xmm1, %xmm0
626; AVX512-NEXT:    retq
627  %a = fadd <4 x float> %x, <float -1.0, float -1.0, float -1.0, float -1.0>
628  %m = fmul <4 x float> %a, %y
629  ret <4 x float> %m
630}
631
632define <4 x float> @test_v4f32_mul_y_add_x_negone(<4 x float> %x, <4 x float> %y) {
633; FMA-LABEL: test_v4f32_mul_y_add_x_negone:
634; FMA:       # BB#0:
635; FMA-NEXT:    vfmsub213ps %xmm1, %xmm1, %xmm0
636; FMA-NEXT:    retq
637;
638; FMA4-LABEL: test_v4f32_mul_y_add_x_negone:
639; FMA4:       # BB#0:
640; FMA4-NEXT:    vfmsubps %xmm1, %xmm1, %xmm0, %xmm0
641; FMA4-NEXT:    retq
642;
643; AVX512-LABEL: test_v4f32_mul_y_add_x_negone:
644; AVX512:       # BB#0:
645; AVX512-NEXT:    vfmsub213ps %xmm1, %xmm1, %xmm0
646; AVX512-NEXT:    retq
647  %a = fadd <4 x float> %x, <float -1.0, float -1.0, float -1.0, float -1.0>
648  %m = fmul <4 x float> %y, %a
649  ret <4 x float> %m
650}
651
652define <4 x float> @test_v4f32_mul_sub_one_x_y(<4 x float> %x, <4 x float> %y) {
653; FMA-LABEL: test_v4f32_mul_sub_one_x_y:
654; FMA:       # BB#0:
655; FMA-NEXT:    vfnmadd213ps %xmm1, %xmm1, %xmm0
656; FMA-NEXT:    retq
657;
658; FMA4-LABEL: test_v4f32_mul_sub_one_x_y:
659; FMA4:       # BB#0:
660; FMA4-NEXT:    vfnmaddps %xmm1, %xmm1, %xmm0, %xmm0
661; FMA4-NEXT:    retq
662;
663; AVX512-LABEL: test_v4f32_mul_sub_one_x_y:
664; AVX512:       # BB#0:
665; AVX512-NEXT:    vfnmadd213ps %xmm1, %xmm1, %xmm0
666; AVX512-NEXT:    retq
667  %s = fsub <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
668  %m = fmul <4 x float> %s, %y
669  ret <4 x float> %m
670}
671
672define <4 x float> @test_v4f32_mul_y_sub_one_x(<4 x float> %x, <4 x float> %y) {
673; FMA-LABEL: test_v4f32_mul_y_sub_one_x:
674; FMA:       # BB#0:
675; FMA-NEXT:    vfnmadd213ps %xmm1, %xmm1, %xmm0
676; FMA-NEXT:    retq
677;
678; FMA4-LABEL: test_v4f32_mul_y_sub_one_x:
679; FMA4:       # BB#0:
680; FMA4-NEXT:    vfnmaddps %xmm1, %xmm1, %xmm0, %xmm0
681; FMA4-NEXT:    retq
682;
683; AVX512-LABEL: test_v4f32_mul_y_sub_one_x:
684; AVX512:       # BB#0:
685; AVX512-NEXT:    vfnmadd213ps %xmm1, %xmm1, %xmm0
686; AVX512-NEXT:    retq
687  %s = fsub <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
688  %m = fmul <4 x float> %y, %s
689  ret <4 x float> %m
690}
691
692define <4 x float> @test_v4f32_mul_sub_negone_x_y(<4 x float> %x, <4 x float> %y) {
693; FMA-LABEL: test_v4f32_mul_sub_negone_x_y:
694; FMA:       # BB#0:
695; FMA-NEXT:    vfnmsub213ps %xmm1, %xmm1, %xmm0
696; FMA-NEXT:    retq
697;
698; FMA4-LABEL: test_v4f32_mul_sub_negone_x_y:
699; FMA4:       # BB#0:
700; FMA4-NEXT:    vfnmsubps %xmm1, %xmm1, %xmm0, %xmm0
701; FMA4-NEXT:    retq
702;
703; AVX512-LABEL: test_v4f32_mul_sub_negone_x_y:
704; AVX512:       # BB#0:
705; AVX512-NEXT:    vfnmsub213ps %xmm1, %xmm1, %xmm0
706; AVX512-NEXT:    retq
707  %s = fsub <4 x float> <float -1.0, float -1.0, float -1.0, float -1.0>, %x
708  %m = fmul <4 x float> %s, %y
709  ret <4 x float> %m
710}
711
712define <4 x float> @test_v4f32_mul_y_sub_negone_x(<4 x float> %x, <4 x float> %y) {
713; FMA-LABEL: test_v4f32_mul_y_sub_negone_x:
714; FMA:       # BB#0:
715; FMA-NEXT:    vfnmsub213ps %xmm1, %xmm1, %xmm0
716; FMA-NEXT:    retq
717;
718; FMA4-LABEL: test_v4f32_mul_y_sub_negone_x:
719; FMA4:       # BB#0:
720; FMA4-NEXT:    vfnmsubps %xmm1, %xmm1, %xmm0, %xmm0
721; FMA4-NEXT:    retq
722;
723; AVX512-LABEL: test_v4f32_mul_y_sub_negone_x:
724; AVX512:       # BB#0:
725; AVX512-NEXT:    vfnmsub213ps %xmm1, %xmm1, %xmm0
726; AVX512-NEXT:    retq
727  %s = fsub <4 x float> <float -1.0, float -1.0, float -1.0, float -1.0>, %x
728  %m = fmul <4 x float> %y, %s
729  ret <4 x float> %m
730}
731
732define <4 x float> @test_v4f32_mul_sub_x_one_y(<4 x float> %x, <4 x float> %y) {
733; FMA-LABEL: test_v4f32_mul_sub_x_one_y:
734; FMA:       # BB#0:
735; FMA-NEXT:    vfmsub213ps %xmm1, %xmm1, %xmm0
736; FMA-NEXT:    retq
737;
738; FMA4-LABEL: test_v4f32_mul_sub_x_one_y:
739; FMA4:       # BB#0:
740; FMA4-NEXT:    vfmsubps %xmm1, %xmm1, %xmm0, %xmm0
741; FMA4-NEXT:    retq
742;
743; AVX512-LABEL: test_v4f32_mul_sub_x_one_y:
744; AVX512:       # BB#0:
745; AVX512-NEXT:    vfmsub213ps %xmm1, %xmm1, %xmm0
746; AVX512-NEXT:    retq
747  %s = fsub <4 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0>
748  %m = fmul <4 x float> %s, %y
749  ret <4 x float> %m
750}
751
752define <4 x float> @test_v4f32_mul_y_sub_x_one(<4 x float> %x, <4 x float> %y) {
753; FMA-LABEL: test_v4f32_mul_y_sub_x_one:
754; FMA:       # BB#0:
755; FMA-NEXT:    vfmsub213ps %xmm1, %xmm1, %xmm0
756; FMA-NEXT:    retq
757;
758; FMA4-LABEL: test_v4f32_mul_y_sub_x_one:
759; FMA4:       # BB#0:
760; FMA4-NEXT:    vfmsubps %xmm1, %xmm1, %xmm0, %xmm0
761; FMA4-NEXT:    retq
762;
763; AVX512-LABEL: test_v4f32_mul_y_sub_x_one:
764; AVX512:       # BB#0:
765; AVX512-NEXT:    vfmsub213ps %xmm1, %xmm1, %xmm0
766; AVX512-NEXT:    retq
767  %s = fsub <4 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0>
768  %m = fmul <4 x float> %y, %s
769  ret <4 x float> %m
770}
771
772define <4 x float> @test_v4f32_mul_sub_x_negone_y(<4 x float> %x, <4 x float> %y) {
773; FMA-LABEL: test_v4f32_mul_sub_x_negone_y:
774; FMA:       # BB#0:
775; FMA-NEXT:    vfmadd213ps %xmm1, %xmm1, %xmm0
776; FMA-NEXT:    retq
777;
778; FMA4-LABEL: test_v4f32_mul_sub_x_negone_y:
779; FMA4:       # BB#0:
780; FMA4-NEXT:    vfmaddps %xmm1, %xmm1, %xmm0, %xmm0
781; FMA4-NEXT:    retq
782;
783; AVX512-LABEL: test_v4f32_mul_sub_x_negone_y:
784; AVX512:       # BB#0:
785; AVX512-NEXT:    vfmadd213ps %xmm1, %xmm1, %xmm0
786; AVX512-NEXT:    retq
787  %s = fsub <4 x float> %x, <float -1.0, float -1.0, float -1.0, float -1.0>
788  %m = fmul <4 x float> %s, %y
789  ret <4 x float> %m
790}
791
792define <4 x float> @test_v4f32_mul_y_sub_x_negone(<4 x float> %x, <4 x float> %y) {
793; FMA-LABEL: test_v4f32_mul_y_sub_x_negone:
794; FMA:       # BB#0:
795; FMA-NEXT:    vfmadd213ps %xmm1, %xmm1, %xmm0
796; FMA-NEXT:    retq
797;
798; FMA4-LABEL: test_v4f32_mul_y_sub_x_negone:
799; FMA4:       # BB#0:
800; FMA4-NEXT:    vfmaddps %xmm1, %xmm1, %xmm0, %xmm0
801; FMA4-NEXT:    retq
802;
803; AVX512-LABEL: test_v4f32_mul_y_sub_x_negone:
804; AVX512:       # BB#0:
805; AVX512-NEXT:    vfmadd213ps %xmm1, %xmm1, %xmm0
806; AVX512-NEXT:    retq
807  %s = fsub <4 x float> %x, <float -1.0, float -1.0, float -1.0, float -1.0>
808  %m = fmul <4 x float> %y, %s
809  ret <4 x float> %m
810}
811
812;
813; Interpolation Patterns: add(mul(x,t),mul(sub(1.0,t),y))
814;
815
816define float @test_f32_interp(float %x, float %y, float %t) {
817; FMA-LABEL: test_f32_interp:
818; FMA:       # BB#0:
819; FMA-NEXT:    vfnmadd213ss %xmm1, %xmm2, %xmm1
820; FMA-NEXT:    vfmadd213ss %xmm1, %xmm2, %xmm0
821; FMA-NEXT:    retq
822;
823; FMA4-LABEL: test_f32_interp:
824; FMA4:       # BB#0:
825; FMA4-NEXT:    vfnmaddss %xmm1, %xmm1, %xmm2, %xmm1
826; FMA4-NEXT:    vfmaddss %xmm1, %xmm2, %xmm0, %xmm0
827; FMA4-NEXT:    retq
828;
829; AVX512-LABEL: test_f32_interp:
830; AVX512:       # BB#0:
831; AVX512-NEXT:    vfnmadd213ss %xmm1, %xmm2, %xmm1
832; AVX512-NEXT:    vfmadd213ss %xmm1, %xmm0, %xmm2
833; AVX512-NEXT:    vmovaps %zmm2, %zmm0
834; AVX512-NEXT:    retq
835  %t1 = fsub float 1.0, %t
836  %tx = fmul float %x, %t
837  %ty = fmul float %y, %t1
838  %r = fadd float %tx, %ty
839  ret float %r
840}
841
842define <4 x float> @test_v4f32_interp(<4 x float> %x, <4 x float> %y, <4 x float> %t) {
843; FMA-LABEL: test_v4f32_interp:
844; FMA:       # BB#0:
845; FMA-NEXT:    vfnmadd213ps %xmm1, %xmm2, %xmm1
846; FMA-NEXT:    vfmadd213ps %xmm1, %xmm2, %xmm0
847; FMA-NEXT:    retq
848;
849; FMA4-LABEL: test_v4f32_interp:
850; FMA4:       # BB#0:
851; FMA4-NEXT:    vfnmaddps %xmm1, %xmm1, %xmm2, %xmm1
852; FMA4-NEXT:    vfmaddps %xmm1, %xmm2, %xmm0, %xmm0
853; FMA4-NEXT:    retq
854;
855; AVX512-LABEL: test_v4f32_interp:
856; AVX512:       # BB#0:
857; AVX512-NEXT:    vmovaps %zmm2, %zmm3
858; AVX512-NEXT:    vfnmadd213ps %xmm1, %xmm1, %xmm3
859; AVX512-NEXT:    vfmadd213ps %xmm3, %xmm2, %xmm0
860; AVX512-NEXT:    retq
861  %t1 = fsub <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %t
862  %tx = fmul <4 x float> %x, %t
863  %ty = fmul <4 x float> %y, %t1
864  %r = fadd <4 x float> %tx, %ty
865  ret <4 x float> %r
866}
867
868define <8 x float> @test_v8f32_interp(<8 x float> %x, <8 x float> %y, <8 x float> %t) {
869; FMA-LABEL: test_v8f32_interp:
870; FMA:       # BB#0:
871; FMA-NEXT:    vfnmadd213ps %ymm1, %ymm2, %ymm1
872; FMA-NEXT:    vfmadd213ps %ymm1, %ymm2, %ymm0
873; FMA-NEXT:    retq
874;
875; FMA4-LABEL: test_v8f32_interp:
876; FMA4:       # BB#0:
877; FMA4-NEXT:    vfnmaddps %ymm1, %ymm1, %ymm2, %ymm1
878; FMA4-NEXT:    vfmaddps %ymm1, %ymm2, %ymm0, %ymm0
879; FMA4-NEXT:    retq
880;
881; AVX512-LABEL: test_v8f32_interp:
882; AVX512:       # BB#0:
883; AVX512-NEXT:    vmovaps %zmm2, %zmm3
884; AVX512-NEXT:    vfnmadd213ps %ymm1, %ymm1, %ymm3
885; AVX512-NEXT:    vfmadd213ps %ymm3, %ymm2, %ymm0
886; AVX512-NEXT:    retq
887  %t1 = fsub <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %t
888  %tx = fmul <8 x float> %x, %t
889  %ty = fmul <8 x float> %y, %t1
890  %r = fadd <8 x float> %tx, %ty
891  ret <8 x float> %r
892}
893
894define double @test_f64_interp(double %x, double %y, double %t) {
895; FMA-LABEL: test_f64_interp:
896; FMA:       # BB#0:
897; FMA-NEXT:    vfnmadd213sd %xmm1, %xmm2, %xmm1
898; FMA-NEXT:    vfmadd213sd %xmm1, %xmm2, %xmm0
899; FMA-NEXT:    retq
900;
901; FMA4-LABEL: test_f64_interp:
902; FMA4:       # BB#0:
903; FMA4-NEXT:    vfnmaddsd %xmm1, %xmm1, %xmm2, %xmm1
904; FMA4-NEXT:    vfmaddsd %xmm1, %xmm2, %xmm0, %xmm0
905; FMA4-NEXT:    retq
906;
907; AVX512-LABEL: test_f64_interp:
908; AVX512:       # BB#0:
909; AVX512-NEXT:    vfnmadd213sd %xmm1, %xmm2, %xmm1
910; AVX512-NEXT:    vfmadd213sd %xmm1, %xmm0, %xmm2
911; AVX512-NEXT:    vmovaps %zmm2, %zmm0
912; AVX512-NEXT:    retq
913  %t1 = fsub double 1.0, %t
914  %tx = fmul double %x, %t
915  %ty = fmul double %y, %t1
916  %r = fadd double %tx, %ty
917  ret double %r
918}
919
920define <2 x double> @test_v2f64_interp(<2 x double> %x, <2 x double> %y, <2 x double> %t) {
921; FMA-LABEL: test_v2f64_interp:
922; FMA:       # BB#0:
923; FMA-NEXT:    vfnmadd213pd %xmm1, %xmm2, %xmm1
924; FMA-NEXT:    vfmadd213pd %xmm1, %xmm2, %xmm0
925; FMA-NEXT:    retq
926;
927; FMA4-LABEL: test_v2f64_interp:
928; FMA4:       # BB#0:
929; FMA4-NEXT:    vfnmaddpd %xmm1, %xmm1, %xmm2, %xmm1
930; FMA4-NEXT:    vfmaddpd %xmm1, %xmm2, %xmm0, %xmm0
931; FMA4-NEXT:    retq
932;
933; AVX512-LABEL: test_v2f64_interp:
934; AVX512:       # BB#0:
935; AVX512-NEXT:    vmovaps %zmm2, %zmm3
936; AVX512-NEXT:    vfnmadd213pd %xmm1, %xmm1, %xmm3
937; AVX512-NEXT:    vfmadd213pd %xmm3, %xmm2, %xmm0
938; AVX512-NEXT:    retq
939  %t1 = fsub <2 x double> <double 1.0, double 1.0>, %t
940  %tx = fmul <2 x double> %x, %t
941  %ty = fmul <2 x double> %y, %t1
942  %r = fadd <2 x double> %tx, %ty
943  ret <2 x double> %r
944}
945
946define <4 x double> @test_v4f64_interp(<4 x double> %x, <4 x double> %y, <4 x double> %t) {
947; FMA-LABEL: test_v4f64_interp:
948; FMA:       # BB#0:
949; FMA-NEXT:    vfnmadd213pd %ymm1, %ymm2, %ymm1
950; FMA-NEXT:    vfmadd213pd %ymm1, %ymm2, %ymm0
951; FMA-NEXT:    retq
952;
953; FMA4-LABEL: test_v4f64_interp:
954; FMA4:       # BB#0:
955; FMA4-NEXT:    vfnmaddpd %ymm1, %ymm1, %ymm2, %ymm1
956; FMA4-NEXT:    vfmaddpd %ymm1, %ymm2, %ymm0, %ymm0
957; FMA4-NEXT:    retq
958;
959; AVX512-LABEL: test_v4f64_interp:
960; AVX512:       # BB#0:
961; AVX512-NEXT:    vmovaps %zmm2, %zmm3
962; AVX512-NEXT:    vfnmadd213pd %ymm1, %ymm1, %ymm3
963; AVX512-NEXT:    vfmadd213pd %ymm3, %ymm2, %ymm0
964; AVX512-NEXT:    retq
965  %t1 = fsub <4 x double> <double 1.0, double 1.0, double 1.0, double 1.0>, %t
966  %tx = fmul <4 x double> %x, %t
967  %ty = fmul <4 x double> %y, %t1
968  %r = fadd <4 x double> %tx, %ty
969  ret <4 x double> %r
970}
971
972;
973; Pattern: (fneg (fma x, y, z)) -> (fma x, -y, -z)
974;
975
976define <4 x float> @test_v4f32_fneg_fmadd(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
977; FMA-LABEL: test_v4f32_fneg_fmadd:
978; FMA:       # BB#0:
979; FMA-NEXT:    vfnmsub213ps %xmm2, %xmm1, %xmm0
980; FMA-NEXT:    retq
981;
982; FMA4-LABEL: test_v4f32_fneg_fmadd:
983; FMA4:       # BB#0:
984; FMA4-NEXT:    vfnmsubps %xmm2, %xmm1, %xmm0, %xmm0
985; FMA4-NEXT:    retq
986;
987; AVX512-LABEL: test_v4f32_fneg_fmadd:
988; AVX512:       # BB#0:
989; AVX512-NEXT:    vfnmsub213ps %xmm2, %xmm1, %xmm0
990; AVX512-NEXT:    retq
991  %mul = fmul <4 x float> %a0, %a1
992  %add = fadd <4 x float> %mul, %a2
993  %neg = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %add
994  ret <4 x float> %neg
995}
996
997define <4 x double> @test_v4f64_fneg_fmsub(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
998; FMA-LABEL: test_v4f64_fneg_fmsub:
999; FMA:       # BB#0:
1000; FMA-NEXT:    vfnmadd213pd %ymm2, %ymm1, %ymm0
1001; FMA-NEXT:    retq
1002;
1003; FMA4-LABEL: test_v4f64_fneg_fmsub:
1004; FMA4:       # BB#0:
1005; FMA4-NEXT:    vfnmaddpd %ymm2, %ymm1, %ymm0, %ymm0
1006; FMA4-NEXT:    retq
1007;
1008; AVX512-LABEL: test_v4f64_fneg_fmsub:
1009; AVX512:       # BB#0:
1010; AVX512-NEXT:    vfnmadd213pd %ymm2, %ymm1, %ymm0
1011; AVX512-NEXT:    retq
1012  %mul = fmul <4 x double> %a0, %a1
1013  %sub = fsub <4 x double> %mul, %a2
1014  %neg = fsub <4 x double> <double -0.0, double -0.0, double -0.0, double -0.0>, %sub
1015  ret <4 x double> %neg
1016}
1017
1018define <4 x float> @test_v4f32_fneg_fnmadd(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
1019; FMA-LABEL: test_v4f32_fneg_fnmadd:
1020; FMA:       # BB#0:
1021; FMA-NEXT:    vfmsub213ps %xmm2, %xmm1, %xmm0
1022; FMA-NEXT:    retq
1023;
1024; FMA4-LABEL: test_v4f32_fneg_fnmadd:
1025; FMA4:       # BB#0:
1026; FMA4-NEXT:    vfmsubps %xmm2, %xmm1, %xmm0, %xmm0
1027; FMA4-NEXT:    retq
1028;
1029; AVX512-LABEL: test_v4f32_fneg_fnmadd:
1030; AVX512:       # BB#0:
1031; AVX512-NEXT:    vfmsub213ps %xmm2, %xmm1, %xmm0
1032; AVX512-NEXT:    retq
1033  %mul = fmul <4 x float> %a0, %a1
1034  %neg0 = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %mul
1035  %add = fadd <4 x float> %neg0, %a2
1036  %neg1 = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %add
1037  ret <4 x float> %neg1
1038}
1039
1040define <4 x double> @test_v4f64_fneg_fnmsub(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
1041; FMA-LABEL: test_v4f64_fneg_fnmsub:
1042; FMA:       # BB#0:
1043; FMA-NEXT:    vfmadd213pd %ymm2, %ymm1, %ymm0
1044; FMA-NEXT:    retq
1045;
1046; FMA4-LABEL: test_v4f64_fneg_fnmsub:
1047; FMA4:       # BB#0:
1048; FMA4-NEXT:    vfmaddpd %ymm2, %ymm1, %ymm0, %ymm0
1049; FMA4-NEXT:    retq
1050;
1051; AVX512-LABEL: test_v4f64_fneg_fnmsub:
1052; AVX512:       # BB#0:
1053; AVX512-NEXT:    vfmadd213pd %ymm2, %ymm1, %ymm0
1054; AVX512-NEXT:    retq
1055  %mul = fmul <4 x double> %a0, %a1
1056  %neg0 = fsub <4 x double> <double -0.0, double -0.0, double -0.0, double -0.0>, %mul
1057  %sub = fsub <4 x double> %neg0, %a2
1058  %neg1 = fsub <4 x double> <double -0.0, double -0.0, double -0.0, double -0.0>, %sub
1059  ret <4 x double> %neg1
1060}
1061
1062;
1063; Pattern: (fma x, c1, (fmul x, c2)) -> (fmul x, c1+c2)
1064;
1065
1066define <4 x float> @test_v4f32_fma_x_c1_fmul_x_c2(<4 x float> %x) #0 {
1067; FMA-LABEL: test_v4f32_fma_x_c1_fmul_x_c2:
1068; FMA:       # BB#0:
1069; FMA-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
1070; FMA-NEXT:    retq
1071;
1072; FMA4-LABEL: test_v4f32_fma_x_c1_fmul_x_c2:
1073; FMA4:       # BB#0:
1074; FMA4-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
1075; FMA4-NEXT:    retq
1076;
1077; AVX512-LABEL: test_v4f32_fma_x_c1_fmul_x_c2:
1078; AVX512:       # BB#0:
1079; AVX512-NEXT:    vmulps {{.*}}(%rip){1to4}, %xmm0, %xmm0
1080; AVX512-NEXT:    retq
1081  %m0 = fmul <4 x float> %x, <float 1.0, float 2.0, float 3.0, float 4.0>
1082  %m1 = fmul <4 x float> %x, <float 4.0, float 3.0, float 2.0, float 1.0>
1083  %a  = fadd <4 x float> %m0, %m1
1084  ret <4 x float> %a
1085}
1086
1087;
1088; Pattern: (fma (fmul x, c1), c2, y) -> (fma x, c1*c2, y)
1089;
1090
1091define <4 x float> @test_v4f32_fma_fmul_x_c1_c2_y(<4 x float> %x, <4 x float> %y) #0 {
1092; FMA-LABEL: test_v4f32_fma_fmul_x_c1_c2_y:
1093; FMA:       # BB#0:
1094; FMA-NEXT:    vfmadd132ps {{.*}}(%rip), %xmm1, %xmm0
1095; FMA-NEXT:    retq
1096;
1097; FMA4-LABEL: test_v4f32_fma_fmul_x_c1_c2_y:
1098; FMA4:       # BB#0:
1099; FMA4-NEXT:    vfmaddps %xmm1, {{.*}}(%rip), %xmm0, %xmm0
1100; FMA4-NEXT:    retq
1101;
1102; AVX512-LABEL: test_v4f32_fma_fmul_x_c1_c2_y:
1103; AVX512:       # BB#0:
1104; AVX512-NEXT:    vfmadd231ps {{.*}}(%rip), %xmm0, %xmm1
1105; AVX512-NEXT:    vmovaps %zmm1, %zmm0
1106; AVX512-NEXT:    retq
1107  %m0 = fmul <4 x float> %x,  <float 1.0, float 2.0, float 3.0, float 4.0>
1108  %m1 = fmul <4 x float> %m0, <float 4.0, float 3.0, float 2.0, float 1.0>
1109  %a  = fadd <4 x float> %m1, %y
1110  ret <4 x float> %a
1111}
1112
1113; Pattern: (fneg (fmul x, y)) -> (fnmsub x, y, 0)
1114
1115define double @test_f64_fneg_fmul(double %x, double %y) #0 {
1116; FMA-LABEL: test_f64_fneg_fmul:
1117; FMA:       # BB#0:
1118; FMA-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
1119; FMA-NEXT:    vfnmsub213sd %xmm2, %xmm1, %xmm0
1120; FMA-NEXT:    retq
1121;
1122; FMA4-LABEL: test_f64_fneg_fmul:
1123; FMA4:       # BB#0:
1124; FMA4-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
1125; FMA4-NEXT:    vfnmsubsd %xmm2, %xmm1, %xmm0, %xmm0
1126; FMA4-NEXT:    retq
1127;
1128; AVX512-LABEL: test_f64_fneg_fmul:
1129; AVX512:       # BB#0:
1130; AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2
1131; AVX512-NEXT:    vfnmsub213sd %xmm2, %xmm0, %xmm1
1132; AVX512-NEXT:    vmovaps %zmm1, %zmm0
1133; AVX512-NEXT:    retq
1134  %m = fmul nsz double %x, %y
1135  %n = fsub double -0.0, %m
1136  ret double %n
1137}
1138
1139define <4 x float> @test_v4f32_fneg_fmul(<4 x float> %x, <4 x float> %y) #0 {
1140; FMA-LABEL: test_v4f32_fneg_fmul:
1141; FMA:       # BB#0:
1142; FMA-NEXT:    vxorps %xmm2, %xmm2, %xmm2
1143; FMA-NEXT:    vfnmsub213ps %xmm2, %xmm1, %xmm0
1144; FMA-NEXT:    retq
1145;
1146; FMA4-LABEL: test_v4f32_fneg_fmul:
1147; FMA4:       # BB#0:
1148; FMA4-NEXT:    vxorps %xmm2, %xmm2, %xmm2
1149; FMA4-NEXT:    vfnmsubps %xmm2, %xmm1, %xmm0, %xmm0
1150; FMA4-NEXT:    retq
1151;
1152; AVX512-LABEL: test_v4f32_fneg_fmul:
1153; AVX512:       # BB#0:
1154; AVX512-NEXT:    vpxord %xmm2, %xmm2, %xmm2
1155; AVX512-NEXT:    vfnmsub213ps %xmm2, %xmm1, %xmm0
1156; AVX512-NEXT:    retq
1157  %m = fmul nsz <4 x float> %x, %y
1158  %n = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %m
1159  ret <4 x float> %n
1160}
1161
1162define <4 x double> @test_v4f64_fneg_fmul(<4 x double> %x, <4 x double> %y) #0 {
1163; FMA-LABEL: test_v4f64_fneg_fmul:
1164; FMA:       # BB#0:
1165; FMA-NEXT:    vxorpd %ymm2, %ymm2, %ymm2
1166; FMA-NEXT:    vfnmsub213pd %ymm2, %ymm1, %ymm0
1167; FMA-NEXT:    retq
1168;
1169; FMA4-LABEL: test_v4f64_fneg_fmul:
1170; FMA4:       # BB#0:
1171; FMA4-NEXT:    vxorpd %ymm2, %ymm2, %ymm2
1172; FMA4-NEXT:    vfnmsubpd %ymm2, %ymm1, %ymm0, %ymm0
1173; FMA4-NEXT:    retq
1174;
1175; AVX512-LABEL: test_v4f64_fneg_fmul:
1176; AVX512:       # BB#0:
1177; AVX512-NEXT:    vpxord %ymm2, %ymm2, %ymm2
1178; AVX512-NEXT:    vfnmsub213pd %ymm2, %ymm1, %ymm0
1179; AVX512-NEXT:    retq
1180  %m = fmul nsz <4 x double> %x, %y
1181  %n = fsub <4 x double> <double -0.0, double -0.0, double -0.0, double -0.0>, %m
1182  ret <4 x double> %n
1183}
1184
1185define <4 x double> @test_v4f64_fneg_fmul_no_nsz(<4 x double> %x, <4 x double> %y) #0 {
1186; ALL-LABEL: test_v4f64_fneg_fmul_no_nsz:
1187; ALL:       # BB#0:
1188; ALL-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
1189; ALL-NEXT:    vxorpd {{.*}}(%rip), %ymm0, %ymm0
1190; ALL-NEXT:    retq
1191  %m = fmul <4 x double> %x, %y
1192  %n = fsub <4 x double> <double -0.0, double -0.0, double -0.0, double -0.0>, %m
1193  ret <4 x double> %n
1194}
1195
1196attributes #0 = { "unsafe-fp-math"="true" }
1197