1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -O3 -mtriple=x86_64-pc-linux < %s | FileCheck %s --check-prefix=NOFMA
3; RUN: llc -O3 -mtriple=x86_64-pc-linux -mattr=+fma < %s | FileCheck %s --check-prefixes=FMA,FMA-AVX1
4; RUN: llc -O3 -mtriple=x86_64-pc-linux -mattr=+fma4 < %s | FileCheck %s --check-prefix=FMA4
5; RUN: llc -O3 -mtriple=x86_64-pc-linux -mattr=+avx512f < %s | FileCheck %s --check-prefixes=FMA,FMA-AVX512
6
7define float @f1(float %0, float %1, float %2) #0 {
8; NOFMA-LABEL: f1:
9; NOFMA:       # %bb.0: # %entry
10; NOFMA-NEXT:    pushq %rax
11; NOFMA-NEXT:    .cfi_def_cfa_offset 16
12; NOFMA-NEXT:    xorps {{.*}}(%rip), %xmm0
13; NOFMA-NEXT:    callq fmaf
14; NOFMA-NEXT:    popq %rax
15; NOFMA-NEXT:    .cfi_def_cfa_offset 8
16; NOFMA-NEXT:    retq
17;
18; FMA-LABEL: f1:
19; FMA:       # %bb.0: # %entry
20; FMA-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
21; FMA-NEXT:    retq
22;
23; FMA4-LABEL: f1:
24; FMA4:       # %bb.0: # %entry
25; FMA4-NEXT:    vfnmaddss {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm2
26; FMA4-NEXT:    retq
27entry:
28  %3 = fneg float %0
29  %result = call float @llvm.experimental.constrained.fma.f32(float %3, float %1, float %2,
30                                                              metadata !"round.dynamic",
31                                                              metadata !"fpexcept.strict") #0
32  ret float %result
33}
34
35define double @f2(double %0, double %1, double %2) #0 {
36; NOFMA-LABEL: f2:
37; NOFMA:       # %bb.0: # %entry
38; NOFMA-NEXT:    pushq %rax
39; NOFMA-NEXT:    .cfi_def_cfa_offset 16
40; NOFMA-NEXT:    xorps {{.*}}(%rip), %xmm0
41; NOFMA-NEXT:    callq fma
42; NOFMA-NEXT:    popq %rax
43; NOFMA-NEXT:    .cfi_def_cfa_offset 8
44; NOFMA-NEXT:    retq
45;
46; FMA-LABEL: f2:
47; FMA:       # %bb.0: # %entry
48; FMA-NEXT:    vfnmadd213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
49; FMA-NEXT:    retq
50;
51; FMA4-LABEL: f2:
52; FMA4:       # %bb.0: # %entry
53; FMA4-NEXT:    vfnmaddsd {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm2
54; FMA4-NEXT:    retq
55entry:
56  %3 = fneg double %0
57  %result = call double @llvm.experimental.constrained.fma.f64(double %3, double %1, double %2,
58                                                               metadata !"round.dynamic",
59                                                               metadata !"fpexcept.strict") #0
60  ret double %result
61}
62
63define float @f3(float %0, float %1, float %2) #0 {
64; NOFMA-LABEL: f3:
65; NOFMA:       # %bb.0: # %entry
66; NOFMA-NEXT:    pushq %rax
67; NOFMA-NEXT:    .cfi_def_cfa_offset 16
68; NOFMA-NEXT:    xorps {{.*}}(%rip), %xmm2
69; NOFMA-NEXT:    callq fmaf
70; NOFMA-NEXT:    popq %rax
71; NOFMA-NEXT:    .cfi_def_cfa_offset 8
72; NOFMA-NEXT:    retq
73;
74; FMA-LABEL: f3:
75; FMA:       # %bb.0: # %entry
76; FMA-NEXT:    vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
77; FMA-NEXT:    retq
78;
79; FMA4-LABEL: f3:
80; FMA4:       # %bb.0: # %entry
81; FMA4-NEXT:    vfmsubss {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm2
82; FMA4-NEXT:    retq
83entry:
84  %3 = fneg float %2
85  %result = call float @llvm.experimental.constrained.fma.f32(float %0, float %1, float %3,
86                                                              metadata !"round.dynamic",
87                                                              metadata !"fpexcept.strict") #0
88  ret float %result
89}
90
91define double @f4(double %0, double %1, double %2) #0 {
92; NOFMA-LABEL: f4:
93; NOFMA:       # %bb.0: # %entry
94; NOFMA-NEXT:    pushq %rax
95; NOFMA-NEXT:    .cfi_def_cfa_offset 16
96; NOFMA-NEXT:    xorps {{.*}}(%rip), %xmm2
97; NOFMA-NEXT:    callq fma
98; NOFMA-NEXT:    popq %rax
99; NOFMA-NEXT:    .cfi_def_cfa_offset 8
100; NOFMA-NEXT:    retq
101;
102; FMA-LABEL: f4:
103; FMA:       # %bb.0: # %entry
104; FMA-NEXT:    vfmsub213sd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
105; FMA-NEXT:    retq
106;
107; FMA4-LABEL: f4:
108; FMA4:       # %bb.0: # %entry
109; FMA4-NEXT:    vfmsubsd {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm2
110; FMA4-NEXT:    retq
111entry:
112  %3 = fneg double %2
113  %result = call double @llvm.experimental.constrained.fma.f64(double %0, double %1, double %3,
114                                                               metadata !"round.dynamic",
115                                                               metadata !"fpexcept.strict") #0
116  ret double %result
117}
118
119define float @f5(float %0, float %1, float %2) #0 {
120; NOFMA-LABEL: f5:
121; NOFMA:       # %bb.0: # %entry
122; NOFMA-NEXT:    pushq %rax
123; NOFMA-NEXT:    .cfi_def_cfa_offset 16
124; NOFMA-NEXT:    movaps {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
125; NOFMA-NEXT:    xorps %xmm3, %xmm0
126; NOFMA-NEXT:    xorps %xmm3, %xmm2
127; NOFMA-NEXT:    callq fmaf
128; NOFMA-NEXT:    popq %rax
129; NOFMA-NEXT:    .cfi_def_cfa_offset 8
130; NOFMA-NEXT:    retq
131;
132; FMA-LABEL: f5:
133; FMA:       # %bb.0: # %entry
134; FMA-NEXT:    vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
135; FMA-NEXT:    retq
136;
137; FMA4-LABEL: f5:
138; FMA4:       # %bb.0: # %entry
139; FMA4-NEXT:    vfnmsubss {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2
140; FMA4-NEXT:    retq
141entry:
142  %3 = fneg float %0
143  %4 = fneg float %2
144  %result = call float @llvm.experimental.constrained.fma.f32(float %3, float %1, float %4,
145                                                              metadata !"round.dynamic",
146                                                              metadata !"fpexcept.strict") #0
147  ret float %result
148}
149
150define double @f6(double %0, double %1, double %2) #0 {
151; NOFMA-LABEL: f6:
152; NOFMA:       # %bb.0: # %entry
153; NOFMA-NEXT:    pushq %rax
154; NOFMA-NEXT:    .cfi_def_cfa_offset 16
155; NOFMA-NEXT:    movaps {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0]
156; NOFMA-NEXT:    xorps %xmm3, %xmm0
157; NOFMA-NEXT:    xorps %xmm3, %xmm2
158; NOFMA-NEXT:    callq fma
159; NOFMA-NEXT:    popq %rax
160; NOFMA-NEXT:    .cfi_def_cfa_offset 8
161; NOFMA-NEXT:    retq
162;
163; FMA-LABEL: f6:
164; FMA:       # %bb.0: # %entry
165; FMA-NEXT:    vfnmsub213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
166; FMA-NEXT:    retq
167;
168; FMA4-LABEL: f6:
169; FMA4:       # %bb.0: # %entry
170; FMA4-NEXT:    vfnmsubsd {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2
171; FMA4-NEXT:    retq
172entry:
173  %3 = fneg double %0
174  %4 = fneg double %2
175  %result = call double @llvm.experimental.constrained.fma.f64(double %3, double %1, double %4,
176                                                               metadata !"round.dynamic",
177                                                               metadata !"fpexcept.strict") #0
178  ret double %result
179}
180
181define float @f7(float %0, float %1, float %2) #0 {
182; NOFMA-LABEL: f7:
183; NOFMA:       # %bb.0: # %entry
184; NOFMA-NEXT:    pushq %rax
185; NOFMA-NEXT:    .cfi_def_cfa_offset 16
186; NOFMA-NEXT:    callq fmaf
187; NOFMA-NEXT:    xorps {{.*}}(%rip), %xmm0
188; NOFMA-NEXT:    popq %rax
189; NOFMA-NEXT:    .cfi_def_cfa_offset 8
190; NOFMA-NEXT:    retq
191;
192; FMA-AVX1-LABEL: f7:
193; FMA-AVX1:       # %bb.0: # %entry
194; FMA-AVX1-NEXT:    vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
195; FMA-AVX1-NEXT:    vxorps {{.*}}(%rip), %xmm0, %xmm0
196; FMA-AVX1-NEXT:    retq
197;
198; FMA4-LABEL: f7:
199; FMA4:       # %bb.0: # %entry
200; FMA4-NEXT:    vfmaddss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm2
201; FMA4-NEXT:    vxorps {{.*}}(%rip), %xmm0, %xmm0
202; FMA4-NEXT:    retq
203;
204; FMA-AVX512-LABEL: f7:
205; FMA-AVX512:       # %bb.0: # %entry
206; FMA-AVX512-NEXT:    vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
207; FMA-AVX512-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
208; FMA-AVX512-NEXT:    vxorps %xmm1, %xmm0, %xmm0
209; FMA-AVX512-NEXT:    retq
210entry:
211  %3 = call float @llvm.experimental.constrained.fma.f32(float %0, float %1, float %2,
212                                                         metadata !"round.dynamic",
213                                                         metadata !"fpexcept.strict") #0
214  %result = fneg float %3
215  ret float %result
216}
217
218define double @f8(double %0, double %1, double %2) #0 {
219; NOFMA-LABEL: f8:
220; NOFMA:       # %bb.0: # %entry
221; NOFMA-NEXT:    pushq %rax
222; NOFMA-NEXT:    .cfi_def_cfa_offset 16
223; NOFMA-NEXT:    callq fma
224; NOFMA-NEXT:    xorps {{.*}}(%rip), %xmm0
225; NOFMA-NEXT:    popq %rax
226; NOFMA-NEXT:    .cfi_def_cfa_offset 8
227; NOFMA-NEXT:    retq
228;
229; FMA-LABEL: f8:
230; FMA:       # %bb.0: # %entry
231; FMA-NEXT:    vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
232; FMA-NEXT:    vxorpd {{.*}}(%rip), %xmm0, %xmm0
233; FMA-NEXT:    retq
234;
235; FMA4-LABEL: f8:
236; FMA4:       # %bb.0: # %entry
237; FMA4-NEXT:    vfmaddsd {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm2
238; FMA4-NEXT:    vxorpd {{.*}}(%rip), %xmm0, %xmm0
239; FMA4-NEXT:    retq
240entry:
241  %3 = call double @llvm.experimental.constrained.fma.f64(double %0, double %1, double %2,
242                                                          metadata !"round.dynamic",
243                                                          metadata !"fpexcept.strict") #0
244  %result = fneg double %3
245  ret double %result
246}
247
248define float @f9(float %0, float %1, float %2) #0 {
249; NOFMA-LABEL: f9:
250; NOFMA:       # %bb.0: # %entry
251; NOFMA-NEXT:    pushq %rax
252; NOFMA-NEXT:    .cfi_def_cfa_offset 16
253; NOFMA-NEXT:    movaps {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
254; NOFMA-NEXT:    xorps %xmm3, %xmm0
255; NOFMA-NEXT:    xorps %xmm3, %xmm2
256; NOFMA-NEXT:    callq fmaf
257; NOFMA-NEXT:    xorps {{.*}}(%rip), %xmm0
258; NOFMA-NEXT:    popq %rax
259; NOFMA-NEXT:    .cfi_def_cfa_offset 8
260; NOFMA-NEXT:    retq
261;
262; FMA-AVX1-LABEL: f9:
263; FMA-AVX1:       # %bb.0: # %entry
264; FMA-AVX1-NEXT:    vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
265; FMA-AVX1-NEXT:    vxorps {{.*}}(%rip), %xmm0, %xmm0
266; FMA-AVX1-NEXT:    retq
267;
268; FMA4-LABEL: f9:
269; FMA4:       # %bb.0: # %entry
270; FMA4-NEXT:    vfnmsubss {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2
271; FMA4-NEXT:    vxorps {{.*}}(%rip), %xmm0, %xmm0
272; FMA4-NEXT:    retq
273;
274; FMA-AVX512-LABEL: f9:
275; FMA-AVX512:       # %bb.0: # %entry
276; FMA-AVX512-NEXT:    vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
277; FMA-AVX512-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
278; FMA-AVX512-NEXT:    vxorps %xmm1, %xmm0, %xmm0
279; FMA-AVX512-NEXT:    retq
280entry:
281  %3 = fneg float %0
282  %4 = fneg float %2
283  %5 = call float @llvm.experimental.constrained.fma.f32(float %3, float %1, float %4,
284                                                         metadata !"round.dynamic",
285                                                         metadata !"fpexcept.strict") #0
286  %result = fneg float %5
287  ret float %result
288}
289
290define double @f10(double %0, double %1, double %2) #0 {
291; NOFMA-LABEL: f10:
292; NOFMA:       # %bb.0: # %entry
293; NOFMA-NEXT:    pushq %rax
294; NOFMA-NEXT:    .cfi_def_cfa_offset 16
295; NOFMA-NEXT:    movaps {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0]
296; NOFMA-NEXT:    xorps %xmm3, %xmm0
297; NOFMA-NEXT:    xorps %xmm3, %xmm2
298; NOFMA-NEXT:    callq fma
299; NOFMA-NEXT:    xorps {{.*}}(%rip), %xmm0
300; NOFMA-NEXT:    popq %rax
301; NOFMA-NEXT:    .cfi_def_cfa_offset 8
302; NOFMA-NEXT:    retq
303;
304; FMA-LABEL: f10:
305; FMA:       # %bb.0: # %entry
306; FMA-NEXT:    vfnmsub213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
307; FMA-NEXT:    vxorpd {{.*}}(%rip), %xmm0, %xmm0
308; FMA-NEXT:    retq
309;
310; FMA4-LABEL: f10:
311; FMA4:       # %bb.0: # %entry
312; FMA4-NEXT:    vfnmsubsd {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2
313; FMA4-NEXT:    vxorpd {{.*}}(%rip), %xmm0, %xmm0
314; FMA4-NEXT:    retq
315entry:
316  %3 = fneg double %0
317  %4 = fneg double %2
318  %5 = call double @llvm.experimental.constrained.fma.f64(double %3, double %1, double %4,
319                                                          metadata !"round.dynamic",
320                                                          metadata !"fpexcept.strict") #0
321  %result = fneg double %5
322  ret double %result
323}
324
325; Verify constrained fmul and fadd aren't fused.
326define float @f11(float %0, float %1, float %2) #0 {
327; NOFMA-LABEL: f11:
328; NOFMA:       # %bb.0: # %entry
329; NOFMA-NEXT:    mulss %xmm1, %xmm0
330; NOFMA-NEXT:    addss %xmm2, %xmm0
331; NOFMA-NEXT:    retq
332;
333; FMA-LABEL: f11:
334; FMA:       # %bb.0: # %entry
335; FMA-NEXT:    vmulss %xmm1, %xmm0, %xmm0
336; FMA-NEXT:    vaddss %xmm2, %xmm0, %xmm0
337; FMA-NEXT:    retq
338;
339; FMA4-LABEL: f11:
340; FMA4:       # %bb.0: # %entry
341; FMA4-NEXT:    vmulss %xmm1, %xmm0, %xmm0
342; FMA4-NEXT:    vaddss %xmm2, %xmm0, %xmm0
343; FMA4-NEXT:    retq
344entry:
345  %3 = call float @llvm.experimental.constrained.fmul.f32(float %0, float %1,
346                                                          metadata !"round.dynamic",
347                                                          metadata !"fpexcept.strict") #0
348  %4 = call float @llvm.experimental.constrained.fadd.f32(float %3, float %2,
349                                                          metadata !"round.dynamic",
350                                                          metadata !"fpexcept.strict") #0
351  ret float %4
352}
353
354; Verify constrained fmul and fadd aren't fused.
355define double @f12(double %0, double %1, double %2) #0 {
356; NOFMA-LABEL: f12:
357; NOFMA:       # %bb.0: # %entry
358; NOFMA-NEXT:    mulsd %xmm1, %xmm0
359; NOFMA-NEXT:    addsd %xmm2, %xmm0
360; NOFMA-NEXT:    retq
361;
362; FMA-LABEL: f12:
363; FMA:       # %bb.0: # %entry
364; FMA-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
365; FMA-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
366; FMA-NEXT:    retq
367;
368; FMA4-LABEL: f12:
369; FMA4:       # %bb.0: # %entry
370; FMA4-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
371; FMA4-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
372; FMA4-NEXT:    retq
373entry:
374  %3 = call double @llvm.experimental.constrained.fmul.f64(double %0, double %1,
375                                                           metadata !"round.dynamic",
376                                                           metadata !"fpexcept.strict") #0
377  %4 = call double @llvm.experimental.constrained.fadd.f64(double %3, double %2,
378                                                           metadata !"round.dynamic",
379                                                           metadata !"fpexcept.strict") #0
380  ret double %4
381}
382
383; Verify that fmuladd(3.5) isn't simplified when the rounding mode is
384; unknown.
385define float @f15() #0 {
386; NOFMA-LABEL: f15:
387; NOFMA:       # %bb.0: # %entry
388; NOFMA-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
389; NOFMA-NEXT:    movaps %xmm1, %xmm0
390; NOFMA-NEXT:    mulss %xmm1, %xmm0
391; NOFMA-NEXT:    addss %xmm1, %xmm0
392; NOFMA-NEXT:    retq
393;
394; FMA-LABEL: f15:
395; FMA:       # %bb.0: # %entry
396; FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
397; FMA-NEXT:    vfmadd213ss {{.*#+}} xmm0 = (xmm0 * xmm0) + xmm0
398; FMA-NEXT:    retq
399;
400; FMA4-LABEL: f15:
401; FMA4:       # %bb.0: # %entry
402; FMA4-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
403; FMA4-NEXT:    vfmaddss {{.*#+}} xmm0 = (xmm0 * xmm0) + xmm0
404; FMA4-NEXT:    retq
405entry:
406  %result = call float @llvm.experimental.constrained.fmuladd.f32(
407                                               float 3.5,
408                                               float 3.5,
409                                               float 3.5,
410                                               metadata !"round.dynamic",
411                                               metadata !"fpexcept.strict") #0
412  ret float %result
413}
414
415; Verify that fmuladd(42.1) isn't simplified when the rounding mode is
416; unknown.
417define double @f16() #0 {
418; NOFMA-LABEL: f16:
419; NOFMA:       # %bb.0: # %entry
420; NOFMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
421; NOFMA-NEXT:    movapd %xmm1, %xmm0
422; NOFMA-NEXT:    mulsd %xmm1, %xmm0
423; NOFMA-NEXT:    addsd %xmm1, %xmm0
424; NOFMA-NEXT:    retq
425;
426; FMA-LABEL: f16:
427; FMA:       # %bb.0: # %entry
428; FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
429; FMA-NEXT:    vfmadd213sd {{.*#+}} xmm0 = (xmm0 * xmm0) + xmm0
430; FMA-NEXT:    retq
431;
432; FMA4-LABEL: f16:
433; FMA4:       # %bb.0: # %entry
434; FMA4-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
435; FMA4-NEXT:    vfmaddsd {{.*#+}} xmm0 = (xmm0 * xmm0) + xmm0
436; FMA4-NEXT:    retq
437entry:
438  %result = call double @llvm.experimental.constrained.fmuladd.f64(
439                                               double 42.1,
440                                               double 42.1,
441                                               double 42.1,
442                                               metadata !"round.dynamic",
443                                               metadata !"fpexcept.strict") #0
444  ret double %result
445}
446
447; Verify that fma(3.5) isn't simplified when the rounding mode is
448; unknown.
449define float @f17() #0 {
450; NOFMA-LABEL: f17:
451; NOFMA:       # %bb.0: # %entry
452; NOFMA-NEXT:    pushq %rax
453; NOFMA-NEXT:    .cfi_def_cfa_offset 16
454; NOFMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
455; NOFMA-NEXT:    movaps %xmm0, %xmm1
456; NOFMA-NEXT:    movaps %xmm0, %xmm2
457; NOFMA-NEXT:    callq fmaf
458; NOFMA-NEXT:    popq %rax
459; NOFMA-NEXT:    .cfi_def_cfa_offset 8
460; NOFMA-NEXT:    retq
461;
462; FMA-LABEL: f17:
463; FMA:       # %bb.0: # %entry
464; FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
465; FMA-NEXT:    vfmadd213ss {{.*#+}} xmm0 = (xmm0 * xmm0) + xmm0
466; FMA-NEXT:    retq
467;
468; FMA4-LABEL: f17:
469; FMA4:       # %bb.0: # %entry
470; FMA4-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
471; FMA4-NEXT:    vfmaddss {{.*#+}} xmm0 = (xmm0 * xmm0) + xmm0
472; FMA4-NEXT:    retq
473entry:
474  %result = call float @llvm.experimental.constrained.fma.f32(
475                                               float 3.5,
476                                               float 3.5,
477                                               float 3.5,
478                                               metadata !"round.dynamic",
479                                               metadata !"fpexcept.strict") #0
480  ret float %result
481}
482
483; Verify that fma(42.1) isn't simplified when the rounding mode is
484; unknown.
485define double @f18() #0 {
486; NOFMA-LABEL: f18:
487; NOFMA:       # %bb.0: # %entry
488; NOFMA-NEXT:    pushq %rax
489; NOFMA-NEXT:    .cfi_def_cfa_offset 16
490; NOFMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
491; NOFMA-NEXT:    movaps %xmm0, %xmm1
492; NOFMA-NEXT:    movaps %xmm0, %xmm2
493; NOFMA-NEXT:    callq fma
494; NOFMA-NEXT:    popq %rax
495; NOFMA-NEXT:    .cfi_def_cfa_offset 8
496; NOFMA-NEXT:    retq
497;
498; FMA-LABEL: f18:
499; FMA:       # %bb.0: # %entry
500; FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
501; FMA-NEXT:    vfmadd213sd {{.*#+}} xmm0 = (xmm0 * xmm0) + xmm0
502; FMA-NEXT:    retq
503;
504; FMA4-LABEL: f18:
505; FMA4:       # %bb.0: # %entry
506; FMA4-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
507; FMA4-NEXT:    vfmaddsd {{.*#+}} xmm0 = (xmm0 * xmm0) + xmm0
508; FMA4-NEXT:    retq
509entry:
510  %result = call double @llvm.experimental.constrained.fma.f64(
511                                               double 42.1,
512                                               double 42.1,
513                                               double 42.1,
514                                               metadata !"round.dynamic",
515                                               metadata !"fpexcept.strict") #0
516  ret double %result
517}
518
519define <4 x float> @f19(<4 x float> %0, <4 x float> %1, <4 x float> %2) #0 {
520; NOFMA-LABEL: f19:
521; NOFMA:       # %bb.0: # %entry
522; NOFMA-NEXT:    subq $88, %rsp
523; NOFMA-NEXT:    .cfi_def_cfa_offset 96
524; NOFMA-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
525; NOFMA-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
526; NOFMA-NEXT:    pxor {{.*}}(%rip), %xmm0
527; NOFMA-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
528; NOFMA-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
529; NOFMA-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
530; NOFMA-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
531; NOFMA-NEXT:    callq fmaf
532; NOFMA-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
533; NOFMA-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
534; NOFMA-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
535; NOFMA-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
536; NOFMA-NEXT:    movhlps {{.*#+}} xmm2 = xmm2[1,1]
537; NOFMA-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
538; NOFMA-NEXT:    # xmm0 = mem[2,3,2,3]
539; NOFMA-NEXT:    callq fmaf
540; NOFMA-NEXT:    punpckldq (%rsp), %xmm0 # 16-byte Folded Reload
541; NOFMA-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
542; NOFMA-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
543; NOFMA-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
544; NOFMA-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
545; NOFMA-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
546; NOFMA-NEXT:    callq fmaf
547; NOFMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
548; NOFMA-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
549; NOFMA-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
550; NOFMA-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
551; NOFMA-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1,1,1]
552; NOFMA-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
553; NOFMA-NEXT:    # xmm0 = mem[1,1,1,1]
554; NOFMA-NEXT:    callq fmaf
555; NOFMA-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
556; NOFMA-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
557; NOFMA-NEXT:    punpcklqdq (%rsp), %xmm1 # 16-byte Folded Reload
558; NOFMA-NEXT:    # xmm1 = xmm1[0],mem[0]
559; NOFMA-NEXT:    movdqa %xmm1, %xmm0
560; NOFMA-NEXT:    addq $88, %rsp
561; NOFMA-NEXT:    .cfi_def_cfa_offset 8
562; NOFMA-NEXT:    retq
563;
564; FMA-LABEL: f19:
565; FMA:       # %bb.0: # %entry
566; FMA-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
567; FMA-NEXT:    retq
568;
569; FMA4-LABEL: f19:
570; FMA4:       # %bb.0: # %entry
571; FMA4-NEXT:    vfnmaddps {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm2
572; FMA4-NEXT:    retq
573entry:
574  %3 = fneg <4 x float> %0
575  %result = call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> %3, <4 x float> %1, <4 x float> %2,
576                                                                      metadata !"round.dynamic",
577                                                                      metadata !"fpexcept.strict") #0
578  ret <4 x float> %result
579}
580
581define <2 x double> @f20(<2 x double> %0, <2 x double> %1, <2 x double> %2) #0 {
582; NOFMA-LABEL: f20:
583; NOFMA:       # %bb.0: # %entry
584; NOFMA-NEXT:    subq $72, %rsp
585; NOFMA-NEXT:    .cfi_def_cfa_offset 80
586; NOFMA-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
587; NOFMA-NEXT:    movaps %xmm1, (%rsp) # 16-byte Spill
588; NOFMA-NEXT:    xorps {{.*}}(%rip), %xmm0
589; NOFMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
590; NOFMA-NEXT:    callq fma
591; NOFMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
592; NOFMA-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
593; NOFMA-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
594; NOFMA-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
595; NOFMA-NEXT:    movhlps {{.*#+}} xmm2 = xmm2[1,1]
596; NOFMA-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
597; NOFMA-NEXT:    # xmm0 = mem[2,3,2,3]
598; NOFMA-NEXT:    callq fma
599; NOFMA-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
600; NOFMA-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
601; NOFMA-NEXT:    movdqa %xmm1, %xmm0
602; NOFMA-NEXT:    addq $72, %rsp
603; NOFMA-NEXT:    .cfi_def_cfa_offset 8
604; NOFMA-NEXT:    retq
605;
606; FMA-LABEL: f20:
607; FMA:       # %bb.0: # %entry
608; FMA-NEXT:    vfnmadd213pd {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
609; FMA-NEXT:    retq
610;
611; FMA4-LABEL: f20:
612; FMA4:       # %bb.0: # %entry
613; FMA4-NEXT:    vfnmaddpd {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm2
614; FMA4-NEXT:    retq
615entry:
616  %3 = fneg <2 x double> %0
617  %result = call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %3, <2 x double> %1, <2 x double> %2,
618                                                                       metadata !"round.dynamic",
619                                                                       metadata !"fpexcept.strict") #0
620  ret <2 x double> %result
621}
622
623define <4 x float> @f21(<4 x float> %0, <4 x float> %1, <4 x float> %2) #0 {
624; NOFMA-LABEL: f21:
625; NOFMA:       # %bb.0: # %entry
626; NOFMA-NEXT:    subq $88, %rsp
627; NOFMA-NEXT:    .cfi_def_cfa_offset 96
628; NOFMA-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
629; NOFMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
630; NOFMA-NEXT:    pxor {{.*}}(%rip), %xmm2
631; NOFMA-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
632; NOFMA-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
633; NOFMA-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
634; NOFMA-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
635; NOFMA-NEXT:    callq fmaf
636; NOFMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
637; NOFMA-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
638; NOFMA-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
639; NOFMA-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
640; NOFMA-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
641; NOFMA-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
642; NOFMA-NEXT:    # xmm2 = mem[2,3,2,3]
643; NOFMA-NEXT:    callq fmaf
644; NOFMA-NEXT:    unpcklps (%rsp), %xmm0 # 16-byte Folded Reload
645; NOFMA-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
646; NOFMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
647; NOFMA-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
648; NOFMA-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
649; NOFMA-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
650; NOFMA-NEXT:    callq fmaf
651; NOFMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
652; NOFMA-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
653; NOFMA-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
654; NOFMA-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
655; NOFMA-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
656; NOFMA-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
657; NOFMA-NEXT:    # xmm2 = mem[1,1,1,1]
658; NOFMA-NEXT:    callq fmaf
659; NOFMA-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
660; NOFMA-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
661; NOFMA-NEXT:    unpcklpd (%rsp), %xmm1 # 16-byte Folded Reload
662; NOFMA-NEXT:    # xmm1 = xmm1[0],mem[0]
663; NOFMA-NEXT:    movaps %xmm1, %xmm0
664; NOFMA-NEXT:    addq $88, %rsp
665; NOFMA-NEXT:    .cfi_def_cfa_offset 8
666; NOFMA-NEXT:    retq
667;
668; FMA-LABEL: f21:
669; FMA:       # %bb.0: # %entry
670; FMA-NEXT:    vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
671; FMA-NEXT:    retq
672;
673; FMA4-LABEL: f21:
674; FMA4:       # %bb.0: # %entry
675; FMA4-NEXT:    vfmsubps {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm2
676; FMA4-NEXT:    retq
677entry:
678  %3 = fneg <4 x float> %2
679  %result = call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %3,
680                                                                      metadata !"round.dynamic",
681                                                                      metadata !"fpexcept.strict") #0
682  ret <4 x float> %result
683}
684
685define <2 x double> @f22(<2 x double> %0, <2 x double> %1, <2 x double> %2) #0 {
686; NOFMA-LABEL: f22:
687; NOFMA:       # %bb.0: # %entry
688; NOFMA-NEXT:    subq $72, %rsp
689; NOFMA-NEXT:    .cfi_def_cfa_offset 80
690; NOFMA-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
691; NOFMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
692; NOFMA-NEXT:    xorps {{.*}}(%rip), %xmm2
693; NOFMA-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
694; NOFMA-NEXT:    callq fma
695; NOFMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
696; NOFMA-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
697; NOFMA-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
698; NOFMA-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
699; NOFMA-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
700; NOFMA-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
701; NOFMA-NEXT:    # xmm2 = mem[2,3,2,3]
702; NOFMA-NEXT:    callq fma
703; NOFMA-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
704; NOFMA-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
705; NOFMA-NEXT:    movaps %xmm1, %xmm0
706; NOFMA-NEXT:    addq $72, %rsp
707; NOFMA-NEXT:    .cfi_def_cfa_offset 8
708; NOFMA-NEXT:    retq
709;
710; FMA-LABEL: f22:
711; FMA:       # %bb.0: # %entry
712; FMA-NEXT:    vfmsub213pd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
713; FMA-NEXT:    retq
714;
715; FMA4-LABEL: f22:
716; FMA4:       # %bb.0: # %entry
717; FMA4-NEXT:    vfmsubpd {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm2
718; FMA4-NEXT:    retq
719entry:
720  %3 = fneg <2 x double> %2
721  %result = call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %0, <2 x double> %1, <2 x double> %3,
722                                                                       metadata !"round.dynamic",
723                                                                       metadata !"fpexcept.strict") #0
724  ret <2 x double> %result
725}
726
727define <4 x float> @f23(<4 x float> %0, <4 x float> %1, <4 x float> %2) #0 {
728; NOFMA-LABEL: f23:
729; NOFMA:       # %bb.0: # %entry
730; NOFMA-NEXT:    subq $88, %rsp
731; NOFMA-NEXT:    .cfi_def_cfa_offset 96
732; NOFMA-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
733; NOFMA-NEXT:    movdqa {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
734; NOFMA-NEXT:    pxor %xmm3, %xmm0
735; NOFMA-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
736; NOFMA-NEXT:    pxor %xmm3, %xmm2
737; NOFMA-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
738; NOFMA-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
739; NOFMA-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
740; NOFMA-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
741; NOFMA-NEXT:    callq fmaf
742; NOFMA-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
743; NOFMA-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
744; NOFMA-NEXT:    # xmm0 = mem[2,3,2,3]
745; NOFMA-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
746; NOFMA-NEXT:    # xmm2 = mem[2,3,2,3]
747; NOFMA-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
748; NOFMA-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
749; NOFMA-NEXT:    callq fmaf
750; NOFMA-NEXT:    punpckldq (%rsp), %xmm0 # 16-byte Folded Reload
751; NOFMA-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
752; NOFMA-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
753; NOFMA-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
754; NOFMA-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
755; NOFMA-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
756; NOFMA-NEXT:    callq fmaf
757; NOFMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
758; NOFMA-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
759; NOFMA-NEXT:    # xmm0 = mem[1,1,1,1]
760; NOFMA-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
761; NOFMA-NEXT:    # xmm2 = mem[1,1,1,1]
762; NOFMA-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
763; NOFMA-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
764; NOFMA-NEXT:    callq fmaf
765; NOFMA-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
766; NOFMA-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
767; NOFMA-NEXT:    punpcklqdq (%rsp), %xmm1 # 16-byte Folded Reload
768; NOFMA-NEXT:    # xmm1 = xmm1[0],mem[0]
769; NOFMA-NEXT:    movdqa %xmm1, %xmm0
770; NOFMA-NEXT:    addq $88, %rsp
771; NOFMA-NEXT:    .cfi_def_cfa_offset 8
772; NOFMA-NEXT:    retq
773;
774; FMA-LABEL: f23:
775; FMA:       # %bb.0: # %entry
776; FMA-NEXT:    vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
777; FMA-NEXT:    retq
778;
779; FMA4-LABEL: f23:
780; FMA4:       # %bb.0: # %entry
781; FMA4-NEXT:    vfnmsubps {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2
782; FMA4-NEXT:    retq
783entry:
784  %3 = fneg <4 x float> %0
785  %4 = fneg <4 x float> %2
786  %result = call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> %3, <4 x float> %1, <4 x float> %4,
787                                                                      metadata !"round.dynamic",
788                                                                      metadata !"fpexcept.strict") #0
789  ret <4 x float> %result
790}
791
792define <2 x double> @f24(<2 x double> %0, <2 x double> %1, <2 x double> %2) #0 {
793; NOFMA-LABEL: f24:
794; NOFMA:       # %bb.0: # %entry
795; NOFMA-NEXT:    subq $72, %rsp
796; NOFMA-NEXT:    .cfi_def_cfa_offset 80
797; NOFMA-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
798; NOFMA-NEXT:    movaps {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0]
799; NOFMA-NEXT:    xorps %xmm3, %xmm0
800; NOFMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
801; NOFMA-NEXT:    xorps %xmm3, %xmm2
802; NOFMA-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
803; NOFMA-NEXT:    callq fma
804; NOFMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
805; NOFMA-NEXT:    pshufd $238, (%rsp), %xmm0 # 16-byte Folded Reload
806; NOFMA-NEXT:    # xmm0 = mem[2,3,2,3]
807; NOFMA-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
808; NOFMA-NEXT:    # xmm2 = mem[2,3,2,3]
809; NOFMA-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
810; NOFMA-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
811; NOFMA-NEXT:    callq fma
812; NOFMA-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
813; NOFMA-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
814; NOFMA-NEXT:    movdqa %xmm1, %xmm0
815; NOFMA-NEXT:    addq $72, %rsp
816; NOFMA-NEXT:    .cfi_def_cfa_offset 8
817; NOFMA-NEXT:    retq
818;
819; FMA-LABEL: f24:
820; FMA:       # %bb.0: # %entry
821; FMA-NEXT:    vfnmsub213pd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
822; FMA-NEXT:    retq
823;
824; FMA4-LABEL: f24:
825; FMA4:       # %bb.0: # %entry
826; FMA4-NEXT:    vfnmsubpd {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2
827; FMA4-NEXT:    retq
828entry:
829  %3 = fneg <2 x double> %0
830  %4 = fneg <2 x double> %2
831  %result = call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %3, <2 x double> %1, <2 x double> %4,
832                                                                       metadata !"round.dynamic",
833                                                                       metadata !"fpexcept.strict") #0
834  ret <2 x double> %result
835}
836
837define <4 x float> @f25(<4 x float> %0, <4 x float> %1, <4 x float> %2) #0 {
838; NOFMA-LABEL: f25:
839; NOFMA:       # %bb.0: # %entry
840; NOFMA-NEXT:    subq $88, %rsp
841; NOFMA-NEXT:    .cfi_def_cfa_offset 96
842; NOFMA-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
843; NOFMA-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
844; NOFMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
845; NOFMA-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
846; NOFMA-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
847; NOFMA-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
848; NOFMA-NEXT:    callq fmaf
849; NOFMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
850; NOFMA-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
851; NOFMA-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
852; NOFMA-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
853; NOFMA-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
854; NOFMA-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
855; NOFMA-NEXT:    movhlps {{.*#+}} xmm2 = xmm2[1,1]
856; NOFMA-NEXT:    callq fmaf
857; NOFMA-NEXT:    unpcklps (%rsp), %xmm0 # 16-byte Folded Reload
858; NOFMA-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
859; NOFMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
860; NOFMA-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
861; NOFMA-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
862; NOFMA-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
863; NOFMA-NEXT:    callq fmaf
864; NOFMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
865; NOFMA-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
866; NOFMA-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
867; NOFMA-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
868; NOFMA-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
869; NOFMA-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
870; NOFMA-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1,1,1]
871; NOFMA-NEXT:    callq fmaf
872; NOFMA-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
873; NOFMA-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
874; NOFMA-NEXT:    unpcklpd (%rsp), %xmm1 # 16-byte Folded Reload
875; NOFMA-NEXT:    # xmm1 = xmm1[0],mem[0]
876; NOFMA-NEXT:    xorps {{.*}}(%rip), %xmm1
877; NOFMA-NEXT:    movaps %xmm1, %xmm0
878; NOFMA-NEXT:    addq $88, %rsp
879; NOFMA-NEXT:    .cfi_def_cfa_offset 8
880; NOFMA-NEXT:    retq
881;
882; FMA-AVX1-LABEL: f25:
883; FMA-AVX1:       # %bb.0: # %entry
884; FMA-AVX1-NEXT:    vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
885; FMA-AVX1-NEXT:    vxorps {{.*}}(%rip), %xmm0, %xmm0
886; FMA-AVX1-NEXT:    retq
887;
888; FMA4-LABEL: f25:
889; FMA4:       # %bb.0: # %entry
890; FMA4-NEXT:    vfmaddps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm2
891; FMA4-NEXT:    vxorps {{.*}}(%rip), %xmm0, %xmm0
892; FMA4-NEXT:    retq
893;
894; FMA-AVX512-LABEL: f25:
895; FMA-AVX512:       # %bb.0: # %entry
896; FMA-AVX512-NEXT:    vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
897; FMA-AVX512-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
898; FMA-AVX512-NEXT:    vxorps %xmm1, %xmm0, %xmm0
899; FMA-AVX512-NEXT:    retq
900entry:
901  %3 = call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2,
902                                                                 metadata !"round.dynamic",
903                                                                 metadata !"fpexcept.strict") #0
904  %result = fneg <4 x float> %3
905  ret <4 x float> %result
906}
907
908define <2 x double> @f26(<2 x double> %0, <2 x double> %1, <2 x double> %2) #0 {
909; NOFMA-LABEL: f26:
910; NOFMA:       # %bb.0: # %entry
911; NOFMA-NEXT:    subq $72, %rsp
912; NOFMA-NEXT:    .cfi_def_cfa_offset 80
913; NOFMA-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
914; NOFMA-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
915; NOFMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
916; NOFMA-NEXT:    callq fma
917; NOFMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
918; NOFMA-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
919; NOFMA-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
920; NOFMA-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
921; NOFMA-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
922; NOFMA-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
923; NOFMA-NEXT:    movhlps {{.*#+}} xmm2 = xmm2[1,1]
924; NOFMA-NEXT:    callq fma
925; NOFMA-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
926; NOFMA-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
927; NOFMA-NEXT:    xorps {{.*}}(%rip), %xmm1
928; NOFMA-NEXT:    movaps %xmm1, %xmm0
929; NOFMA-NEXT:    addq $72, %rsp
930; NOFMA-NEXT:    .cfi_def_cfa_offset 8
931; NOFMA-NEXT:    retq
932;
933; FMA-LABEL: f26:
934; FMA:       # %bb.0: # %entry
935; FMA-NEXT:    vfmadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
936; FMA-NEXT:    vxorpd {{.*}}(%rip), %xmm0, %xmm0
937; FMA-NEXT:    retq
938;
939; FMA4-LABEL: f26:
940; FMA4:       # %bb.0: # %entry
941; FMA4-NEXT:    vfmaddpd {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm2
942; FMA4-NEXT:    vxorpd {{.*}}(%rip), %xmm0, %xmm0
943; FMA4-NEXT:    retq
944entry:
945  %3 = call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %0, <2 x double> %1, <2 x double> %2,
946                                                                  metadata !"round.dynamic",
947                                                                  metadata !"fpexcept.strict") #0
948  %result = fneg <2 x double> %3
949  ret <2 x double> %result
950}
951
952define <4 x float> @f27(<4 x float> %0, <4 x float> %1, <4 x float> %2) #0 {
953; NOFMA-LABEL: f27:
954; NOFMA:       # %bb.0: # %entry
955; NOFMA-NEXT:    subq $88, %rsp
956; NOFMA-NEXT:    .cfi_def_cfa_offset 96
957; NOFMA-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
958; NOFMA-NEXT:    movdqa {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
959; NOFMA-NEXT:    pxor %xmm3, %xmm0
960; NOFMA-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
961; NOFMA-NEXT:    pxor %xmm3, %xmm2
962; NOFMA-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
963; NOFMA-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
964; NOFMA-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
965; NOFMA-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
966; NOFMA-NEXT:    callq fmaf
967; NOFMA-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
968; NOFMA-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
969; NOFMA-NEXT:    # xmm0 = mem[2,3,2,3]
970; NOFMA-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
971; NOFMA-NEXT:    # xmm2 = mem[2,3,2,3]
972; NOFMA-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
973; NOFMA-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
974; NOFMA-NEXT:    callq fmaf
975; NOFMA-NEXT:    punpckldq (%rsp), %xmm0 # 16-byte Folded Reload
976; NOFMA-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
977; NOFMA-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
978; NOFMA-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
979; NOFMA-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
980; NOFMA-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
981; NOFMA-NEXT:    callq fmaf
982; NOFMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
983; NOFMA-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
984; NOFMA-NEXT:    # xmm0 = mem[1,1,1,1]
985; NOFMA-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
986; NOFMA-NEXT:    # xmm2 = mem[1,1,1,1]
987; NOFMA-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
988; NOFMA-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
989; NOFMA-NEXT:    callq fmaf
990; NOFMA-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
991; NOFMA-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
992; NOFMA-NEXT:    punpcklqdq (%rsp), %xmm1 # 16-byte Folded Reload
993; NOFMA-NEXT:    # xmm1 = xmm1[0],mem[0]
994; NOFMA-NEXT:    pxor {{.*}}(%rip), %xmm1
995; NOFMA-NEXT:    movdqa %xmm1, %xmm0
996; NOFMA-NEXT:    addq $88, %rsp
997; NOFMA-NEXT:    .cfi_def_cfa_offset 8
998; NOFMA-NEXT:    retq
999;
1000; FMA-AVX1-LABEL: f27:
1001; FMA-AVX1:       # %bb.0: # %entry
1002; FMA-AVX1-NEXT:    vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
1003; FMA-AVX1-NEXT:    vxorps {{.*}}(%rip), %xmm0, %xmm0
1004; FMA-AVX1-NEXT:    retq
1005;
1006; FMA4-LABEL: f27:
1007; FMA4:       # %bb.0: # %entry
1008; FMA4-NEXT:    vfnmsubps {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2
1009; FMA4-NEXT:    vxorps {{.*}}(%rip), %xmm0, %xmm0
1010; FMA4-NEXT:    retq
1011;
1012; FMA-AVX512-LABEL: f27:
1013; FMA-AVX512:       # %bb.0: # %entry
1014; FMA-AVX512-NEXT:    vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
1015; FMA-AVX512-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
1016; FMA-AVX512-NEXT:    vxorps %xmm1, %xmm0, %xmm0
1017; FMA-AVX512-NEXT:    retq
1018entry:
1019  %3 = fneg <4 x float> %0
1020  %4 = fneg <4 x float> %2
1021  %5 = call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> %3, <4 x float> %1, <4 x float> %4,
1022                                                                 metadata !"round.dynamic",
1023                                                                 metadata !"fpexcept.strict") #0
1024  %result = fneg <4 x float> %5
1025  ret <4 x float> %result
1026}
1027
1028define <2 x double> @f28(<2 x double> %0, <2 x double> %1, <2 x double> %2) #0 {
1029; NOFMA-LABEL: f28:
1030; NOFMA:       # %bb.0: # %entry
1031; NOFMA-NEXT:    subq $72, %rsp
1032; NOFMA-NEXT:    .cfi_def_cfa_offset 80
1033; NOFMA-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1034; NOFMA-NEXT:    movaps {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0]
1035; NOFMA-NEXT:    xorps %xmm3, %xmm0
1036; NOFMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
1037; NOFMA-NEXT:    xorps %xmm3, %xmm2
1038; NOFMA-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1039; NOFMA-NEXT:    callq fma
1040; NOFMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1041; NOFMA-NEXT:    pshufd $238, (%rsp), %xmm0 # 16-byte Folded Reload
1042; NOFMA-NEXT:    # xmm0 = mem[2,3,2,3]
1043; NOFMA-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
1044; NOFMA-NEXT:    # xmm2 = mem[2,3,2,3]
1045; NOFMA-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1046; NOFMA-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
1047; NOFMA-NEXT:    callq fma
1048; NOFMA-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1049; NOFMA-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
1050; NOFMA-NEXT:    pxor {{.*}}(%rip), %xmm1
1051; NOFMA-NEXT:    movdqa %xmm1, %xmm0
1052; NOFMA-NEXT:    addq $72, %rsp
1053; NOFMA-NEXT:    .cfi_def_cfa_offset 8
1054; NOFMA-NEXT:    retq
1055;
1056; FMA-LABEL: f28:
1057; FMA:       # %bb.0: # %entry
1058; FMA-NEXT:    vfnmsub213pd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
1059; FMA-NEXT:    vxorpd {{.*}}(%rip), %xmm0, %xmm0
1060; FMA-NEXT:    retq
1061;
1062; FMA4-LABEL: f28:
1063; FMA4:       # %bb.0: # %entry
1064; FMA4-NEXT:    vfnmsubpd {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2
1065; FMA4-NEXT:    vxorpd {{.*}}(%rip), %xmm0, %xmm0
1066; FMA4-NEXT:    retq
1067entry:
1068  %3 = fneg <2 x double> %0
1069  %4 = fneg <2 x double> %2
1070  %5 = call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %3, <2 x double> %1, <2 x double> %4,
1071                                                                  metadata !"round.dynamic",
1072                                                                  metadata !"fpexcept.strict") #0
1073  %result = fneg <2 x double> %5
1074  ret <2 x double> %result
1075}
1076
1077attributes #0 = { strictfp }
1078
1079declare float @llvm.experimental.constrained.fmul.f32(float, float, metadata, metadata)
1080declare float @llvm.experimental.constrained.fadd.f32(float, float, metadata, metadata)
1081declare double @llvm.experimental.constrained.fmul.f64(double, double, metadata, metadata)
1082declare double @llvm.experimental.constrained.fadd.f64(double, double, metadata, metadata)
1083declare float @llvm.experimental.constrained.fma.f32(float, float, float, metadata, metadata)
1084declare double @llvm.experimental.constrained.fma.f64(double, double, double, metadata, metadata)
1085declare <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float>, <4 x float>, <4 x float>, metadata, metadata)
1086declare <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double>, <2 x double>, <2 x double>, metadata, metadata)
1087declare float @llvm.experimental.constrained.fmuladd.f32(float, float, float, metadata, metadata)
1088declare double @llvm.experimental.constrained.fmuladd.f64(double, double, double, metadata, metadata)
1089