1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 -O3 | FileCheck %s --check-prefixes=SSE,SSE-X86
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 -O3 | FileCheck %s --check-prefixes=SSE,SSE-X64
4; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+fma -O3 | FileCheck %s --check-prefixes=AVX
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+fma -O3 | FileCheck %s --check-prefixes=AVX
6; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=AVX
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=AVX
8
9declare <2 x double> @llvm.experimental.constrained.fadd.v2f64(<2 x double>, <2 x double>, metadata, metadata)
10declare <4 x float> @llvm.experimental.constrained.fadd.v4f32(<4 x float>, <4 x float>, metadata, metadata)
11declare <2 x double> @llvm.experimental.constrained.fsub.v2f64(<2 x double>, <2 x double>, metadata, metadata)
12declare <4 x float> @llvm.experimental.constrained.fsub.v4f32(<4 x float>, <4 x float>, metadata, metadata)
13declare <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double>, <2 x double>, metadata, metadata)
14declare <4 x float> @llvm.experimental.constrained.fmul.v4f32(<4 x float>, <4 x float>, metadata, metadata)
15declare <2 x double> @llvm.experimental.constrained.fdiv.v2f64(<2 x double>, <2 x double>, metadata, metadata)
16declare <4 x float> @llvm.experimental.constrained.fdiv.v4f32(<4 x float>, <4 x float>, metadata, metadata)
17declare <2 x double> @llvm.experimental.constrained.sqrt.v2f64(<2 x double>, metadata, metadata)
18declare <4 x float> @llvm.experimental.constrained.sqrt.v4f32(<4 x float>, metadata, metadata)
19declare float @llvm.experimental.constrained.fptrunc.f32.f64(double, metadata, metadata)
20declare <2 x float> @llvm.experimental.constrained.fptrunc.v2f32.v2f64(<2 x double>, metadata, metadata)
21declare double @llvm.experimental.constrained.fpext.f64.f32(float, metadata)
22declare <2 x double> @llvm.experimental.constrained.fpext.v2f64.v2f32(<2 x float>, metadata)
23declare <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double>, <2 x double>, <2 x double>, metadata, metadata)
24declare <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float>, <4 x float>, <4 x float>, metadata, metadata)
25
26define <2 x double> @f1(<2 x double> %a, <2 x double> %b) #0 {
27; SSE-LABEL: f1:
28; SSE:       # %bb.0:
29; SSE-NEXT:    addpd %xmm1, %xmm0
30; SSE-NEXT:    ret{{[l|q]}}
31;
32; AVX-LABEL: f1:
33; AVX:       # %bb.0:
34; AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
35; AVX-NEXT:    ret{{[l|q]}}
36  %ret = call <2 x double> @llvm.experimental.constrained.fadd.v2f64(<2 x double> %a, <2 x double> %b,
37                                                                     metadata !"round.dynamic",
38                                                                     metadata !"fpexcept.strict") #0
39  ret <2 x double> %ret
40}
41
42define <4 x float> @f2(<4 x float> %a, <4 x float> %b) #0 {
43; SSE-LABEL: f2:
44; SSE:       # %bb.0:
45; SSE-NEXT:    addps %xmm1, %xmm0
46; SSE-NEXT:    ret{{[l|q]}}
47;
48; AVX-LABEL: f2:
49; AVX:       # %bb.0:
50; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
51; AVX-NEXT:    ret{{[l|q]}}
52  %ret = call <4 x float> @llvm.experimental.constrained.fadd.v4f32(<4 x float> %a, <4 x float> %b,
53                                                                    metadata !"round.dynamic",
54                                                                    metadata !"fpexcept.strict") #0
55  ret <4 x float> %ret
56}
57
58define <2 x double> @f3(<2 x double> %a, <2 x double> %b) #0 {
59; SSE-LABEL: f3:
60; SSE:       # %bb.0:
61; SSE-NEXT:    subpd %xmm1, %xmm0
62; SSE-NEXT:    ret{{[l|q]}}
63;
64; AVX-LABEL: f3:
65; AVX:       # %bb.0:
66; AVX-NEXT:    vsubpd %xmm1, %xmm0, %xmm0
67; AVX-NEXT:    ret{{[l|q]}}
68  %ret = call <2 x double> @llvm.experimental.constrained.fsub.v2f64(<2 x double> %a, <2 x double> %b,
69                                                                     metadata !"round.dynamic",
70                                                                     metadata !"fpexcept.strict") #0
71  ret <2 x double> %ret
72}
73
74define <4 x float> @f4(<4 x float> %a, <4 x float> %b) #0 {
75; SSE-LABEL: f4:
76; SSE:       # %bb.0:
77; SSE-NEXT:    subps %xmm1, %xmm0
78; SSE-NEXT:    ret{{[l|q]}}
79;
80; AVX-LABEL: f4:
81; AVX:       # %bb.0:
82; AVX-NEXT:    vsubps %xmm1, %xmm0, %xmm0
83; AVX-NEXT:    ret{{[l|q]}}
84  %ret = call <4 x float> @llvm.experimental.constrained.fsub.v4f32(<4 x float> %a, <4 x float> %b,
85                                                                    metadata !"round.dynamic",
86                                                                    metadata !"fpexcept.strict") #0
87  ret <4 x float> %ret
88}
89
90define <2 x double> @f5(<2 x double> %a, <2 x double> %b) #0 {
91; SSE-LABEL: f5:
92; SSE:       # %bb.0:
93; SSE-NEXT:    mulpd %xmm1, %xmm0
94; SSE-NEXT:    ret{{[l|q]}}
95;
96; AVX-LABEL: f5:
97; AVX:       # %bb.0:
98; AVX-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
99; AVX-NEXT:    ret{{[l|q]}}
100  %ret = call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %a, <2 x double> %b,
101                                                                     metadata !"round.dynamic",
102                                                                     metadata !"fpexcept.strict") #0
103  ret <2 x double> %ret
104}
105
106define <4 x float> @f6(<4 x float> %a, <4 x float> %b) #0 {
107; SSE-LABEL: f6:
108; SSE:       # %bb.0:
109; SSE-NEXT:    mulps %xmm1, %xmm0
110; SSE-NEXT:    ret{{[l|q]}}
111;
112; AVX-LABEL: f6:
113; AVX:       # %bb.0:
114; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0
115; AVX-NEXT:    ret{{[l|q]}}
116  %ret = call <4 x float> @llvm.experimental.constrained.fmul.v4f32(<4 x float> %a, <4 x float> %b,
117                                                                    metadata !"round.dynamic",
118                                                                    metadata !"fpexcept.strict") #0
119  ret <4 x float> %ret
120}
121
122define <2 x double> @f7(<2 x double> %a, <2 x double> %b) #0 {
123; SSE-LABEL: f7:
124; SSE:       # %bb.0:
125; SSE-NEXT:    divpd %xmm1, %xmm0
126; SSE-NEXT:    ret{{[l|q]}}
127;
128; AVX-LABEL: f7:
129; AVX:       # %bb.0:
130; AVX-NEXT:    vdivpd %xmm1, %xmm0, %xmm0
131; AVX-NEXT:    ret{{[l|q]}}
132  %ret = call <2 x double> @llvm.experimental.constrained.fdiv.v2f64(<2 x double> %a, <2 x double> %b,
133                                                                     metadata !"round.dynamic",
134                                                                     metadata !"fpexcept.strict") #0
135  ret <2 x double> %ret
136}
137
138define <4 x float> @f8(<4 x float> %a, <4 x float> %b) #0 {
139; SSE-LABEL: f8:
140; SSE:       # %bb.0:
141; SSE-NEXT:    divps %xmm1, %xmm0
142; SSE-NEXT:    ret{{[l|q]}}
143;
144; AVX-LABEL: f8:
145; AVX:       # %bb.0:
146; AVX-NEXT:    vdivps %xmm1, %xmm0, %xmm0
147; AVX-NEXT:    ret{{[l|q]}}
148  %ret = call <4 x float> @llvm.experimental.constrained.fdiv.v4f32(<4 x float> %a, <4 x float> %b,
149                                                                    metadata !"round.dynamic",
150                                                                    metadata !"fpexcept.strict") #0
151  ret <4 x float> %ret
152}
153
154define <2 x double> @f9(<2 x double> %a) #0 {
155; SSE-LABEL: f9:
156; SSE:       # %bb.0:
157; SSE-NEXT:    sqrtpd %xmm0, %xmm0
158; SSE-NEXT:    ret{{[l|q]}}
159;
160; AVX-LABEL: f9:
161; AVX:       # %bb.0:
162; AVX-NEXT:    vsqrtpd %xmm0, %xmm0
163; AVX-NEXT:    ret{{[l|q]}}
164  %sqrt = call <2 x double> @llvm.experimental.constrained.sqrt.v2f64(
165                              <2 x double> %a,
166                              metadata !"round.dynamic",
167                              metadata !"fpexcept.strict") #0
168  ret <2 x double> %sqrt
169}
170
171define <4 x float> @f10(<4 x float> %a) #0 {
172; SSE-LABEL: f10:
173; SSE:       # %bb.0:
174; SSE-NEXT:    sqrtps %xmm0, %xmm0
175; SSE-NEXT:    ret{{[l|q]}}
176;
177; AVX-LABEL: f10:
178; AVX:       # %bb.0:
179; AVX-NEXT:    vsqrtps %xmm0, %xmm0
180; AVX-NEXT:    ret{{[l|q]}}
181  %sqrt = call <4 x float> @llvm.experimental.constrained.sqrt.v4f32(
182                              <4 x float> %a,
183                              metadata !"round.dynamic",
184                              metadata !"fpexcept.strict") #0
185  ret <4 x float > %sqrt
186}
187
188define <4 x float> @f11(<2 x double> %a0, <4 x float> %a1) #0 {
189; SSE-LABEL: f11:
190; SSE:       # %bb.0:
191; SSE-NEXT:    cvtsd2ss %xmm0, %xmm1
192; SSE-NEXT:    movaps %xmm1, %xmm0
193; SSE-NEXT:    ret{{[l|q]}}
194;
195; AVX-LABEL: f11:
196; AVX:       # %bb.0:
197; AVX-NEXT:    vcvtsd2ss %xmm0, %xmm1, %xmm0
198; AVX-NEXT:    ret{{[l|q]}}
199  %ext = extractelement <2 x double> %a0, i32 0
200  %cvt = call float @llvm.experimental.constrained.fptrunc.f32.f64(double %ext,
201                                                                   metadata !"round.dynamic",
202                                                                   metadata !"fpexcept.strict") #0
203  %res = insertelement <4 x float> %a1, float %cvt, i32 0
204  ret <4 x float> %res
205}
206
207define <2 x double> @f12(<2 x double> %a0, <4 x float> %a1) #0 {
208; SSE-LABEL: f12:
209; SSE:       # %bb.0:
210; SSE-NEXT:    cvtss2sd %xmm1, %xmm0
211; SSE-NEXT:    ret{{[l|q]}}
212;
213; AVX-LABEL: f12:
214; AVX:       # %bb.0:
215; AVX-NEXT:    vcvtss2sd %xmm1, %xmm0, %xmm0
216; AVX-NEXT:    ret{{[l|q]}}
217  %ext = extractelement <4 x float> %a1, i32 0
218  %cvt = call double @llvm.experimental.constrained.fpext.f64.f32(float %ext,
219                                                                  metadata !"fpexcept.strict") #0
220  %res = insertelement <2 x double> %a0, double %cvt, i32 0
221  ret <2 x double> %res
222}
223
224define <4 x float> @f13(<4 x float> %a, <4 x float> %b, <4 x float> %c) #0 {
225; SSE-X86-LABEL: f13:
226; SSE-X86:       # %bb.0:
227; SSE-X86-NEXT:    subl $108, %esp
228; SSE-X86-NEXT:    .cfi_def_cfa_offset 112
229; SSE-X86-NEXT:    movups %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
230; SSE-X86-NEXT:    movups %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
231; SSE-X86-NEXT:    movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
232; SSE-X86-NEXT:    movss %xmm2, {{[0-9]+}}(%esp)
233; SSE-X86-NEXT:    movss %xmm1, {{[0-9]+}}(%esp)
234; SSE-X86-NEXT:    movss %xmm0, (%esp)
235; SSE-X86-NEXT:    calll fmaf
236; SSE-X86-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
237; SSE-X86-NEXT:    wait
238; SSE-X86-NEXT:    movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
239; SSE-X86-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
240; SSE-X86-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
241; SSE-X86-NEXT:    movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
242; SSE-X86-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
243; SSE-X86-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
244; SSE-X86-NEXT:    movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
245; SSE-X86-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
246; SSE-X86-NEXT:    movss %xmm0, (%esp)
247; SSE-X86-NEXT:    calll fmaf
248; SSE-X86-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
249; SSE-X86-NEXT:    wait
250; SSE-X86-NEXT:    movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
251; SSE-X86-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
252; SSE-X86-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
253; SSE-X86-NEXT:    movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
254; SSE-X86-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
255; SSE-X86-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
256; SSE-X86-NEXT:    movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
257; SSE-X86-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
258; SSE-X86-NEXT:    movss %xmm0, (%esp)
259; SSE-X86-NEXT:    calll fmaf
260; SSE-X86-NEXT:    movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
261; SSE-X86-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
262; SSE-X86-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
263; SSE-X86-NEXT:    movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
264; SSE-X86-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
265; SSE-X86-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
266; SSE-X86-NEXT:    movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
267; SSE-X86-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
268; SSE-X86-NEXT:    movss %xmm0, (%esp)
269; SSE-X86-NEXT:    fstps {{[0-9]+}}(%esp)
270; SSE-X86-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
271; SSE-X86-NEXT:    fstps {{[0-9]+}}(%esp)
272; SSE-X86-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
273; SSE-X86-NEXT:    fstps {{[0-9]+}}(%esp)
274; SSE-X86-NEXT:    wait
275; SSE-X86-NEXT:    calll fmaf
276; SSE-X86-NEXT:    fstps {{[0-9]+}}(%esp)
277; SSE-X86-NEXT:    wait
278; SSE-X86-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
279; SSE-X86-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
280; SSE-X86-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
281; SSE-X86-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
282; SSE-X86-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
283; SSE-X86-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
284; SSE-X86-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
285; SSE-X86-NEXT:    addl $108, %esp
286; SSE-X86-NEXT:    .cfi_def_cfa_offset 4
287; SSE-X86-NEXT:    retl
288;
289; SSE-X64-LABEL: f13:
290; SSE-X64:       # %bb.0:
291; SSE-X64-NEXT:    subq $88, %rsp
292; SSE-X64-NEXT:    .cfi_def_cfa_offset 96
293; SSE-X64-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
294; SSE-X64-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
295; SSE-X64-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
296; SSE-X64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
297; SSE-X64-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
298; SSE-X64-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
299; SSE-X64-NEXT:    callq fmaf
300; SSE-X64-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
301; SSE-X64-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
302; SSE-X64-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
303; SSE-X64-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
304; SSE-X64-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
305; SSE-X64-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
306; SSE-X64-NEXT:    movhlps {{.*#+}} xmm2 = xmm2[1,1]
307; SSE-X64-NEXT:    callq fmaf
308; SSE-X64-NEXT:    unpcklps (%rsp), %xmm0 # 16-byte Folded Reload
309; SSE-X64-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
310; SSE-X64-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
311; SSE-X64-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
312; SSE-X64-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
313; SSE-X64-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
314; SSE-X64-NEXT:    callq fmaf
315; SSE-X64-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
316; SSE-X64-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
317; SSE-X64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
318; SSE-X64-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
319; SSE-X64-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
320; SSE-X64-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
321; SSE-X64-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1,1,1]
322; SSE-X64-NEXT:    callq fmaf
323; SSE-X64-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
324; SSE-X64-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
325; SSE-X64-NEXT:    unpcklpd (%rsp), %xmm1 # 16-byte Folded Reload
326; SSE-X64-NEXT:    # xmm1 = xmm1[0],mem[0]
327; SSE-X64-NEXT:    movaps %xmm1, %xmm0
328; SSE-X64-NEXT:    addq $88, %rsp
329; SSE-X64-NEXT:    .cfi_def_cfa_offset 8
330; SSE-X64-NEXT:    retq
331;
332; AVX-LABEL: f13:
333; AVX:       # %bb.0:
334; AVX-NEXT:    vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
335; AVX-NEXT:    ret{{[l|q]}}
336  %res = call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c,
337                                                                   metadata !"round.dynamic",
338                                                                   metadata !"fpexcept.strict") #0
339  ret <4 x float> %res
340}
341
342define <2 x double> @f14(<2 x double> %a, <2 x double> %b, <2 x double> %c) #0 {
343; SSE-X86-LABEL: f14:
344; SSE-X86:       # %bb.0:
345; SSE-X86-NEXT:    pushl %ebp
346; SSE-X86-NEXT:    .cfi_def_cfa_offset 8
347; SSE-X86-NEXT:    .cfi_offset %ebp, -8
348; SSE-X86-NEXT:    movl %esp, %ebp
349; SSE-X86-NEXT:    .cfi_def_cfa_register %ebp
350; SSE-X86-NEXT:    andl $-16, %esp
351; SSE-X86-NEXT:    subl $112, %esp
352; SSE-X86-NEXT:    movaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
353; SSE-X86-NEXT:    movaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
354; SSE-X86-NEXT:    movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
355; SSE-X86-NEXT:    movlps %xmm2, {{[0-9]+}}(%esp)
356; SSE-X86-NEXT:    movlps %xmm1, {{[0-9]+}}(%esp)
357; SSE-X86-NEXT:    movlps %xmm0, (%esp)
358; SSE-X86-NEXT:    calll fma
359; SSE-X86-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
360; SSE-X86-NEXT:    movhps %xmm0, {{[0-9]+}}(%esp)
361; SSE-X86-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
362; SSE-X86-NEXT:    movhps %xmm0, {{[0-9]+}}(%esp)
363; SSE-X86-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
364; SSE-X86-NEXT:    movhps %xmm0, (%esp)
365; SSE-X86-NEXT:    fstpl {{[0-9]+}}(%esp)
366; SSE-X86-NEXT:    wait
367; SSE-X86-NEXT:    calll fma
368; SSE-X86-NEXT:    fstpl {{[0-9]+}}(%esp)
369; SSE-X86-NEXT:    wait
370; SSE-X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
371; SSE-X86-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
372; SSE-X86-NEXT:    movl %ebp, %esp
373; SSE-X86-NEXT:    popl %ebp
374; SSE-X86-NEXT:    .cfi_def_cfa %esp, 4
375; SSE-X86-NEXT:    retl
376;
377; SSE-X64-LABEL: f14:
378; SSE-X64:       # %bb.0:
379; SSE-X64-NEXT:    subq $72, %rsp
380; SSE-X64-NEXT:    .cfi_def_cfa_offset 80
381; SSE-X64-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
382; SSE-X64-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
383; SSE-X64-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
384; SSE-X64-NEXT:    callq fma
385; SSE-X64-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
386; SSE-X64-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
387; SSE-X64-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
388; SSE-X64-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
389; SSE-X64-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
390; SSE-X64-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
391; SSE-X64-NEXT:    movhlps {{.*#+}} xmm2 = xmm2[1,1]
392; SSE-X64-NEXT:    callq fma
393; SSE-X64-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
394; SSE-X64-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
395; SSE-X64-NEXT:    movaps %xmm1, %xmm0
396; SSE-X64-NEXT:    addq $72, %rsp
397; SSE-X64-NEXT:    .cfi_def_cfa_offset 8
398; SSE-X64-NEXT:    retq
399;
400; AVX-LABEL: f14:
401; AVX:       # %bb.0:
402; AVX-NEXT:    vfmadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
403; AVX-NEXT:    ret{{[l|q]}}
404  %res = call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c,
405                                                                    metadata !"round.dynamic",
406                                                                    metadata !"fpexcept.strict") #0
407  ret <2 x double> %res
408}
409
410define <2 x double> @f15(<2 x float> %a) #0 {
411; SSE-LABEL: f15:
412; SSE:       # %bb.0:
413; SSE-NEXT:    cvtps2pd %xmm0, %xmm0
414; SSE-NEXT:    ret{{[l|q]}}
415;
416; AVX-LABEL: f15:
417; AVX:       # %bb.0:
418; AVX-NEXT:    vcvtps2pd %xmm0, %xmm0
419; AVX-NEXT:    ret{{[l|q]}}
420  %ret = call <2 x double> @llvm.experimental.constrained.fpext.v2f64.v2f32(
421                                <2 x float> %a,
422                                metadata !"fpexcept.strict") #0
423  ret <2 x double> %ret
424}
425
426define <2 x float> @f16(<2 x double> %a) #0 {
427; SSE-LABEL: f16:
428; SSE:       # %bb.0:
429; SSE-NEXT:    cvtpd2ps %xmm0, %xmm0
430; SSE-NEXT:    ret{{[l|q]}}
431;
432; AVX-LABEL: f16:
433; AVX:       # %bb.0:
434; AVX-NEXT:    vcvtpd2ps %xmm0, %xmm0
435; AVX-NEXT:    ret{{[l|q]}}
436  %ret = call <2 x float> @llvm.experimental.constrained.fptrunc.v2f32.v2f64(
437                                <2 x double> %a,
438                                metadata !"round.dynamic",
439                                metadata !"fpexcept.strict") #0
440  ret <2 x float> %ret
441}
442
443
444attributes #0 = { strictfp }
445