1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f -mattr=+fma -show-mc-encoding | FileCheck %s
3
4define <2 x double> @combine_scalar_mask_fmadd_f32(<2 x double> %a, i8 zeroext %k, <2 x double> %b, <2 x double> %c) {
5; CHECK-LABEL: combine_scalar_mask_fmadd_f32:
6; CHECK:       # %bb.0: # %entry
7; CHECK-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
8; CHECK-NEXT:    vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0xa9,0xc2]
9; CHECK-NEXT:    # xmm0 {%k1} = (xmm1 * xmm0) + xmm2
10; CHECK-NEXT:    retq # encoding: [0xc3]
11entry:
12  %0 = bitcast <2 x double> %a to <4 x float>
13  %1 = bitcast <2 x double> %b to <4 x float>
14  %2 = bitcast <2 x double> %c to <4 x float>
15  %3 = extractelement <4 x float> %0, i64 0
16  %4 = extractelement <4 x float> %1, i64 0
17  %5 = extractelement <4 x float> %2, i64 0
18  %6 = fmul fast float %4, %3
19  %7 = fadd fast float %6, %5
20  %8 = bitcast i8 %k to <8 x i1>
21  %9 = extractelement <8 x i1> %8, i64 0
22  %10 = select i1 %9, float %7, float %3
23  %11 = insertelement <4 x float> %0, float %10, i64 0
24  %12 = bitcast <4 x float> %11 to <2 x double>
25  ret <2 x double> %12
26}
27
28define <2 x double> @combine_scalar_mask_fmadd_f64(<2 x double> %a, i8 zeroext %k, <2 x double> %b, <2 x double> %c) {
29; CHECK-LABEL: combine_scalar_mask_fmadd_f64:
30; CHECK:       # %bb.0: # %entry
31; CHECK-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
32; CHECK-NEXT:    vfmadd213sd %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0xf5,0x09,0xa9,0xc2]
33; CHECK-NEXT:    # xmm0 {%k1} = (xmm1 * xmm0) + xmm2
34; CHECK-NEXT:    retq # encoding: [0xc3]
35entry:
36  %0 = extractelement <2 x double> %a, i64 0
37  %1 = extractelement <2 x double> %b, i64 0
38  %2 = extractelement <2 x double> %c, i64 0
39  %3 = fmul fast double %1, %0
40  %4 = fadd fast double %3, %2
41  %5 = bitcast i8 %k to <8 x i1>
42  %6 = extractelement <8 x i1> %5, i64 0
43  %7 = select i1 %6, double %4, double %0
44  %8 = insertelement <2 x double> %a, double %7, i64 0
45  ret <2 x double> %8
46}
47
48define <2 x double> @combine_scalar_maskz_fmadd_32(i8 zeroext %k, <2 x double> %a, <2 x double> %b, <2 x double> %c) {
49; CHECK-LABEL: combine_scalar_maskz_fmadd_32:
50; CHECK:       # %bb.0: # %entry
51; CHECK-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
52; CHECK-NEXT:    vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0xa9,0xc2]
53; CHECK-NEXT:    # xmm0 {%k1} {z} = (xmm1 * xmm0) + xmm2
54; CHECK-NEXT:    retq # encoding: [0xc3]
55entry:
56  %0 = bitcast <2 x double> %a to <4 x float>
57  %1 = bitcast <2 x double> %b to <4 x float>
58  %2 = bitcast <2 x double> %c to <4 x float>
59  %3 = extractelement <4 x float> %0, i64 0
60  %4 = extractelement <4 x float> %1, i64 0
61  %5 = extractelement <4 x float> %2, i64 0
62  %6 = fmul fast float %4, %3
63  %7 = fadd fast float %6, %5
64  %8 = bitcast i8 %k to <8 x i1>
65  %9 = extractelement <8 x i1> %8, i64 0
66  %10 = select i1 %9, float %7, float 0.000000e+00
67  %11 = insertelement <4 x float> %0, float %10, i64 0
68  %12 = bitcast <4 x float> %11 to <2 x double>
69  ret <2 x double> %12
70}
71
72define <2 x double> @combine_scalar_maskz_fmadd_64(i8 zeroext %k, <2 x double> %a, <2 x double> %b, <2 x double> %c) {
73; CHECK-LABEL: combine_scalar_maskz_fmadd_64:
74; CHECK:       # %bb.0: # %entry
75; CHECK-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
76; CHECK-NEXT:    vfmadd213sd %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0xa9,0xc2]
77; CHECK-NEXT:    # xmm0 {%k1} {z} = (xmm1 * xmm0) + xmm2
78; CHECK-NEXT:    retq # encoding: [0xc3]
79entry:
80  %0 = extractelement <2 x double> %a, i64 0
81  %1 = extractelement <2 x double> %b, i64 0
82  %2 = extractelement <2 x double> %c, i64 0
83  %3 = fmul fast double %1, %0
84  %4 = fadd fast double %3, %2
85  %5 = bitcast i8 %k to <8 x i1>
86  %6 = extractelement <8 x i1> %5, i64 0
87  %7 = select i1 %6, double %4, double 0.000000e+00
88  %8 = insertelement <2 x double> %a, double %7, i64 0
89  ret <2 x double> %8
90}
91
92define <2 x double> @combine_scalar_mask3_fmadd_32(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 zeroext %k) {
93; CHECK-LABEL: combine_scalar_mask3_fmadd_32:
94; CHECK:       # %bb.0: # %entry
95; CHECK-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
96; CHECK-NEXT:    vfmadd231ss %xmm0, %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0xb9,0xd0]
97; CHECK-NEXT:    # xmm2 {%k1} = (xmm1 * xmm0) + xmm2
98; CHECK-NEXT:    vmovaps %xmm2, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc2]
99; CHECK-NEXT:    retq # encoding: [0xc3]
100entry:
101  %0 = bitcast <2 x double> %a to <4 x float>
102  %1 = bitcast <2 x double> %b to <4 x float>
103  %2 = bitcast <2 x double> %c to <4 x float>
104  %3 = extractelement <4 x float> %0, i64 0
105  %4 = extractelement <4 x float> %1, i64 0
106  %5 = extractelement <4 x float> %2, i64 0
107  %6 = fmul fast float %4, %3
108  %7 = fadd fast float %6, %5
109  %8 = bitcast i8 %k to <8 x i1>
110  %9 = extractelement <8 x i1> %8, i64 0
111  %10 = select i1 %9, float %7, float %5
112  %11 = insertelement <4 x float> %2, float %10, i64 0
113  %12 = bitcast <4 x float> %11 to <2 x double>
114  ret <2 x double> %12
115}
116
117define <2 x double> @combine_scalar_mask3_fmadd_64(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 zeroext %k) {
118; CHECK-LABEL: combine_scalar_mask3_fmadd_64:
119; CHECK:       # %bb.0: # %entry
120; CHECK-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
121; CHECK-NEXT:    vfmadd231sd %xmm0, %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf2,0xf5,0x09,0xb9,0xd0]
122; CHECK-NEXT:    # xmm2 {%k1} = (xmm1 * xmm0) + xmm2
123; CHECK-NEXT:    vmovapd %xmm2, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc2]
124; CHECK-NEXT:    retq # encoding: [0xc3]
125entry:
126  %0 = extractelement <2 x double> %a, i64 0
127  %1 = extractelement <2 x double> %b, i64 0
128  %2 = extractelement <2 x double> %c, i64 0
129  %3 = fmul fast double %1, %0
130  %4 = fadd fast double %3, %2
131  %5 = bitcast i8 %k to <8 x i1>
132  %6 = extractelement <8 x i1> %5, i64 0
133  %7 = select i1 %6, double %4, double %2
134  %8 = insertelement <2 x double> %c, double %7, i64 0
135  ret <2 x double> %8
136}
137
138define <2 x double> @combine_scalar_mask_fmsub_f32(<2 x double> %a, i8 zeroext %k, <2 x double> %b, <2 x double> %c) {
139; CHECK-LABEL: combine_scalar_mask_fmsub_f32:
140; CHECK:       # %bb.0: # %entry
141; CHECK-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
142; CHECK-NEXT:    vfmsub213ss %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0xab,0xc2]
143; CHECK-NEXT:    # xmm0 {%k1} = (xmm1 * xmm0) - xmm2
144; CHECK-NEXT:    retq # encoding: [0xc3]
145entry:
146  %0 = bitcast <2 x double> %a to <4 x float>
147  %1 = bitcast <2 x double> %b to <4 x float>
148  %2 = bitcast <2 x double> %c to <4 x float>
149  %3 = extractelement <4 x float> %0, i64 0
150  %4 = extractelement <4 x float> %1, i64 0
151  %5 = extractelement <4 x float> %2, i64 0
152  %6 = fmul fast float %4, %3
153  %7 = fsub fast float %6, %5
154  %8 = bitcast i8 %k to <8 x i1>
155  %9 = extractelement <8 x i1> %8, i64 0
156  %10 = select i1 %9, float %7, float %3
157  %11 = insertelement <4 x float> %0, float %10, i64 0
158  %12 = bitcast <4 x float> %11 to <2 x double>
159  ret <2 x double> %12
160}
161
162define <2 x double> @combine_scalar_mask_fmsub_f64(<2 x double> %a, i8 zeroext %k, <2 x double> %b, <2 x double> %c) {
163; CHECK-LABEL: combine_scalar_mask_fmsub_f64:
164; CHECK:       # %bb.0: # %entry
165; CHECK-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
166; CHECK-NEXT:    vfmsub213sd %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0xf5,0x09,0xab,0xc2]
167; CHECK-NEXT:    # xmm0 {%k1} = (xmm1 * xmm0) - xmm2
168; CHECK-NEXT:    retq # encoding: [0xc3]
169entry:
170  %0 = extractelement <2 x double> %a, i64 0
171  %1 = extractelement <2 x double> %b, i64 0
172  %2 = extractelement <2 x double> %c, i64 0
173  %3 = fmul fast double %1, %0
174  %4 = fsub fast double %3, %2
175  %5 = bitcast i8 %k to <8 x i1>
176  %6 = extractelement <8 x i1> %5, i64 0
177  %7 = select i1 %6, double %4, double %0
178  %8 = insertelement <2 x double> %a, double %7, i64 0
179  ret <2 x double> %8
180}
181
182define <2 x double> @combine_scalar_maskz_fmsub_32(i8 zeroext %k, <2 x double> %a, <2 x double> %b, <2 x double> %c) {
183; CHECK-LABEL: combine_scalar_maskz_fmsub_32:
184; CHECK:       # %bb.0: # %entry
185; CHECK-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
186; CHECK-NEXT:    vfmsub213ss %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0xab,0xc2]
187; CHECK-NEXT:    # xmm0 {%k1} {z} = (xmm1 * xmm0) - xmm2
188; CHECK-NEXT:    retq # encoding: [0xc3]
189entry:
190  %0 = bitcast <2 x double> %a to <4 x float>
191  %1 = bitcast <2 x double> %b to <4 x float>
192  %2 = bitcast <2 x double> %c to <4 x float>
193  %3 = extractelement <4 x float> %0, i64 0
194  %4 = extractelement <4 x float> %1, i64 0
195  %5 = extractelement <4 x float> %2, i64 0
196  %6 = fmul fast float %4, %3
197  %7 = fsub fast float %6, %5
198  %8 = bitcast i8 %k to <8 x i1>
199  %9 = extractelement <8 x i1> %8, i64 0
200  %10 = select i1 %9, float %7, float 0.000000e+00
201  %11 = insertelement <4 x float> %0, float %10, i64 0
202  %12 = bitcast <4 x float> %11 to <2 x double>
203  ret <2 x double> %12
204}
205
206define <2 x double> @combine_scalar_maskz_fmsub_64(i8 zeroext %k, <2 x double> %a, <2 x double> %b, <2 x double> %c) {
207; CHECK-LABEL: combine_scalar_maskz_fmsub_64:
208; CHECK:       # %bb.0: # %entry
209; CHECK-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
210; CHECK-NEXT:    vfmsub213sd %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0xab,0xc2]
211; CHECK-NEXT:    # xmm0 {%k1} {z} = (xmm1 * xmm0) - xmm2
212; CHECK-NEXT:    retq # encoding: [0xc3]
213entry:
214  %0 = extractelement <2 x double> %a, i64 0
215  %1 = extractelement <2 x double> %b, i64 0
216  %2 = extractelement <2 x double> %c, i64 0
217  %3 = fmul fast double %1, %0
218  %4 = fsub fast double %3, %2
219  %5 = bitcast i8 %k to <8 x i1>
220  %6 = extractelement <8 x i1> %5, i64 0
221  %7 = select i1 %6, double %4, double 0.000000e+00
222  %8 = insertelement <2 x double> %a, double %7, i64 0
223  ret <2 x double> %8
224}
225
226define <2 x double> @combine_scalar_mask3_fmsub_32(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 zeroext %k) {
227; CHECK-LABEL: combine_scalar_mask3_fmsub_32:
228; CHECK:       # %bb.0: # %entry
229; CHECK-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
230; CHECK-NEXT:    vfmsub231ss %xmm0, %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0xbb,0xd0]
231; CHECK-NEXT:    # xmm2 {%k1} = (xmm1 * xmm0) - xmm2
232; CHECK-NEXT:    vmovaps %xmm2, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc2]
233; CHECK-NEXT:    retq # encoding: [0xc3]
234entry:
235  %0 = bitcast <2 x double> %a to <4 x float>
236  %1 = bitcast <2 x double> %b to <4 x float>
237  %2 = bitcast <2 x double> %c to <4 x float>
238  %3 = extractelement <4 x float> %0, i64 0
239  %4 = extractelement <4 x float> %1, i64 0
240  %5 = extractelement <4 x float> %2, i64 0
241  %6 = fmul fast float %4, %3
242  %7 = fsub fast float %6, %5
243  %8 = bitcast i8 %k to <8 x i1>
244  %9 = extractelement <8 x i1> %8, i64 0
245  %10 = select i1 %9, float %7, float %5
246  %11 = insertelement <4 x float> %2, float %10, i64 0
247  %12 = bitcast <4 x float> %11 to <2 x double>
248  ret <2 x double> %12
249}
250
251define <2 x double> @combine_scalar_mask3_fmsub_64(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 zeroext %k) {
252; CHECK-LABEL: combine_scalar_mask3_fmsub_64:
253; CHECK:       # %bb.0: # %entry
254; CHECK-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
255; CHECK-NEXT:    vfmsub231sd %xmm0, %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf2,0xf5,0x09,0xbb,0xd0]
256; CHECK-NEXT:    # xmm2 {%k1} = (xmm1 * xmm0) - xmm2
257; CHECK-NEXT:    vmovapd %xmm2, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc2]
258; CHECK-NEXT:    retq # encoding: [0xc3]
259entry:
260  %0 = extractelement <2 x double> %a, i64 0
261  %1 = extractelement <2 x double> %b, i64 0
262  %2 = extractelement <2 x double> %c, i64 0
263  %3 = fmul fast double %1, %0
264  %4 = fsub fast double %3, %2
265  %5 = bitcast i8 %k to <8 x i1>
266  %6 = extractelement <8 x i1> %5, i64 0
267  %7 = select i1 %6, double %4, double %2
268  %8 = insertelement <2 x double> %c, double %7, i64 0
269  ret <2 x double> %8
270}
271
272define <2 x double> @combine_scalar_mask_fnmadd_f32(<2 x double> %a, i8 zeroext %k, <2 x double> %b, <2 x double> %c) {
273; CHECK-LABEL: combine_scalar_mask_fnmadd_f32:
274; CHECK:       # %bb.0: # %entry
275; CHECK-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
276; CHECK-NEXT:    vfnmadd213ss %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0xad,0xc2]
277; CHECK-NEXT:    # xmm0 {%k1} = -(xmm1 * xmm0) + xmm2
278; CHECK-NEXT:    retq # encoding: [0xc3]
279entry:
280  %0 = bitcast <2 x double> %a to <4 x float>
281  %1 = bitcast <2 x double> %b to <4 x float>
282  %2 = bitcast <2 x double> %c to <4 x float>
283  %3 = extractelement <4 x float> %0, i64 0
284  %4 = extractelement <4 x float> %1, i64 0
285  %5 = extractelement <4 x float> %2, i64 0
286  %6 = fmul fast float %4, %3
287  %7 = fsub fast float %5, %6
288  %8 = bitcast i8 %k to <8 x i1>
289  %9 = extractelement <8 x i1> %8, i64 0
290  %10 = select i1 %9, float %7, float %3
291  %11 = insertelement <4 x float> %0, float %10, i64 0
292  %12 = bitcast <4 x float> %11 to <2 x double>
293  ret <2 x double> %12
294}
295
296define <2 x double> @combine_scalar_mask_fnmadd_f64(<2 x double> %a, i8 zeroext %k, <2 x double> %b, <2 x double> %c) {
297; CHECK-LABEL: combine_scalar_mask_fnmadd_f64:
298; CHECK:       # %bb.0: # %entry
299; CHECK-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
300; CHECK-NEXT:    vfnmadd213sd %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0xf5,0x09,0xad,0xc2]
301; CHECK-NEXT:    # xmm0 {%k1} = -(xmm1 * xmm0) + xmm2
302; CHECK-NEXT:    retq # encoding: [0xc3]
303entry:
304  %0 = extractelement <2 x double> %a, i64 0
305  %1 = extractelement <2 x double> %b, i64 0
306  %2 = extractelement <2 x double> %c, i64 0
307  %3 = fmul fast double %1, %0
308  %4 = fsub fast double %2, %3
309  %5 = bitcast i8 %k to <8 x i1>
310  %6 = extractelement <8 x i1> %5, i64 0
311  %7 = select i1 %6, double %4, double %0
312  %8 = insertelement <2 x double> %a, double %7, i64 0
313  ret <2 x double> %8
314}
315
316define <2 x double> @combine_scalar_maskz_fnmadd_32(i8 zeroext %k, <2 x double> %a, <2 x double> %b, <2 x double> %c) {
317; CHECK-LABEL: combine_scalar_maskz_fnmadd_32:
318; CHECK:       # %bb.0: # %entry
319; CHECK-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
320; CHECK-NEXT:    vfnmadd213ss %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0xad,0xc2]
321; CHECK-NEXT:    # xmm0 {%k1} {z} = -(xmm1 * xmm0) + xmm2
322; CHECK-NEXT:    retq # encoding: [0xc3]
323entry:
324  %0 = bitcast <2 x double> %a to <4 x float>
325  %1 = bitcast <2 x double> %b to <4 x float>
326  %2 = bitcast <2 x double> %c to <4 x float>
327  %3 = extractelement <4 x float> %0, i64 0
328  %4 = extractelement <4 x float> %1, i64 0
329  %5 = extractelement <4 x float> %2, i64 0
330  %6 = fmul fast float %4, %3
331  %7 = fsub fast float %5, %6
332  %8 = bitcast i8 %k to <8 x i1>
333  %9 = extractelement <8 x i1> %8, i64 0
334  %10 = select i1 %9, float %7, float 0.000000e+00
335  %11 = insertelement <4 x float> %0, float %10, i64 0
336  %12 = bitcast <4 x float> %11 to <2 x double>
337  ret <2 x double> %12
338}
339
340define <2 x double> @combine_scalar_maskz_fnmadd_64(i8 zeroext %k, <2 x double> %a, <2 x double> %b, <2 x double> %c) {
341; CHECK-LABEL: combine_scalar_maskz_fnmadd_64:
342; CHECK:       # %bb.0: # %entry
343; CHECK-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
344; CHECK-NEXT:    vfnmadd213sd %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0xad,0xc2]
345; CHECK-NEXT:    # xmm0 {%k1} {z} = -(xmm1 * xmm0) + xmm2
346; CHECK-NEXT:    retq # encoding: [0xc3]
347entry:
348  %0 = extractelement <2 x double> %a, i64 0
349  %1 = extractelement <2 x double> %b, i64 0
350  %2 = extractelement <2 x double> %c, i64 0
351  %3 = fmul fast double %1, %0
352  %4 = fsub fast double %2, %3
353  %5 = bitcast i8 %k to <8 x i1>
354  %6 = extractelement <8 x i1> %5, i64 0
355  %7 = select i1 %6, double %4, double 0.000000e+00
356  %8 = insertelement <2 x double> %a, double %7, i64 0
357  ret <2 x double> %8
358}
359
360define <2 x double> @combine_scalar_mask3_fnmadd_32(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 zeroext %k) {
361; CHECK-LABEL: combine_scalar_mask3_fnmadd_32:
362; CHECK:       # %bb.0: # %entry
363; CHECK-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
364; CHECK-NEXT:    vfnmadd231ss %xmm0, %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0xbd,0xd0]
365; CHECK-NEXT:    # xmm2 {%k1} = -(xmm1 * xmm0) + xmm2
366; CHECK-NEXT:    vmovaps %xmm2, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc2]
367; CHECK-NEXT:    retq # encoding: [0xc3]
368entry:
369  %0 = bitcast <2 x double> %a to <4 x float>
370  %1 = bitcast <2 x double> %b to <4 x float>
371  %2 = bitcast <2 x double> %c to <4 x float>
372  %3 = extractelement <4 x float> %0, i64 0
373  %4 = extractelement <4 x float> %1, i64 0
374  %5 = extractelement <4 x float> %2, i64 0
375  %6 = fmul fast float %4, %3
376  %7 = fsub fast float %5, %6
377  %8 = bitcast i8 %k to <8 x i1>
378  %9 = extractelement <8 x i1> %8, i64 0
379  %10 = select i1 %9, float %7, float %5
380  %11 = insertelement <4 x float> %2, float %10, i64 0
381  %12 = bitcast <4 x float> %11 to <2 x double>
382  ret <2 x double> %12
383}
384
385define <2 x double> @combine_scalar_mask3_fnmadd_64(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 zeroext %k) {
386; CHECK-LABEL: combine_scalar_mask3_fnmadd_64:
387; CHECK:       # %bb.0: # %entry
388; CHECK-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
389; CHECK-NEXT:    vfnmadd231sd %xmm0, %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf2,0xf5,0x09,0xbd,0xd0]
390; CHECK-NEXT:    # xmm2 {%k1} = -(xmm1 * xmm0) + xmm2
391; CHECK-NEXT:    vmovapd %xmm2, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc2]
392; CHECK-NEXT:    retq # encoding: [0xc3]
393entry:
394  %0 = extractelement <2 x double> %a, i64 0
395  %1 = extractelement <2 x double> %b, i64 0
396  %2 = extractelement <2 x double> %c, i64 0
397  %3 = fmul fast double %1, %0
398  %4 = fsub fast double %2, %3
399  %5 = bitcast i8 %k to <8 x i1>
400  %6 = extractelement <8 x i1> %5, i64 0
401  %7 = select i1 %6, double %4, double %2
402  %8 = insertelement <2 x double> %c, double %7, i64 0
403  ret <2 x double> %8
404}
405
406define <2 x double> @combine_scalar_mask_fnmsub_f32(<2 x double> %a, i8 zeroext %k, <2 x double> %b, <2 x double> %c) {
407; CHECK-LABEL: combine_scalar_mask_fnmsub_f32:
408; CHECK:       # %bb.0: # %entry
409; CHECK-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
410; CHECK-NEXT:    vfnmsub213ss %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0xaf,0xc2]
411; CHECK-NEXT:    # xmm0 {%k1} = -(xmm1 * xmm0) - xmm2
412; CHECK-NEXT:    retq # encoding: [0xc3]
413entry:
414  %0 = bitcast <2 x double> %a to <4 x float>
415  %1 = bitcast <2 x double> %b to <4 x float>
416  %2 = bitcast <2 x double> %c to <4 x float>
417  %3 = extractelement <4 x float> %0, i64 0
418  %4 = extractelement <4 x float> %1, i64 0
419  %5 = extractelement <4 x float> %2, i64 0
420  %sub = fsub fast float -0.000000e+00, %5
421  %6 = fmul fast float %4, %3
422  %7 = fsub fast float %sub, %6
423  %8 = bitcast i8 %k to <8 x i1>
424  %9 = extractelement <8 x i1> %8, i64 0
425  %10 = select i1 %9, float %7, float %3
426  %11 = insertelement <4 x float> %0, float %10, i64 0
427  %12 = bitcast <4 x float> %11 to <2 x double>
428  ret <2 x double> %12
429}
430
431define <2 x double> @combine_scalar_mask_fnmsub_f64(<2 x double> %a, i8 zeroext %k, <2 x double> %b, <2 x double> %c) {
432; CHECK-LABEL: combine_scalar_mask_fnmsub_f64:
433; CHECK:       # %bb.0: # %entry
434; CHECK-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
435; CHECK-NEXT:    vfnmsub213sd %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0xf5,0x09,0xaf,0xc2]
436; CHECK-NEXT:    # xmm0 {%k1} = -(xmm1 * xmm0) - xmm2
437; CHECK-NEXT:    retq # encoding: [0xc3]
438entry:
439  %0 = extractelement <2 x double> %a, i64 0
440  %1 = extractelement <2 x double> %b, i64 0
441  %2 = extractelement <2 x double> %c, i64 0
442  %sub = fsub fast double -0.000000e+00, %2
443  %3 = fmul fast double %1, %0
444  %4 = fsub fast double %sub, %3
445  %5 = bitcast i8 %k to <8 x i1>
446  %6 = extractelement <8 x i1> %5, i64 0
447  %7 = select i1 %6, double %4, double %0
448  %8 = insertelement <2 x double> %a, double %7, i64 0
449  ret <2 x double> %8
450}
451
452define <2 x double> @combine_scalar_maskz_fnmsub_32(i8 zeroext %k, <2 x double> %a, <2 x double> %b, <2 x double> %c) {
453; CHECK-LABEL: combine_scalar_maskz_fnmsub_32:
454; CHECK:       # %bb.0: # %entry
455; CHECK-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
456; CHECK-NEXT:    vfnmsub213ss %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0xaf,0xc2]
457; CHECK-NEXT:    # xmm0 {%k1} {z} = -(xmm1 * xmm0) - xmm2
458; CHECK-NEXT:    retq # encoding: [0xc3]
459entry:
460  %0 = bitcast <2 x double> %a to <4 x float>
461  %1 = bitcast <2 x double> %b to <4 x float>
462  %2 = bitcast <2 x double> %c to <4 x float>
463  %3 = extractelement <4 x float> %0, i64 0
464  %4 = extractelement <4 x float> %1, i64 0
465  %5 = extractelement <4 x float> %2, i64 0
466  %sub = fsub fast float -0.000000e+00, %5
467  %6 = fmul fast float %4, %3
468  %7 = fsub fast float %sub, %6
469  %8 = bitcast i8 %k to <8 x i1>
470  %9 = extractelement <8 x i1> %8, i64 0
471  %10 = select i1 %9, float %7, float 0.000000e+00
472  %11 = insertelement <4 x float> %0, float %10, i64 0
473  %12 = bitcast <4 x float> %11 to <2 x double>
474  ret <2 x double> %12
475}
476
477define <2 x double> @combine_scalar_maskz_fnmsub_64(i8 zeroext %k, <2 x double> %a, <2 x double> %b, <2 x double> %c) {
478; CHECK-LABEL: combine_scalar_maskz_fnmsub_64:
479; CHECK:       # %bb.0: # %entry
480; CHECK-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
481; CHECK-NEXT:    vfnmsub213sd %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0xaf,0xc2]
482; CHECK-NEXT:    # xmm0 {%k1} {z} = -(xmm1 * xmm0) - xmm2
483; CHECK-NEXT:    retq # encoding: [0xc3]
484entry:
485  %0 = extractelement <2 x double> %a, i64 0
486  %1 = extractelement <2 x double> %b, i64 0
487  %2 = extractelement <2 x double> %c, i64 0
488  %sub = fsub fast double -0.000000e+00, %2
489  %3 = fmul fast double %1, %0
490  %4 = fsub fast double %sub, %3
491  %5 = bitcast i8 %k to <8 x i1>
492  %6 = extractelement <8 x i1> %5, i64 0
493  %7 = select i1 %6, double %4, double 0.000000e+00
494  %8 = insertelement <2 x double> %a, double %7, i64 0
495  ret <2 x double> %8
496}
497
498define <2 x double> @combine_scalar_mask3_fnmsub_32(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 zeroext %k) {
499; CHECK-LABEL: combine_scalar_mask3_fnmsub_32:
500; CHECK:       # %bb.0: # %entry
501; CHECK-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
502; CHECK-NEXT:    vfnmsub231ss %xmm0, %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0xbf,0xd0]
503; CHECK-NEXT:    # xmm2 {%k1} = -(xmm1 * xmm0) - xmm2
504; CHECK-NEXT:    vmovaps %xmm2, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc2]
505; CHECK-NEXT:    retq # encoding: [0xc3]
506entry:
507  %0 = bitcast <2 x double> %a to <4 x float>
508  %1 = bitcast <2 x double> %b to <4 x float>
509  %2 = bitcast <2 x double> %c to <4 x float>
510  %3 = extractelement <4 x float> %0, i64 0
511  %4 = extractelement <4 x float> %1, i64 0
512  %5 = extractelement <4 x float> %2, i64 0
513  %sub = fsub fast float -0.000000e+00, %5
514  %6 = fmul fast float %4, %3
515  %7 = fsub fast float %sub, %6
516  %8 = bitcast i8 %k to <8 x i1>
517  %9 = extractelement <8 x i1> %8, i64 0
518  %10 = select i1 %9, float %7, float %5
519  %11 = insertelement <4 x float> %2, float %10, i64 0
520  %12 = bitcast <4 x float> %11 to <2 x double>
521  ret <2 x double> %12
522}
523
524define <2 x double> @combine_scalar_mask3_fnmsub_64(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 zeroext %k) {
525; CHECK-LABEL: combine_scalar_mask3_fnmsub_64:
526; CHECK:       # %bb.0: # %entry
527; CHECK-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
528; CHECK-NEXT:    vfnmsub231sd %xmm0, %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf2,0xf5,0x09,0xbf,0xd0]
529; CHECK-NEXT:    # xmm2 {%k1} = -(xmm1 * xmm0) - xmm2
530; CHECK-NEXT:    vmovapd %xmm2, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc2]
531; CHECK-NEXT:    retq # encoding: [0xc3]
532entry:
533  %0 = extractelement <2 x double> %a, i64 0
534  %1 = extractelement <2 x double> %b, i64 0
535  %2 = extractelement <2 x double> %c, i64 0
536  %sub = fsub fast double -0.000000e+00, %2
537  %3 = fmul fast double %1, %0
538  %4 = fsub fast double %sub, %3
539  %5 = bitcast i8 %k to <8 x i1>
540  %6 = extractelement <8 x i1> %5, i64 0
541  %7 = select i1 %6, double %4, double %2
542  %8 = insertelement <2 x double> %c, double %7, i64 0
543  ret <2 x double> %8
544}
545
546; Don't fold into (fmul x, c1+c2) if reassoc not set
547define float @fma_const_fmul(float %x) {
548; CHECK-LABEL: fma_const_fmul:
549; CHECK:       # %bb.0:
550; CHECK-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x59,0x0d,A,A,A,A]
551; CHECK-NEXT:    # fixup A - offset: 4, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte
552; CHECK-NEXT:    vfmadd132ss {{.*}}(%rip), %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0x99,0x05,A,A,A,A]
553; CHECK-NEXT:    # fixup A - offset: 5, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte
554; CHECK-NEXT:    # xmm0 = (xmm0 * mem) + xmm1
555; CHECK-NEXT:    retq # encoding: [0xc3]
556  %mul1 = fmul contract float %x, 10.0
557  %mul2 = fmul contract float %x, 11.0
558  %add1 = fadd contract float %mul1, %mul2
559  ret float %add1
560}
561