1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vl --show-mc-encoding | FileCheck %s --check-prefix=X86
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl --show-mc-encoding | FileCheck %s --check-prefix=X64
4
5; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512vl-builtins.c
6
7define <2 x double> @test_mm_mask_fmadd_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) {
8; X86-LABEL: test_mm_mask_fmadd_pd:
9; X86:       # %bb.0: # %entry
10; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
11; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
12; X86-NEXT:    vfmadd132pd %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0xed,0x09,0x98,0xc1]
13; X86-NEXT:    # xmm0 {%k1} = (xmm0 * xmm1) + xmm2
14; X86-NEXT:    retl # encoding: [0xc3]
15;
16; X64-LABEL: test_mm_mask_fmadd_pd:
17; X64:       # %bb.0: # %entry
18; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
19; X64-NEXT:    vfmadd132pd %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0xed,0x09,0x98,0xc1]
20; X64-NEXT:    # xmm0 {%k1} = (xmm0 * xmm1) + xmm2
21; X64-NEXT:    retq # encoding: [0xc3]
22entry:
23  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
24  %1 = bitcast i8 %__U to <8 x i1>
25  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
26  %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__A
27  ret <2 x double> %2
28}
29
30define <2 x double> @test_mm_mask_fmsub_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) {
31; X86-LABEL: test_mm_mask_fmsub_pd:
32; X86:       # %bb.0: # %entry
33; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
34; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
35; X86-NEXT:    vfmsub132pd %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0xed,0x09,0x9a,0xc1]
36; X86-NEXT:    # xmm0 {%k1} = (xmm0 * xmm1) - xmm2
37; X86-NEXT:    retl # encoding: [0xc3]
38;
39; X64-LABEL: test_mm_mask_fmsub_pd:
40; X64:       # %bb.0: # %entry
41; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
42; X64-NEXT:    vfmsub132pd %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0xed,0x09,0x9a,0xc1]
43; X64-NEXT:    # xmm0 {%k1} = (xmm0 * xmm1) - xmm2
44; X64-NEXT:    retq # encoding: [0xc3]
45entry:
46  %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
47  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9
48  %1 = bitcast i8 %__U to <8 x i1>
49  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
50  %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__A
51  ret <2 x double> %2
52}
53
54define <2 x double> @test_mm_mask3_fmadd_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) {
55; X86-LABEL: test_mm_mask3_fmadd_pd:
56; X86:       # %bb.0: # %entry
57; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
58; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
59; X86-NEXT:    vfmadd231pd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0xb8,0xd1]
60; X86-NEXT:    # xmm2 {%k1} = (xmm0 * xmm1) + xmm2
61; X86-NEXT:    vmovapd %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc2]
62; X86-NEXT:    retl # encoding: [0xc3]
63;
64; X64-LABEL: test_mm_mask3_fmadd_pd:
65; X64:       # %bb.0: # %entry
66; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
67; X64-NEXT:    vfmadd231pd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0xb8,0xd1]
68; X64-NEXT:    # xmm2 {%k1} = (xmm0 * xmm1) + xmm2
69; X64-NEXT:    vmovapd %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc2]
70; X64-NEXT:    retq # encoding: [0xc3]
71entry:
72  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
73  %1 = bitcast i8 %__U to <8 x i1>
74  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
75  %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__C
76  ret <2 x double> %2
77}
78
79define <2 x double> @test_mm_mask3_fnmadd_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) {
80; X86-LABEL: test_mm_mask3_fnmadd_pd:
81; X86:       # %bb.0: # %entry
82; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
83; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
84; X86-NEXT:    vfnmadd231pd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0xbc,0xd1]
85; X86-NEXT:    # xmm2 {%k1} = -(xmm0 * xmm1) + xmm2
86; X86-NEXT:    vmovapd %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc2]
87; X86-NEXT:    retl # encoding: [0xc3]
88;
89; X64-LABEL: test_mm_mask3_fnmadd_pd:
90; X64:       # %bb.0: # %entry
91; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
92; X64-NEXT:    vfnmadd231pd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0xbc,0xd1]
93; X64-NEXT:    # xmm2 {%k1} = -(xmm0 * xmm1) + xmm2
94; X64-NEXT:    vmovapd %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc2]
95; X64-NEXT:    retq # encoding: [0xc3]
96entry:
97  %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__A
98  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %sub.i, <2 x double> %__B, <2 x double> %__C) #9
99  %1 = bitcast i8 %__U to <8 x i1>
100  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
101  %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__C
102  ret <2 x double> %2
103}
104
105define <2 x double> @test_mm_maskz_fmadd_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
106; X86-LABEL: test_mm_maskz_fmadd_pd:
107; X86:       # %bb.0: # %entry
108; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
109; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
110; X86-NEXT:    vfmadd213pd %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0xa8,0xc2]
111; X86-NEXT:    # xmm0 {%k1} {z} = (xmm1 * xmm0) + xmm2
112; X86-NEXT:    retl # encoding: [0xc3]
113;
114; X64-LABEL: test_mm_maskz_fmadd_pd:
115; X64:       # %bb.0: # %entry
116; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
117; X64-NEXT:    vfmadd213pd %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0xa8,0xc2]
118; X64-NEXT:    # xmm0 {%k1} {z} = (xmm1 * xmm0) + xmm2
119; X64-NEXT:    retq # encoding: [0xc3]
120entry:
121  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
122  %1 = bitcast i8 %__U to <8 x i1>
123  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
124  %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer
125  ret <2 x double> %2
126}
127
128define <2 x double> @test_mm_maskz_fmsub_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
129; X86-LABEL: test_mm_maskz_fmsub_pd:
130; X86:       # %bb.0: # %entry
131; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
132; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
133; X86-NEXT:    vfmsub213pd %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0xaa,0xc2]
134; X86-NEXT:    # xmm0 {%k1} {z} = (xmm1 * xmm0) - xmm2
135; X86-NEXT:    retl # encoding: [0xc3]
136;
137; X64-LABEL: test_mm_maskz_fmsub_pd:
138; X64:       # %bb.0: # %entry
139; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
140; X64-NEXT:    vfmsub213pd %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0xaa,0xc2]
141; X64-NEXT:    # xmm0 {%k1} {z} = (xmm1 * xmm0) - xmm2
142; X64-NEXT:    retq # encoding: [0xc3]
143entry:
144  %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
145  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9
146  %1 = bitcast i8 %__U to <8 x i1>
147  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
148  %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer
149  ret <2 x double> %2
150}
151
152define <2 x double> @test_mm_maskz_fnmadd_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
153; X86-LABEL: test_mm_maskz_fnmadd_pd:
154; X86:       # %bb.0: # %entry
155; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
156; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
157; X86-NEXT:    vfnmadd213pd %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0xac,0xc2]
158; X86-NEXT:    # xmm0 {%k1} {z} = -(xmm1 * xmm0) + xmm2
159; X86-NEXT:    retl # encoding: [0xc3]
160;
161; X64-LABEL: test_mm_maskz_fnmadd_pd:
162; X64:       # %bb.0: # %entry
163; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
164; X64-NEXT:    vfnmadd213pd %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0xac,0xc2]
165; X64-NEXT:    # xmm0 {%k1} {z} = -(xmm1 * xmm0) + xmm2
166; X64-NEXT:    retq # encoding: [0xc3]
167entry:
168  %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__A
169  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %sub.i, <2 x double> %__B, <2 x double> %__C) #9
170  %1 = bitcast i8 %__U to <8 x i1>
171  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
172  %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer
173  ret <2 x double> %2
174}
175
176define <2 x double> @test_mm_maskz_fnmsub_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
177; X86-LABEL: test_mm_maskz_fnmsub_pd:
178; X86:       # %bb.0: # %entry
179; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
180; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
181; X86-NEXT:    vfnmsub213pd %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0xae,0xc2]
182; X86-NEXT:    # xmm0 {%k1} {z} = -(xmm1 * xmm0) - xmm2
183; X86-NEXT:    retl # encoding: [0xc3]
184;
185; X64-LABEL: test_mm_maskz_fnmsub_pd:
186; X64:       # %bb.0: # %entry
187; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
188; X64-NEXT:    vfnmsub213pd %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0xae,0xc2]
189; X64-NEXT:    # xmm0 {%k1} {z} = -(xmm1 * xmm0) - xmm2
190; X64-NEXT:    retq # encoding: [0xc3]
191entry:
192  %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__A
193  %sub1.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
194  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %sub.i, <2 x double> %__B, <2 x double> %sub1.i) #9
195  %1 = bitcast i8 %__U to <8 x i1>
196  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
197  %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer
198  ret <2 x double> %2
199}
200
201define <4 x double> @test_mm256_mask_fmadd_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) {
202; X86-LABEL: test_mm256_mask_fmadd_pd:
203; X86:       # %bb.0: # %entry
204; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
205; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
206; X86-NEXT:    vfmadd132pd %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0xed,0x29,0x98,0xc1]
207; X86-NEXT:    # ymm0 {%k1} = (ymm0 * ymm1) + ymm2
208; X86-NEXT:    retl # encoding: [0xc3]
209;
210; X64-LABEL: test_mm256_mask_fmadd_pd:
211; X64:       # %bb.0: # %entry
212; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
213; X64-NEXT:    vfmadd132pd %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0xed,0x29,0x98,0xc1]
214; X64-NEXT:    # ymm0 {%k1} = (ymm0 * ymm1) + ymm2
215; X64-NEXT:    retq # encoding: [0xc3]
216entry:
217  %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
218  %1 = bitcast i8 %__U to <8 x i1>
219  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
220  %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__A
221  ret <4 x double> %2
222}
223
224define <4 x double> @test_mm256_mask_fmsub_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) {
225; X86-LABEL: test_mm256_mask_fmsub_pd:
226; X86:       # %bb.0: # %entry
227; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
228; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
229; X86-NEXT:    vfmsub132pd %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0xed,0x29,0x9a,0xc1]
230; X86-NEXT:    # ymm0 {%k1} = (ymm0 * ymm1) - ymm2
231; X86-NEXT:    retl # encoding: [0xc3]
232;
233; X64-LABEL: test_mm256_mask_fmsub_pd:
234; X64:       # %bb.0: # %entry
235; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
236; X64-NEXT:    vfmsub132pd %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0xed,0x29,0x9a,0xc1]
237; X64-NEXT:    # ymm0 {%k1} = (ymm0 * ymm1) - ymm2
238; X64-NEXT:    retq # encoding: [0xc3]
239entry:
240  %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
241  %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9
242  %1 = bitcast i8 %__U to <8 x i1>
243  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
244  %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__A
245  ret <4 x double> %2
246}
247
248define <4 x double> @test_mm256_mask3_fmadd_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) {
249; X86-LABEL: test_mm256_mask3_fmadd_pd:
250; X86:       # %bb.0: # %entry
251; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
252; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
253; X86-NEXT:    vfmadd231pd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0xb8,0xd1]
254; X86-NEXT:    # ymm2 {%k1} = (ymm0 * ymm1) + ymm2
255; X86-NEXT:    vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2]
256; X86-NEXT:    retl # encoding: [0xc3]
257;
258; X64-LABEL: test_mm256_mask3_fmadd_pd:
259; X64:       # %bb.0: # %entry
260; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
261; X64-NEXT:    vfmadd231pd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0xb8,0xd1]
262; X64-NEXT:    # ymm2 {%k1} = (ymm0 * ymm1) + ymm2
263; X64-NEXT:    vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2]
264; X64-NEXT:    retq # encoding: [0xc3]
265entry:
266  %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
267  %1 = bitcast i8 %__U to <8 x i1>
268  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
269  %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__C
270  ret <4 x double> %2
271}
272
273define <4 x double> @test_mm256_mask3_fnmadd_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) {
274; X86-LABEL: test_mm256_mask3_fnmadd_pd:
275; X86:       # %bb.0: # %entry
276; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
277; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
278; X86-NEXT:    vfnmadd231pd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0xbc,0xd1]
279; X86-NEXT:    # ymm2 {%k1} = -(ymm0 * ymm1) + ymm2
280; X86-NEXT:    vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2]
281; X86-NEXT:    retl # encoding: [0xc3]
282;
283; X64-LABEL: test_mm256_mask3_fnmadd_pd:
284; X64:       # %bb.0: # %entry
285; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
286; X64-NEXT:    vfnmadd231pd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0xbc,0xd1]
287; X64-NEXT:    # ymm2 {%k1} = -(ymm0 * ymm1) + ymm2
288; X64-NEXT:    vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2]
289; X64-NEXT:    retq # encoding: [0xc3]
290entry:
291  %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
292  %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %sub.i, <4 x double> %__B, <4 x double> %__C) #9
293  %1 = bitcast i8 %__U to <8 x i1>
294  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
295  %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__C
296  ret <4 x double> %2
297}
298
299define <4 x double> @test_mm256_maskz_fmadd_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) {
300; X86-LABEL: test_mm256_maskz_fmadd_pd:
301; X86:       # %bb.0: # %entry
302; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
303; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
304; X86-NEXT:    vfmadd213pd %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0xa8,0xc2]
305; X86-NEXT:    # ymm0 {%k1} {z} = (ymm1 * ymm0) + ymm2
306; X86-NEXT:    retl # encoding: [0xc3]
307;
308; X64-LABEL: test_mm256_maskz_fmadd_pd:
309; X64:       # %bb.0: # %entry
310; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
311; X64-NEXT:    vfmadd213pd %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0xa8,0xc2]
312; X64-NEXT:    # ymm0 {%k1} {z} = (ymm1 * ymm0) + ymm2
313; X64-NEXT:    retq # encoding: [0xc3]
314entry:
315  %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
316  %1 = bitcast i8 %__U to <8 x i1>
317  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
318  %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer
319  ret <4 x double> %2
320}
321
322define <4 x double> @test_mm256_maskz_fmsub_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) {
323; X86-LABEL: test_mm256_maskz_fmsub_pd:
324; X86:       # %bb.0: # %entry
325; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
326; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
327; X86-NEXT:    vfmsub213pd %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0xaa,0xc2]
328; X86-NEXT:    # ymm0 {%k1} {z} = (ymm1 * ymm0) - ymm2
329; X86-NEXT:    retl # encoding: [0xc3]
330;
331; X64-LABEL: test_mm256_maskz_fmsub_pd:
332; X64:       # %bb.0: # %entry
333; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
334; X64-NEXT:    vfmsub213pd %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0xaa,0xc2]
335; X64-NEXT:    # ymm0 {%k1} {z} = (ymm1 * ymm0) - ymm2
336; X64-NEXT:    retq # encoding: [0xc3]
337entry:
338  %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
339  %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9
340  %1 = bitcast i8 %__U to <8 x i1>
341  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
342  %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer
343  ret <4 x double> %2
344}
345
346define <4 x double> @test_mm256_maskz_fnmadd_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) {
347; X86-LABEL: test_mm256_maskz_fnmadd_pd:
348; X86:       # %bb.0: # %entry
349; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
350; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
351; X86-NEXT:    vfnmadd213pd %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0xac,0xc2]
352; X86-NEXT:    # ymm0 {%k1} {z} = -(ymm1 * ymm0) + ymm2
353; X86-NEXT:    retl # encoding: [0xc3]
354;
355; X64-LABEL: test_mm256_maskz_fnmadd_pd:
356; X64:       # %bb.0: # %entry
357; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
358; X64-NEXT:    vfnmadd213pd %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0xac,0xc2]
359; X64-NEXT:    # ymm0 {%k1} {z} = -(ymm1 * ymm0) + ymm2
360; X64-NEXT:    retq # encoding: [0xc3]
361entry:
362  %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
363  %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %sub.i, <4 x double> %__B, <4 x double> %__C) #9
364  %1 = bitcast i8 %__U to <8 x i1>
365  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
366  %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer
367  ret <4 x double> %2
368}
369
370define <4 x double> @test_mm256_maskz_fnmsub_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) {
371; X86-LABEL: test_mm256_maskz_fnmsub_pd:
372; X86:       # %bb.0: # %entry
373; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
374; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
375; X86-NEXT:    vfnmsub213pd %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0xae,0xc2]
376; X86-NEXT:    # ymm0 {%k1} {z} = -(ymm1 * ymm0) - ymm2
377; X86-NEXT:    retl # encoding: [0xc3]
378;
379; X64-LABEL: test_mm256_maskz_fnmsub_pd:
380; X64:       # %bb.0: # %entry
381; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
382; X64-NEXT:    vfnmsub213pd %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0xae,0xc2]
383; X64-NEXT:    # ymm0 {%k1} {z} = -(ymm1 * ymm0) - ymm2
384; X64-NEXT:    retq # encoding: [0xc3]
385entry:
386  %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
387  %sub1.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
388  %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %sub.i, <4 x double> %__B, <4 x double> %sub1.i) #9
389  %1 = bitcast i8 %__U to <8 x i1>
390  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
391  %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer
392  ret <4 x double> %2
393}
394
395define <4 x float> @test_mm_mask_fmadd_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) {
396; X86-LABEL: test_mm_mask_fmadd_ps:
397; X86:       # %bb.0: # %entry
398; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
399; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
400; X86-NEXT:    vfmadd132ps %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x09,0x98,0xc1]
401; X86-NEXT:    # xmm0 {%k1} = (xmm0 * xmm1) + xmm2
402; X86-NEXT:    retl # encoding: [0xc3]
403;
404; X64-LABEL: test_mm_mask_fmadd_ps:
405; X64:       # %bb.0: # %entry
406; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
407; X64-NEXT:    vfmadd132ps %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x09,0x98,0xc1]
408; X64-NEXT:    # xmm0 {%k1} = (xmm0 * xmm1) + xmm2
409; X64-NEXT:    retq # encoding: [0xc3]
410entry:
411  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
412  %1 = bitcast i8 %__U to <8 x i1>
413  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
414  %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__A
415  ret <4 x float> %2
416}
417
418define <4 x float> @test_mm_mask_fmsub_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) {
419; X86-LABEL: test_mm_mask_fmsub_ps:
420; X86:       # %bb.0: # %entry
421; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
422; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
423; X86-NEXT:    vfmsub132ps %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x09,0x9a,0xc1]
424; X86-NEXT:    # xmm0 {%k1} = (xmm0 * xmm1) - xmm2
425; X86-NEXT:    retl # encoding: [0xc3]
426;
427; X64-LABEL: test_mm_mask_fmsub_ps:
428; X64:       # %bb.0: # %entry
429; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
430; X64-NEXT:    vfmsub132ps %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x09,0x9a,0xc1]
431; X64-NEXT:    # xmm0 {%k1} = (xmm0 * xmm1) - xmm2
432; X64-NEXT:    retq # encoding: [0xc3]
433entry:
434  %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
435  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9
436  %1 = bitcast i8 %__U to <8 x i1>
437  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
438  %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__A
439  ret <4 x float> %2
440}
441
442define <4 x float> @test_mm_mask3_fmadd_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) {
443; X86-LABEL: test_mm_mask3_fmadd_ps:
444; X86:       # %bb.0: # %entry
445; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
446; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
447; X86-NEXT:    vfmadd231ps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0xb8,0xd1]
448; X86-NEXT:    # xmm2 {%k1} = (xmm0 * xmm1) + xmm2
449; X86-NEXT:    vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
450; X86-NEXT:    retl # encoding: [0xc3]
451;
452; X64-LABEL: test_mm_mask3_fmadd_ps:
453; X64:       # %bb.0: # %entry
454; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
455; X64-NEXT:    vfmadd231ps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0xb8,0xd1]
456; X64-NEXT:    # xmm2 {%k1} = (xmm0 * xmm1) + xmm2
457; X64-NEXT:    vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
458; X64-NEXT:    retq # encoding: [0xc3]
459entry:
460  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
461  %1 = bitcast i8 %__U to <8 x i1>
462  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
463  %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__C
464  ret <4 x float> %2
465}
466
467define <4 x float> @test_mm_mask3_fnmadd_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) {
468; X86-LABEL: test_mm_mask3_fnmadd_ps:
469; X86:       # %bb.0: # %entry
470; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
471; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
472; X86-NEXT:    vfnmadd231ps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0xbc,0xd1]
473; X86-NEXT:    # xmm2 {%k1} = -(xmm0 * xmm1) + xmm2
474; X86-NEXT:    vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
475; X86-NEXT:    retl # encoding: [0xc3]
476;
477; X64-LABEL: test_mm_mask3_fnmadd_ps:
478; X64:       # %bb.0: # %entry
479; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
480; X64-NEXT:    vfnmadd231ps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0xbc,0xd1]
481; X64-NEXT:    # xmm2 {%k1} = -(xmm0 * xmm1) + xmm2
482; X64-NEXT:    vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
483; X64-NEXT:    retq # encoding: [0xc3]
484entry:
485  %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
486  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %sub.i, <4 x float> %__B, <4 x float> %__C) #9
487  %1 = bitcast i8 %__U to <8 x i1>
488  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
489  %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__C
490  ret <4 x float> %2
491}
492
493define <4 x float> @test_mm_maskz_fmadd_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
494; X86-LABEL: test_mm_maskz_fmadd_ps:
495; X86:       # %bb.0: # %entry
496; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
497; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
498; X86-NEXT:    vfmadd213ps %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0xa8,0xc2]
499; X86-NEXT:    # xmm0 {%k1} {z} = (xmm1 * xmm0) + xmm2
500; X86-NEXT:    retl # encoding: [0xc3]
501;
502; X64-LABEL: test_mm_maskz_fmadd_ps:
503; X64:       # %bb.0: # %entry
504; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
505; X64-NEXT:    vfmadd213ps %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0xa8,0xc2]
506; X64-NEXT:    # xmm0 {%k1} {z} = (xmm1 * xmm0) + xmm2
507; X64-NEXT:    retq # encoding: [0xc3]
508entry:
509  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
510  %1 = bitcast i8 %__U to <8 x i1>
511  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
512  %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer
513  ret <4 x float> %2
514}
515
516define <4 x float> @test_mm_maskz_fmsub_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
517; X86-LABEL: test_mm_maskz_fmsub_ps:
518; X86:       # %bb.0: # %entry
519; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
520; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
521; X86-NEXT:    vfmsub213ps %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0xaa,0xc2]
522; X86-NEXT:    # xmm0 {%k1} {z} = (xmm1 * xmm0) - xmm2
523; X86-NEXT:    retl # encoding: [0xc3]
524;
525; X64-LABEL: test_mm_maskz_fmsub_ps:
526; X64:       # %bb.0: # %entry
527; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
528; X64-NEXT:    vfmsub213ps %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0xaa,0xc2]
529; X64-NEXT:    # xmm0 {%k1} {z} = (xmm1 * xmm0) - xmm2
530; X64-NEXT:    retq # encoding: [0xc3]
531entry:
532  %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
533  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9
534  %1 = bitcast i8 %__U to <8 x i1>
535  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
536  %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer
537  ret <4 x float> %2
538}
539
540define <4 x float> @test_mm_maskz_fnmadd_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
541; X86-LABEL: test_mm_maskz_fnmadd_ps:
542; X86:       # %bb.0: # %entry
543; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
544; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
545; X86-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0xac,0xc2]
546; X86-NEXT:    # xmm0 {%k1} {z} = -(xmm1 * xmm0) + xmm2
547; X86-NEXT:    retl # encoding: [0xc3]
548;
549; X64-LABEL: test_mm_maskz_fnmadd_ps:
550; X64:       # %bb.0: # %entry
551; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
552; X64-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0xac,0xc2]
553; X64-NEXT:    # xmm0 {%k1} {z} = -(xmm1 * xmm0) + xmm2
554; X64-NEXT:    retq # encoding: [0xc3]
555entry:
556  %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
557  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %sub.i, <4 x float> %__B, <4 x float> %__C) #9
558  %1 = bitcast i8 %__U to <8 x i1>
559  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
560  %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer
561  ret <4 x float> %2
562}
563
564define <4 x float> @test_mm_maskz_fnmsub_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
565; X86-LABEL: test_mm_maskz_fnmsub_ps:
566; X86:       # %bb.0: # %entry
567; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
568; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
569; X86-NEXT:    vfnmsub213ps %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0xae,0xc2]
570; X86-NEXT:    # xmm0 {%k1} {z} = -(xmm1 * xmm0) - xmm2
571; X86-NEXT:    retl # encoding: [0xc3]
572;
573; X64-LABEL: test_mm_maskz_fnmsub_ps:
574; X64:       # %bb.0: # %entry
575; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
576; X64-NEXT:    vfnmsub213ps %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0xae,0xc2]
577; X64-NEXT:    # xmm0 {%k1} {z} = -(xmm1 * xmm0) - xmm2
578; X64-NEXT:    retq # encoding: [0xc3]
579entry:
580  %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
581  %sub1.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
582  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %sub.i, <4 x float> %__B, <4 x float> %sub1.i) #9
583  %1 = bitcast i8 %__U to <8 x i1>
584  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
585  %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer
586  ret <4 x float> %2
587}
588
589define <8 x float> @test_mm256_mask_fmadd_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) {
590; X86-LABEL: test_mm256_mask_fmadd_ps:
591; X86:       # %bb.0: # %entry
592; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
593; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
594; X86-NEXT:    vfmadd132ps %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x29,0x98,0xc1]
595; X86-NEXT:    # ymm0 {%k1} = (ymm0 * ymm1) + ymm2
596; X86-NEXT:    retl # encoding: [0xc3]
597;
598; X64-LABEL: test_mm256_mask_fmadd_ps:
599; X64:       # %bb.0: # %entry
600; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
601; X64-NEXT:    vfmadd132ps %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x29,0x98,0xc1]
602; X64-NEXT:    # ymm0 {%k1} = (ymm0 * ymm1) + ymm2
603; X64-NEXT:    retq # encoding: [0xc3]
604entry:
605  %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
606  %1 = bitcast i8 %__U to <8 x i1>
607  %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__A
608  ret <8 x float> %2
609}
610
611define <8 x float> @test_mm256_mask_fmsub_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) {
612; X86-LABEL: test_mm256_mask_fmsub_ps:
613; X86:       # %bb.0: # %entry
614; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
615; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
616; X86-NEXT:    vfmsub132ps %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x29,0x9a,0xc1]
617; X86-NEXT:    # ymm0 {%k1} = (ymm0 * ymm1) - ymm2
618; X86-NEXT:    retl # encoding: [0xc3]
619;
620; X64-LABEL: test_mm256_mask_fmsub_ps:
621; X64:       # %bb.0: # %entry
622; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
623; X64-NEXT:    vfmsub132ps %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x29,0x9a,0xc1]
624; X64-NEXT:    # ymm0 {%k1} = (ymm0 * ymm1) - ymm2
625; X64-NEXT:    retq # encoding: [0xc3]
626entry:
627  %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
628  %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9
629  %1 = bitcast i8 %__U to <8 x i1>
630  %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__A
631  ret <8 x float> %2
632}
633
634define <8 x float> @test_mm256_mask3_fmadd_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) {
635; X86-LABEL: test_mm256_mask3_fmadd_ps:
636; X86:       # %bb.0: # %entry
637; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
638; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
639; X86-NEXT:    vfmadd231ps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0xb8,0xd1]
640; X86-NEXT:    # ymm2 {%k1} = (ymm0 * ymm1) + ymm2
641; X86-NEXT:    vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
642; X86-NEXT:    retl # encoding: [0xc3]
643;
644; X64-LABEL: test_mm256_mask3_fmadd_ps:
645; X64:       # %bb.0: # %entry
646; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
647; X64-NEXT:    vfmadd231ps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0xb8,0xd1]
648; X64-NEXT:    # ymm2 {%k1} = (ymm0 * ymm1) + ymm2
649; X64-NEXT:    vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
650; X64-NEXT:    retq # encoding: [0xc3]
651entry:
652  %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
653  %1 = bitcast i8 %__U to <8 x i1>
654  %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__C
655  ret <8 x float> %2
656}
657
658define <8 x float> @test_mm256_mask3_fnmadd_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) {
659; X86-LABEL: test_mm256_mask3_fnmadd_ps:
660; X86:       # %bb.0: # %entry
661; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
662; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
663; X86-NEXT:    vfnmadd231ps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0xbc,0xd1]
664; X86-NEXT:    # ymm2 {%k1} = -(ymm0 * ymm1) + ymm2
665; X86-NEXT:    vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
666; X86-NEXT:    retl # encoding: [0xc3]
667;
668; X64-LABEL: test_mm256_mask3_fnmadd_ps:
669; X64:       # %bb.0: # %entry
670; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
671; X64-NEXT:    vfnmadd231ps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0xbc,0xd1]
672; X64-NEXT:    # ymm2 {%k1} = -(ymm0 * ymm1) + ymm2
673; X64-NEXT:    vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
674; X64-NEXT:    retq # encoding: [0xc3]
675entry:
676  %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
677  %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %sub.i, <8 x float> %__B, <8 x float> %__C) #9
678  %1 = bitcast i8 %__U to <8 x i1>
679  %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__C
680  ret <8 x float> %2
681}
682
683define <8 x float> @test_mm256_maskz_fmadd_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) {
684; X86-LABEL: test_mm256_maskz_fmadd_ps:
685; X86:       # %bb.0: # %entry
686; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
687; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
688; X86-NEXT:    vfmadd213ps %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0xa8,0xc2]
689; X86-NEXT:    # ymm0 {%k1} {z} = (ymm1 * ymm0) + ymm2
690; X86-NEXT:    retl # encoding: [0xc3]
691;
692; X64-LABEL: test_mm256_maskz_fmadd_ps:
693; X64:       # %bb.0: # %entry
694; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
695; X64-NEXT:    vfmadd213ps %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0xa8,0xc2]
696; X64-NEXT:    # ymm0 {%k1} {z} = (ymm1 * ymm0) + ymm2
697; X64-NEXT:    retq # encoding: [0xc3]
698entry:
699  %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
700  %1 = bitcast i8 %__U to <8 x i1>
701  %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer
702  ret <8 x float> %2
703}
704
705define <8 x float> @test_mm256_maskz_fmsub_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) {
706; X86-LABEL: test_mm256_maskz_fmsub_ps:
707; X86:       # %bb.0: # %entry
708; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
709; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
710; X86-NEXT:    vfmsub213ps %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0xaa,0xc2]
711; X86-NEXT:    # ymm0 {%k1} {z} = (ymm1 * ymm0) - ymm2
712; X86-NEXT:    retl # encoding: [0xc3]
713;
714; X64-LABEL: test_mm256_maskz_fmsub_ps:
715; X64:       # %bb.0: # %entry
716; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
717; X64-NEXT:    vfmsub213ps %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0xaa,0xc2]
718; X64-NEXT:    # ymm0 {%k1} {z} = (ymm1 * ymm0) - ymm2
719; X64-NEXT:    retq # encoding: [0xc3]
720entry:
721  %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
722  %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9
723  %1 = bitcast i8 %__U to <8 x i1>
724  %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer
725  ret <8 x float> %2
726}
727
728define <8 x float> @test_mm256_maskz_fnmadd_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) {
729; X86-LABEL: test_mm256_maskz_fnmadd_ps:
730; X86:       # %bb.0: # %entry
731; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
732; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
733; X86-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0xac,0xc2]
734; X86-NEXT:    # ymm0 {%k1} {z} = -(ymm1 * ymm0) + ymm2
735; X86-NEXT:    retl # encoding: [0xc3]
736;
737; X64-LABEL: test_mm256_maskz_fnmadd_ps:
738; X64:       # %bb.0: # %entry
739; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
740; X64-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0xac,0xc2]
741; X64-NEXT:    # ymm0 {%k1} {z} = -(ymm1 * ymm0) + ymm2
742; X64-NEXT:    retq # encoding: [0xc3]
743entry:
744  %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
745  %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %sub.i, <8 x float> %__B, <8 x float> %__C) #9
746  %1 = bitcast i8 %__U to <8 x i1>
747  %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer
748  ret <8 x float> %2
749}
750
751define <8 x float> @test_mm256_maskz_fnmsub_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) {
752; X86-LABEL: test_mm256_maskz_fnmsub_ps:
753; X86:       # %bb.0: # %entry
754; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
755; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
756; X86-NEXT:    vfnmsub213ps %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0xae,0xc2]
757; X86-NEXT:    # ymm0 {%k1} {z} = -(ymm1 * ymm0) - ymm2
758; X86-NEXT:    retl # encoding: [0xc3]
759;
760; X64-LABEL: test_mm256_maskz_fnmsub_ps:
761; X64:       # %bb.0: # %entry
762; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
763; X64-NEXT:    vfnmsub213ps %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0xae,0xc2]
764; X64-NEXT:    # ymm0 {%k1} {z} = -(ymm1 * ymm0) - ymm2
765; X64-NEXT:    retq # encoding: [0xc3]
766entry:
767  %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
768  %sub1.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
769  %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %sub.i, <8 x float> %__B, <8 x float> %sub1.i) #9
770  %1 = bitcast i8 %__U to <8 x i1>
771  %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer
772  ret <8 x float> %2
773}
774
775define <2 x double> @test_mm_mask_fmaddsub_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) {
776; X86-LABEL: test_mm_mask_fmaddsub_pd:
777; X86:       # %bb.0: # %entry
778; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
779; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
780; X86-NEXT:    vfmaddsub132pd %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0xed,0x09,0x96,0xc1]
781; X86-NEXT:    # xmm0 {%k1} = (xmm0 * xmm1) +/- xmm2
782; X86-NEXT:    retl # encoding: [0xc3]
783;
784; X64-LABEL: test_mm_mask_fmaddsub_pd:
785; X64:       # %bb.0: # %entry
786; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
787; X64-NEXT:    vfmaddsub132pd %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0xed,0x09,0x96,0xc1]
788; X64-NEXT:    # xmm0 {%k1} = (xmm0 * xmm1) +/- xmm2
789; X64-NEXT:    retq # encoding: [0xc3]
790entry:
791  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
792  %1 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
793  %2 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %1) #9
794  %3 = shufflevector <2 x double> %2, <2 x double> %0, <2 x i32> <i32 0, i32 3>
795  %4 = bitcast i8 %__U to <8 x i1>
796  %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
797  %5 = select <2 x i1> %extract.i, <2 x double> %3, <2 x double> %__A
798  ret <2 x double> %5
799}
800
801define <2 x double> @test_mm_mask_fmsubadd_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) {
802; X86-LABEL: test_mm_mask_fmsubadd_pd:
803; X86:       # %bb.0: # %entry
804; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
805; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
806; X86-NEXT:    vfmsubadd132pd %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0xed,0x09,0x97,0xc1]
807; X86-NEXT:    # xmm0 {%k1} = (xmm0 * xmm1) -/+ xmm2
808; X86-NEXT:    retl # encoding: [0xc3]
809;
810; X64-LABEL: test_mm_mask_fmsubadd_pd:
811; X64:       # %bb.0: # %entry
812; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
813; X64-NEXT:    vfmsubadd132pd %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0xed,0x09,0x97,0xc1]
814; X64-NEXT:    # xmm0 {%k1} = (xmm0 * xmm1) -/+ xmm2
815; X64-NEXT:    retq # encoding: [0xc3]
816entry:
817  %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
818  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9
819  %1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
820  %2 = shufflevector <2 x double> %1, <2 x double> %0, <2 x i32> <i32 0, i32 3>
821  %3 = bitcast i8 %__U to <8 x i1>
822  %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
823  %4 = select <2 x i1> %extract.i, <2 x double> %2, <2 x double> %__A
824  ret <2 x double> %4
825}
826
827define <2 x double> @test_mm_mask3_fmaddsub_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) {
828; X86-LABEL: test_mm_mask3_fmaddsub_pd:
829; X86:       # %bb.0: # %entry
830; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
831; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
832; X86-NEXT:    vfmaddsub231pd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0xb6,0xd1]
833; X86-NEXT:    # xmm2 {%k1} = (xmm0 * xmm1) +/- xmm2
834; X86-NEXT:    vmovapd %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc2]
835; X86-NEXT:    retl # encoding: [0xc3]
836;
837; X64-LABEL: test_mm_mask3_fmaddsub_pd:
838; X64:       # %bb.0: # %entry
839; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
840; X64-NEXT:    vfmaddsub231pd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0xb6,0xd1]
841; X64-NEXT:    # xmm2 {%k1} = (xmm0 * xmm1) +/- xmm2
842; X64-NEXT:    vmovapd %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc2]
843; X64-NEXT:    retq # encoding: [0xc3]
844entry:
845  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
846  %1 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
847  %2 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %1) #9
848  %3 = shufflevector <2 x double> %2, <2 x double> %0, <2 x i32> <i32 0, i32 3>
849  %4 = bitcast i8 %__U to <8 x i1>
850  %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
851  %5 = select <2 x i1> %extract.i, <2 x double> %3, <2 x double> %__C
852  ret <2 x double> %5
853}
854
855define <2 x double> @test_mm_maskz_fmaddsub_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
856; X86-LABEL: test_mm_maskz_fmaddsub_pd:
857; X86:       # %bb.0: # %entry
858; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
859; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
860; X86-NEXT:    vfmaddsub213pd %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0xa6,0xc2]
861; X86-NEXT:    # xmm0 {%k1} {z} = (xmm1 * xmm0) +/- xmm2
862; X86-NEXT:    retl # encoding: [0xc3]
863;
864; X64-LABEL: test_mm_maskz_fmaddsub_pd:
865; X64:       # %bb.0: # %entry
866; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
867; X64-NEXT:    vfmaddsub213pd %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0xa6,0xc2]
868; X64-NEXT:    # xmm0 {%k1} {z} = (xmm1 * xmm0) +/- xmm2
869; X64-NEXT:    retq # encoding: [0xc3]
870entry:
871  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
872  %1 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
873  %2 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %1) #9
874  %3 = shufflevector <2 x double> %2, <2 x double> %0, <2 x i32> <i32 0, i32 3>
875  %4 = bitcast i8 %__U to <8 x i1>
876  %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
877  %5 = select <2 x i1> %extract.i, <2 x double> %3, <2 x double> zeroinitializer
878  ret <2 x double> %5
879}
880
881define <2 x double> @test_mm_maskz_fmsubadd_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
882; X86-LABEL: test_mm_maskz_fmsubadd_pd:
883; X86:       # %bb.0: # %entry
884; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
885; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
886; X86-NEXT:    vfmsubadd213pd %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0xa7,0xc2]
887; X86-NEXT:    # xmm0 {%k1} {z} = (xmm1 * xmm0) -/+ xmm2
888; X86-NEXT:    retl # encoding: [0xc3]
889;
890; X64-LABEL: test_mm_maskz_fmsubadd_pd:
891; X64:       # %bb.0: # %entry
892; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
893; X64-NEXT:    vfmsubadd213pd %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0xa7,0xc2]
894; X64-NEXT:    # xmm0 {%k1} {z} = (xmm1 * xmm0) -/+ xmm2
895; X64-NEXT:    retq # encoding: [0xc3]
896entry:
897  %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
898  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9
899  %1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
900  %2 = shufflevector <2 x double> %1, <2 x double> %0, <2 x i32> <i32 0, i32 3>
901  %3 = bitcast i8 %__U to <8 x i1>
902  %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
903  %4 = select <2 x i1> %extract.i, <2 x double> %2, <2 x double> zeroinitializer
904  ret <2 x double> %4
905}
906
907define <4 x double> @test_mm256_mask_fmaddsub_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) {
908; X86-LABEL: test_mm256_mask_fmaddsub_pd:
909; X86:       # %bb.0: # %entry
910; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
911; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
912; X86-NEXT:    vfmaddsub132pd %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0xed,0x29,0x96,0xc1]
913; X86-NEXT:    # ymm0 {%k1} = (ymm0 * ymm1) +/- ymm2
914; X86-NEXT:    retl # encoding: [0xc3]
915;
916; X64-LABEL: test_mm256_mask_fmaddsub_pd:
917; X64:       # %bb.0: # %entry
918; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
919; X64-NEXT:    vfmaddsub132pd %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0xed,0x29,0x96,0xc1]
920; X64-NEXT:    # ymm0 {%k1} = (ymm0 * ymm1) +/- ymm2
921; X64-NEXT:    retq # encoding: [0xc3]
922entry:
923  %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
924  %1 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
925  %2 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %1) #9
926  %3 = shufflevector <4 x double> %2, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
927  %4 = bitcast i8 %__U to <8 x i1>
928  %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
929  %5 = select <4 x i1> %extract.i, <4 x double> %3, <4 x double> %__A
930  ret <4 x double> %5
931}
932
933define <4 x double> @test_mm256_mask_fmsubadd_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) {
934; X86-LABEL: test_mm256_mask_fmsubadd_pd:
935; X86:       # %bb.0: # %entry
936; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
937; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
938; X86-NEXT:    vfmsubadd132pd %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0xed,0x29,0x97,0xc1]
939; X86-NEXT:    # ymm0 {%k1} = (ymm0 * ymm1) -/+ ymm2
940; X86-NEXT:    retl # encoding: [0xc3]
941;
942; X64-LABEL: test_mm256_mask_fmsubadd_pd:
943; X64:       # %bb.0: # %entry
944; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
945; X64-NEXT:    vfmsubadd132pd %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0xed,0x29,0x97,0xc1]
946; X64-NEXT:    # ymm0 {%k1} = (ymm0 * ymm1) -/+ ymm2
947; X64-NEXT:    retq # encoding: [0xc3]
948entry:
949  %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
950  %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9
951  %1 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
952  %2 = shufflevector <4 x double> %1, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
953  %3 = bitcast i8 %__U to <8 x i1>
954  %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
955  %4 = select <4 x i1> %extract.i, <4 x double> %2, <4 x double> %__A
956  ret <4 x double> %4
957}
958
959define <4 x double> @test_mm256_mask3_fmaddsub_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) {
960; X86-LABEL: test_mm256_mask3_fmaddsub_pd:
961; X86:       # %bb.0: # %entry
962; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
963; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
964; X86-NEXT:    vfmaddsub231pd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0xb6,0xd1]
965; X86-NEXT:    # ymm2 {%k1} = (ymm0 * ymm1) +/- ymm2
966; X86-NEXT:    vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2]
967; X86-NEXT:    retl # encoding: [0xc3]
968;
969; X64-LABEL: test_mm256_mask3_fmaddsub_pd:
970; X64:       # %bb.0: # %entry
971; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
972; X64-NEXT:    vfmaddsub231pd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0xb6,0xd1]
973; X64-NEXT:    # ymm2 {%k1} = (ymm0 * ymm1) +/- ymm2
974; X64-NEXT:    vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2]
975; X64-NEXT:    retq # encoding: [0xc3]
976entry:
977  %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
978  %1 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
979  %2 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %1) #9
980  %3 = shufflevector <4 x double> %2, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
981  %4 = bitcast i8 %__U to <8 x i1>
982  %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
983  %5 = select <4 x i1> %extract.i, <4 x double> %3, <4 x double> %__C
984  ret <4 x double> %5
985}
986
987define <4 x double> @test_mm256_maskz_fmaddsub_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) {
988; X86-LABEL: test_mm256_maskz_fmaddsub_pd:
989; X86:       # %bb.0: # %entry
990; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
991; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
992; X86-NEXT:    vfmaddsub213pd %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0xa6,0xc2]
993; X86-NEXT:    # ymm0 {%k1} {z} = (ymm1 * ymm0) +/- ymm2
994; X86-NEXT:    retl # encoding: [0xc3]
995;
996; X64-LABEL: test_mm256_maskz_fmaddsub_pd:
997; X64:       # %bb.0: # %entry
998; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
999; X64-NEXT:    vfmaddsub213pd %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0xa6,0xc2]
1000; X64-NEXT:    # ymm0 {%k1} {z} = (ymm1 * ymm0) +/- ymm2
1001; X64-NEXT:    retq # encoding: [0xc3]
1002entry:
1003  %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
1004  %1 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
1005  %2 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %1) #9
1006  %3 = shufflevector <4 x double> %2, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1007  %4 = bitcast i8 %__U to <8 x i1>
1008  %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1009  %5 = select <4 x i1> %extract.i, <4 x double> %3, <4 x double> zeroinitializer
1010  ret <4 x double> %5
1011}
1012
1013define <4 x double> @test_mm256_maskz_fmsubadd_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) {
1014; X86-LABEL: test_mm256_maskz_fmsubadd_pd:
1015; X86:       # %bb.0: # %entry
1016; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1017; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1018; X86-NEXT:    vfmsubadd213pd %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0xa7,0xc2]
1019; X86-NEXT:    # ymm0 {%k1} {z} = (ymm1 * ymm0) -/+ ymm2
1020; X86-NEXT:    retl # encoding: [0xc3]
1021;
1022; X64-LABEL: test_mm256_maskz_fmsubadd_pd:
1023; X64:       # %bb.0: # %entry
1024; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1025; X64-NEXT:    vfmsubadd213pd %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0xa7,0xc2]
1026; X64-NEXT:    # ymm0 {%k1} {z} = (ymm1 * ymm0) -/+ ymm2
1027; X64-NEXT:    retq # encoding: [0xc3]
1028entry:
1029  %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
1030  %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9
1031  %1 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
1032  %2 = shufflevector <4 x double> %1, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1033  %3 = bitcast i8 %__U to <8 x i1>
1034  %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1035  %4 = select <4 x i1> %extract.i, <4 x double> %2, <4 x double> zeroinitializer
1036  ret <4 x double> %4
1037}
1038
1039define <4 x float> @test_mm_mask_fmaddsub_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) {
1040; X86-LABEL: test_mm_mask_fmaddsub_ps:
1041; X86:       # %bb.0: # %entry
1042; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1043; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1044; X86-NEXT:    vfmaddsub132ps %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x09,0x96,0xc1]
1045; X86-NEXT:    # xmm0 {%k1} = (xmm0 * xmm1) +/- xmm2
1046; X86-NEXT:    retl # encoding: [0xc3]
1047;
1048; X64-LABEL: test_mm_mask_fmaddsub_ps:
1049; X64:       # %bb.0: # %entry
1050; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1051; X64-NEXT:    vfmaddsub132ps %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x09,0x96,0xc1]
1052; X64-NEXT:    # xmm0 {%k1} = (xmm0 * xmm1) +/- xmm2
1053; X64-NEXT:    retq # encoding: [0xc3]
1054entry:
1055  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
1056  %1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
1057  %2 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %1) #9
1058  %3 = shufflevector <4 x float> %2, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1059  %4 = bitcast i8 %__U to <8 x i1>
1060  %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1061  %5 = select <4 x i1> %extract.i, <4 x float> %3, <4 x float> %__A
1062  ret <4 x float> %5
1063}
1064
1065define <4 x float> @test_mm_mask_fmsubadd_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) {
1066; X86-LABEL: test_mm_mask_fmsubadd_ps:
1067; X86:       # %bb.0: # %entry
1068; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1069; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1070; X86-NEXT:    vfmsubadd132ps %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x09,0x97,0xc1]
1071; X86-NEXT:    # xmm0 {%k1} = (xmm0 * xmm1) -/+ xmm2
1072; X86-NEXT:    retl # encoding: [0xc3]
1073;
1074; X64-LABEL: test_mm_mask_fmsubadd_ps:
1075; X64:       # %bb.0: # %entry
1076; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1077; X64-NEXT:    vfmsubadd132ps %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x09,0x97,0xc1]
1078; X64-NEXT:    # xmm0 {%k1} = (xmm0 * xmm1) -/+ xmm2
1079; X64-NEXT:    retq # encoding: [0xc3]
1080entry:
1081  %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
1082  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9
1083  %1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
1084  %2 = shufflevector <4 x float> %1, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1085  %3 = bitcast i8 %__U to <8 x i1>
1086  %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1087  %4 = select <4 x i1> %extract.i, <4 x float> %2, <4 x float> %__A
1088  ret <4 x float> %4
1089}
1090
1091define <4 x float> @test_mm_mask3_fmaddsub_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) {
1092; X86-LABEL: test_mm_mask3_fmaddsub_ps:
1093; X86:       # %bb.0: # %entry
1094; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1095; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1096; X86-NEXT:    vfmaddsub231ps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0xb6,0xd1]
1097; X86-NEXT:    # xmm2 {%k1} = (xmm0 * xmm1) +/- xmm2
1098; X86-NEXT:    vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
1099; X86-NEXT:    retl # encoding: [0xc3]
1100;
1101; X64-LABEL: test_mm_mask3_fmaddsub_ps:
1102; X64:       # %bb.0: # %entry
1103; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1104; X64-NEXT:    vfmaddsub231ps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0xb6,0xd1]
1105; X64-NEXT:    # xmm2 {%k1} = (xmm0 * xmm1) +/- xmm2
1106; X64-NEXT:    vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
1107; X64-NEXT:    retq # encoding: [0xc3]
1108entry:
1109  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
1110  %1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
1111  %2 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %1) #9
1112  %3 = shufflevector <4 x float> %2, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1113  %4 = bitcast i8 %__U to <8 x i1>
1114  %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1115  %5 = select <4 x i1> %extract.i, <4 x float> %3, <4 x float> %__C
1116  ret <4 x float> %5
1117}
1118
1119define <4 x float> @test_mm_maskz_fmaddsub_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
1120; X86-LABEL: test_mm_maskz_fmaddsub_ps:
1121; X86:       # %bb.0: # %entry
1122; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1123; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1124; X86-NEXT:    vfmaddsub213ps %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0xa6,0xc2]
1125; X86-NEXT:    # xmm0 {%k1} {z} = (xmm1 * xmm0) +/- xmm2
1126; X86-NEXT:    retl # encoding: [0xc3]
1127;
1128; X64-LABEL: test_mm_maskz_fmaddsub_ps:
1129; X64:       # %bb.0: # %entry
1130; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1131; X64-NEXT:    vfmaddsub213ps %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0xa6,0xc2]
1132; X64-NEXT:    # xmm0 {%k1} {z} = (xmm1 * xmm0) +/- xmm2
1133; X64-NEXT:    retq # encoding: [0xc3]
1134entry:
1135  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
1136  %1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
1137  %2 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %1) #9
1138  %3 = shufflevector <4 x float> %2, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1139  %4 = bitcast i8 %__U to <8 x i1>
1140  %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1141  %5 = select <4 x i1> %extract.i, <4 x float> %3, <4 x float> zeroinitializer
1142  ret <4 x float> %5
1143}
1144
1145define <4 x float> @test_mm_maskz_fmsubadd_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
1146; X86-LABEL: test_mm_maskz_fmsubadd_ps:
1147; X86:       # %bb.0: # %entry
1148; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1149; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1150; X86-NEXT:    vfmsubadd213ps %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0xa7,0xc2]
1151; X86-NEXT:    # xmm0 {%k1} {z} = (xmm1 * xmm0) -/+ xmm2
1152; X86-NEXT:    retl # encoding: [0xc3]
1153;
1154; X64-LABEL: test_mm_maskz_fmsubadd_ps:
1155; X64:       # %bb.0: # %entry
1156; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1157; X64-NEXT:    vfmsubadd213ps %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0xa7,0xc2]
1158; X64-NEXT:    # xmm0 {%k1} {z} = (xmm1 * xmm0) -/+ xmm2
1159; X64-NEXT:    retq # encoding: [0xc3]
1160entry:
1161  %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
1162  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9
1163  %1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
1164  %2 = shufflevector <4 x float> %1, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1165  %3 = bitcast i8 %__U to <8 x i1>
1166  %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1167  %4 = select <4 x i1> %extract.i, <4 x float> %2, <4 x float> zeroinitializer
1168  ret <4 x float> %4
1169}
1170
1171define <8 x float> @test_mm256_mask_fmaddsub_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) {
1172; X86-LABEL: test_mm256_mask_fmaddsub_ps:
1173; X86:       # %bb.0: # %entry
1174; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1175; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1176; X86-NEXT:    vfmaddsub132ps %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x29,0x96,0xc1]
1177; X86-NEXT:    # ymm0 {%k1} = (ymm0 * ymm1) +/- ymm2
1178; X86-NEXT:    retl # encoding: [0xc3]
1179;
1180; X64-LABEL: test_mm256_mask_fmaddsub_ps:
1181; X64:       # %bb.0: # %entry
1182; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1183; X64-NEXT:    vfmaddsub132ps %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x29,0x96,0xc1]
1184; X64-NEXT:    # ymm0 {%k1} = (ymm0 * ymm1) +/- ymm2
1185; X64-NEXT:    retq # encoding: [0xc3]
1186entry:
1187  %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
1188  %1 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
1189  %2 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %1) #9
1190  %3 = shufflevector <8 x float> %2, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
1191  %4 = bitcast i8 %__U to <8 x i1>
1192  %5 = select <8 x i1> %4, <8 x float> %3, <8 x float> %__A
1193  ret <8 x float> %5
1194}
1195
1196define <8 x float> @test_mm256_mask_fmsubadd_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) {
1197; X86-LABEL: test_mm256_mask_fmsubadd_ps:
1198; X86:       # %bb.0: # %entry
1199; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1200; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1201; X86-NEXT:    vfmsubadd132ps %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x29,0x97,0xc1]
1202; X86-NEXT:    # ymm0 {%k1} = (ymm0 * ymm1) -/+ ymm2
1203; X86-NEXT:    retl # encoding: [0xc3]
1204;
1205; X64-LABEL: test_mm256_mask_fmsubadd_ps:
1206; X64:       # %bb.0: # %entry
1207; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1208; X64-NEXT:    vfmsubadd132ps %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x29,0x97,0xc1]
1209; X64-NEXT:    # ymm0 {%k1} = (ymm0 * ymm1) -/+ ymm2
1210; X64-NEXT:    retq # encoding: [0xc3]
1211entry:
1212  %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
1213  %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9
1214  %1 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
1215  %2 = shufflevector <8 x float> %1, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
1216  %3 = bitcast i8 %__U to <8 x i1>
1217  %4 = select <8 x i1> %3, <8 x float> %2, <8 x float> %__A
1218  ret <8 x float> %4
1219}
1220
1221define <8 x float> @test_mm256_mask3_fmaddsub_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) {
1222; X86-LABEL: test_mm256_mask3_fmaddsub_ps:
1223; X86:       # %bb.0: # %entry
1224; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1225; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1226; X86-NEXT:    vfmaddsub231ps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0xb6,0xd1]
1227; X86-NEXT:    # ymm2 {%k1} = (ymm0 * ymm1) +/- ymm2
1228; X86-NEXT:    vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
1229; X86-NEXT:    retl # encoding: [0xc3]
1230;
1231; X64-LABEL: test_mm256_mask3_fmaddsub_ps:
1232; X64:       # %bb.0: # %entry
1233; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1234; X64-NEXT:    vfmaddsub231ps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0xb6,0xd1]
1235; X64-NEXT:    # ymm2 {%k1} = (ymm0 * ymm1) +/- ymm2
1236; X64-NEXT:    vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
1237; X64-NEXT:    retq # encoding: [0xc3]
1238entry:
1239  %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
1240  %1 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
1241  %2 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %1) #9
1242  %3 = shufflevector <8 x float> %2, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
1243  %4 = bitcast i8 %__U to <8 x i1>
1244  %5 = select <8 x i1> %4, <8 x float> %3, <8 x float> %__C
1245  ret <8 x float> %5
1246}
1247
1248define <8 x float> @test_mm256_maskz_fmaddsub_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) {
1249; X86-LABEL: test_mm256_maskz_fmaddsub_ps:
1250; X86:       # %bb.0: # %entry
1251; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1252; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1253; X86-NEXT:    vfmaddsub213ps %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0xa6,0xc2]
1254; X86-NEXT:    # ymm0 {%k1} {z} = (ymm1 * ymm0) +/- ymm2
1255; X86-NEXT:    retl # encoding: [0xc3]
1256;
1257; X64-LABEL: test_mm256_maskz_fmaddsub_ps:
1258; X64:       # %bb.0: # %entry
1259; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1260; X64-NEXT:    vfmaddsub213ps %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0xa6,0xc2]
1261; X64-NEXT:    # ymm0 {%k1} {z} = (ymm1 * ymm0) +/- ymm2
1262; X64-NEXT:    retq # encoding: [0xc3]
1263entry:
1264  %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
1265  %1 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
1266  %2 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %1) #9
1267  %3 = shufflevector <8 x float> %2, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
1268  %4 = bitcast i8 %__U to <8 x i1>
1269  %5 = select <8 x i1> %4, <8 x float> %3, <8 x float> zeroinitializer
1270  ret <8 x float> %5
1271}
1272
1273define <8 x float> @test_mm256_maskz_fmsubadd_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) {
1274; X86-LABEL: test_mm256_maskz_fmsubadd_ps:
1275; X86:       # %bb.0: # %entry
1276; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1277; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1278; X86-NEXT:    vfmsubadd213ps %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0xa7,0xc2]
1279; X86-NEXT:    # ymm0 {%k1} {z} = (ymm1 * ymm0) -/+ ymm2
1280; X86-NEXT:    retl # encoding: [0xc3]
1281;
1282; X64-LABEL: test_mm256_maskz_fmsubadd_ps:
1283; X64:       # %bb.0: # %entry
1284; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1285; X64-NEXT:    vfmsubadd213ps %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0xa7,0xc2]
1286; X64-NEXT:    # ymm0 {%k1} {z} = (ymm1 * ymm0) -/+ ymm2
1287; X64-NEXT:    retq # encoding: [0xc3]
1288entry:
1289  %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
1290  %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9
1291  %1 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
1292  %2 = shufflevector <8 x float> %1, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
1293  %3 = bitcast i8 %__U to <8 x i1>
1294  %4 = select <8 x i1> %3, <8 x float> %2, <8 x float> zeroinitializer
1295  ret <8 x float> %4
1296}
1297
1298define <2 x double> @test_mm_mask3_fmsub_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) {
1299; X86-LABEL: test_mm_mask3_fmsub_pd:
1300; X86:       # %bb.0: # %entry
1301; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1302; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1303; X86-NEXT:    vfmsub231pd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0xba,0xd1]
1304; X86-NEXT:    # xmm2 {%k1} = (xmm0 * xmm1) - xmm2
1305; X86-NEXT:    vmovapd %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc2]
1306; X86-NEXT:    retl # encoding: [0xc3]
1307;
1308; X64-LABEL: test_mm_mask3_fmsub_pd:
1309; X64:       # %bb.0: # %entry
1310; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1311; X64-NEXT:    vfmsub231pd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0xba,0xd1]
1312; X64-NEXT:    # xmm2 {%k1} = (xmm0 * xmm1) - xmm2
1313; X64-NEXT:    vmovapd %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc2]
1314; X64-NEXT:    retq # encoding: [0xc3]
1315entry:
1316  %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
1317  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9
1318  %1 = bitcast i8 %__U to <8 x i1>
1319  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
1320  %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__C
1321  ret <2 x double> %2
1322}
1323
1324define <4 x double> @test_mm256_mask3_fmsub_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) {
1325; X86-LABEL: test_mm256_mask3_fmsub_pd:
1326; X86:       # %bb.0: # %entry
1327; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1328; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1329; X86-NEXT:    vfmsub231pd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0xba,0xd1]
1330; X86-NEXT:    # ymm2 {%k1} = (ymm0 * ymm1) - ymm2
1331; X86-NEXT:    vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2]
1332; X86-NEXT:    retl # encoding: [0xc3]
1333;
1334; X64-LABEL: test_mm256_mask3_fmsub_pd:
1335; X64:       # %bb.0: # %entry
1336; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1337; X64-NEXT:    vfmsub231pd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0xba,0xd1]
1338; X64-NEXT:    # ymm2 {%k1} = (ymm0 * ymm1) - ymm2
1339; X64-NEXT:    vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2]
1340; X64-NEXT:    retq # encoding: [0xc3]
1341entry:
1342  %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
1343  %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9
1344  %1 = bitcast i8 %__U to <8 x i1>
1345  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1346  %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__C
1347  ret <4 x double> %2
1348}
1349
1350define <4 x float> @test_mm_mask3_fmsub_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) {
1351; X86-LABEL: test_mm_mask3_fmsub_ps:
1352; X86:       # %bb.0: # %entry
1353; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1354; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1355; X86-NEXT:    vfmsub231ps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0xba,0xd1]
1356; X86-NEXT:    # xmm2 {%k1} = (xmm0 * xmm1) - xmm2
1357; X86-NEXT:    vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
1358; X86-NEXT:    retl # encoding: [0xc3]
1359;
1360; X64-LABEL: test_mm_mask3_fmsub_ps:
1361; X64:       # %bb.0: # %entry
1362; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1363; X64-NEXT:    vfmsub231ps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0xba,0xd1]
1364; X64-NEXT:    # xmm2 {%k1} = (xmm0 * xmm1) - xmm2
1365; X64-NEXT:    vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
1366; X64-NEXT:    retq # encoding: [0xc3]
1367entry:
1368  %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
1369  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9
1370  %1 = bitcast i8 %__U to <8 x i1>
1371  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1372  %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__C
1373  ret <4 x float> %2
1374}
1375
1376define <8 x float> @test_mm256_mask3_fmsub_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) {
1377; X86-LABEL: test_mm256_mask3_fmsub_ps:
1378; X86:       # %bb.0: # %entry
1379; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1380; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1381; X86-NEXT:    vfmsub231ps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0xba,0xd1]
1382; X86-NEXT:    # ymm2 {%k1} = (ymm0 * ymm1) - ymm2
1383; X86-NEXT:    vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
1384; X86-NEXT:    retl # encoding: [0xc3]
1385;
1386; X64-LABEL: test_mm256_mask3_fmsub_ps:
1387; X64:       # %bb.0: # %entry
1388; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1389; X64-NEXT:    vfmsub231ps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0xba,0xd1]
1390; X64-NEXT:    # ymm2 {%k1} = (ymm0 * ymm1) - ymm2
1391; X64-NEXT:    vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
1392; X64-NEXT:    retq # encoding: [0xc3]
1393entry:
1394  %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
1395  %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9
1396  %1 = bitcast i8 %__U to <8 x i1>
1397  %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__C
1398  ret <8 x float> %2
1399}
1400
1401define <2 x double> @test_mm_mask3_fmsubadd_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) {
1402; X86-LABEL: test_mm_mask3_fmsubadd_pd:
1403; X86:       # %bb.0: # %entry
1404; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1405; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1406; X86-NEXT:    vfmsubadd231pd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0xb7,0xd1]
1407; X86-NEXT:    # xmm2 {%k1} = (xmm0 * xmm1) -/+ xmm2
1408; X86-NEXT:    vmovapd %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc2]
1409; X86-NEXT:    retl # encoding: [0xc3]
1410;
1411; X64-LABEL: test_mm_mask3_fmsubadd_pd:
1412; X64:       # %bb.0: # %entry
1413; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1414; X64-NEXT:    vfmsubadd231pd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0xb7,0xd1]
1415; X64-NEXT:    # xmm2 {%k1} = (xmm0 * xmm1) -/+ xmm2
1416; X64-NEXT:    vmovapd %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc2]
1417; X64-NEXT:    retq # encoding: [0xc3]
1418entry:
1419  %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
1420  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9
1421  %1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
1422  %2 = shufflevector <2 x double> %1, <2 x double> %0, <2 x i32> <i32 0, i32 3>
1423  %3 = bitcast i8 %__U to <8 x i1>
1424  %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
1425  %4 = select <2 x i1> %extract.i, <2 x double> %2, <2 x double> %__C
1426  ret <2 x double> %4
1427}
1428
1429define <4 x double> @test_mm256_mask3_fmsubadd_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) {
1430; X86-LABEL: test_mm256_mask3_fmsubadd_pd:
1431; X86:       # %bb.0: # %entry
1432; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1433; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1434; X86-NEXT:    vfmsubadd231pd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0xb7,0xd1]
1435; X86-NEXT:    # ymm2 {%k1} = (ymm0 * ymm1) -/+ ymm2
1436; X86-NEXT:    vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2]
1437; X86-NEXT:    retl # encoding: [0xc3]
1438;
1439; X64-LABEL: test_mm256_mask3_fmsubadd_pd:
1440; X64:       # %bb.0: # %entry
1441; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1442; X64-NEXT:    vfmsubadd231pd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0xb7,0xd1]
1443; X64-NEXT:    # ymm2 {%k1} = (ymm0 * ymm1) -/+ ymm2
1444; X64-NEXT:    vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2]
1445; X64-NEXT:    retq # encoding: [0xc3]
1446entry:
1447  %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
1448  %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9
1449  %1 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
1450  %2 = shufflevector <4 x double> %1, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1451  %3 = bitcast i8 %__U to <8 x i1>
1452  %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1453  %4 = select <4 x i1> %extract.i, <4 x double> %2, <4 x double> %__C
1454  ret <4 x double> %4
1455}
1456
1457define <4 x float> @test_mm_mask3_fmsubadd_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) {
1458; X86-LABEL: test_mm_mask3_fmsubadd_ps:
1459; X86:       # %bb.0: # %entry
1460; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1461; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1462; X86-NEXT:    vfmsubadd231ps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0xb7,0xd1]
1463; X86-NEXT:    # xmm2 {%k1} = (xmm0 * xmm1) -/+ xmm2
1464; X86-NEXT:    vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
1465; X86-NEXT:    retl # encoding: [0xc3]
1466;
1467; X64-LABEL: test_mm_mask3_fmsubadd_ps:
1468; X64:       # %bb.0: # %entry
1469; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1470; X64-NEXT:    vfmsubadd231ps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0xb7,0xd1]
1471; X64-NEXT:    # xmm2 {%k1} = (xmm0 * xmm1) -/+ xmm2
1472; X64-NEXT:    vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
1473; X64-NEXT:    retq # encoding: [0xc3]
1474entry:
1475  %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
1476  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9
1477  %1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
1478  %2 = shufflevector <4 x float> %1, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1479  %3 = bitcast i8 %__U to <8 x i1>
1480  %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1481  %4 = select <4 x i1> %extract.i, <4 x float> %2, <4 x float> %__C
1482  ret <4 x float> %4
1483}
1484
1485define <8 x float> @test_mm256_mask3_fmsubadd_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) {
1486; X86-LABEL: test_mm256_mask3_fmsubadd_ps:
1487; X86:       # %bb.0: # %entry
1488; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1489; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1490; X86-NEXT:    vfmsubadd231ps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0xb7,0xd1]
1491; X86-NEXT:    # ymm2 {%k1} = (ymm0 * ymm1) -/+ ymm2
1492; X86-NEXT:    vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
1493; X86-NEXT:    retl # encoding: [0xc3]
1494;
1495; X64-LABEL: test_mm256_mask3_fmsubadd_ps:
1496; X64:       # %bb.0: # %entry
1497; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1498; X64-NEXT:    vfmsubadd231ps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0xb7,0xd1]
1499; X64-NEXT:    # ymm2 {%k1} = (ymm0 * ymm1) -/+ ymm2
1500; X64-NEXT:    vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
1501; X64-NEXT:    retq # encoding: [0xc3]
1502entry:
1503  %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
1504  %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9
1505  %1 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
1506  %2 = shufflevector <8 x float> %1, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
1507  %3 = bitcast i8 %__U to <8 x i1>
1508  %4 = select <8 x i1> %3, <8 x float> %2, <8 x float> %__C
1509  ret <8 x float> %4
1510}
1511
1512define <2 x double> @test_mm_mask_fnmadd_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) {
1513; X86-LABEL: test_mm_mask_fnmadd_pd:
1514; X86:       # %bb.0: # %entry
1515; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1516; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1517; X86-NEXT:    vfnmadd132pd %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0xed,0x09,0x9c,0xc1]
1518; X86-NEXT:    # xmm0 {%k1} = -(xmm0 * xmm1) + xmm2
1519; X86-NEXT:    retl # encoding: [0xc3]
1520;
1521; X64-LABEL: test_mm_mask_fnmadd_pd:
1522; X64:       # %bb.0: # %entry
1523; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1524; X64-NEXT:    vfnmadd132pd %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0xed,0x09,0x9c,0xc1]
1525; X64-NEXT:    # xmm0 {%k1} = -(xmm0 * xmm1) + xmm2
1526; X64-NEXT:    retq # encoding: [0xc3]
1527entry:
1528  %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__B
1529  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %sub.i, <2 x double> %__C) #9
1530  %1 = bitcast i8 %__U to <8 x i1>
1531  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
1532  %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__A
1533  ret <2 x double> %2
1534}
1535
1536define <4 x double> @test_mm256_mask_fnmadd_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) {
1537; X86-LABEL: test_mm256_mask_fnmadd_pd:
1538; X86:       # %bb.0: # %entry
1539; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1540; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1541; X86-NEXT:    vfnmadd132pd %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0xed,0x29,0x9c,0xc1]
1542; X86-NEXT:    # ymm0 {%k1} = -(ymm0 * ymm1) + ymm2
1543; X86-NEXT:    retl # encoding: [0xc3]
1544;
1545; X64-LABEL: test_mm256_mask_fnmadd_pd:
1546; X64:       # %bb.0: # %entry
1547; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1548; X64-NEXT:    vfnmadd132pd %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0xed,0x29,0x9c,0xc1]
1549; X64-NEXT:    # ymm0 {%k1} = -(ymm0 * ymm1) + ymm2
1550; X64-NEXT:    retq # encoding: [0xc3]
1551entry:
1552  %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B
1553  %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %sub.i, <4 x double> %__C) #9
1554  %1 = bitcast i8 %__U to <8 x i1>
1555  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1556  %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__A
1557  ret <4 x double> %2
1558}
1559
1560define <4 x float> @test_mm_mask_fnmadd_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) {
1561; X86-LABEL: test_mm_mask_fnmadd_ps:
1562; X86:       # %bb.0: # %entry
1563; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1564; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1565; X86-NEXT:    vfnmadd132ps %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x09,0x9c,0xc1]
1566; X86-NEXT:    # xmm0 {%k1} = -(xmm0 * xmm1) + xmm2
1567; X86-NEXT:    retl # encoding: [0xc3]
1568;
1569; X64-LABEL: test_mm_mask_fnmadd_ps:
1570; X64:       # %bb.0: # %entry
1571; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1572; X64-NEXT:    vfnmadd132ps %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x09,0x9c,0xc1]
1573; X64-NEXT:    # xmm0 {%k1} = -(xmm0 * xmm1) + xmm2
1574; X64-NEXT:    retq # encoding: [0xc3]
1575entry:
1576  %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
1577  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %sub.i, <4 x float> %__C) #9
1578  %1 = bitcast i8 %__U to <8 x i1>
1579  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1580  %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__A
1581  ret <4 x float> %2
1582}
1583
1584define <8 x float> @test_mm256_mask_fnmadd_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) {
1585; X86-LABEL: test_mm256_mask_fnmadd_ps:
1586; X86:       # %bb.0: # %entry
1587; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1588; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1589; X86-NEXT:    vfnmadd132ps %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x29,0x9c,0xc1]
1590; X86-NEXT:    # ymm0 {%k1} = -(ymm0 * ymm1) + ymm2
1591; X86-NEXT:    retl # encoding: [0xc3]
1592;
1593; X64-LABEL: test_mm256_mask_fnmadd_ps:
1594; X64:       # %bb.0: # %entry
1595; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1596; X64-NEXT:    vfnmadd132ps %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x29,0x9c,0xc1]
1597; X64-NEXT:    # ymm0 {%k1} = -(ymm0 * ymm1) + ymm2
1598; X64-NEXT:    retq # encoding: [0xc3]
1599entry:
1600  %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
1601  %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %sub.i, <8 x float> %__C) #9
1602  %1 = bitcast i8 %__U to <8 x i1>
1603  %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__A
1604  ret <8 x float> %2
1605}
1606
1607define <2 x double> @test_mm_mask_fnmsub_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) {
1608; X86-LABEL: test_mm_mask_fnmsub_pd:
1609; X86:       # %bb.0: # %entry
1610; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1611; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1612; X86-NEXT:    vfnmsub132pd %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0xed,0x09,0x9e,0xc1]
1613; X86-NEXT:    # xmm0 {%k1} = -(xmm0 * xmm1) - xmm2
1614; X86-NEXT:    retl # encoding: [0xc3]
1615;
1616; X64-LABEL: test_mm_mask_fnmsub_pd:
1617; X64:       # %bb.0: # %entry
1618; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1619; X64-NEXT:    vfnmsub132pd %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0xed,0x09,0x9e,0xc1]
1620; X64-NEXT:    # xmm0 {%k1} = -(xmm0 * xmm1) - xmm2
1621; X64-NEXT:    retq # encoding: [0xc3]
1622entry:
1623  %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__B
1624  %sub1.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
1625  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %sub.i, <2 x double> %sub1.i) #9
1626  %1 = bitcast i8 %__U to <8 x i1>
1627  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
1628  %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__A
1629  ret <2 x double> %2
1630}
1631
1632define <2 x double> @test_mm_mask3_fnmsub_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) {
1633; X86-LABEL: test_mm_mask3_fnmsub_pd:
1634; X86:       # %bb.0: # %entry
1635; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1636; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1637; X86-NEXT:    vfnmsub231pd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0xbe,0xd1]
1638; X86-NEXT:    # xmm2 {%k1} = -(xmm0 * xmm1) - xmm2
1639; X86-NEXT:    vmovapd %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc2]
1640; X86-NEXT:    retl # encoding: [0xc3]
1641;
1642; X64-LABEL: test_mm_mask3_fnmsub_pd:
1643; X64:       # %bb.0: # %entry
1644; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1645; X64-NEXT:    vfnmsub231pd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0xbe,0xd1]
1646; X64-NEXT:    # xmm2 {%k1} = -(xmm0 * xmm1) - xmm2
1647; X64-NEXT:    vmovapd %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc2]
1648; X64-NEXT:    retq # encoding: [0xc3]
1649entry:
1650  %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__B
1651  %sub1.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
1652  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %sub.i, <2 x double> %sub1.i) #9
1653  %1 = bitcast i8 %__U to <8 x i1>
1654  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
1655  %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__C
1656  ret <2 x double> %2
1657}
1658
1659define <4 x double> @test_mm256_mask_fnmsub_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) {
1660; X86-LABEL: test_mm256_mask_fnmsub_pd:
1661; X86:       # %bb.0: # %entry
1662; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1663; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1664; X86-NEXT:    vfnmsub132pd %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0xed,0x29,0x9e,0xc1]
1665; X86-NEXT:    # ymm0 {%k1} = -(ymm0 * ymm1) - ymm2
1666; X86-NEXT:    retl # encoding: [0xc3]
1667;
1668; X64-LABEL: test_mm256_mask_fnmsub_pd:
1669; X64:       # %bb.0: # %entry
1670; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1671; X64-NEXT:    vfnmsub132pd %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0xed,0x29,0x9e,0xc1]
1672; X64-NEXT:    # ymm0 {%k1} = -(ymm0 * ymm1) - ymm2
1673; X64-NEXT:    retq # encoding: [0xc3]
1674entry:
1675  %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B
1676  %sub1.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
1677  %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %sub.i, <4 x double> %sub1.i) #9
1678  %1 = bitcast i8 %__U to <8 x i1>
1679  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1680  %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__A
1681  ret <4 x double> %2
1682}
1683
1684define <4 x double> @test_mm256_mask3_fnmsub_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) {
1685; X86-LABEL: test_mm256_mask3_fnmsub_pd:
1686; X86:       # %bb.0: # %entry
1687; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1688; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1689; X86-NEXT:    vfnmsub231pd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0xbe,0xd1]
1690; X86-NEXT:    # ymm2 {%k1} = -(ymm0 * ymm1) - ymm2
1691; X86-NEXT:    vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2]
1692; X86-NEXT:    retl # encoding: [0xc3]
1693;
1694; X64-LABEL: test_mm256_mask3_fnmsub_pd:
1695; X64:       # %bb.0: # %entry
1696; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1697; X64-NEXT:    vfnmsub231pd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0xbe,0xd1]
1698; X64-NEXT:    # ymm2 {%k1} = -(ymm0 * ymm1) - ymm2
1699; X64-NEXT:    vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2]
1700; X64-NEXT:    retq # encoding: [0xc3]
1701entry:
1702  %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B
1703  %sub1.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
1704  %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %sub.i, <4 x double> %sub1.i) #9
1705  %1 = bitcast i8 %__U to <8 x i1>
1706  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1707  %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__C
1708  ret <4 x double> %2
1709}
1710
1711define <4 x float> @test_mm_mask_fnmsub_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) {
1712; X86-LABEL: test_mm_mask_fnmsub_ps:
1713; X86:       # %bb.0: # %entry
1714; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1715; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1716; X86-NEXT:    vfnmsub132ps %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x09,0x9e,0xc1]
1717; X86-NEXT:    # xmm0 {%k1} = -(xmm0 * xmm1) - xmm2
1718; X86-NEXT:    retl # encoding: [0xc3]
1719;
1720; X64-LABEL: test_mm_mask_fnmsub_ps:
1721; X64:       # %bb.0: # %entry
1722; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1723; X64-NEXT:    vfnmsub132ps %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x09,0x9e,0xc1]
1724; X64-NEXT:    # xmm0 {%k1} = -(xmm0 * xmm1) - xmm2
1725; X64-NEXT:    retq # encoding: [0xc3]
1726entry:
1727  %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
1728  %sub1.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
1729  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %sub.i, <4 x float> %sub1.i) #9
1730  %1 = bitcast i8 %__U to <8 x i1>
1731  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1732  %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__A
1733  ret <4 x float> %2
1734}
1735
1736define <4 x float> @test_mm_mask3_fnmsub_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) {
1737; X86-LABEL: test_mm_mask3_fnmsub_ps:
1738; X86:       # %bb.0: # %entry
1739; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1740; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1741; X86-NEXT:    vfnmsub231ps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0xbe,0xd1]
1742; X86-NEXT:    # xmm2 {%k1} = -(xmm0 * xmm1) - xmm2
1743; X86-NEXT:    vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
1744; X86-NEXT:    retl # encoding: [0xc3]
1745;
1746; X64-LABEL: test_mm_mask3_fnmsub_ps:
1747; X64:       # %bb.0: # %entry
1748; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1749; X64-NEXT:    vfnmsub231ps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0xbe,0xd1]
1750; X64-NEXT:    # xmm2 {%k1} = -(xmm0 * xmm1) - xmm2
1751; X64-NEXT:    vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
1752; X64-NEXT:    retq # encoding: [0xc3]
1753entry:
1754  %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
1755  %sub1.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
1756  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %sub.i, <4 x float> %sub1.i) #9
1757  %1 = bitcast i8 %__U to <8 x i1>
1758  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1759  %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__C
1760  ret <4 x float> %2
1761}
1762
1763define <8 x float> @test_mm256_mask_fnmsub_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) {
1764; X86-LABEL: test_mm256_mask_fnmsub_ps:
1765; X86:       # %bb.0: # %entry
1766; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1767; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1768; X86-NEXT:    vfnmsub132ps %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x29,0x9e,0xc1]
1769; X86-NEXT:    # ymm0 {%k1} = -(ymm0 * ymm1) - ymm2
1770; X86-NEXT:    retl # encoding: [0xc3]
1771;
1772; X64-LABEL: test_mm256_mask_fnmsub_ps:
1773; X64:       # %bb.0: # %entry
1774; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1775; X64-NEXT:    vfnmsub132ps %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x29,0x9e,0xc1]
1776; X64-NEXT:    # ymm0 {%k1} = -(ymm0 * ymm1) - ymm2
1777; X64-NEXT:    retq # encoding: [0xc3]
1778entry:
1779  %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
1780  %sub1.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
1781  %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %sub.i, <8 x float> %sub1.i) #9
1782  %1 = bitcast i8 %__U to <8 x i1>
1783  %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__A
1784  ret <8 x float> %2
1785}
1786
1787define <8 x float> @test_mm256_mask3_fnmsub_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) {
1788; X86-LABEL: test_mm256_mask3_fnmsub_ps:
1789; X86:       # %bb.0: # %entry
1790; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1791; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1792; X86-NEXT:    vfnmsub231ps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0xbe,0xd1]
1793; X86-NEXT:    # ymm2 {%k1} = -(ymm0 * ymm1) - ymm2
1794; X86-NEXT:    vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
1795; X86-NEXT:    retl # encoding: [0xc3]
1796;
1797; X64-LABEL: test_mm256_mask3_fnmsub_ps:
1798; X64:       # %bb.0: # %entry
1799; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1800; X64-NEXT:    vfnmsub231ps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0xbe,0xd1]
1801; X64-NEXT:    # ymm2 {%k1} = -(ymm0 * ymm1) - ymm2
1802; X64-NEXT:    vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
1803; X64-NEXT:    retq # encoding: [0xc3]
1804entry:
1805  %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
1806  %sub1.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
1807  %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %sub.i, <8 x float> %sub1.i) #9
1808  %1 = bitcast i8 %__U to <8 x i1>
1809  %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__C
1810  ret <8 x float> %2
1811}
1812
1813declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) #8
1814declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>) #8
1815declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) #8
1816declare <8 x float> @llvm.fma.v8f32(<8 x float>, <8 x float>, <8 x float>) #8
1817