1//===-- X86InstrSSE.td - SSE Instruction Set ---------------*- tablegen -*-===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file describes the X86 SSE instruction set, defining the instructions,
11// and properties of the instructions which are needed for code generation,
12// machine code emission, and analysis.
13//
14//===----------------------------------------------------------------------===//
15
16//===----------------------------------------------------------------------===//
17// SSE 1 & 2 Instructions Classes
18//===----------------------------------------------------------------------===//
19
20/// sse12_fp_scalar - SSE 1 & 2 scalar instructions class
21multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
22                           RegisterClass RC, X86MemOperand x86memop,
23                           Domain d, X86FoldableSchedWrite sched,
24                           bit Is2Addr = 1> {
25  let isCommutable = 1 in {
26    def rr : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
27       !if(Is2Addr,
28           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
29           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
30       [(set RC:$dst, (OpNode RC:$src1, RC:$src2))], d>,
31       Sched<[sched]>;
32  }
33  def rm : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
34       !if(Is2Addr,
35           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
36           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
37       [(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))], d>,
38       Sched<[sched.Folded, ReadAfterLd]>;
39}
40
41/// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class
42multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr,
43                               SDPatternOperator OpNode, RegisterClass RC,
44                               ValueType VT, string asm, Operand memopr,
45                               ComplexPattern mem_cpat, Domain d,
46                               X86FoldableSchedWrite sched, bit Is2Addr = 1> {
47let isCodeGenOnly = 1, hasSideEffects = 0 in {
48  def rr_Int : SI_Int<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
49       !if(Is2Addr,
50           !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
51           !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
52       [(set RC:$dst, (VT (OpNode RC:$src1, RC:$src2)))], d>,
53       Sched<[sched]>;
54  let mayLoad = 1 in
55  def rm_Int : SI_Int<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, memopr:$src2),
56       !if(Is2Addr,
57           !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
58           !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
59       [(set RC:$dst, (VT (OpNode RC:$src1, mem_cpat:$src2)))], d>,
60       Sched<[sched.Folded, ReadAfterLd]>;
61}
62}
63
64/// sse12_fp_packed - SSE 1 & 2 packed instructions class
65multiclass sse12_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
66                           RegisterClass RC, ValueType vt,
67                           X86MemOperand x86memop, PatFrag mem_frag,
68                           Domain d, X86FoldableSchedWrite sched,
69                           bit Is2Addr = 1> {
70  let isCommutable = 1 in
71    def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
72       !if(Is2Addr,
73           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
74           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
75       [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], d>,
76       Sched<[sched]>;
77  let mayLoad = 1 in
78    def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
79       !if(Is2Addr,
80           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
81           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
82       [(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))],
83          d>,
84       Sched<[sched.Folded, ReadAfterLd]>;
85}
86
87/// sse12_fp_packed_logical_rm - SSE 1 & 2 packed instructions class
88multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d,
89                                      string OpcodeStr, X86MemOperand x86memop,
90                                      X86FoldableSchedWrite sched,
91                                      list<dag> pat_rr, list<dag> pat_rm,
92                                      bit Is2Addr = 1> {
93  let isCommutable = 1, hasSideEffects = 0 in
94    def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
95       !if(Is2Addr,
96           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
97           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
98       pat_rr, d>,
99       Sched<[sched]>;
100  let hasSideEffects = 0, mayLoad = 1 in
101  def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
102       !if(Is2Addr,
103           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
104           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
105       pat_rm, d>,
106       Sched<[sched.Folded, ReadAfterLd]>;
107}
108
109
110// Alias instructions that map fld0 to xorps for sse or vxorps for avx.
111// This is expanded by ExpandPostRAPseudos.
112let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
113    isPseudo = 1, SchedRW = [WriteZero] in {
114  def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "",
115                   [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1, NoAVX512]>;
116  def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "",
117                   [(set FR64:$dst, fpimm0)]>, Requires<[HasSSE2, NoAVX512]>;
118}
119
120//===----------------------------------------------------------------------===//
121// AVX & SSE - Zero/One Vectors
122//===----------------------------------------------------------------------===//
123
124// Alias instruction that maps zero vector to pxor / xorp* for sse.
125// This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then
126// swizzled by ExecutionDomainFix to pxor.
127// We set canFoldAsLoad because this can be converted to a constant-pool
128// load of an all-zeros value if folding it would be beneficial.
129let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
130    isPseudo = 1, SchedRW = [WriteZero] in {
131def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "",
132               [(set VR128:$dst, (v4f32 immAllZerosV))]>;
133}
134
135let Predicates = [NoAVX512] in
136def : Pat<(v4i32 immAllZerosV), (V_SET0)>;
137
138
139// The same as done above but for AVX.  The 256-bit AVX1 ISA doesn't support PI,
140// and doesn't need it because on sandy bridge the register is set to zero
141// at the rename stage without using any execution unit, so SET0PSY
142// and SET0PDY can be used for vector int instructions without penalty
143let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
144    isPseudo = 1, Predicates = [NoAVX512], SchedRW = [WriteZero] in {
145def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "",
146                 [(set VR256:$dst, (v8i32 immAllZerosV))]>;
147}
148
149// We set canFoldAsLoad because this can be converted to a constant-pool
150// load of an all-ones value if folding it would be beneficial.
151let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
152    isPseudo = 1, SchedRW = [WriteZero] in {
153  def V_SETALLONES : I<0, Pseudo, (outs VR128:$dst), (ins), "",
154                       [(set VR128:$dst, (v4i32 immAllOnesV))]>;
155  let Predicates = [HasAVX1Only, OptForMinSize] in {
156  def AVX1_SETALLONES: I<0, Pseudo, (outs VR256:$dst), (ins), "",
157                          [(set VR256:$dst, (v8i32 immAllOnesV))]>;
158  }
159  let Predicates = [HasAVX2] in
160  def AVX2_SETALLONES : I<0, Pseudo, (outs VR256:$dst), (ins), "",
161                          [(set VR256:$dst, (v8i32 immAllOnesV))]>;
162}
163
164//===----------------------------------------------------------------------===//
165// SSE 1 & 2 - Move FP Scalar Instructions
166//
167// Move Instructions. Register-to-register movss/movsd is not used for FR32/64
168// register copies because it's a partial register update; Register-to-register
169// movss/movsd is not modeled as an INSERT_SUBREG because INSERT_SUBREG requires
170// that the insert be implementable in terms of a copy, and just mentioned, we
171// don't use movss/movsd for copies.
172//===----------------------------------------------------------------------===//
173
174multiclass sse12_move_rr<SDNode OpNode, ValueType vt,
175                         X86MemOperand x86memop, string base_opc,
176                         string asm_opr, Domain d, string Name> {
177  let isCommutable = 1 in
178  def rr : SI<0x10, MRMSrcReg, (outs VR128:$dst),
179              (ins VR128:$src1, VR128:$src2),
180              !strconcat(base_opc, asm_opr),
181              [(set VR128:$dst, (vt (OpNode VR128:$src1, VR128:$src2)))], d>,
182              Sched<[SchedWriteFShuffle.XMM]>;
183
184  // For the disassembler
185  let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
186  def rr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
187                  (ins VR128:$src1, VR128:$src2),
188                  !strconcat(base_opc, asm_opr), []>,
189                  Sched<[SchedWriteFShuffle.XMM]>, FoldGenData<Name#rr>;
190}
191
192multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
193                      X86MemOperand x86memop, string OpcodeStr,
194                      Domain d, string Name, Predicate pred> {
195  // AVX
196  let Predicates = [UseAVX, OptForSize] in
197  defm V#NAME : sse12_move_rr<OpNode, vt, x86memop, OpcodeStr,
198                              "\t{$src2, $src1, $dst|$dst, $src1, $src2}", d,
199                              "V"#Name>,
200                              VEX_4V, VEX_LIG, VEX_WIG;
201
202  def V#NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
203                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
204                     [(store RC:$src, addr:$dst)], d>,
205                     VEX, VEX_LIG, Sched<[WriteFStore]>, VEX_WIG;
206  // SSE1 & 2
207  let Constraints = "$src1 = $dst" in {
208    let Predicates = [pred, NoSSE41_Or_OptForSize] in
209    defm NAME : sse12_move_rr<OpNode, vt, x86memop, OpcodeStr,
210                              "\t{$src2, $dst|$dst, $src2}", d, Name>;
211  }
212
213  def NAME#mr   : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
214                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
215                     [(store RC:$src, addr:$dst)], d>,
216                     Sched<[WriteFStore]>;
217
218  def : InstAlias<"v"#OpcodeStr#".s\t{$src2, $src1, $dst|$dst, $src1, $src2}",
219                  (!cast<Instruction>("V"#NAME#"rr_REV")
220                   VR128:$dst, VR128:$src1, VR128:$src2), 0>;
221  def : InstAlias<OpcodeStr#".s\t{$src2, $dst|$dst, $src2}",
222                  (!cast<Instruction>(NAME#"rr_REV")
223                   VR128:$dst, VR128:$src2), 0>;
224}
225
226// Loading from memory automatically zeroing upper bits.
227multiclass sse12_move_rm<RegisterClass RC, X86MemOperand x86memop,
228                         PatFrag mem_pat, string OpcodeStr, Domain d> {
229  def V#NAME#rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
230                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
231                     [(set RC:$dst, (mem_pat addr:$src))], d>,
232                     VEX, VEX_LIG, Sched<[WriteFLoad]>, VEX_WIG;
233  def NAME#rm   : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
234                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
235                     [(set RC:$dst, (mem_pat addr:$src))], d>,
236                     Sched<[WriteFLoad]>;
237}
238
239defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss",
240                        SSEPackedSingle, "MOVSS", UseSSE1>, XS;
241defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd",
242                        SSEPackedDouble, "MOVSD", UseSSE2>, XD;
243
244let canFoldAsLoad = 1, isReMaterializable = 1 in {
245  defm MOVSS : sse12_move_rm<FR32, f32mem, loadf32, "movss",
246                             SSEPackedSingle>, XS;
247  defm MOVSD : sse12_move_rm<FR64, f64mem, loadf64, "movsd",
248                             SSEPackedDouble>, XD;
249}
250
251// Patterns
252let Predicates = [UseAVX] in {
253  // MOVSSrm zeros the high parts of the register; represent this
254  // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
255  def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
256            (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
257  def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
258            (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
259  def : Pat<(v4f32 (X86vzload addr:$src)),
260            (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
261
262  // MOVSDrm zeros the high parts of the register; represent this
263  // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
264  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
265            (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
266  def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
267            (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
268  def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
269            (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
270  def : Pat<(v2f64 (X86vzload addr:$src)),
271            (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
272
273  // Represent the same patterns above but in the form they appear for
274  // 256-bit types
275  def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
276                   (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))),
277            (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
278  def : Pat<(v8f32 (X86vzload addr:$src)),
279            (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
280  def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
281                   (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))),
282            (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
283  def : Pat<(v4f64 (X86vzload addr:$src)),
284            (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
285
286  // Extract and store.
287  def : Pat<(store (f32 (extractelt (v4f32 VR128:$src), (iPTR 0))),
288                   addr:$dst),
289            (VMOVSSmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32))>;
290}
291
292let Predicates = [UseAVX, OptForSize] in {
293  // Move scalar to XMM zero-extended, zeroing a VR128 then do a
294  // MOVSS to the lower bits.
295  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
296            (VMOVSSrr (v4f32 (V_SET0)), VR128:$src)>;
297  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
298            (VMOVSSrr (v4i32 (V_SET0)), VR128:$src)>;
299
300  // Move low f32 and clear high bits.
301  def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
302            (SUBREG_TO_REG (i32 0),
303             (v4f32 (VMOVSSrr (v4f32 (V_SET0)),
304              (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)))), sub_xmm)>;
305  def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
306            (SUBREG_TO_REG (i32 0),
307             (v4i32 (VMOVSSrr (v4i32 (V_SET0)),
308              (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)))), sub_xmm)>;
309
310  def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))),
311            (SUBREG_TO_REG (i32 0),
312             (v2f64 (VMOVSDrr (v2f64 (V_SET0)),
313                       (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)))),
314             sub_xmm)>;
315  def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))),
316            (SUBREG_TO_REG (i32 0),
317             (v2i64 (VMOVSDrr (v2i64 (V_SET0)),
318                       (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)))),
319             sub_xmm)>;
320}
321
322let Predicates = [UseSSE1] in {
323  let Predicates = [UseSSE1, NoSSE41_Or_OptForSize] in {
324  // Move scalar to XMM zero-extended, zeroing a VR128 then do a
325  // MOVSS to the lower bits.
326  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
327            (MOVSSrr (v4f32 (V_SET0)), VR128:$src)>;
328  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
329            (MOVSSrr (v4i32 (V_SET0)), VR128:$src)>;
330  }
331
332  // MOVSSrm already zeros the high parts of the register.
333  def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
334            (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
335  def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
336            (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
337  def : Pat<(v4f32 (X86vzload addr:$src)),
338            (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
339
340  // Extract and store.
341  def : Pat<(store (f32 (extractelt (v4f32 VR128:$src), (iPTR 0))),
342                   addr:$dst),
343            (MOVSSmr addr:$dst, (COPY_TO_REGCLASS VR128:$src, FR32))>;
344}
345
346let Predicates = [UseSSE2] in {
347  // MOVSDrm already zeros the high parts of the register.
348  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
349            (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
350  def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
351            (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
352  def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
353            (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
354  def : Pat<(v2f64 (X86vzload addr:$src)),
355            (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
356}
357
358// Aliases to help the assembler pick two byte VEX encodings by swapping the
359// operands relative to the normal instructions to use VEX.R instead of VEX.B.
360def : InstAlias<"vmovss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
361                (VMOVSSrr_REV VR128L:$dst, VR128:$src1, VR128H:$src2), 0>;
362def : InstAlias<"vmovsd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
363                (VMOVSDrr_REV VR128L:$dst, VR128:$src1, VR128H:$src2), 0>;
364
365//===----------------------------------------------------------------------===//
366// SSE 1 & 2 - Move Aligned/Unaligned FP Instructions
367//===----------------------------------------------------------------------===//
368
369multiclass sse12_mov_packed<bits<8> opc, RegisterClass RC,
370                            X86MemOperand x86memop, PatFrag ld_frag,
371                            string asm, Domain d,
372                            X86SchedWriteMoveLS sched> {
373let hasSideEffects = 0, isMoveReg = 1 in
374  def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
375              !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], d>,
376           Sched<[sched.RR]>;
377let canFoldAsLoad = 1, isReMaterializable = 1 in
378  def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
379              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
380                   [(set RC:$dst, (ld_frag addr:$src))], d>,
381           Sched<[sched.RM]>;
382}
383
384let Predicates = [HasAVX, NoVLX] in {
385defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps",
386                                SSEPackedSingle, SchedWriteFMoveLS.XMM>,
387                                PS, VEX, VEX_WIG;
388defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd",
389                                SSEPackedDouble, SchedWriteFMoveLS.XMM>,
390                                PD, VEX, VEX_WIG;
391defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups",
392                                SSEPackedSingle, SchedWriteFMoveLS.XMM>,
393                                PS, VEX, VEX_WIG;
394defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd",
395                                SSEPackedDouble, SchedWriteFMoveLS.XMM>,
396                                PD, VEX, VEX_WIG;
397
398defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32, "movaps",
399                                 SSEPackedSingle, SchedWriteFMoveLS.YMM>,
400                                 PS, VEX, VEX_L, VEX_WIG;
401defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64, "movapd",
402                                 SSEPackedDouble, SchedWriteFMoveLS.YMM>,
403                                 PD, VEX, VEX_L, VEX_WIG;
404defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32, "movups",
405                                 SSEPackedSingle, SchedWriteFMoveLS.YMM>,
406                                 PS, VEX, VEX_L, VEX_WIG;
407defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64, "movupd",
408                                 SSEPackedDouble, SchedWriteFMoveLS.YMM>,
409                                 PD, VEX, VEX_L, VEX_WIG;
410}
411
412let Predicates = [UseSSE1] in {
413defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps",
414                               SSEPackedSingle, SchedWriteFMoveLS.XMM>,
415                               PS;
416defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups",
417                               SSEPackedSingle, SchedWriteFMoveLS.XMM>,
418                               PS;
419}
420let Predicates = [UseSSE2] in {
421defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd",
422                               SSEPackedDouble, SchedWriteFMoveLS.XMM>,
423                               PD;
424defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd",
425                               SSEPackedDouble, SchedWriteFMoveLS.XMM>,
426                               PD;
427}
428
429let Predicates = [HasAVX, NoVLX]  in {
430let SchedRW = [SchedWriteFMoveLS.XMM.MR] in {
431def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
432                   "movaps\t{$src, $dst|$dst, $src}",
433                   [(alignedstore (v4f32 VR128:$src), addr:$dst)]>,
434                   VEX, VEX_WIG;
435def VMOVAPDmr : VPDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
436                   "movapd\t{$src, $dst|$dst, $src}",
437                   [(alignedstore (v2f64 VR128:$src), addr:$dst)]>,
438                   VEX, VEX_WIG;
439def VMOVUPSmr : VPSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
440                   "movups\t{$src, $dst|$dst, $src}",
441                   [(store (v4f32 VR128:$src), addr:$dst)]>,
442                   VEX, VEX_WIG;
443def VMOVUPDmr : VPDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
444                   "movupd\t{$src, $dst|$dst, $src}",
445                   [(store (v2f64 VR128:$src), addr:$dst)]>,
446                   VEX, VEX_WIG;
447} // SchedRW
448
449let SchedRW = [SchedWriteFMoveLS.YMM.MR] in {
450def VMOVAPSYmr : VPSI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
451                   "movaps\t{$src, $dst|$dst, $src}",
452                   [(alignedstore (v8f32 VR256:$src), addr:$dst)]>,
453                   VEX, VEX_L, VEX_WIG;
454def VMOVAPDYmr : VPDI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
455                   "movapd\t{$src, $dst|$dst, $src}",
456                   [(alignedstore (v4f64 VR256:$src), addr:$dst)]>,
457                   VEX, VEX_L, VEX_WIG;
458def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
459                   "movups\t{$src, $dst|$dst, $src}",
460                   [(store (v8f32 VR256:$src), addr:$dst)]>,
461                   VEX, VEX_L, VEX_WIG;
462def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
463                   "movupd\t{$src, $dst|$dst, $src}",
464                   [(store (v4f64 VR256:$src), addr:$dst)]>,
465                   VEX, VEX_L, VEX_WIG;
466} // SchedRW
467} // Predicate
468
469// For disassembler
470let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
471    isMoveReg = 1 in {
472let SchedRW = [SchedWriteFMoveLS.XMM.RR] in {
473  def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst),
474                          (ins VR128:$src),
475                          "movaps\t{$src, $dst|$dst, $src}", []>,
476                          VEX, VEX_WIG, FoldGenData<"VMOVAPSrr">;
477  def VMOVAPDrr_REV : VPDI<0x29, MRMDestReg, (outs VR128:$dst),
478                           (ins VR128:$src),
479                           "movapd\t{$src, $dst|$dst, $src}", []>,
480                           VEX, VEX_WIG, FoldGenData<"VMOVAPDrr">;
481  def VMOVUPSrr_REV : VPSI<0x11, MRMDestReg, (outs VR128:$dst),
482                           (ins VR128:$src),
483                           "movups\t{$src, $dst|$dst, $src}", []>,
484                           VEX, VEX_WIG, FoldGenData<"VMOVUPSrr">;
485  def VMOVUPDrr_REV : VPDI<0x11, MRMDestReg, (outs VR128:$dst),
486                           (ins VR128:$src),
487                           "movupd\t{$src, $dst|$dst, $src}", []>,
488                           VEX, VEX_WIG, FoldGenData<"VMOVUPDrr">;
489} // SchedRW
490
491let SchedRW = [SchedWriteFMoveLS.YMM.RR] in {
492  def VMOVAPSYrr_REV : VPSI<0x29, MRMDestReg, (outs VR256:$dst),
493                            (ins VR256:$src),
494                            "movaps\t{$src, $dst|$dst, $src}", []>,
495                            VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVAPSYrr">;
496  def VMOVAPDYrr_REV : VPDI<0x29, MRMDestReg, (outs VR256:$dst),
497                            (ins VR256:$src),
498                            "movapd\t{$src, $dst|$dst, $src}", []>,
499                            VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVAPDYrr">;
500  def VMOVUPSYrr_REV : VPSI<0x11, MRMDestReg, (outs VR256:$dst),
501                            (ins VR256:$src),
502                            "movups\t{$src, $dst|$dst, $src}", []>,
503                            VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVUPSYrr">;
504  def VMOVUPDYrr_REV : VPDI<0x11, MRMDestReg, (outs VR256:$dst),
505                            (ins VR256:$src),
506                            "movupd\t{$src, $dst|$dst, $src}", []>,
507                            VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVUPDYrr">;
508} // SchedRW
509} // Predicate
510
511// Aliases to help the assembler pick two byte VEX encodings by swapping the
512// operands relative to the normal instructions to use VEX.R instead of VEX.B.
513def : InstAlias<"vmovaps\t{$src, $dst|$dst, $src}",
514                (VMOVAPSrr_REV VR128L:$dst, VR128H:$src), 0>;
515def : InstAlias<"vmovapd\t{$src, $dst|$dst, $src}",
516                (VMOVAPDrr_REV VR128L:$dst, VR128H:$src), 0>;
517def : InstAlias<"vmovups\t{$src, $dst|$dst, $src}",
518                (VMOVUPSrr_REV VR128L:$dst, VR128H:$src), 0>;
519def : InstAlias<"vmovupd\t{$src, $dst|$dst, $src}",
520                (VMOVUPDrr_REV VR128L:$dst, VR128H:$src), 0>;
521def : InstAlias<"vmovaps\t{$src, $dst|$dst, $src}",
522                (VMOVAPSYrr_REV VR256L:$dst, VR256H:$src), 0>;
523def : InstAlias<"vmovapd\t{$src, $dst|$dst, $src}",
524                (VMOVAPDYrr_REV VR256L:$dst, VR256H:$src), 0>;
525def : InstAlias<"vmovups\t{$src, $dst|$dst, $src}",
526                (VMOVUPSYrr_REV VR256L:$dst, VR256H:$src), 0>;
527def : InstAlias<"vmovupd\t{$src, $dst|$dst, $src}",
528                (VMOVUPDYrr_REV VR256L:$dst, VR256H:$src), 0>;
529
530// Reversed version with ".s" suffix for GAS compatibility.
531def : InstAlias<"vmovaps.s\t{$src, $dst|$dst, $src}",
532                (VMOVAPSrr_REV VR128:$dst, VR128:$src), 0>;
533def : InstAlias<"vmovapd.s\t{$src, $dst|$dst, $src}",
534                (VMOVAPDrr_REV VR128:$dst, VR128:$src), 0>;
535def : InstAlias<"vmovups.s\t{$src, $dst|$dst, $src}",
536                (VMOVUPSrr_REV VR128:$dst, VR128:$src), 0>;
537def : InstAlias<"vmovupd.s\t{$src, $dst|$dst, $src}",
538                (VMOVUPDrr_REV VR128:$dst, VR128:$src), 0>;
539def : InstAlias<"vmovaps.s\t{$src, $dst|$dst, $src}",
540                (VMOVAPSYrr_REV VR256:$dst, VR256:$src), 0>;
541def : InstAlias<"vmovapd.s\t{$src, $dst|$dst, $src}",
542                (VMOVAPDYrr_REV VR256:$dst, VR256:$src), 0>;
543def : InstAlias<"vmovups.s\t{$src, $dst|$dst, $src}",
544                (VMOVUPSYrr_REV VR256:$dst, VR256:$src), 0>;
545def : InstAlias<"vmovupd.s\t{$src, $dst|$dst, $src}",
546                (VMOVUPDYrr_REV VR256:$dst, VR256:$src), 0>;
547
548let SchedRW = [SchedWriteFMoveLS.XMM.MR] in {
549def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
550                   "movaps\t{$src, $dst|$dst, $src}",
551                   [(alignedstore (v4f32 VR128:$src), addr:$dst)]>;
552def MOVAPDmr : PDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
553                   "movapd\t{$src, $dst|$dst, $src}",
554                   [(alignedstore (v2f64 VR128:$src), addr:$dst)]>;
555def MOVUPSmr : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
556                   "movups\t{$src, $dst|$dst, $src}",
557                   [(store (v4f32 VR128:$src), addr:$dst)]>;
558def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
559                   "movupd\t{$src, $dst|$dst, $src}",
560                   [(store (v2f64 VR128:$src), addr:$dst)]>;
561} // SchedRW
562
563// For disassembler
564let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
565    isMoveReg = 1, SchedRW = [SchedWriteFMoveLS.XMM.RR] in {
566  def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
567                         "movaps\t{$src, $dst|$dst, $src}", []>,
568                         FoldGenData<"MOVAPSrr">;
569  def MOVAPDrr_REV : PDI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
570                         "movapd\t{$src, $dst|$dst, $src}", []>,
571                         FoldGenData<"MOVAPDrr">;
572  def MOVUPSrr_REV : PSI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
573                         "movups\t{$src, $dst|$dst, $src}", []>,
574                         FoldGenData<"MOVUPSrr">;
575  def MOVUPDrr_REV : PDI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
576                         "movupd\t{$src, $dst|$dst, $src}", []>,
577                         FoldGenData<"MOVUPDrr">;
578}
579
580// Reversed version with ".s" suffix for GAS compatibility.
581def : InstAlias<"movaps.s\t{$src, $dst|$dst, $src}",
582                (MOVAPSrr_REV VR128:$dst, VR128:$src), 0>;
583def : InstAlias<"movapd.s\t{$src, $dst|$dst, $src}",
584                (MOVAPDrr_REV VR128:$dst, VR128:$src), 0>;
585def : InstAlias<"movups.s\t{$src, $dst|$dst, $src}",
586                (MOVUPSrr_REV VR128:$dst, VR128:$src), 0>;
587def : InstAlias<"movupd.s\t{$src, $dst|$dst, $src}",
588                (MOVUPDrr_REV VR128:$dst, VR128:$src), 0>;
589
590let Predicates = [HasAVX, NoVLX] in {
591  // 256-bit load/store need to use floating point load/store in case we don't
592  // have AVX2. Execution domain fixing will convert to integer if AVX2 is
593  // available and changing the domain is beneficial.
594  def : Pat<(alignedloadv4i64 addr:$src),
595            (VMOVAPSYrm addr:$src)>;
596  def : Pat<(loadv4i64 addr:$src),
597            (VMOVUPSYrm addr:$src)>;
598  def : Pat<(alignedstore (v4i64 VR256:$src), addr:$dst),
599            (VMOVAPSYmr addr:$dst, VR256:$src)>;
600  def : Pat<(alignedstore (v8i32 VR256:$src), addr:$dst),
601            (VMOVAPSYmr addr:$dst, VR256:$src)>;
602  def : Pat<(alignedstore (v16i16 VR256:$src), addr:$dst),
603            (VMOVAPSYmr addr:$dst, VR256:$src)>;
604  def : Pat<(alignedstore (v32i8 VR256:$src), addr:$dst),
605            (VMOVAPSYmr addr:$dst, VR256:$src)>;
606  def : Pat<(store (v4i64 VR256:$src), addr:$dst),
607            (VMOVUPSYmr addr:$dst, VR256:$src)>;
608  def : Pat<(store (v8i32 VR256:$src), addr:$dst),
609            (VMOVUPSYmr addr:$dst, VR256:$src)>;
610  def : Pat<(store (v16i16 VR256:$src), addr:$dst),
611            (VMOVUPSYmr addr:$dst, VR256:$src)>;
612  def : Pat<(store (v32i8 VR256:$src), addr:$dst),
613            (VMOVUPSYmr addr:$dst, VR256:$src)>;
614}
615
616// Use movaps / movups for SSE integer load / store (one byte shorter).
617// The instructions selected below are then converted to MOVDQA/MOVDQU
618// during the SSE domain pass.
619let Predicates = [UseSSE1] in {
620  def : Pat<(alignedloadv2i64 addr:$src),
621            (MOVAPSrm addr:$src)>;
622  def : Pat<(loadv2i64 addr:$src),
623            (MOVUPSrm addr:$src)>;
624
625  def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
626            (MOVAPSmr addr:$dst, VR128:$src)>;
627  def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
628            (MOVAPSmr addr:$dst, VR128:$src)>;
629  def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
630            (MOVAPSmr addr:$dst, VR128:$src)>;
631  def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
632            (MOVAPSmr addr:$dst, VR128:$src)>;
633  def : Pat<(store (v2i64 VR128:$src), addr:$dst),
634            (MOVUPSmr addr:$dst, VR128:$src)>;
635  def : Pat<(store (v4i32 VR128:$src), addr:$dst),
636            (MOVUPSmr addr:$dst, VR128:$src)>;
637  def : Pat<(store (v8i16 VR128:$src), addr:$dst),
638            (MOVUPSmr addr:$dst, VR128:$src)>;
639  def : Pat<(store (v16i8 VR128:$src), addr:$dst),
640            (MOVUPSmr addr:$dst, VR128:$src)>;
641}
642
643//===----------------------------------------------------------------------===//
644// SSE 1 & 2 - Move Low packed FP Instructions
645//===----------------------------------------------------------------------===//
646
647multiclass sse12_mov_hilo_packed_base<bits<8>opc,  SDNode pdnode,
648                                      string base_opc, string asm_opr> {
649  // No pattern as they need be special cased between high and low.
650  let hasSideEffects = 0, mayLoad = 1 in
651  def PSrm : PI<opc, MRMSrcMem,
652                (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
653                !strconcat(base_opc, "s", asm_opr),
654                [], SSEPackedSingle>, PS,
655                Sched<[SchedWriteFShuffle.XMM.Folded, ReadAfterLd]>;
656
657  def PDrm : PI<opc, MRMSrcMem,
658         (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
659         !strconcat(base_opc, "d", asm_opr),
660     [(set VR128:$dst, (v2f64 (pdnode VR128:$src1,
661                              (scalar_to_vector (loadf64 addr:$src2)))))],
662              SSEPackedDouble>, PD,
663     Sched<[SchedWriteFShuffle.XMM.Folded, ReadAfterLd]>;
664}
665
666multiclass sse12_mov_hilo_packed<bits<8>opc, SDPatternOperator pdnode,
667                                 string base_opc> {
668  let Predicates = [UseAVX] in
669    defm V#NAME : sse12_mov_hilo_packed_base<opc, pdnode, base_opc,
670                                    "\t{$src2, $src1, $dst|$dst, $src1, $src2}">,
671                                    VEX_4V, VEX_WIG;
672
673  let Constraints = "$src1 = $dst" in
674    defm NAME : sse12_mov_hilo_packed_base<opc,  pdnode, base_opc,
675                                    "\t{$src2, $dst|$dst, $src2}">;
676}
677
678defm MOVL : sse12_mov_hilo_packed<0x12, X86Movsd, "movlp">;
679
680let SchedRW = [WriteFStore] in {
681let Predicates = [UseAVX] in {
682def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
683                     "movlps\t{$src, $dst|$dst, $src}",
684                     [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128:$src)),
685                                   (iPTR 0))), addr:$dst)]>,
686                     VEX, VEX_WIG;
687def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
688                     "movlpd\t{$src, $dst|$dst, $src}",
689                     [(store (f64 (extractelt (v2f64 VR128:$src),
690                                   (iPTR 0))), addr:$dst)]>,
691                     VEX, VEX_WIG;
692}// UseAVX
693def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
694                   "movlps\t{$src, $dst|$dst, $src}",
695                   [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128:$src)),
696                                 (iPTR 0))), addr:$dst)]>;
697def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
698                   "movlpd\t{$src, $dst|$dst, $src}",
699                   [(store (f64 (extractelt (v2f64 VR128:$src),
700                                 (iPTR 0))), addr:$dst)]>;
701} // SchedRW
702
703let Predicates = [UseSSE1] in {
704  // (store (vector_shuffle (load addr), v2, <4, 5, 2, 3>), addr) using MOVLPS
705  def : Pat<(store (i64 (extractelt (bc_v2i64 (v4f32 VR128:$src2)),
706                                 (iPTR 0))), addr:$src1),
707            (MOVLPSmr addr:$src1, VR128:$src2)>;
708
709  // This pattern helps select MOVLPS on SSE1 only targets. With SSE2 we'll
710  // end up with a movsd or blend instead of shufp.
711  // No need for aligned load, we're only loading 64-bits.
712  def : Pat<(X86Shufp (loadv4f32 addr:$src2), VR128:$src1, (i8 -28)),
713            (MOVLPSrm VR128:$src1, addr:$src2)>;
714}
715
716//===----------------------------------------------------------------------===//
717// SSE 1 & 2 - Move Hi packed FP Instructions
718//===----------------------------------------------------------------------===//
719
720defm MOVH : sse12_mov_hilo_packed<0x16, X86Unpckl, "movhp">;
721
722let SchedRW = [WriteFStore] in {
723// v2f64 extract element 1 is always custom lowered to unpack high to low
724// and extract element 0 so the non-store version isn't too horrible.
725let Predicates = [UseAVX] in {
726def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
727                   "movhps\t{$src, $dst|$dst, $src}",
728                   [(store (f64 (extractelt
729                                 (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)),
730                                            (bc_v2f64 (v4f32 VR128:$src))),
731                                 (iPTR 0))), addr:$dst)]>, VEX, VEX_WIG;
732def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
733                   "movhpd\t{$src, $dst|$dst, $src}",
734                   [(store (f64 (extractelt
735                                 (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
736                                 (iPTR 0))), addr:$dst)]>, VEX, VEX_WIG;
737} // UseAVX
738def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
739                   "movhps\t{$src, $dst|$dst, $src}",
740                   [(store (f64 (extractelt
741                                 (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)),
742                                            (bc_v2f64 (v4f32 VR128:$src))),
743                                 (iPTR 0))), addr:$dst)]>;
744def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
745                   "movhpd\t{$src, $dst|$dst, $src}",
746                   [(store (f64 (extractelt
747                                 (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
748                                 (iPTR 0))), addr:$dst)]>;
749} // SchedRW
750
751let Predicates = [UseAVX] in {
752  // Also handle an i64 load because that may get selected as a faster way to
753  // load the data.
754  def : Pat<(v2f64 (X86Unpckl VR128:$src1,
755                      (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
756            (VMOVHPDrm VR128:$src1, addr:$src2)>;
757
758  def : Pat<(store (f64 (extractelt
759                          (v2f64 (X86VPermilpi VR128:$src, (i8 1))),
760                          (iPTR 0))), addr:$dst),
761            (VMOVHPDmr addr:$dst, VR128:$src)>;
762}
763
764let Predicates = [UseSSE1] in {
765  // This pattern helps select MOVHPS on SSE1 only targets. With SSE2 we'll
766  // end up with a movsd or blend instead of shufp.
767  // No need for aligned load, we're only loading 64-bits.
768  def : Pat<(X86Movlhps VR128:$src1, (loadv4f32 addr:$src2)),
769            (MOVHPSrm VR128:$src1, addr:$src2)>;
770}
771
772let Predicates = [UseSSE2] in {
773  // MOVHPD patterns
774
775  // Also handle an i64 load because that may get selected as a faster way to
776  // load the data.
777  def : Pat<(v2f64 (X86Unpckl VR128:$src1,
778                      (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
779            (MOVHPDrm VR128:$src1, addr:$src2)>;
780
781  def : Pat<(store (f64 (extractelt
782                          (v2f64 (X86Shufp VR128:$src, VR128:$src, (i8 1))),
783                          (iPTR 0))), addr:$dst),
784            (MOVHPDmr addr:$dst, VR128:$src)>;
785}
786
787//===----------------------------------------------------------------------===//
788// SSE 1 & 2 - Move Low to High and High to Low packed FP Instructions
789//===----------------------------------------------------------------------===//
790
791let Predicates = [UseAVX] in {
792  def VMOVLHPSrr : VPSI<0x16, MRMSrcReg, (outs VR128:$dst),
793                                       (ins VR128:$src1, VR128:$src2),
794                      "movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
795                      [(set VR128:$dst,
796                        (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))]>,
797                      VEX_4V, Sched<[SchedWriteFShuffle.XMM]>, VEX_WIG;
798  let isCommutable = 1 in
799  def VMOVHLPSrr : VPSI<0x12, MRMSrcReg, (outs VR128:$dst),
800                                       (ins VR128:$src1, VR128:$src2),
801                      "movhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
802                      [(set VR128:$dst,
803                        (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))]>,
804                      VEX_4V, Sched<[SchedWriteFShuffle.XMM]>, VEX_WIG,
805                      NotMemoryFoldable;
806}
807let Constraints = "$src1 = $dst" in {
808  def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst),
809                                       (ins VR128:$src1, VR128:$src2),
810                      "movlhps\t{$src2, $dst|$dst, $src2}",
811                      [(set VR128:$dst,
812                        (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))]>,
813                      Sched<[SchedWriteFShuffle.XMM]>;
814  let isCommutable = 1 in
815  def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst),
816                                       (ins VR128:$src1, VR128:$src2),
817                      "movhlps\t{$src2, $dst|$dst, $src2}",
818                      [(set VR128:$dst,
819                        (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))]>,
820                      Sched<[SchedWriteFShuffle.XMM]>, NotMemoryFoldable;
821}
822
823// TODO: This is largely to trick fastisel into ignoring the pattern.
824def UnpckhUnary : PatFrag<(ops node:$src1, node:$src2),
825                          (X86Unpckh node:$src1, node:$src2), [{
826  return N->getOperand(0) == N->getOperand(1);
827}]>;
828
829let Predicates = [UseSSE2] in {
830  // TODO: This is a hack pattern to allow lowering to emit unpckh instead of
831  // movhlps for sse2 without changing a bunch of tests.
832  def : Pat<(v2f64 (UnpckhUnary VR128:$src, VR128:$src)),
833            (MOVHLPSrr VR128:$src, VR128:$src)>;
834}
835
836//===----------------------------------------------------------------------===//
837// SSE 1 & 2 - Conversion Instructions
838//===----------------------------------------------------------------------===//
839
840multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
841                     SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag,
842                     string asm, X86FoldableSchedWrite sched> {
843  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
844                        [(set DstRC:$dst, (OpNode SrcRC:$src))]>,
845                        Sched<[sched]>;
846  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
847                        [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))]>,
848                        Sched<[sched.Folded]>;
849}
850
851multiclass sse12_cvt_p<bits<8> opc, RegisterClass RC, X86MemOperand x86memop,
852                       ValueType DstTy, ValueType SrcTy, PatFrag ld_frag,
853                       string asm, Domain d, X86FoldableSchedWrite sched> {
854let hasSideEffects = 0 in {
855  def rr : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), asm,
856             [(set RC:$dst, (DstTy (sint_to_fp (SrcTy RC:$src))))], d>,
857             Sched<[sched]>;
858  let mayLoad = 1 in
859  def rm : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), asm,
860             [(set RC:$dst, (DstTy (sint_to_fp
861                                    (SrcTy (bitconvert (ld_frag addr:$src))))))], d>,
862             Sched<[sched.Folded]>;
863}
864}
865
866multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
867                          X86MemOperand x86memop, string asm,
868                          X86FoldableSchedWrite sched> {
869let hasSideEffects = 0, Predicates = [UseAVX] in {
870  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src),
871              !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
872              Sched<[sched]>;
873  let mayLoad = 1 in
874  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
875              (ins DstRC:$src1, x86memop:$src),
876              !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
877           Sched<[sched.Folded, ReadAfterLd]>;
878} // hasSideEffects = 0
879}
880
881let Predicates = [UseAVX] in {
882defm VCVTTSS2SI   : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
883                                "cvttss2si\t{$src, $dst|$dst, $src}",
884                                WriteCvtSS2I>,
885                                XS, VEX, VEX_LIG;
886defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32,
887                                "cvttss2si\t{$src, $dst|$dst, $src}",
888                                WriteCvtSS2I>,
889                                XS, VEX, VEX_W, VEX_LIG;
890defm VCVTTSD2SI   : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
891                                "cvttsd2si\t{$src, $dst|$dst, $src}",
892                                WriteCvtSD2I>,
893                                XD, VEX, VEX_LIG;
894defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
895                                "cvttsd2si\t{$src, $dst|$dst, $src}",
896                                WriteCvtSD2I>,
897                                XD, VEX, VEX_W, VEX_LIG;
898
899def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
900                (VCVTTSS2SIrr GR32:$dst, FR32:$src), 0, "att">;
901def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
902                (VCVTTSS2SIrm GR32:$dst, f32mem:$src), 0, "att">;
903def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
904                (VCVTTSD2SIrr GR32:$dst, FR64:$src), 0, "att">;
905def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
906                (VCVTTSD2SIrm GR32:$dst, f64mem:$src), 0, "att">;
907def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
908                (VCVTTSS2SI64rr GR64:$dst, FR32:$src), 0, "att">;
909def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
910                (VCVTTSS2SI64rm GR64:$dst, f32mem:$src), 0, "att">;
911def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
912                (VCVTTSD2SI64rr GR64:$dst, FR64:$src), 0, "att">;
913def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
914                (VCVTTSD2SI64rm GR64:$dst, f64mem:$src), 0, "att">;
915}
916// The assembler can recognize rr 64-bit instructions by seeing a rxx
917// register, but the same isn't true when only using memory operands,
918// provide other assembly "l" and "q" forms to address this explicitly
919// where appropriate to do so.
920defm VCVTSI2SS   : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss{l}",
921                                  WriteCvtI2SS>, XS, VEX_4V, VEX_LIG;
922defm VCVTSI642SS : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss{q}",
923                                  WriteCvtI2SS>, XS, VEX_4V, VEX_W, VEX_LIG;
924defm VCVTSI2SD   : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd{l}",
925                                  WriteCvtI2SD>, XD, VEX_4V, VEX_LIG;
926defm VCVTSI642SD : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd{q}",
927                                  WriteCvtI2SD>, XD, VEX_4V, VEX_W, VEX_LIG;
928
929let Predicates = [UseAVX] in {
930  def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
931                (VCVTSI2SSrm FR64:$dst, FR64:$src1, i32mem:$src), 0, "att">;
932  def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}",
933                (VCVTSI2SDrm FR64:$dst, FR64:$src1, i32mem:$src), 0, "att">;
934
935  def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))),
936            (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
937  def : Pat<(f32 (sint_to_fp (loadi64 addr:$src))),
938            (VCVTSI642SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
939  def : Pat<(f64 (sint_to_fp (loadi32 addr:$src))),
940            (VCVTSI2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>;
941  def : Pat<(f64 (sint_to_fp (loadi64 addr:$src))),
942            (VCVTSI642SDrm (f64 (IMPLICIT_DEF)), addr:$src)>;
943
944  def : Pat<(f32 (sint_to_fp GR32:$src)),
945            (VCVTSI2SSrr (f32 (IMPLICIT_DEF)), GR32:$src)>;
946  def : Pat<(f32 (sint_to_fp GR64:$src)),
947            (VCVTSI642SSrr (f32 (IMPLICIT_DEF)), GR64:$src)>;
948  def : Pat<(f64 (sint_to_fp GR32:$src)),
949            (VCVTSI2SDrr (f64 (IMPLICIT_DEF)), GR32:$src)>;
950  def : Pat<(f64 (sint_to_fp GR64:$src)),
951            (VCVTSI642SDrr (f64 (IMPLICIT_DEF)), GR64:$src)>;
952}
953
954defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
955                      "cvttss2si\t{$src, $dst|$dst, $src}",
956                      WriteCvtSS2I>, XS;
957defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32,
958                      "cvttss2si\t{$src, $dst|$dst, $src}",
959                      WriteCvtSS2I>, XS, REX_W;
960defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
961                      "cvttsd2si\t{$src, $dst|$dst, $src}",
962                      WriteCvtSD2I>, XD;
963defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
964                      "cvttsd2si\t{$src, $dst|$dst, $src}",
965                      WriteCvtSD2I>, XD, REX_W;
966defm CVTSI2SS  : sse12_cvt_s<0x2A, GR32, FR32, sint_to_fp, i32mem, loadi32,
967                      "cvtsi2ss{l}\t{$src, $dst|$dst, $src}",
968                      WriteCvtI2SS>, XS;
969defm CVTSI642SS : sse12_cvt_s<0x2A, GR64, FR32, sint_to_fp, i64mem, loadi64,
970                      "cvtsi2ss{q}\t{$src, $dst|$dst, $src}",
971                      WriteCvtI2SS>, XS, REX_W;
972defm CVTSI2SD  : sse12_cvt_s<0x2A, GR32, FR64, sint_to_fp, i32mem, loadi32,
973                      "cvtsi2sd{l}\t{$src, $dst|$dst, $src}",
974                      WriteCvtI2SD>, XD;
975defm CVTSI642SD : sse12_cvt_s<0x2A, GR64, FR64, sint_to_fp, i64mem, loadi64,
976                      "cvtsi2sd{q}\t{$src, $dst|$dst, $src}",
977                      WriteCvtI2SD>, XD, REX_W;
978
979def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
980                (CVTTSS2SIrr GR32:$dst, FR32:$src), 0, "att">;
981def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
982                (CVTTSS2SIrm GR32:$dst, f32mem:$src), 0, "att">;
983def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
984                (CVTTSD2SIrr GR32:$dst, FR64:$src), 0, "att">;
985def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
986                (CVTTSD2SIrm GR32:$dst, f64mem:$src), 0, "att">;
987def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
988                (CVTTSS2SI64rr GR64:$dst, FR32:$src), 0, "att">;
989def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
990                (CVTTSS2SI64rm GR64:$dst, f32mem:$src), 0, "att">;
991def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
992                (CVTTSD2SI64rr GR64:$dst, FR64:$src), 0, "att">;
993def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
994                (CVTTSD2SI64rm GR64:$dst, f64mem:$src), 0, "att">;
995
996def : InstAlias<"cvtsi2ss\t{$src, $dst|$dst, $src}",
997                (CVTSI2SSrm FR64:$dst, i32mem:$src), 0, "att">;
998def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}",
999                (CVTSI2SDrm FR64:$dst, i32mem:$src), 0, "att">;
1000
1001// Conversion Instructions Intrinsics - Match intrinsics which expect MM
1002// and/or XMM operand(s).
1003
1004// FIXME: We probably want to match the rm form only when optimizing for
1005// size, to avoid false depenendecies (see sse_fp_unop_s for details)
1006multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
1007                          Intrinsic Int, Operand memop, ComplexPattern mem_cpat,
1008                          string asm, X86FoldableSchedWrite sched> {
1009  def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
1010                  !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
1011                  [(set DstRC:$dst, (Int SrcRC:$src))]>,
1012               Sched<[sched]>;
1013  def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src),
1014                  !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
1015                  [(set DstRC:$dst, (Int mem_cpat:$src))]>,
1016               Sched<[sched.Folded]>;
1017}
1018
1019multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC,
1020                    RegisterClass DstRC, X86MemOperand x86memop,
1021                    string asm, X86FoldableSchedWrite sched,
1022                    bit Is2Addr = 1> {
1023let hasSideEffects = 0 in {
1024  def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src2),
1025                  !if(Is2Addr,
1026                      !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
1027                      !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
1028                  []>, Sched<[sched]>;
1029  let mayLoad = 1 in
1030  def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst),
1031                  (ins DstRC:$src1, x86memop:$src2),
1032                  !if(Is2Addr,
1033                      !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
1034                      !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
1035                  []>, Sched<[sched.Folded, ReadAfterLd]>;
1036}
1037}
1038
1039let Predicates = [UseAVX] in {
1040defm VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32,
1041                  int_x86_sse2_cvtsd2si, sdmem, sse_load_f64, "cvtsd2si",
1042                  WriteCvtSD2I>, XD, VEX, VEX_LIG;
1043defm VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64,
1044                    int_x86_sse2_cvtsd2si64, sdmem, sse_load_f64, "cvtsd2si",
1045                    WriteCvtSD2I>, XD, VEX, VEX_W, VEX_LIG;
1046}
1047defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse2_cvtsd2si,
1048                 sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I>, XD;
1049defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse2_cvtsd2si64,
1050                   sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I>, XD, REX_W;
1051
1052
1053let isCodeGenOnly = 1 in {
1054  let Predicates = [UseAVX] in {
1055  defm VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1056            i32mem, "cvtsi2ss{l}", WriteCvtI2SS, 0>, XS, VEX_4V;
1057  defm VCVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1058            i64mem, "cvtsi2ss{q}", WriteCvtI2SS, 0>, XS, VEX_4V, VEX_W;
1059  defm VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1060            i32mem, "cvtsi2sd{l}", WriteCvtI2SD, 0>, XD, VEX_4V;
1061  defm VCVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1062            i64mem, "cvtsi2sd{q}", WriteCvtI2SD, 0>, XD, VEX_4V, VEX_W;
1063  }
1064  let Constraints = "$src1 = $dst" in {
1065    defm CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1066                          i32mem, "cvtsi2ss{l}", WriteCvtI2SS>, XS;
1067    defm CVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1068                          i64mem, "cvtsi2ss{q}", WriteCvtI2SS>, XS, REX_W;
1069    defm CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1070                          i32mem, "cvtsi2sd{l}", WriteCvtI2SD>, XD;
1071    defm CVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1072                          i64mem, "cvtsi2sd{q}", WriteCvtI2SD>, XD, REX_W;
1073  }
1074} // isCodeGenOnly = 1
1075
1076/// SSE 1 Only
1077
1078// Aliases for intrinsics
1079let isCodeGenOnly = 1 in {
1080let Predicates = [UseAVX] in {
1081defm VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si,
1082                                ssmem, sse_load_f32, "cvttss2si",
1083                                WriteCvtSS2I>, XS, VEX;
1084defm VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
1085                               int_x86_sse_cvttss2si64, ssmem, sse_load_f32,
1086                               "cvttss2si", WriteCvtSS2I>,
1087                               XS, VEX, VEX_W;
1088defm VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si,
1089                                sdmem, sse_load_f64, "cvttsd2si",
1090                                WriteCvtSS2I>, XD, VEX;
1091defm VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
1092                              int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64,
1093                              "cvttsd2si", WriteCvtSS2I>,
1094                              XD, VEX, VEX_W;
1095}
1096defm CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si,
1097                                    ssmem, sse_load_f32, "cvttss2si",
1098                                    WriteCvtSS2I>, XS;
1099defm CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
1100                                   int_x86_sse_cvttss2si64, ssmem, sse_load_f32,
1101                                   "cvttss2si", WriteCvtSS2I>, XS, REX_W;
1102defm CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si,
1103                                    sdmem, sse_load_f64, "cvttsd2si",
1104                                    WriteCvtSD2I>, XD;
1105defm CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
1106                                  int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64,
1107                                  "cvttsd2si", WriteCvtSD2I>, XD, REX_W;
1108} // isCodeGenOnly = 1
1109
1110let Predicates = [UseAVX] in {
1111defm VCVTSS2SI   : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si,
1112                                  ssmem, sse_load_f32, "cvtss2si",
1113                                  WriteCvtSS2I>, XS, VEX, VEX_LIG;
1114defm VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64,
1115                                  ssmem, sse_load_f32, "cvtss2si",
1116                                  WriteCvtSS2I>, XS, VEX, VEX_W, VEX_LIG;
1117}
1118defm CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si,
1119                               ssmem, sse_load_f32, "cvtss2si",
1120                               WriteCvtSS2I>, XS;
1121defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64,
1122                                 ssmem, sse_load_f32, "cvtss2si",
1123                                 WriteCvtSS2I>, XS, REX_W;
1124
1125defm VCVTDQ2PS   : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, loadv2i64,
1126                               "vcvtdq2ps\t{$src, $dst|$dst, $src}",
1127                               SSEPackedSingle, WriteCvtI2PS>,
1128                               PS, VEX, Requires<[HasAVX, NoVLX]>, VEX_WIG;
1129defm VCVTDQ2PSY  : sse12_cvt_p<0x5B, VR256, i256mem, v8f32, v8i32, loadv4i64,
1130                               "vcvtdq2ps\t{$src, $dst|$dst, $src}",
1131                               SSEPackedSingle, WriteCvtI2PSY>,
1132                               PS, VEX, VEX_L, Requires<[HasAVX, NoVLX]>, VEX_WIG;
1133
1134defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, memopv2i64,
1135                            "cvtdq2ps\t{$src, $dst|$dst, $src}",
1136                            SSEPackedSingle, WriteCvtI2PS>,
1137                            PS, Requires<[UseSSE2]>;
1138
1139let Predicates = [UseAVX] in {
1140def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
1141                (VCVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1142def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
1143                (VCVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0, "att">;
1144def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
1145                (VCVTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1146def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
1147                (VCVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0, "att">;
1148def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
1149                (VCVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1150def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
1151                (VCVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0, "att">;
1152def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
1153                (VCVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1154def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
1155                (VCVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0, "att">;
1156}
1157
1158def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
1159                (CVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1160def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
1161                (CVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0, "att">;
1162def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
1163                (CVTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1164def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
1165                (CVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0, "att">;
1166def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
1167                (CVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1168def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
1169                (CVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0, "att">;
1170def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
1171                (CVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1172def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
1173                (CVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0, "att">;
1174
1175/// SSE 2 Only
1176
1177// Convert scalar double to scalar single
1178let hasSideEffects = 0, Predicates = [UseAVX] in {
1179def VCVTSD2SSrr  : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst),
1180                        (ins FR32:$src1, FR64:$src2),
1181                        "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
1182                        VEX_4V, VEX_LIG, VEX_WIG,
1183                        Sched<[WriteCvtSD2SS]>;
1184let mayLoad = 1 in
1185def VCVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst),
1186                     (ins FR32:$src1, f64mem:$src2),
1187                     "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
1188                     XD, VEX_4V, VEX_LIG, VEX_WIG,
1189                     Sched<[WriteCvtSD2SS.Folded, ReadAfterLd]>;
1190}
1191
1192def : Pat<(f32 (fpround FR64:$src)),
1193            (VCVTSD2SSrr (f32 (IMPLICIT_DEF)), FR64:$src)>,
1194          Requires<[UseAVX]>;
1195
1196def CVTSD2SSrr  : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
1197                      "cvtsd2ss\t{$src, $dst|$dst, $src}",
1198                      [(set FR32:$dst, (fpround FR64:$src))]>,
1199                      Sched<[WriteCvtSD2SS]>;
1200def CVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src),
1201                    "cvtsd2ss\t{$src, $dst|$dst, $src}",
1202                    [(set FR32:$dst, (fpround (loadf64 addr:$src)))]>,
1203                    XD, Requires<[UseSSE2, OptForSize]>,
1204                    Sched<[WriteCvtSD2SS.Folded]>;
1205
1206let isCodeGenOnly = 1 in {
1207def VCVTSD2SSrr_Int: I<0x5A, MRMSrcReg,
1208                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1209                       "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1210                       [(set VR128:$dst,
1211                         (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))]>,
1212                       XD, VEX_4V, VEX_WIG, Requires<[HasAVX]>,
1213                       Sched<[WriteCvtSD2SS]>;
1214def VCVTSD2SSrm_Int: I<0x5A, MRMSrcMem,
1215                       (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
1216                       "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1217                       [(set VR128:$dst, (int_x86_sse2_cvtsd2ss
1218                                          VR128:$src1, sse_load_f64:$src2))]>,
1219                       XD, VEX_4V, VEX_WIG, Requires<[HasAVX]>,
1220                       Sched<[WriteCvtSD2SS.Folded, ReadAfterLd]>;
1221let Constraints = "$src1 = $dst" in {
1222def CVTSD2SSrr_Int: I<0x5A, MRMSrcReg,
1223                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1224                       "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
1225                       [(set VR128:$dst,
1226                         (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))]>,
1227                       XD, Requires<[UseSSE2]>, Sched<[WriteCvtSD2SS]>;
1228def CVTSD2SSrm_Int: I<0x5A, MRMSrcMem,
1229                       (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
1230                       "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
1231                       [(set VR128:$dst, (int_x86_sse2_cvtsd2ss
1232                                          VR128:$src1, sse_load_f64:$src2))]>,
1233                       XD, Requires<[UseSSE2]>,
1234                       Sched<[WriteCvtSD2SS.Folded, ReadAfterLd]>;
1235}
1236} // isCodeGenOnly = 1
1237
1238// Convert scalar single to scalar double
1239// SSE2 instructions with XS prefix
1240let hasSideEffects = 0 in {
1241def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst),
1242                    (ins FR64:$src1, FR32:$src2),
1243                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
1244                    XS, VEX_4V, VEX_LIG, VEX_WIG,
1245                    Sched<[WriteCvtSS2SD]>, Requires<[UseAVX]>;
1246let mayLoad = 1 in
1247def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst),
1248                    (ins FR64:$src1, f32mem:$src2),
1249                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
1250                    XS, VEX_4V, VEX_LIG, VEX_WIG,
1251                    Sched<[WriteCvtSS2SD.Folded, ReadAfterLd]>,
1252                    Requires<[UseAVX, OptForSize]>;
1253}
1254
1255def : Pat<(f64 (fpextend FR32:$src)),
1256    (VCVTSS2SDrr (f64 (IMPLICIT_DEF)), FR32:$src)>, Requires<[UseAVX]>;
1257def : Pat<(fpextend (loadf32 addr:$src)),
1258    (VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX, OptForSize]>;
1259
1260def : Pat<(extloadf32 addr:$src),
1261    (VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>,
1262    Requires<[UseAVX, OptForSize]>;
1263def : Pat<(extloadf32 addr:$src),
1264    (VCVTSS2SDrr (f64 (IMPLICIT_DEF)), (VMOVSSrm addr:$src))>,
1265    Requires<[UseAVX, OptForSpeed]>;
1266
1267def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
1268                   "cvtss2sd\t{$src, $dst|$dst, $src}",
1269                   [(set FR64:$dst, (fpextend FR32:$src))]>,
1270                   XS, Requires<[UseSSE2]>, Sched<[WriteCvtSS2SD]>;
1271def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
1272                   "cvtss2sd\t{$src, $dst|$dst, $src}",
1273                   [(set FR64:$dst, (extloadf32 addr:$src))]>,
1274                   XS, Requires<[UseSSE2, OptForSize]>,
1275                   Sched<[WriteCvtSS2SD.Folded]>;
1276
1277// extload f32 -> f64.  This matches load+fpextend because we have a hack in
1278// the isel (PreprocessForFPConvert) that can introduce loads after dag
1279// combine.
1280// Since these loads aren't folded into the fpextend, we have to match it
1281// explicitly here.
1282def : Pat<(fpextend (loadf32 addr:$src)),
1283          (CVTSS2SDrm addr:$src)>, Requires<[UseSSE2, OptForSize]>;
1284def : Pat<(extloadf32 addr:$src),
1285          (CVTSS2SDrr (MOVSSrm addr:$src))>, Requires<[UseSSE2, OptForSpeed]>;
1286
1287let isCodeGenOnly = 1, hasSideEffects = 0 in {
1288def VCVTSS2SDrr_Int: I<0x5A, MRMSrcReg,
1289                      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1290                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1291                    []>, XS, VEX_4V, VEX_WIG,
1292                    Requires<[HasAVX]>, Sched<[WriteCvtSS2SD]>;
1293let mayLoad = 1 in
1294def VCVTSS2SDrm_Int: I<0x5A, MRMSrcMem,
1295                      (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
1296                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1297                    []>, XS, VEX_4V, VEX_WIG, Requires<[HasAVX]>,
1298                    Sched<[WriteCvtSS2SD.Folded, ReadAfterLd]>;
1299let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix
1300def CVTSS2SDrr_Int: I<0x5A, MRMSrcReg,
1301                      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1302                    "cvtss2sd\t{$src2, $dst|$dst, $src2}",
1303                    []>, XS, Requires<[UseSSE2]>,
1304                    Sched<[WriteCvtSS2SD]>;
1305let mayLoad = 1 in
1306def CVTSS2SDrm_Int: I<0x5A, MRMSrcMem,
1307                      (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
1308                    "cvtss2sd\t{$src2, $dst|$dst, $src2}",
1309                    []>, XS, Requires<[UseSSE2]>,
1310                    Sched<[WriteCvtSS2SD.Folded, ReadAfterLd]>;
1311}
1312} // isCodeGenOnly = 1
1313
1314// Patterns used for matching (v)cvtsi2ss, (v)cvtsi2sd, (v)cvtsd2ss and
1315// (v)cvtss2sd intrinsic sequences from clang which produce unnecessary
1316// vmovs{s,d} instructions
1317let Predicates = [UseAVX] in {
1318def : Pat<(v4f32 (X86Movss
1319                   (v4f32 VR128:$dst),
1320                   (v4f32 (scalar_to_vector
1321                     (f32 (fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))),
1322          (VCVTSD2SSrr_Int VR128:$dst, VR128:$src)>;
1323
1324def : Pat<(v2f64 (X86Movsd
1325                   (v2f64 VR128:$dst),
1326                   (v2f64 (scalar_to_vector
1327                     (f64 (fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))),
1328          (VCVTSS2SDrr_Int VR128:$dst, VR128:$src)>;
1329
1330def : Pat<(v4f32 (X86Movss
1331                   (v4f32 VR128:$dst),
1332                   (v4f32 (scalar_to_vector (f32 (sint_to_fp GR64:$src)))))),
1333          (VCVTSI642SSrr_Int VR128:$dst, GR64:$src)>;
1334
1335def : Pat<(v4f32 (X86Movss
1336                   (v4f32 VR128:$dst),
1337                   (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi64 addr:$src))))))),
1338          (VCVTSI642SSrm_Int VR128:$dst, addr:$src)>;
1339
1340def : Pat<(v4f32 (X86Movss
1341                   (v4f32 VR128:$dst),
1342                   (v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))),
1343          (VCVTSI2SSrr_Int VR128:$dst, GR32:$src)>;
1344
1345def : Pat<(v4f32 (X86Movss
1346                   (v4f32 VR128:$dst),
1347                   (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi32 addr:$src))))))),
1348          (VCVTSI2SSrm_Int VR128:$dst, addr:$src)>;
1349
1350def : Pat<(v2f64 (X86Movsd
1351                   (v2f64 VR128:$dst),
1352                   (v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))),
1353          (VCVTSI642SDrr_Int VR128:$dst, GR64:$src)>;
1354
1355def : Pat<(v2f64 (X86Movsd
1356                   (v2f64 VR128:$dst),
1357                   (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi64 addr:$src))))))),
1358          (VCVTSI642SDrm_Int VR128:$dst, addr:$src)>;
1359
1360def : Pat<(v2f64 (X86Movsd
1361                   (v2f64 VR128:$dst),
1362                   (v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))),
1363          (VCVTSI2SDrr_Int VR128:$dst, GR32:$src)>;
1364
1365def : Pat<(v2f64 (X86Movsd
1366                   (v2f64 VR128:$dst),
1367                   (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi32 addr:$src))))))),
1368          (VCVTSI2SDrm_Int VR128:$dst, addr:$src)>;
1369} // Predicates = [UseAVX]
1370
1371let Predicates = [UseSSE2] in {
1372def : Pat<(v4f32 (X86Movss
1373                   (v4f32 VR128:$dst),
1374                   (v4f32 (scalar_to_vector
1375                     (f32 (fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))),
1376          (CVTSD2SSrr_Int VR128:$dst, VR128:$src)>;
1377
1378def : Pat<(v2f64 (X86Movsd
1379                   (v2f64 VR128:$dst),
1380                   (v2f64 (scalar_to_vector
1381                     (f64 (fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))),
1382          (CVTSS2SDrr_Int VR128:$dst, VR128:$src)>;
1383
1384def : Pat<(v2f64 (X86Movsd
1385                   (v2f64 VR128:$dst),
1386                   (v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))),
1387          (CVTSI642SDrr_Int VR128:$dst, GR64:$src)>;
1388
1389def : Pat<(v2f64 (X86Movsd
1390                   (v2f64 VR128:$dst),
1391                   (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi64 addr:$src))))))),
1392          (CVTSI642SDrm_Int VR128:$dst, addr:$src)>;
1393
1394def : Pat<(v2f64 (X86Movsd
1395                   (v2f64 VR128:$dst),
1396                   (v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))),
1397          (CVTSI2SDrr_Int VR128:$dst, GR32:$src)>;
1398
1399def : Pat<(v2f64 (X86Movsd
1400                   (v2f64 VR128:$dst),
1401                   (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi32 addr:$src))))))),
1402          (CVTSI2SDrm_Int VR128:$dst, addr:$src)>;
1403} // Predicates = [UseSSE2]
1404
1405let Predicates = [UseSSE1] in {
1406def : Pat<(v4f32 (X86Movss
1407                   (v4f32 VR128:$dst),
1408                   (v4f32 (scalar_to_vector (f32 (sint_to_fp GR64:$src)))))),
1409          (CVTSI642SSrr_Int VR128:$dst, GR64:$src)>;
1410
1411def : Pat<(v4f32 (X86Movss
1412                   (v4f32 VR128:$dst),
1413                   (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi64 addr:$src))))))),
1414          (CVTSI642SSrm_Int VR128:$dst, addr:$src)>;
1415
1416def : Pat<(v4f32 (X86Movss
1417                   (v4f32 VR128:$dst),
1418                   (v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))),
1419          (CVTSI2SSrr_Int VR128:$dst, GR32:$src)>;
1420
1421def : Pat<(v4f32 (X86Movss
1422                   (v4f32 VR128:$dst),
1423                   (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi32 addr:$src))))))),
1424          (CVTSI2SSrm_Int VR128:$dst, addr:$src)>;
1425} // Predicates = [UseSSE1]
1426
1427let Predicates = [HasAVX, NoVLX] in {
1428// Convert packed single/double fp to doubleword
1429def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1430                       "cvtps2dq\t{$src, $dst|$dst, $src}",
1431                       [(set VR128:$dst, (v4i32 (X86cvtp2Int (v4f32 VR128:$src))))]>,
1432                       VEX, Sched<[WriteCvtPS2I]>, VEX_WIG;
1433def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1434                       "cvtps2dq\t{$src, $dst|$dst, $src}",
1435                       [(set VR128:$dst,
1436                         (v4i32 (X86cvtp2Int (loadv4f32 addr:$src))))]>,
1437                       VEX, Sched<[WriteCvtPS2ILd]>, VEX_WIG;
1438def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
1439                        "cvtps2dq\t{$src, $dst|$dst, $src}",
1440                        [(set VR256:$dst,
1441                          (v8i32 (X86cvtp2Int (v8f32 VR256:$src))))]>,
1442                        VEX, VEX_L, Sched<[WriteCvtPS2IY]>, VEX_WIG;
1443def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
1444                        "cvtps2dq\t{$src, $dst|$dst, $src}",
1445                        [(set VR256:$dst,
1446                          (v8i32 (X86cvtp2Int (loadv8f32 addr:$src))))]>,
1447                        VEX, VEX_L, Sched<[WriteCvtPS2IYLd]>, VEX_WIG;
1448}
1449def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1450                     "cvtps2dq\t{$src, $dst|$dst, $src}",
1451                     [(set VR128:$dst, (v4i32 (X86cvtp2Int (v4f32 VR128:$src))))]>,
1452                     Sched<[WriteCvtPS2I]>;
1453def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1454                     "cvtps2dq\t{$src, $dst|$dst, $src}",
1455                     [(set VR128:$dst,
1456                       (v4i32 (X86cvtp2Int (memopv4f32 addr:$src))))]>,
1457                     Sched<[WriteCvtPS2ILd]>;
1458
1459
1460// Convert Packed Double FP to Packed DW Integers
1461let Predicates = [HasAVX, NoVLX] in {
1462// The assembler can recognize rr 256-bit instructions by seeing a ymm
1463// register, but the same isn't true when using memory operands instead.
1464// Provide other assembly rr and rm forms to address this explicitly.
1465def VCVTPD2DQrr  : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1466                       "vcvtpd2dq\t{$src, $dst|$dst, $src}",
1467                       [(set VR128:$dst,
1468                         (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>,
1469                       VEX, Sched<[WriteCvtPD2I]>, VEX_WIG;
1470
1471// XMM only
1472def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
1473                (VCVTPD2DQrr VR128:$dst, VR128:$src), 0>;
1474def VCVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1475                      "vcvtpd2dq{x}\t{$src, $dst|$dst, $src}",
1476                      [(set VR128:$dst,
1477                        (v4i32 (X86cvtp2Int (loadv2f64 addr:$src))))]>, VEX,
1478                      Sched<[WriteCvtPD2ILd]>, VEX_WIG;
1479def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
1480                (VCVTPD2DQrm VR128:$dst, f128mem:$src), 0, "intel">;
1481
1482// YMM only
1483def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
1484                       "vcvtpd2dq\t{$src, $dst|$dst, $src}",
1485                       [(set VR128:$dst,
1486                         (v4i32 (X86cvtp2Int (v4f64 VR256:$src))))]>,
1487                       VEX, VEX_L, Sched<[WriteCvtPD2IY]>, VEX_WIG;
1488def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
1489                       "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}",
1490                       [(set VR128:$dst,
1491                         (v4i32 (X86cvtp2Int (loadv4f64 addr:$src))))]>,
1492                       VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, VEX_WIG;
1493def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}",
1494                (VCVTPD2DQYrr VR128:$dst, VR256:$src), 0>;
1495def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}",
1496                (VCVTPD2DQYrm VR128:$dst, f256mem:$src), 0, "intel">;
1497}
1498
1499def CVTPD2DQrm  : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1500                      "cvtpd2dq\t{$src, $dst|$dst, $src}",
1501                      [(set VR128:$dst,
1502                        (v4i32 (X86cvtp2Int (memopv2f64 addr:$src))))]>,
1503                      Sched<[WriteCvtPD2ILd]>;
1504def CVTPD2DQrr  : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1505                      "cvtpd2dq\t{$src, $dst|$dst, $src}",
1506                      [(set VR128:$dst,
1507                        (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>,
1508                      Sched<[WriteCvtPD2I]>;
1509
1510// Convert with truncation packed single/double fp to doubleword
1511// SSE2 packed instructions with XS prefix
1512let Predicates = [HasAVX, NoVLX] in {
1513def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1514                         "cvttps2dq\t{$src, $dst|$dst, $src}",
1515                         [(set VR128:$dst,
1516                           (v4i32 (X86cvttp2si (v4f32 VR128:$src))))]>,
1517                         VEX, Sched<[WriteCvtPS2I]>, VEX_WIG;
1518def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1519                         "cvttps2dq\t{$src, $dst|$dst, $src}",
1520                         [(set VR128:$dst,
1521                           (v4i32 (X86cvttp2si (loadv4f32 addr:$src))))]>,
1522                         VEX, Sched<[WriteCvtPS2ILd]>, VEX_WIG;
1523def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
1524                          "cvttps2dq\t{$src, $dst|$dst, $src}",
1525                          [(set VR256:$dst,
1526                            (v8i32 (X86cvttp2si (v8f32 VR256:$src))))]>,
1527                          VEX, VEX_L, Sched<[WriteCvtPS2IY]>, VEX_WIG;
1528def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
1529                          "cvttps2dq\t{$src, $dst|$dst, $src}",
1530                          [(set VR256:$dst,
1531                            (v8i32 (X86cvttp2si (loadv8f32 addr:$src))))]>,
1532                          VEX, VEX_L,
1533                          Sched<[WriteCvtPS2IYLd]>, VEX_WIG;
1534}
1535
1536let Predicates = [HasAVX, NoVLX] in {
1537  def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
1538            (VCVTTPS2DQrr VR128:$src)>;
1539  def : Pat<(v4i32 (fp_to_sint (loadv4f32 addr:$src))),
1540            (VCVTTPS2DQrm addr:$src)>;
1541  def : Pat<(v8i32 (fp_to_sint (v8f32 VR256:$src))),
1542            (VCVTTPS2DQYrr VR256:$src)>;
1543  def : Pat<(v8i32 (fp_to_sint (loadv8f32 addr:$src))),
1544            (VCVTTPS2DQYrm addr:$src)>;
1545}
1546
1547def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1548                       "cvttps2dq\t{$src, $dst|$dst, $src}",
1549                       [(set VR128:$dst,
1550                         (v4i32 (X86cvttp2si (v4f32 VR128:$src))))]>,
1551                       Sched<[WriteCvtPS2I]>;
1552def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1553                       "cvttps2dq\t{$src, $dst|$dst, $src}",
1554                       [(set VR128:$dst,
1555                         (v4i32 (X86cvttp2si (memopv4f32 addr:$src))))]>,
1556                       Sched<[WriteCvtPS2ILd]>;
1557
1558let Predicates = [UseSSE2] in {
1559  def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
1560            (CVTTPS2DQrr VR128:$src)>;
1561  def : Pat<(v4i32 (fp_to_sint (memopv4f32 addr:$src))),
1562            (CVTTPS2DQrm addr:$src)>;
1563}
1564
1565let Predicates = [HasAVX, NoVLX] in
1566def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1567                        "cvttpd2dq\t{$src, $dst|$dst, $src}",
1568                        [(set VR128:$dst,
1569                          (v4i32 (X86cvttp2si (v2f64 VR128:$src))))]>,
1570                        VEX, Sched<[WriteCvtPD2I]>, VEX_WIG;
1571
1572// The assembler can recognize rr 256-bit instructions by seeing a ymm
1573// register, but the same isn't true when using memory operands instead.
1574// Provide other assembly rr and rm forms to address this explicitly.
1575
1576// XMM only
1577def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}",
1578                (VCVTTPD2DQrr VR128:$dst, VR128:$src), 0>;
1579
1580let Predicates = [HasAVX, NoVLX] in
1581def VCVTTPD2DQrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1582                        "cvttpd2dq{x}\t{$src, $dst|$dst, $src}",
1583                        [(set VR128:$dst,
1584                          (v4i32 (X86cvttp2si (loadv2f64 addr:$src))))]>,
1585                        VEX, Sched<[WriteCvtPD2ILd]>, VEX_WIG;
1586def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}",
1587                (VCVTTPD2DQrm VR128:$dst, f128mem:$src), 0, "intel">;
1588
1589// YMM only
1590let Predicates = [HasAVX, NoVLX] in {
1591def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
1592                         "cvttpd2dq\t{$src, $dst|$dst, $src}",
1593                         [(set VR128:$dst,
1594                           (v4i32 (X86cvttp2si (v4f64 VR256:$src))))]>,
1595                         VEX, VEX_L, Sched<[WriteCvtPD2IY]>, VEX_WIG;
1596def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
1597                         "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
1598                         [(set VR128:$dst,
1599                           (v4i32 (X86cvttp2si (loadv4f64 addr:$src))))]>,
1600                         VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, VEX_WIG;
1601}
1602def : InstAlias<"vcvttpd2dqy\t{$src, $dst|$dst, $src}",
1603                (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0>;
1604def : InstAlias<"vcvttpd2dqy\t{$src, $dst|$dst, $src}",
1605                (VCVTTPD2DQYrm VR128:$dst, f256mem:$src), 0, "intel">;
1606
1607let Predicates = [HasAVX, NoVLX] in {
1608  def : Pat<(v4i32 (fp_to_sint (v4f64 VR256:$src))),
1609            (VCVTTPD2DQYrr VR256:$src)>;
1610  def : Pat<(v4i32 (fp_to_sint (loadv4f64 addr:$src))),
1611            (VCVTTPD2DQYrm addr:$src)>;
1612}
1613
1614let Predicates = [HasAVX, NoVLX] in {
1615  def : Pat<(X86vzmovl (v2i64 (bitconvert
1616                               (v4i32 (X86cvtp2Int (v2f64 VR128:$src)))))),
1617            (VCVTPD2DQrr VR128:$src)>;
1618  def : Pat<(X86vzmovl (v2i64 (bitconvert
1619                               (v4i32 (X86cvtp2Int (loadv2f64 addr:$src)))))),
1620            (VCVTPD2DQrm addr:$src)>;
1621  def : Pat<(X86vzmovl (v2i64 (bitconvert
1622                               (v4i32 (X86cvttp2si (v2f64 VR128:$src)))))),
1623            (VCVTTPD2DQrr VR128:$src)>;
1624  def : Pat<(X86vzmovl (v2i64 (bitconvert
1625                               (v4i32 (X86cvttp2si (loadv2f64 addr:$src)))))),
1626            (VCVTTPD2DQrm addr:$src)>;
1627} // Predicates = [HasAVX, NoVLX]
1628
1629def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1630                      "cvttpd2dq\t{$src, $dst|$dst, $src}",
1631                      [(set VR128:$dst,
1632                        (v4i32 (X86cvttp2si (v2f64 VR128:$src))))]>,
1633                      Sched<[WriteCvtPD2I]>;
1634def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src),
1635                      "cvttpd2dq\t{$src, $dst|$dst, $src}",
1636                      [(set VR128:$dst,
1637                        (v4i32 (X86cvttp2si (memopv2f64 addr:$src))))]>,
1638                      Sched<[WriteCvtPD2ILd]>;
1639
1640let Predicates = [UseSSE2] in {
1641  def : Pat<(X86vzmovl (v2i64 (bitconvert
1642                               (v4i32 (X86cvtp2Int (v2f64 VR128:$src)))))),
1643            (CVTPD2DQrr VR128:$src)>;
1644  def : Pat<(X86vzmovl (v2i64 (bitconvert
1645                               (v4i32 (X86cvtp2Int (memopv2f64 addr:$src)))))),
1646            (CVTPD2DQrm addr:$src)>;
1647  def : Pat<(X86vzmovl (v2i64 (bitconvert
1648                               (v4i32 (X86cvttp2si (v2f64 VR128:$src)))))),
1649            (CVTTPD2DQrr VR128:$src)>;
1650  def : Pat<(X86vzmovl (v2i64 (bitconvert
1651                               (v4i32 (X86cvttp2si (memopv2f64 addr:$src)))))),
1652            (CVTTPD2DQrm addr:$src)>;
1653} // Predicates = [UseSSE2]
1654
1655// Convert packed single to packed double
1656let Predicates = [HasAVX, NoVLX] in {
1657                  // SSE2 instructions without OpSize prefix
1658def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1659                    "vcvtps2pd\t{$src, $dst|$dst, $src}",
1660                    [(set VR128:$dst, (v2f64 (X86vfpext (v4f32 VR128:$src))))]>,
1661                    PS, VEX, Sched<[WriteCvtPS2PD]>, VEX_WIG;
1662def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
1663                    "vcvtps2pd\t{$src, $dst|$dst, $src}",
1664                    [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))]>,
1665                    PS, VEX, Sched<[WriteCvtPS2PD.Folded]>, VEX_WIG;
1666def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
1667                     "vcvtps2pd\t{$src, $dst|$dst, $src}",
1668                     [(set VR256:$dst, (v4f64 (fpextend (v4f32 VR128:$src))))]>,
1669                     PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY]>, VEX_WIG;
1670def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
1671                     "vcvtps2pd\t{$src, $dst|$dst, $src}",
1672                     [(set VR256:$dst, (v4f64 (extloadv4f32 addr:$src)))]>,
1673                     PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY.Folded]>, VEX_WIG;
1674}
1675
1676let Predicates = [UseSSE2] in {
1677def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1678                   "cvtps2pd\t{$src, $dst|$dst, $src}",
1679                   [(set VR128:$dst, (v2f64 (X86vfpext (v4f32 VR128:$src))))]>,
1680                   PS, Sched<[WriteCvtPS2PD]>;
1681def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
1682                   "cvtps2pd\t{$src, $dst|$dst, $src}",
1683                   [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))]>,
1684                   PS, Sched<[WriteCvtPS2PD.Folded]>;
1685}
1686
1687// Convert Packed DW Integers to Packed Double FP
1688let Predicates = [HasAVX, NoVLX] in {
1689let hasSideEffects = 0, mayLoad = 1 in
1690def VCVTDQ2PDrm  : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
1691                        "vcvtdq2pd\t{$src, $dst|$dst, $src}",
1692                        [(set VR128:$dst,
1693                          (v2f64 (X86VSintToFP (bc_v4i32 (loadv2i64 addr:$src)))))]>,
1694                        VEX, Sched<[WriteCvtI2PDLd]>, VEX_WIG;
1695def VCVTDQ2PDrr  : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1696                        "vcvtdq2pd\t{$src, $dst|$dst, $src}",
1697                        [(set VR128:$dst,
1698                          (v2f64 (X86VSintToFP (v4i32 VR128:$src))))]>,
1699                        VEX, Sched<[WriteCvtI2PD]>, VEX_WIG;
1700def VCVTDQ2PDYrm  : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src),
1701                         "vcvtdq2pd\t{$src, $dst|$dst, $src}",
1702                         [(set VR256:$dst,
1703                           (v4f64 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))))]>,
1704                         VEX, VEX_L, Sched<[WriteCvtI2PDYLd]>,
1705                         VEX_WIG;
1706def VCVTDQ2PDYrr  : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
1707                         "vcvtdq2pd\t{$src, $dst|$dst, $src}",
1708                         [(set VR256:$dst,
1709                           (v4f64 (sint_to_fp (v4i32 VR128:$src))))]>,
1710                         VEX, VEX_L, Sched<[WriteCvtI2PDY]>, VEX_WIG;
1711}
1712
1713let hasSideEffects = 0, mayLoad = 1 in
1714def CVTDQ2PDrm  : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
1715                       "cvtdq2pd\t{$src, $dst|$dst, $src}",
1716                       [(set VR128:$dst,
1717                         (v2f64 (X86VSintToFP (bc_v4i32 (loadv2i64 addr:$src)))))]>,
1718                       Sched<[WriteCvtI2PDLd]>;
1719def CVTDQ2PDrr  : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1720                       "cvtdq2pd\t{$src, $dst|$dst, $src}",
1721                       [(set VR128:$dst,
1722                         (v2f64 (X86VSintToFP (v4i32 VR128:$src))))]>,
1723                       Sched<[WriteCvtI2PD]>;
1724
1725// AVX register conversion intrinsics
1726let Predicates = [HasAVX, NoVLX] in {
1727  def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
1728            (VCVTDQ2PDrm addr:$src)>;
1729  def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src))))),
1730            (VCVTDQ2PDrm addr:$src)>;
1731} // Predicates = [HasAVX, NoVLX]
1732
1733// SSE2 register conversion intrinsics
1734let Predicates = [UseSSE2] in {
1735  def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
1736            (CVTDQ2PDrm addr:$src)>;
1737  def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src))))),
1738            (CVTDQ2PDrm addr:$src)>;
1739} // Predicates = [UseSSE2]
1740
1741// Convert packed double to packed single
1742// The assembler can recognize rr 256-bit instructions by seeing a ymm
1743// register, but the same isn't true when using memory operands instead.
1744// Provide other assembly rr and rm forms to address this explicitly.
1745let Predicates = [HasAVX, NoVLX] in
1746def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1747                       "cvtpd2ps\t{$src, $dst|$dst, $src}",
1748                       [(set VR128:$dst, (X86vfpround (v2f64 VR128:$src)))]>,
1749                       VEX, Sched<[WriteCvtPD2PS]>, VEX_WIG;
1750
1751// XMM only
1752def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
1753                (VCVTPD2PSrr VR128:$dst, VR128:$src), 0>;
1754let Predicates = [HasAVX, NoVLX] in
1755def VCVTPD2PSrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1756                       "cvtpd2ps{x}\t{$src, $dst|$dst, $src}",
1757                       [(set VR128:$dst, (X86vfpround (loadv2f64 addr:$src)))]>,
1758                       VEX, Sched<[WriteCvtPD2PS.Folded]>, VEX_WIG;
1759def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
1760                (VCVTPD2PSrm VR128:$dst, f128mem:$src), 0, "intel">;
1761
1762// YMM only
1763let Predicates = [HasAVX, NoVLX] in {
1764def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
1765                        "cvtpd2ps\t{$src, $dst|$dst, $src}",
1766                        [(set VR128:$dst, (fpround VR256:$src))]>,
1767                        VEX, VEX_L, Sched<[WriteCvtPD2PSY]>, VEX_WIG;
1768def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
1769                        "cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
1770                        [(set VR128:$dst, (fpround (loadv4f64 addr:$src)))]>,
1771                        VEX, VEX_L, Sched<[WriteCvtPD2PSY.Folded]>, VEX_WIG;
1772}
1773def : InstAlias<"vcvtpd2psy\t{$src, $dst|$dst, $src}",
1774                (VCVTPD2PSYrr VR128:$dst, VR256:$src), 0>;
1775def : InstAlias<"vcvtpd2psy\t{$src, $dst|$dst, $src}",
1776                (VCVTPD2PSYrm VR128:$dst, f256mem:$src), 0, "intel">;
1777
1778def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1779                     "cvtpd2ps\t{$src, $dst|$dst, $src}",
1780                     [(set VR128:$dst, (X86vfpround (v2f64 VR128:$src)))]>,
1781                     Sched<[WriteCvtPD2PS]>;
1782def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1783                     "cvtpd2ps\t{$src, $dst|$dst, $src}",
1784                     [(set VR128:$dst, (X86vfpround (memopv2f64 addr:$src)))]>,
1785                     Sched<[WriteCvtPD2PS.Folded]>;
1786
1787// AVX 256-bit register conversion intrinsics
1788// FIXME: Migrate SSE conversion intrinsics matching to use patterns as below
1789// whenever possible to avoid declaring two versions of each one.
1790
1791let Predicates = [HasAVX, NoVLX] in {
1792  // Match fpround and fpextend for 128/256-bit conversions
1793  def : Pat<(X86vzmovl (v2f64 (bitconvert
1794                               (v4f32 (X86vfpround (v2f64 VR128:$src)))))),
1795            (VCVTPD2PSrr VR128:$src)>;
1796  def : Pat<(X86vzmovl (v2f64 (bitconvert
1797                               (v4f32 (X86vfpround (loadv2f64 addr:$src)))))),
1798            (VCVTPD2PSrm addr:$src)>;
1799}
1800
1801let Predicates = [UseSSE2] in {
1802  // Match fpround and fpextend for 128 conversions
1803  def : Pat<(X86vzmovl (v2f64 (bitconvert
1804                               (v4f32 (X86vfpround (v2f64 VR128:$src)))))),
1805            (CVTPD2PSrr VR128:$src)>;
1806  def : Pat<(X86vzmovl (v2f64 (bitconvert
1807                               (v4f32 (X86vfpround (memopv2f64 addr:$src)))))),
1808            (CVTPD2PSrm addr:$src)>;
1809}
1810
1811//===----------------------------------------------------------------------===//
1812// SSE 1 & 2 - Compare Instructions
1813//===----------------------------------------------------------------------===//
1814
1815// sse12_cmp_scalar - sse 1 & 2 compare scalar instructions
1816multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
1817                            Operand CC, SDNode OpNode, ValueType VT,
1818                            PatFrag ld_frag, string asm, string asm_alt,
1819                            X86FoldableSchedWrite sched> {
1820  let isCommutable = 1 in
1821  def rr : SIi8<0xC2, MRMSrcReg,
1822                (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
1823                [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, imm:$cc))]>,
1824                Sched<[sched]>;
1825  def rm : SIi8<0xC2, MRMSrcMem,
1826                (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
1827                [(set RC:$dst, (OpNode (VT RC:$src1),
1828                                         (ld_frag addr:$src2), imm:$cc))]>,
1829                Sched<[sched.Folded, ReadAfterLd]>;
1830
1831  // Accept explicit immediate argument form instead of comparison code.
1832  let isAsmParserOnly = 1, hasSideEffects = 0 in {
1833    def rr_alt : SIi8<0xC2, MRMSrcReg, (outs RC:$dst),
1834                      (ins RC:$src1, RC:$src2, u8imm:$cc), asm_alt, []>,
1835                      Sched<[sched]>, NotMemoryFoldable;
1836    let mayLoad = 1 in
1837    def rm_alt : SIi8<0xC2, MRMSrcMem, (outs RC:$dst),
1838                      (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm_alt, []>,
1839                      Sched<[sched.Folded, ReadAfterLd]>, NotMemoryFoldable;
1840  }
1841}
1842
1843let ExeDomain = SSEPackedSingle in
1844defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, AVXCC, X86cmps, f32, loadf32,
1845                 "cmp${cc}ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1846                 "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1847                 SchedWriteFCmpSizes.PS.Scl>, XS, VEX_4V, VEX_LIG, VEX_WIG;
1848let ExeDomain = SSEPackedDouble in
1849defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, AVXCC, X86cmps, f64, loadf64,
1850                 "cmp${cc}sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1851                 "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1852                 SchedWriteFCmpSizes.PD.Scl>,
1853                 XD, VEX_4V, VEX_LIG, VEX_WIG;
1854
1855let Constraints = "$src1 = $dst" in {
1856  let ExeDomain = SSEPackedSingle in
1857  defm CMPSS : sse12_cmp_scalar<FR32, f32mem, SSECC, X86cmps, f32, loadf32,
1858                  "cmp${cc}ss\t{$src2, $dst|$dst, $src2}",
1859                  "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}",
1860                  SchedWriteFCmpSizes.PS.Scl>, XS;
1861  let ExeDomain = SSEPackedDouble in
1862  defm CMPSD : sse12_cmp_scalar<FR64, f64mem, SSECC, X86cmps, f64, loadf64,
1863                  "cmp${cc}sd\t{$src2, $dst|$dst, $src2}",
1864                  "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
1865                  SchedWriteFCmpSizes.PD.Scl>, XD;
1866}
1867
1868multiclass sse12_cmp_scalar_int<Operand memop, Operand CC,
1869                         Intrinsic Int, string asm, X86FoldableSchedWrite sched,
1870                         ComplexPattern mem_cpat> {
1871  def rr_Int : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst),
1872                      (ins VR128:$src1, VR128:$src, CC:$cc), asm,
1873                        [(set VR128:$dst, (Int VR128:$src1,
1874                                               VR128:$src, imm:$cc))]>,
1875           Sched<[sched]>;
1876let mayLoad = 1 in
1877  def rm_Int : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst),
1878                      (ins VR128:$src1, memop:$src, CC:$cc), asm,
1879                        [(set VR128:$dst, (Int VR128:$src1,
1880                                               mem_cpat:$src, imm:$cc))]>,
1881           Sched<[sched.Folded, ReadAfterLd]>;
1882}
1883
1884let isCodeGenOnly = 1 in {
1885  // Aliases to match intrinsics which expect XMM operand(s).
1886  let ExeDomain = SSEPackedSingle in
1887  defm VCMPSS  : sse12_cmp_scalar_int<ssmem, AVXCC, int_x86_sse_cmp_ss,
1888                       "cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}",
1889                       SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, XS, VEX_4V;
1890  let ExeDomain = SSEPackedDouble in
1891  defm VCMPSD  : sse12_cmp_scalar_int<sdmem, AVXCC, int_x86_sse2_cmp_sd,
1892                       "cmp${cc}sd\t{$src, $src1, $dst|$dst, $src1, $src}",
1893                       SchedWriteFCmpSizes.PD.Scl, sse_load_f64>,
1894                       XD, VEX_4V;
1895  let Constraints = "$src1 = $dst" in {
1896    let ExeDomain = SSEPackedSingle in
1897    defm CMPSS  : sse12_cmp_scalar_int<ssmem, SSECC, int_x86_sse_cmp_ss,
1898                         "cmp${cc}ss\t{$src, $dst|$dst, $src}",
1899                         SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, XS;
1900    let ExeDomain = SSEPackedDouble in
1901    defm CMPSD  : sse12_cmp_scalar_int<sdmem, SSECC, int_x86_sse2_cmp_sd,
1902                         "cmp${cc}sd\t{$src, $dst|$dst, $src}",
1903                         SchedWriteFCmpSizes.PD.Scl, sse_load_f64>, XD;
1904}
1905}
1906
1907
1908// sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS
1909multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode,
1910                         ValueType vt, X86MemOperand x86memop,
1911                         PatFrag ld_frag, string OpcodeStr,
1912                         X86FoldableSchedWrite sched> {
1913let hasSideEffects = 0 in {
1914  def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
1915                     !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
1916                     [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>,
1917          Sched<[sched]>;
1918let mayLoad = 1 in
1919  def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
1920                     !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
1921                     [(set EFLAGS, (OpNode (vt RC:$src1),
1922                                           (ld_frag addr:$src2)))]>,
1923          Sched<[sched.Folded, ReadAfterLd]>;
1924}
1925}
1926
1927// sse12_ord_cmp_int - Intrinsic version of sse12_ord_cmp
1928multiclass sse12_ord_cmp_int<bits<8> opc, RegisterClass RC, SDNode OpNode,
1929                             ValueType vt, Operand memop,
1930                             ComplexPattern mem_cpat, string OpcodeStr,
1931                             X86FoldableSchedWrite sched> {
1932  def rr_Int: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
1933                     !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
1934                     [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>,
1935          Sched<[sched]>;
1936let mayLoad = 1 in
1937  def rm_Int: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, memop:$src2),
1938                     !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
1939                     [(set EFLAGS, (OpNode (vt RC:$src1),
1940                                           mem_cpat:$src2))]>,
1941          Sched<[sched.Folded, ReadAfterLd]>;
1942}
1943
1944let Defs = [EFLAGS] in {
1945  defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
1946                               "ucomiss", WriteFCom>, PS, VEX, VEX_LIG, VEX_WIG;
1947  defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
1948                               "ucomisd", WriteFCom>, PD, VEX, VEX_LIG, VEX_WIG;
1949  let Pattern = []<dag> in {
1950    defm VCOMISS  : sse12_ord_cmp<0x2F, FR32, undef, f32, f32mem, loadf32,
1951                                "comiss", WriteFCom>, PS, VEX, VEX_LIG, VEX_WIG;
1952    defm VCOMISD  : sse12_ord_cmp<0x2F, FR64, undef, f64, f64mem, loadf64,
1953                                "comisd", WriteFCom>, PD, VEX, VEX_LIG, VEX_WIG;
1954  }
1955
1956  let isCodeGenOnly = 1 in {
1957    defm VUCOMISS  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
1958                      sse_load_f32, "ucomiss", WriteFCom>, PS, VEX, VEX_WIG;
1959    defm VUCOMISD  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
1960                      sse_load_f64, "ucomisd", WriteFCom>, PD, VEX, VEX_WIG;
1961
1962    defm VCOMISS  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
1963                       sse_load_f32, "comiss", WriteFCom>, PS, VEX, VEX_WIG;
1964    defm VCOMISD  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
1965                       sse_load_f64, "comisd", WriteFCom>, PD, VEX, VEX_WIG;
1966  }
1967  defm UCOMISS  : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
1968                                  "ucomiss", WriteFCom>, PS;
1969  defm UCOMISD  : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
1970                                  "ucomisd", WriteFCom>, PD;
1971
1972  let Pattern = []<dag> in {
1973    defm COMISS  : sse12_ord_cmp<0x2F, FR32, undef, f32, f32mem, loadf32,
1974                                    "comiss", WriteFCom>, PS;
1975    defm COMISD  : sse12_ord_cmp<0x2F, FR64, undef, f64, f64mem, loadf64,
1976                                    "comisd", WriteFCom>, PD;
1977  }
1978
1979  let isCodeGenOnly = 1 in {
1980    defm UCOMISS  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
1981                            sse_load_f32, "ucomiss", WriteFCom>, PS;
1982    defm UCOMISD  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
1983                            sse_load_f64, "ucomisd", WriteFCom>, PD;
1984
1985    defm COMISS  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
1986                                sse_load_f32, "comiss", WriteFCom>, PS;
1987    defm COMISD  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
1988                                    sse_load_f64, "comisd", WriteFCom>, PD;
1989  }
1990} // Defs = [EFLAGS]
1991
1992// sse12_cmp_packed - sse 1 & 2 compare packed instructions
1993multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
1994                            Operand CC,  ValueType VT, string asm,
1995                            string asm_alt, X86FoldableSchedWrite sched,
1996                            Domain d, PatFrag ld_frag> {
1997  let isCommutable = 1 in
1998  def rri : PIi8<0xC2, MRMSrcReg,
1999             (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
2000             [(set RC:$dst, (VT (X86cmpp RC:$src1, RC:$src2, imm:$cc)))], d>,
2001            Sched<[sched]>;
2002  def rmi : PIi8<0xC2, MRMSrcMem,
2003             (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
2004             [(set RC:$dst,
2005               (VT (X86cmpp RC:$src1, (ld_frag addr:$src2), imm:$cc)))], d>,
2006            Sched<[sched.Folded, ReadAfterLd]>;
2007
2008  // Accept explicit immediate argument form instead of comparison code.
2009  let isAsmParserOnly = 1, hasSideEffects = 0 in {
2010    def rri_alt : PIi8<0xC2, MRMSrcReg,
2011               (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc),
2012               asm_alt, [], d>, Sched<[sched]>, NotMemoryFoldable;
2013    let mayLoad = 1 in
2014    def rmi_alt : PIi8<0xC2, MRMSrcMem,
2015               (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc),
2016               asm_alt, [], d>, Sched<[sched.Folded, ReadAfterLd]>,
2017               NotMemoryFoldable;
2018  }
2019}
2020
2021defm VCMPPS : sse12_cmp_packed<VR128, f128mem, AVXCC, v4f32,
2022               "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2023               "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
2024               SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, loadv4f32>, PS, VEX_4V, VEX_WIG;
2025defm VCMPPD : sse12_cmp_packed<VR128, f128mem, AVXCC, v2f64,
2026               "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2027               "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
2028               SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, loadv2f64>, PD, VEX_4V, VEX_WIG;
2029defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, AVXCC, v8f32,
2030               "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2031               "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
2032               SchedWriteFCmpSizes.PS.YMM, SSEPackedSingle, loadv8f32>, PS, VEX_4V, VEX_L, VEX_WIG;
2033defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, AVXCC, v4f64,
2034               "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2035               "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
2036               SchedWriteFCmpSizes.PD.YMM, SSEPackedDouble, loadv4f64>, PD, VEX_4V, VEX_L, VEX_WIG;
2037let Constraints = "$src1 = $dst" in {
2038  defm CMPPS : sse12_cmp_packed<VR128, f128mem, SSECC, v4f32,
2039                 "cmp${cc}ps\t{$src2, $dst|$dst, $src2}",
2040                 "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}",
2041                 SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, memopv4f32>, PS;
2042  defm CMPPD : sse12_cmp_packed<VR128, f128mem, SSECC, v2f64,
2043                 "cmp${cc}pd\t{$src2, $dst|$dst, $src2}",
2044                 "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
2045                 SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, memopv2f64>, PD;
2046}
2047
2048def CommutableCMPCC : PatLeaf<(imm), [{
2049  uint64_t Imm = N->getZExtValue() & 0x7;
2050  return (Imm == 0x00 || Imm == 0x03 || Imm == 0x04 || Imm == 0x07);
2051}]>;
2052
2053// Patterns to select compares with loads in first operand.
2054let Predicates = [HasAVX] in {
2055  def : Pat<(v4f64 (X86cmpp (loadv4f64 addr:$src2), VR256:$src1,
2056                            CommutableCMPCC:$cc)),
2057            (VCMPPDYrmi VR256:$src1, addr:$src2, imm:$cc)>;
2058
2059  def : Pat<(v8f32 (X86cmpp (loadv8f32 addr:$src2), VR256:$src1,
2060                            CommutableCMPCC:$cc)),
2061            (VCMPPSYrmi VR256:$src1, addr:$src2, imm:$cc)>;
2062
2063  def : Pat<(v2f64 (X86cmpp (loadv2f64 addr:$src2), VR128:$src1,
2064                            CommutableCMPCC:$cc)),
2065            (VCMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
2066
2067  def : Pat<(v4f32 (X86cmpp (loadv4f32 addr:$src2), VR128:$src1,
2068                            CommutableCMPCC:$cc)),
2069            (VCMPPSrmi VR128:$src1, addr:$src2, imm:$cc)>;
2070
2071  def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1,
2072                          CommutableCMPCC:$cc)),
2073            (VCMPSDrm FR64:$src1, addr:$src2, imm:$cc)>;
2074
2075  def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1,
2076                          CommutableCMPCC:$cc)),
2077            (VCMPSSrm FR32:$src1, addr:$src2, imm:$cc)>;
2078}
2079
2080let Predicates = [UseSSE2] in {
2081  def : Pat<(v2f64 (X86cmpp (memopv2f64 addr:$src2), VR128:$src1,
2082                            CommutableCMPCC:$cc)),
2083            (CMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
2084
2085  def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1,
2086                          CommutableCMPCC:$cc)),
2087            (CMPSDrm FR64:$src1, addr:$src2, imm:$cc)>;
2088}
2089
2090let Predicates = [UseSSE1] in {
2091  def : Pat<(v4f32 (X86cmpp (memopv4f32 addr:$src2), VR128:$src1,
2092                            CommutableCMPCC:$cc)),
2093            (CMPPSrmi VR128:$src1, addr:$src2, imm:$cc)>;
2094
2095  def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1,
2096                          CommutableCMPCC:$cc)),
2097            (CMPSSrm FR32:$src1, addr:$src2, imm:$cc)>;
2098}
2099
2100//===----------------------------------------------------------------------===//
2101// SSE 1 & 2 - Shuffle Instructions
2102//===----------------------------------------------------------------------===//
2103
2104/// sse12_shuffle - sse 1 & 2 fp shuffle instructions
2105multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop,
2106                         ValueType vt, string asm, PatFrag mem_frag,
2107                         X86FoldableSchedWrite sched, Domain d> {
2108  def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst),
2109                   (ins RC:$src1, x86memop:$src2, u8imm:$src3), asm,
2110                   [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2),
2111                                       (i8 imm:$src3))))], d>,
2112            Sched<[sched.Folded, ReadAfterLd]>;
2113  def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst),
2114                 (ins RC:$src1, RC:$src2, u8imm:$src3), asm,
2115                 [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2,
2116                                     (i8 imm:$src3))))], d>,
2117            Sched<[sched]>;
2118}
2119
2120let Predicates = [HasAVX, NoVLX] in {
2121  defm VSHUFPS  : sse12_shuffle<VR128, f128mem, v4f32,
2122           "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2123           loadv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>,
2124           PS, VEX_4V, VEX_WIG;
2125  defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32,
2126           "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2127           loadv8f32, SchedWriteFShuffle.YMM, SSEPackedSingle>,
2128           PS, VEX_4V, VEX_L, VEX_WIG;
2129  defm VSHUFPD  : sse12_shuffle<VR128, f128mem, v2f64,
2130           "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2131           loadv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble>,
2132           PD, VEX_4V, VEX_WIG;
2133  defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64,
2134           "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2135           loadv4f64, SchedWriteFShuffle.YMM, SSEPackedDouble>,
2136           PD, VEX_4V, VEX_L, VEX_WIG;
2137}
2138let Constraints = "$src1 = $dst" in {
2139  defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
2140                    "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
2141                    memopv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
2142  defm SHUFPD : sse12_shuffle<VR128, f128mem, v2f64,
2143                    "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}",
2144                    memopv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble>, PD;
2145}
2146
2147//===----------------------------------------------------------------------===//
2148// SSE 1 & 2 - Unpack FP Instructions
2149//===----------------------------------------------------------------------===//
2150
2151/// sse12_unpack_interleave - sse 1 & 2 fp unpack and interleave
2152multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt,
2153                                   PatFrag mem_frag, RegisterClass RC,
2154                                   X86MemOperand x86memop, string asm,
2155                                   X86FoldableSchedWrite sched, Domain d,
2156                                   bit IsCommutable = 0> {
2157    let isCommutable = IsCommutable in
2158    def rr : PI<opc, MRMSrcReg,
2159                (outs RC:$dst), (ins RC:$src1, RC:$src2),
2160                asm, [(set RC:$dst,
2161                           (vt (OpNode RC:$src1, RC:$src2)))], d>,
2162                Sched<[sched]>;
2163    def rm : PI<opc, MRMSrcMem,
2164                (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
2165                asm, [(set RC:$dst,
2166                           (vt (OpNode RC:$src1,
2167                                       (mem_frag addr:$src2))))], d>,
2168             Sched<[sched.Folded, ReadAfterLd]>;
2169}
2170
2171let Predicates = [HasAVX, NoVLX] in {
2172defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, loadv4f32,
2173      VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2174                     SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
2175defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, loadv2f64,
2176      VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2177                     SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD, VEX_4V, VEX_WIG;
2178defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, loadv4f32,
2179      VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2180                     SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
2181defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, loadv2f64,
2182      VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2183                     SchedWriteFShuffle.XMM, SSEPackedDouble>, PD, VEX_4V, VEX_WIG;
2184
2185defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, loadv8f32,
2186      VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2187                     SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG;
2188defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, loadv4f64,
2189      VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2190                     SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG;
2191defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, loadv8f32,
2192      VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2193                     SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG;
2194defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, loadv4f64,
2195      VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2196                     SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG;
2197}// Predicates = [HasAVX, NoVLX]
2198
2199let Constraints = "$src1 = $dst" in {
2200  defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memopv4f32,
2201        VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}",
2202                       SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
2203  defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memopv2f64,
2204        VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}",
2205                       SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD;
2206  defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memopv4f32,
2207        VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}",
2208                       SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
2209  defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memopv2f64,
2210        VR128, f128mem, "unpcklpd\t{$src2, $dst|$dst, $src2}",
2211                       SchedWriteFShuffle.XMM, SSEPackedDouble>, PD;
2212} // Constraints = "$src1 = $dst"
2213
2214let Predicates = [HasAVX1Only] in {
2215  def : Pat<(v8i32 (X86Unpckl VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))),
2216            (VUNPCKLPSYrm VR256:$src1, addr:$src2)>;
2217  def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)),
2218            (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>;
2219  def : Pat<(v8i32 (X86Unpckh VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))),
2220            (VUNPCKHPSYrm VR256:$src1, addr:$src2)>;
2221  def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)),
2222            (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>;
2223
2224  def : Pat<(v4i64 (X86Unpckl VR256:$src1, (loadv4i64 addr:$src2))),
2225            (VUNPCKLPDYrm VR256:$src1, addr:$src2)>;
2226  def : Pat<(v4i64 (X86Unpckl VR256:$src1, VR256:$src2)),
2227            (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>;
2228  def : Pat<(v4i64 (X86Unpckh VR256:$src1, (loadv4i64 addr:$src2))),
2229            (VUNPCKHPDYrm VR256:$src1, addr:$src2)>;
2230  def : Pat<(v4i64 (X86Unpckh VR256:$src1, VR256:$src2)),
2231            (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>;
2232}
2233
2234//===----------------------------------------------------------------------===//
2235// SSE 1 & 2 - Extract Floating-Point Sign mask
2236//===----------------------------------------------------------------------===//
2237
2238/// sse12_extr_sign_mask - sse 1 & 2 unpack and interleave
2239multiclass sse12_extr_sign_mask<RegisterClass RC, ValueType vt,
2240                                string asm, Domain d> {
2241  def rr : PI<0x50, MRMSrcReg, (outs GR32orGR64:$dst), (ins RC:$src),
2242              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
2243              [(set GR32orGR64:$dst, (X86movmsk (vt RC:$src)))], d>,
2244              Sched<[WriteFMOVMSK]>;
2245}
2246
2247let Predicates = [HasAVX] in {
2248  defm VMOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps",
2249                                        SSEPackedSingle>, PS, VEX, VEX_WIG;
2250  defm VMOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd",
2251                                        SSEPackedDouble>, PD, VEX, VEX_WIG;
2252  defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, v8f32, "movmskps",
2253                                         SSEPackedSingle>, PS, VEX, VEX_L, VEX_WIG;
2254  defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, v4f64, "movmskpd",
2255                                         SSEPackedDouble>, PD, VEX, VEX_L, VEX_WIG;
2256}
2257
2258defm MOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps",
2259                                     SSEPackedSingle>, PS;
2260defm MOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd",
2261                                     SSEPackedDouble>, PD;
2262
2263//===---------------------------------------------------------------------===//
2264// SSE2 - Packed Integer Logical Instructions
2265//===---------------------------------------------------------------------===//
2266
2267let ExeDomain = SSEPackedInt in { // SSE integer instructions
2268
2269/// PDI_binop_rm - Simple SSE2 binary operator.
2270multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
2271                        ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
2272                        X86MemOperand x86memop, X86FoldableSchedWrite sched,
2273                        bit IsCommutable, bit Is2Addr> {
2274  let isCommutable = IsCommutable in
2275  def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
2276       (ins RC:$src1, RC:$src2),
2277       !if(Is2Addr,
2278           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2279           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
2280       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
2281       Sched<[sched]>;
2282  def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
2283       (ins RC:$src1, x86memop:$src2),
2284       !if(Is2Addr,
2285           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2286           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
2287       [(set RC:$dst, (OpVT (OpNode RC:$src1,
2288                                     (bitconvert (memop_frag addr:$src2)))))]>,
2289       Sched<[sched.Folded, ReadAfterLd]>;
2290}
2291} // ExeDomain = SSEPackedInt
2292
2293multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode,
2294                         ValueType OpVT128, ValueType OpVT256,
2295                         X86SchedWriteWidths sched, bit IsCommutable,
2296                         Predicate prd> {
2297let Predicates = [HasAVX, prd] in
2298  defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128,
2299                             VR128, loadv2i64, i128mem, sched.XMM,
2300                             IsCommutable, 0>, VEX_4V, VEX_WIG;
2301
2302let Constraints = "$src1 = $dst" in
2303  defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128,
2304                           memopv2i64, i128mem, sched.XMM, IsCommutable, 1>;
2305
2306let Predicates = [HasAVX2, prd] in
2307  defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode,
2308                               OpVT256, VR256, loadv4i64, i256mem, sched.YMM,
2309                               IsCommutable, 0>, VEX_4V, VEX_L, VEX_WIG;
2310}
2311
2312// These are ordered here for pattern ordering requirements with the fp versions
2313
2314defm PAND  : PDI_binop_all<0xDB, "pand", and, v2i64, v4i64,
2315                           SchedWriteVecLogic, 1, NoVLX>;
2316defm POR   : PDI_binop_all<0xEB, "por", or, v2i64, v4i64,
2317                           SchedWriteVecLogic, 1, NoVLX>;
2318defm PXOR  : PDI_binop_all<0xEF, "pxor", xor, v2i64, v4i64,
2319                           SchedWriteVecLogic, 1, NoVLX>;
2320defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64,
2321                           SchedWriteVecLogic, 0, NoVLX>;
2322
2323//===----------------------------------------------------------------------===//
2324// SSE 1 & 2 - Logical Instructions
2325//===----------------------------------------------------------------------===//
2326
2327/// sse12_fp_packed_logical - SSE 1 & 2 packed FP logical ops
2328///
2329/// There are no patterns here because isel prefers integer versions for SSE2
2330/// and later. There are SSE1 v4f32 patterns later.
2331multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
2332                                   SDNode OpNode, X86SchedWriteWidths sched> {
2333  let Predicates = [HasAVX, NoVLX] in {
2334  defm V#NAME#PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle,
2335        !strconcat(OpcodeStr, "ps"), f256mem, sched.YMM,
2336        [], [], 0>, PS, VEX_4V, VEX_L, VEX_WIG;
2337
2338  defm V#NAME#PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble,
2339        !strconcat(OpcodeStr, "pd"), f256mem, sched.YMM,
2340        [], [], 0>, PD, VEX_4V, VEX_L, VEX_WIG;
2341
2342  defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
2343       !strconcat(OpcodeStr, "ps"), f128mem, sched.XMM,
2344       [], [], 0>, PS, VEX_4V, VEX_WIG;
2345
2346  defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
2347       !strconcat(OpcodeStr, "pd"), f128mem, sched.XMM,
2348       [], [], 0>, PD, VEX_4V, VEX_WIG;
2349  }
2350
2351  let Constraints = "$src1 = $dst" in {
2352    defm PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
2353         !strconcat(OpcodeStr, "ps"), f128mem, sched.XMM,
2354         [], []>, PS;
2355
2356    defm PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
2357         !strconcat(OpcodeStr, "pd"), f128mem, sched.XMM,
2358         [], []>, PD;
2359  }
2360}
2361
2362defm AND  : sse12_fp_packed_logical<0x54, "and", and, SchedWriteFLogic>;
2363defm OR   : sse12_fp_packed_logical<0x56, "or", or, SchedWriteFLogic>;
2364defm XOR  : sse12_fp_packed_logical<0x57, "xor", xor, SchedWriteFLogic>;
2365let isCommutable = 0 in
2366  defm ANDN : sse12_fp_packed_logical<0x55, "andn", X86andnp, SchedWriteFLogic>;
2367
2368// If only AVX1 is supported, we need to handle integer operations with
2369// floating point instructions since the integer versions aren't available.
2370let Predicates = [HasAVX1Only] in {
2371  def : Pat<(v4i64 (and VR256:$src1, VR256:$src2)),
2372            (VANDPSYrr VR256:$src1, VR256:$src2)>;
2373  def : Pat<(v4i64 (or VR256:$src1, VR256:$src2)),
2374            (VORPSYrr VR256:$src1, VR256:$src2)>;
2375  def : Pat<(v4i64 (xor VR256:$src1, VR256:$src2)),
2376            (VXORPSYrr VR256:$src1, VR256:$src2)>;
2377  def : Pat<(v4i64 (X86andnp VR256:$src1, VR256:$src2)),
2378            (VANDNPSYrr VR256:$src1, VR256:$src2)>;
2379
2380  def : Pat<(and VR256:$src1, (loadv4i64 addr:$src2)),
2381            (VANDPSYrm VR256:$src1, addr:$src2)>;
2382  def : Pat<(or VR256:$src1, (loadv4i64 addr:$src2)),
2383            (VORPSYrm VR256:$src1, addr:$src2)>;
2384  def : Pat<(xor VR256:$src1, (loadv4i64 addr:$src2)),
2385            (VXORPSYrm VR256:$src1, addr:$src2)>;
2386  def : Pat<(X86andnp VR256:$src1, (loadv4i64 addr:$src2)),
2387            (VANDNPSYrm VR256:$src1, addr:$src2)>;
2388}
2389
2390let Predicates = [HasAVX, NoVLX_Or_NoDQI] in {
2391  // Use packed logical operations for scalar ops.
2392  def : Pat<(f64 (X86fand FR64:$src1, FR64:$src2)),
2393            (COPY_TO_REGCLASS
2394             (v2f64 (VANDPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
2395                              (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
2396             FR64)>;
2397  def : Pat<(f64 (X86for FR64:$src1, FR64:$src2)),
2398            (COPY_TO_REGCLASS
2399             (v2f64 (VORPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
2400                             (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
2401             FR64)>;
2402  def : Pat<(f64 (X86fxor FR64:$src1, FR64:$src2)),
2403            (COPY_TO_REGCLASS
2404             (v2f64 (VXORPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
2405                              (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
2406             FR64)>;
2407  def : Pat<(f64 (X86fandn FR64:$src1, FR64:$src2)),
2408            (COPY_TO_REGCLASS
2409             (v2f64 (VANDNPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
2410                               (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
2411             FR64)>;
2412
2413  def : Pat<(f32 (X86fand FR32:$src1, FR32:$src2)),
2414            (COPY_TO_REGCLASS
2415             (v4f32 (VANDPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
2416                              (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
2417             FR32)>;
2418  def : Pat<(f32 (X86for FR32:$src1, FR32:$src2)),
2419            (COPY_TO_REGCLASS
2420             (v4f32 (VORPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
2421                             (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
2422             FR32)>;
2423  def : Pat<(f32 (X86fxor FR32:$src1, FR32:$src2)),
2424            (COPY_TO_REGCLASS
2425             (v4f32 (VXORPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
2426                              (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
2427             FR32)>;
2428  def : Pat<(f32 (X86fandn FR32:$src1, FR32:$src2)),
2429            (COPY_TO_REGCLASS
2430             (v4f32 (VANDNPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
2431                               (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
2432             FR32)>;
2433}
2434
2435let Predicates = [UseSSE1] in {
2436  // Use packed logical operations for scalar ops.
2437  def : Pat<(f32 (X86fand FR32:$src1, FR32:$src2)),
2438            (COPY_TO_REGCLASS
2439             (v4f32 (ANDPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
2440                             (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
2441             FR32)>;
2442  def : Pat<(f32 (X86for FR32:$src1, FR32:$src2)),
2443            (COPY_TO_REGCLASS
2444             (v4f32 (ORPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
2445                            (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
2446             FR32)>;
2447  def : Pat<(f32 (X86fxor FR32:$src1, FR32:$src2)),
2448            (COPY_TO_REGCLASS
2449             (v4f32 (XORPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
2450                             (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
2451             FR32)>;
2452  def : Pat<(f32 (X86fandn FR32:$src1, FR32:$src2)),
2453            (COPY_TO_REGCLASS
2454             (v4f32 (ANDNPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
2455                              (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
2456             FR32)>;
2457}
2458
2459let Predicates = [UseSSE2] in {
2460  // Use packed logical operations for scalar ops.
2461  def : Pat<(f64 (X86fand FR64:$src1, FR64:$src2)),
2462            (COPY_TO_REGCLASS
2463             (v2f64 (ANDPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
2464                             (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
2465             FR64)>;
2466  def : Pat<(f64 (X86for FR64:$src1, FR64:$src2)),
2467            (COPY_TO_REGCLASS
2468             (v2f64 (ORPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
2469                            (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
2470             FR64)>;
2471  def : Pat<(f64 (X86fxor FR64:$src1, FR64:$src2)),
2472            (COPY_TO_REGCLASS
2473             (v2f64 (XORPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
2474                             (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
2475             FR64)>;
2476  def : Pat<(f64 (X86fandn FR64:$src1, FR64:$src2)),
2477            (COPY_TO_REGCLASS
2478             (v2f64 (ANDNPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
2479                              (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
2480             FR64)>;
2481}
2482
2483// Patterns for packed operations when we don't have integer type available.
2484def : Pat<(v4f32 (X86fand VR128:$src1, VR128:$src2)),
2485          (ANDPSrr VR128:$src1, VR128:$src2)>;
2486def : Pat<(v4f32 (X86for VR128:$src1, VR128:$src2)),
2487          (ORPSrr VR128:$src1, VR128:$src2)>;
2488def : Pat<(v4f32 (X86fxor VR128:$src1, VR128:$src2)),
2489          (XORPSrr VR128:$src1, VR128:$src2)>;
2490def : Pat<(v4f32 (X86fandn VR128:$src1, VR128:$src2)),
2491          (ANDNPSrr VR128:$src1, VR128:$src2)>;
2492
2493def : Pat<(X86fand VR128:$src1, (memopv4f32 addr:$src2)),
2494          (ANDPSrm VR128:$src1, addr:$src2)>;
2495def : Pat<(X86for VR128:$src1, (memopv4f32 addr:$src2)),
2496          (ORPSrm VR128:$src1, addr:$src2)>;
2497def : Pat<(X86fxor VR128:$src1, (memopv4f32 addr:$src2)),
2498          (XORPSrm VR128:$src1, addr:$src2)>;
2499def : Pat<(X86fandn VR128:$src1, (memopv4f32 addr:$src2)),
2500          (ANDNPSrm VR128:$src1, addr:$src2)>;
2501
2502//===----------------------------------------------------------------------===//
2503// SSE 1 & 2 - Arithmetic Instructions
2504//===----------------------------------------------------------------------===//
2505
2506/// basic_sse12_fp_binop_xxx - SSE 1 & 2 binops come in both scalar and
2507/// vector forms.
2508///
2509/// In addition, we also have a special variant of the scalar form here to
2510/// represent the associated intrinsic operation.  This form is unlike the
2511/// plain scalar form, in that it takes an entire vector (instead of a scalar)
2512/// and leaves the top elements unmodified (therefore these cannot be commuted).
2513///
2514/// These three forms can each be reg+reg or reg+mem.
2515///
2516
2517/// FIXME: once all 256-bit intrinsics are matched, cleanup and refactor those
2518/// classes below
2519multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr,
2520                                  SDNode OpNode, X86SchedWriteSizes sched> {
2521  let Predicates = [HasAVX, NoVLX] in {
2522  defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
2523                               VR128, v4f32, f128mem, loadv4f32,
2524                               SSEPackedSingle, sched.PS.XMM, 0>, PS, VEX_4V, VEX_WIG;
2525  defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
2526                               VR128, v2f64, f128mem, loadv2f64,
2527                               SSEPackedDouble, sched.PD.XMM, 0>, PD, VEX_4V, VEX_WIG;
2528
2529  defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"),
2530                        OpNode, VR256, v8f32, f256mem, loadv8f32,
2531                        SSEPackedSingle, sched.PS.YMM, 0>, PS, VEX_4V, VEX_L, VEX_WIG;
2532  defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"),
2533                        OpNode, VR256, v4f64, f256mem, loadv4f64,
2534                        SSEPackedDouble, sched.PD.YMM, 0>, PD, VEX_4V, VEX_L, VEX_WIG;
2535  }
2536
2537  let Constraints = "$src1 = $dst" in {
2538    defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128,
2539                              v4f32, f128mem, memopv4f32, SSEPackedSingle,
2540                              sched.PS.XMM>, PS;
2541    defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128,
2542                              v2f64, f128mem, memopv2f64, SSEPackedDouble,
2543                              sched.PD.XMM>, PD;
2544  }
2545}
2546
2547multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
2548                                  X86SchedWriteSizes sched> {
2549  defm V#NAME#SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
2550                         OpNode, FR32, f32mem, SSEPackedSingle, sched.PS.Scl, 0>,
2551                         XS, VEX_4V, VEX_LIG, VEX_WIG;
2552  defm V#NAME#SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
2553                         OpNode, FR64, f64mem, SSEPackedDouble, sched.PD.Scl, 0>,
2554                         XD, VEX_4V, VEX_LIG, VEX_WIG;
2555
2556  let Constraints = "$src1 = $dst" in {
2557    defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
2558                              OpNode, FR32, f32mem, SSEPackedSingle,
2559                              sched.PS.Scl>, XS;
2560    defm SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
2561                              OpNode, FR64, f64mem, SSEPackedDouble,
2562                              sched.PD.Scl>, XD;
2563  }
2564}
2565
2566multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr,
2567                                      SDPatternOperator OpNode,
2568                                      X86SchedWriteSizes sched> {
2569  defm V#NAME#SS : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v4f32,
2570                   !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32,
2571                   SSEPackedSingle, sched.PS.Scl, 0>, XS, VEX_4V, VEX_LIG, VEX_WIG;
2572  defm V#NAME#SD : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v2f64,
2573                   !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64,
2574                   SSEPackedDouble, sched.PD.Scl, 0>, XD, VEX_4V, VEX_LIG, VEX_WIG;
2575
2576  let Constraints = "$src1 = $dst" in {
2577    defm SS : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v4f32,
2578                   !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32,
2579                   SSEPackedSingle, sched.PS.Scl>, XS;
2580    defm SD : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v2f64,
2581                   !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64,
2582                   SSEPackedDouble, sched.PD.Scl>, XD;
2583  }
2584}
2585
2586// Binary Arithmetic instructions
2587defm ADD : basic_sse12_fp_binop_p<0x58, "add", fadd, SchedWriteFAddSizes>,
2588           basic_sse12_fp_binop_s<0x58, "add", fadd, SchedWriteFAddSizes>,
2589           basic_sse12_fp_binop_s_int<0x58, "add", null_frag, SchedWriteFAddSizes>;
2590defm MUL : basic_sse12_fp_binop_p<0x59, "mul", fmul, SchedWriteFMulSizes>,
2591           basic_sse12_fp_binop_s<0x59, "mul", fmul, SchedWriteFMulSizes>,
2592           basic_sse12_fp_binop_s_int<0x59, "mul", null_frag, SchedWriteFMulSizes>;
2593let isCommutable = 0 in {
2594  defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", fsub, SchedWriteFAddSizes>,
2595             basic_sse12_fp_binop_s<0x5C, "sub", fsub, SchedWriteFAddSizes>,
2596             basic_sse12_fp_binop_s_int<0x5C, "sub", null_frag, SchedWriteFAddSizes>;
2597  defm DIV : basic_sse12_fp_binop_p<0x5E, "div", fdiv, SchedWriteFDivSizes>,
2598             basic_sse12_fp_binop_s<0x5E, "div", fdiv, SchedWriteFDivSizes>,
2599             basic_sse12_fp_binop_s_int<0x5E, "div", null_frag, SchedWriteFDivSizes>;
2600  defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SchedWriteFCmpSizes>,
2601             basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SchedWriteFCmpSizes>,
2602             basic_sse12_fp_binop_s_int<0x5F, "max", X86fmaxs, SchedWriteFCmpSizes>;
2603  defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SchedWriteFCmpSizes>,
2604             basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SchedWriteFCmpSizes>,
2605             basic_sse12_fp_binop_s_int<0x5D, "min", X86fmins, SchedWriteFCmpSizes>;
2606}
2607
2608let isCodeGenOnly = 1 in {
2609  defm MAXC: basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SchedWriteFCmpSizes>,
2610             basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SchedWriteFCmpSizes>;
2611  defm MINC: basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SchedWriteFCmpSizes>,
2612             basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SchedWriteFCmpSizes>;
2613}
2614
2615// Patterns used to select SSE scalar fp arithmetic instructions from
2616// either:
2617//
2618// (1) a scalar fp operation followed by a blend
2619//
2620// The effect is that the backend no longer emits unnecessary vector
2621// insert instructions immediately after SSE scalar fp instructions
2622// like addss or mulss.
2623//
2624// For example, given the following code:
2625//   __m128 foo(__m128 A, __m128 B) {
2626//     A[0] += B[0];
2627//     return A;
2628//   }
2629//
2630// Previously we generated:
2631//   addss %xmm0, %xmm1
2632//   movss %xmm1, %xmm0
2633//
2634// We now generate:
2635//   addss %xmm1, %xmm0
2636//
2637// (2) a vector packed single/double fp operation followed by a vector insert
2638//
2639// The effect is that the backend converts the packed fp instruction
2640// followed by a vector insert into a single SSE scalar fp instruction.
2641//
2642// For example, given the following code:
2643//   __m128 foo(__m128 A, __m128 B) {
2644//     __m128 C = A + B;
2645//     return (__m128) {c[0], a[1], a[2], a[3]};
2646//   }
2647//
2648// Previously we generated:
2649//   addps %xmm0, %xmm1
2650//   movss %xmm1, %xmm0
2651//
2652// We now generate:
2653//   addss %xmm1, %xmm0
2654
2655// TODO: Some canonicalization in lowering would simplify the number of
2656// patterns we have to try to match.
2657multiclass scalar_math_patterns<SDNode Op, string OpcPrefix, SDNode Move,
2658                                    ValueType VT, ValueType EltTy,
2659                                    RegisterClass RC, Predicate BasePredicate> {
2660  let Predicates = [BasePredicate] in {
2661    // extracted scalar math op with insert via movss/movsd
2662    def : Pat<(VT (Move (VT VR128:$dst),
2663                        (VT (scalar_to_vector
2664                             (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
2665                                 RC:$src))))),
2666              (!cast<Instruction>(OpcPrefix#rr_Int) VT:$dst,
2667               (VT (COPY_TO_REGCLASS RC:$src, VR128)))>;
2668  }
2669
2670  // Repeat for AVX versions of the instructions.
2671  let Predicates = [UseAVX] in {
2672    // extracted scalar math op with insert via movss/movsd
2673    def : Pat<(VT (Move (VT VR128:$dst),
2674                        (VT (scalar_to_vector
2675                             (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
2676                                 RC:$src))))),
2677              (!cast<Instruction>("V"#OpcPrefix#rr_Int) VT:$dst,
2678               (VT (COPY_TO_REGCLASS RC:$src, VR128)))>;
2679  }
2680}
2681
2682defm : scalar_math_patterns<fadd, "ADDSS", X86Movss, v4f32, f32, FR32, UseSSE1>;
2683defm : scalar_math_patterns<fsub, "SUBSS", X86Movss, v4f32, f32, FR32, UseSSE1>;
2684defm : scalar_math_patterns<fmul, "MULSS", X86Movss, v4f32, f32, FR32, UseSSE1>;
2685defm : scalar_math_patterns<fdiv, "DIVSS", X86Movss, v4f32, f32, FR32, UseSSE1>;
2686
2687defm : scalar_math_patterns<fadd, "ADDSD", X86Movsd, v2f64, f64, FR64, UseSSE2>;
2688defm : scalar_math_patterns<fsub, "SUBSD", X86Movsd, v2f64, f64, FR64, UseSSE2>;
2689defm : scalar_math_patterns<fmul, "MULSD", X86Movsd, v2f64, f64, FR64, UseSSE2>;
2690defm : scalar_math_patterns<fdiv, "DIVSD", X86Movsd, v2f64, f64, FR64, UseSSE2>;
2691
2692/// Unop Arithmetic
2693/// In addition, we also have a special variant of the scalar form here to
2694/// represent the associated intrinsic operation.  This form is unlike the
2695/// plain scalar form, in that it takes an entire vector (instead of a
2696/// scalar) and leaves the top elements undefined.
2697///
2698/// And, we have a special variant form for a full-vector intrinsic form.
2699
2700/// sse_fp_unop_s - SSE1 unops in scalar form
2701/// For the non-AVX defs, we need $src1 to be tied to $dst because
2702/// the HW instructions are 2 operand / destructive.
2703multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
2704                          ValueType ScalarVT, X86MemOperand x86memop,
2705                          Operand intmemop, SDNode OpNode, Domain d,
2706                          X86FoldableSchedWrite sched, Predicate target> {
2707  let hasSideEffects = 0 in {
2708  def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1),
2709              !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"),
2710            [(set RC:$dst, (OpNode RC:$src1))], d>, Sched<[sched]>,
2711            Requires<[target]>;
2712  let mayLoad = 1 in
2713  def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1),
2714            !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"),
2715            [(set RC:$dst, (OpNode (load addr:$src1)))], d>,
2716            Sched<[sched.Folded, ReadAfterLd]>,
2717            Requires<[target, OptForSize]>;
2718
2719  let isCodeGenOnly = 1, Constraints = "$src1 = $dst", ExeDomain = d in {
2720  def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
2721                !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>,
2722                Sched<[sched]>;
2723  let mayLoad = 1 in
2724  def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, intmemop:$src2),
2725                !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>,
2726                Sched<[sched.Folded, ReadAfterLd]>;
2727  }
2728  }
2729
2730}
2731
2732multiclass sse_fp_unop_s_intr<RegisterClass RC, ValueType vt,
2733                              ComplexPattern int_cpat, Intrinsic Intr,
2734                              Predicate target, string Suffix> {
2735  let Predicates = [target] in {
2736  // These are unary operations, but they are modeled as having 2 source operands
2737  // because the high elements of the destination are unchanged in SSE.
2738  def : Pat<(Intr VR128:$src),
2739            (!cast<Instruction>(NAME#r_Int) VR128:$src, VR128:$src)>;
2740  }
2741  // We don't want to fold scalar loads into these instructions unless
2742  // optimizing for size. This is because the folded instruction will have a
2743  // partial register update, while the unfolded sequence will not, e.g.
2744  // movss mem, %xmm0
2745  // rcpss %xmm0, %xmm0
2746  // which has a clobber before the rcp, vs.
2747  // rcpss mem, %xmm0
2748  let Predicates = [target, OptForSize] in {
2749    def : Pat<(Intr int_cpat:$src2),
2750               (!cast<Instruction>(NAME#m_Int)
2751                      (vt (IMPLICIT_DEF)), addr:$src2)>;
2752  }
2753}
2754
2755multiclass avx_fp_unop_s_intr<RegisterClass RC, ValueType vt, ComplexPattern int_cpat,
2756                              Intrinsic Intr, Predicate target> {
2757  let Predicates = [target] in {
2758   def : Pat<(Intr VR128:$src),
2759             (!cast<Instruction>(NAME#r_Int) VR128:$src,
2760                                 VR128:$src)>;
2761  }
2762  let Predicates = [target, OptForSize] in {
2763    def : Pat<(Intr int_cpat:$src2),
2764              (!cast<Instruction>(NAME#m_Int)
2765                    (vt (IMPLICIT_DEF)), addr:$src2)>;
2766  }
2767}
2768
2769multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
2770                          ValueType ScalarVT, X86MemOperand x86memop,
2771                          Operand intmemop, SDNode OpNode, Domain d,
2772                          X86FoldableSchedWrite sched, Predicate target> {
2773  let hasSideEffects = 0 in {
2774  def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
2775            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
2776            [], d>, Sched<[sched]>;
2777  let mayLoad = 1 in
2778  def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
2779             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
2780            [], d>, Sched<[sched.Folded, ReadAfterLd]>;
2781  let isCodeGenOnly = 1, ExeDomain = d in {
2782  def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst),
2783                (ins VR128:$src1, VR128:$src2),
2784             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
2785             []>, Sched<[sched]>;
2786  let mayLoad = 1 in
2787  def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst),
2788                (ins VR128:$src1, intmemop:$src2),
2789             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
2790             []>, Sched<[sched.Folded, ReadAfterLd]>;
2791  }
2792  }
2793
2794  // We don't want to fold scalar loads into these instructions unless
2795  // optimizing for size. This is because the folded instruction will have a
2796  // partial register update, while the unfolded sequence will not, e.g.
2797  // vmovss mem, %xmm0
2798  // vrcpss %xmm0, %xmm0, %xmm0
2799  // which has a clobber before the rcp, vs.
2800  // vrcpss mem, %xmm0, %xmm0
2801  // TODO: In theory, we could fold the load, and avoid the stall caused by
2802  // the partial register store, either in BreakFalseDeps or with smarter RA.
2803  let Predicates = [target] in {
2804   def : Pat<(OpNode RC:$src),  (!cast<Instruction>(NAME#r)
2805                                (ScalarVT (IMPLICIT_DEF)), RC:$src)>;
2806  }
2807  let Predicates = [target, OptForSize] in {
2808    def : Pat<(ScalarVT (OpNode (load addr:$src))),
2809              (!cast<Instruction>(NAME#m) (ScalarVT (IMPLICIT_DEF)),
2810            addr:$src)>;
2811  }
2812}
2813
2814/// sse1_fp_unop_p - SSE1 unops in packed form.
2815multiclass sse1_fp_unop_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
2816                          X86SchedWriteWidths sched, list<Predicate> prds> {
2817let Predicates = prds in {
2818  def V#NAME#PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2819                       !strconcat("v", OpcodeStr,
2820                                  "ps\t{$src, $dst|$dst, $src}"),
2821                       [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]>,
2822                       VEX, Sched<[sched.XMM]>, VEX_WIG;
2823  def V#NAME#PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2824                       !strconcat("v", OpcodeStr,
2825                                  "ps\t{$src, $dst|$dst, $src}"),
2826                       [(set VR128:$dst, (OpNode (loadv4f32 addr:$src)))]>,
2827                       VEX, Sched<[sched.XMM.Folded]>, VEX_WIG;
2828  def V#NAME#PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
2829                        !strconcat("v", OpcodeStr,
2830                                   "ps\t{$src, $dst|$dst, $src}"),
2831                        [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))]>,
2832                        VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG;
2833  def V#NAME#PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
2834                        !strconcat("v", OpcodeStr,
2835                                   "ps\t{$src, $dst|$dst, $src}"),
2836                        [(set VR256:$dst, (OpNode (loadv8f32 addr:$src)))]>,
2837                        VEX, VEX_L, Sched<[sched.YMM.Folded]>, VEX_WIG;
2838}
2839
2840  def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2841                !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
2842                [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]>,
2843                Sched<[sched.XMM]>;
2844  def PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2845                !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
2846                [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))]>,
2847                Sched<[sched.XMM.Folded]>;
2848}
2849
2850/// sse2_fp_unop_p - SSE2 unops in vector forms.
2851multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr,
2852                          SDNode OpNode, X86SchedWriteWidths sched> {
2853let Predicates = [HasAVX, NoVLX] in {
2854  def V#NAME#PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2855                       !strconcat("v", OpcodeStr,
2856                                  "pd\t{$src, $dst|$dst, $src}"),
2857                       [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]>,
2858                       VEX, Sched<[sched.XMM]>, VEX_WIG;
2859  def V#NAME#PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2860                       !strconcat("v", OpcodeStr,
2861                                  "pd\t{$src, $dst|$dst, $src}"),
2862                       [(set VR128:$dst, (OpNode (loadv2f64 addr:$src)))]>,
2863                       VEX, Sched<[sched.XMM.Folded]>, VEX_WIG;
2864  def V#NAME#PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
2865                        !strconcat("v", OpcodeStr,
2866                                   "pd\t{$src, $dst|$dst, $src}"),
2867                        [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))]>,
2868                        VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG;
2869  def V#NAME#PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
2870                        !strconcat("v", OpcodeStr,
2871                                   "pd\t{$src, $dst|$dst, $src}"),
2872                        [(set VR256:$dst, (OpNode (loadv4f64 addr:$src)))]>,
2873                        VEX, VEX_L, Sched<[sched.YMM.Folded]>, VEX_WIG;
2874}
2875
2876  def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2877                !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
2878                [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]>,
2879                Sched<[sched.XMM]>;
2880  def PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2881                !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
2882                [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))]>,
2883                Sched<[sched.XMM.Folded]>;
2884}
2885
2886multiclass sse1_fp_unop_s_intr<bits<8> opc, string OpcodeStr, SDNode OpNode,
2887                          X86SchedWriteWidths sched, Predicate AVXTarget> {
2888  defm SS        :  sse_fp_unop_s_intr<FR32, v4f32, sse_load_f32,
2889                      !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss),
2890                      UseSSE1, "SS">, XS;
2891  defm V#NAME#SS  : avx_fp_unop_s_intr<FR32, v4f32, sse_load_f32,
2892                      !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss),
2893                      AVXTarget>,
2894                      XS, VEX_4V, VEX_LIG, VEX_WIG, NotMemoryFoldable;
2895}
2896
2897multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
2898                          X86SchedWriteWidths sched, Predicate AVXTarget> {
2899  defm SS        :  sse_fp_unop_s<opc, OpcodeStr##ss, FR32, f32, f32mem,
2900                      ssmem, OpNode, SSEPackedSingle, sched.Scl, UseSSE1>, XS;
2901  defm V#NAME#SS  : avx_fp_unop_s<opc, "v"#OpcodeStr##ss, FR32, f32,
2902                      f32mem, ssmem, OpNode, SSEPackedSingle, sched.Scl, AVXTarget>,
2903                       XS, VEX_4V, VEX_LIG, VEX_WIG;
2904}
2905
2906multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
2907                          X86SchedWriteWidths sched, Predicate AVXTarget> {
2908  defm SD         : sse_fp_unop_s<opc, OpcodeStr##sd, FR64, f64, f64mem,
2909                         sdmem, OpNode, SSEPackedDouble, sched.Scl, UseSSE2>, XD;
2910  defm V#NAME#SD  : avx_fp_unop_s<opc, "v"#OpcodeStr##sd, FR64, f64,
2911                         f64mem, sdmem, OpNode, SSEPackedDouble, sched.Scl, AVXTarget>,
2912                         XD, VEX_4V, VEX_LIG, VEX_WIG;
2913}
2914
2915// Square root.
2916defm SQRT  : sse1_fp_unop_s<0x51, "sqrt", fsqrt, SchedWriteFSqrt, UseAVX>,
2917             sse1_fp_unop_p<0x51, "sqrt", fsqrt, SchedWriteFSqrt, [HasAVX, NoVLX]>,
2918             sse2_fp_unop_s<0x51, "sqrt", fsqrt, SchedWriteFSqrt64, UseAVX>,
2919             sse2_fp_unop_p<0x51, "sqrt", fsqrt, SchedWriteFSqrt64>;
2920
2921// Reciprocal approximations. Note that these typically require refinement
2922// in order to obtain suitable precision.
2923defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, HasAVX>,
2924             sse1_fp_unop_s_intr<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, HasAVX>,
2925             sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, [HasAVX]>;
2926defm RCP   : sse1_fp_unop_s<0x53, "rcp", X86frcp, SchedWriteFRcp, HasAVX>,
2927             sse1_fp_unop_s_intr<0x53, "rcp", X86frcp, SchedWriteFRcp, HasAVX>,
2928             sse1_fp_unop_p<0x53, "rcp", X86frcp, SchedWriteFRcp, [HasAVX]>;
2929
2930// There is no f64 version of the reciprocal approximation instructions.
2931
2932multiclass scalar_unary_math_patterns<SDNode OpNode, string OpcPrefix, SDNode Move,
2933                                      ValueType VT, Predicate BasePredicate> {
2934  let Predicates = [BasePredicate] in {
2935    def : Pat<(VT (Move VT:$dst, (scalar_to_vector
2936                                  (OpNode (extractelt VT:$src, 0))))),
2937              (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src)>;
2938  }
2939
2940  // Repeat for AVX versions of the instructions.
2941  let Predicates = [UseAVX] in {
2942    def : Pat<(VT (Move VT:$dst, (scalar_to_vector
2943                                  (OpNode (extractelt VT:$src, 0))))),
2944              (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>;
2945  }
2946}
2947
2948multiclass scalar_unary_math_imm_patterns<SDNode OpNode, string OpcPrefix, SDNode Move,
2949                                          ValueType VT, bits<8> ImmV,
2950                                          Predicate BasePredicate> {
2951  let Predicates = [BasePredicate] in {
2952    def : Pat<(VT (Move VT:$dst, (scalar_to_vector
2953                                  (OpNode (extractelt VT:$src, 0))))),
2954              (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src, (i32 ImmV))>;
2955  }
2956
2957  // Repeat for AVX versions of the instructions.
2958  let Predicates = [UseAVX] in {
2959    def : Pat<(VT (Move VT:$dst, (scalar_to_vector
2960                                  (OpNode (extractelt VT:$src, 0))))),
2961              (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src, (i32 ImmV))>;
2962  }
2963}
2964
2965defm : scalar_unary_math_patterns<fsqrt, "SQRTSS", X86Movss, v4f32, UseSSE1>;
2966defm : scalar_unary_math_patterns<fsqrt, "SQRTSD", X86Movsd, v2f64, UseSSE2>;
2967
2968multiclass scalar_unary_math_intr_patterns<Intrinsic Intr, string OpcPrefix,
2969                                           SDNode Move, ValueType VT,
2970                                           Predicate BasePredicate> {
2971  let Predicates = [BasePredicate] in {
2972    def : Pat<(VT (Move VT:$dst, (Intr VT:$src))),
2973              (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src)>;
2974  }
2975
2976  // Repeat for AVX versions of the instructions.
2977  let Predicates = [HasAVX] in {
2978    def : Pat<(VT (Move VT:$dst, (Intr VT:$src))),
2979              (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>;
2980  }
2981}
2982
2983defm : scalar_unary_math_intr_patterns<int_x86_sse_rcp_ss, "RCPSS", X86Movss,
2984                                       v4f32, UseSSE1>;
2985defm : scalar_unary_math_intr_patterns<int_x86_sse_rsqrt_ss, "RSQRTSS", X86Movss,
2986                                       v4f32, UseSSE1>;
2987
2988
2989//===----------------------------------------------------------------------===//
2990// SSE 1 & 2 - Non-temporal stores
2991//===----------------------------------------------------------------------===//
2992
2993let AddedComplexity = 400 in { // Prefer non-temporal versions
2994let Predicates = [HasAVX, NoVLX] in {
2995let SchedRW = [SchedWriteFMoveLSNT.XMM.MR] in {
2996def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs),
2997                     (ins f128mem:$dst, VR128:$src),
2998                     "movntps\t{$src, $dst|$dst, $src}",
2999                     [(alignednontemporalstore (v4f32 VR128:$src),
3000                                               addr:$dst)]>, VEX, VEX_WIG;
3001def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs),
3002                     (ins f128mem:$dst, VR128:$src),
3003                     "movntpd\t{$src, $dst|$dst, $src}",
3004                     [(alignednontemporalstore (v2f64 VR128:$src),
3005                                               addr:$dst)]>, VEX, VEX_WIG;
3006} // SchedRW
3007
3008let SchedRW = [SchedWriteFMoveLSNT.YMM.MR] in {
3009def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs),
3010                     (ins f256mem:$dst, VR256:$src),
3011                     "movntps\t{$src, $dst|$dst, $src}",
3012                     [(alignednontemporalstore (v8f32 VR256:$src),
3013                                               addr:$dst)]>, VEX, VEX_L, VEX_WIG;
3014def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs),
3015                     (ins f256mem:$dst, VR256:$src),
3016                     "movntpd\t{$src, $dst|$dst, $src}",
3017                     [(alignednontemporalstore (v4f64 VR256:$src),
3018                                               addr:$dst)]>, VEX, VEX_L, VEX_WIG;
3019} // SchedRW
3020
3021let ExeDomain = SSEPackedInt in {
3022def VMOVNTDQmr    : VPDI<0xE7, MRMDestMem, (outs),
3023                         (ins i128mem:$dst, VR128:$src),
3024                         "movntdq\t{$src, $dst|$dst, $src}",
3025                         [(alignednontemporalstore (v2i64 VR128:$src),
3026                                                   addr:$dst)]>, VEX, VEX_WIG,
3027                         Sched<[SchedWriteVecMoveLSNT.XMM.MR]>;
3028def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs),
3029                    (ins i256mem:$dst, VR256:$src),
3030                    "movntdq\t{$src, $dst|$dst, $src}",
3031                    [(alignednontemporalstore (v4i64 VR256:$src),
3032                                              addr:$dst)]>, VEX, VEX_L, VEX_WIG,
3033                    Sched<[SchedWriteVecMoveLSNT.YMM.MR]>;
3034} // ExeDomain
3035} // Predicates
3036
3037let SchedRW = [SchedWriteFMoveLSNT.XMM.MR] in {
3038def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
3039                    "movntps\t{$src, $dst|$dst, $src}",
3040                    [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)]>;
3041def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
3042                    "movntpd\t{$src, $dst|$dst, $src}",
3043                    [(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)]>;
3044} // SchedRW
3045
3046let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecMoveLSNT.XMM.MR] in
3047def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
3048                    "movntdq\t{$src, $dst|$dst, $src}",
3049                    [(alignednontemporalstore (v2i64 VR128:$src), addr:$dst)]>;
3050
3051let SchedRW = [WriteStoreNT] in {
3052// There is no AVX form for instructions below this point
3053def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
3054                 "movnti{l}\t{$src, $dst|$dst, $src}",
3055                 [(nontemporalstore (i32 GR32:$src), addr:$dst)]>,
3056               PS, Requires<[HasSSE2]>;
3057def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
3058                     "movnti{q}\t{$src, $dst|$dst, $src}",
3059                     [(nontemporalstore (i64 GR64:$src), addr:$dst)]>,
3060                  PS, Requires<[HasSSE2]>;
3061} // SchedRW = [WriteStoreNT]
3062
3063let Predicates = [HasAVX, NoVLX] in {
3064  def : Pat<(alignednontemporalstore (v8i32 VR256:$src), addr:$dst),
3065            (VMOVNTDQYmr addr:$dst, VR256:$src)>;
3066  def : Pat<(alignednontemporalstore (v16i16 VR256:$src), addr:$dst),
3067            (VMOVNTDQYmr addr:$dst, VR256:$src)>;
3068  def : Pat<(alignednontemporalstore (v32i8 VR256:$src), addr:$dst),
3069            (VMOVNTDQYmr addr:$dst, VR256:$src)>;
3070
3071  def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst),
3072            (VMOVNTDQmr addr:$dst, VR128:$src)>;
3073  def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst),
3074            (VMOVNTDQmr addr:$dst, VR128:$src)>;
3075  def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst),
3076            (VMOVNTDQmr addr:$dst, VR128:$src)>;
3077}
3078
3079let Predicates = [UseSSE2] in {
3080  def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst),
3081            (MOVNTDQmr addr:$dst, VR128:$src)>;
3082  def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst),
3083            (MOVNTDQmr addr:$dst, VR128:$src)>;
3084  def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst),
3085            (MOVNTDQmr addr:$dst, VR128:$src)>;
3086}
3087
3088} // AddedComplexity
3089
3090//===----------------------------------------------------------------------===//
3091// SSE 1 & 2 - Prefetch and memory fence
3092//===----------------------------------------------------------------------===//
3093
3094// Prefetch intrinsic.
3095let Predicates = [HasSSEPrefetch], SchedRW = [WriteLoad] in {
3096def PREFETCHT0   : I<0x18, MRM1m, (outs), (ins i8mem:$src),
3097    "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3), (i32 1))]>, TB;
3098def PREFETCHT1   : I<0x18, MRM2m, (outs), (ins i8mem:$src),
3099    "prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2), (i32 1))]>, TB;
3100def PREFETCHT2   : I<0x18, MRM3m, (outs), (ins i8mem:$src),
3101    "prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1), (i32 1))]>, TB;
3102def PREFETCHNTA  : I<0x18, MRM0m, (outs), (ins i8mem:$src),
3103    "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0), (i32 1))]>, TB;
3104}
3105
3106// FIXME: How should flush instruction be modeled?
3107let SchedRW = [WriteLoad] in {
3108// Flush cache
3109def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
3110               "clflush\t$src", [(int_x86_sse2_clflush addr:$src)]>,
3111               PS, Requires<[HasSSE2]>;
3112}
3113
3114let SchedRW = [WriteNop] in {
3115// Pause. This "instruction" is encoded as "rep; nop", so even though it
3116// was introduced with SSE2, it's backward compatible.
3117def PAUSE : I<0x90, RawFrm, (outs), (ins),
3118              "pause", [(int_x86_sse2_pause)]>, OBXS;
3119}
3120
3121let SchedRW = [WriteFence] in {
3122// Load, store, and memory fence
3123// TODO: As with mfence, we may want to ease the availablity of sfence/lfence
3124// to include any 64-bit target.
3125def SFENCE : I<0xAE, MRM_F8, (outs), (ins), "sfence", [(int_x86_sse_sfence)]>,
3126               PS, Requires<[HasSSE1]>;
3127def LFENCE : I<0xAE, MRM_E8, (outs), (ins), "lfence", [(int_x86_sse2_lfence)]>,
3128               PS, Requires<[HasSSE2]>;
3129def MFENCE : I<0xAE, MRM_F0, (outs), (ins), "mfence", [(int_x86_sse2_mfence)]>,
3130               PS, Requires<[HasMFence]>;
3131} // SchedRW
3132
3133def : Pat<(X86MFence), (MFENCE)>;
3134
3135//===----------------------------------------------------------------------===//
3136// SSE 1 & 2 - Load/Store XCSR register
3137//===----------------------------------------------------------------------===//
3138
3139def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src),
3140               "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>,
3141               VEX, Sched<[WriteLDMXCSR]>, VEX_WIG;
3142def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
3143               "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>,
3144               VEX, Sched<[WriteSTMXCSR]>, VEX_WIG;
3145
3146def LDMXCSR : I<0xAE, MRM2m, (outs), (ins i32mem:$src),
3147              "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>,
3148              TB, Sched<[WriteLDMXCSR]>;
3149def STMXCSR : I<0xAE, MRM3m, (outs), (ins i32mem:$dst),
3150              "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>,
3151              TB, Sched<[WriteSTMXCSR]>;
3152
3153//===---------------------------------------------------------------------===//
3154// SSE2 - Move Aligned/Unaligned Packed Integer Instructions
3155//===---------------------------------------------------------------------===//
3156
3157let ExeDomain = SSEPackedInt in { // SSE integer instructions
3158
3159let hasSideEffects = 0 in {
3160def VMOVDQArr  : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3161                      "movdqa\t{$src, $dst|$dst, $src}", []>,
3162                      Sched<[SchedWriteVecMoveLS.XMM.RR]>, VEX, VEX_WIG;
3163def VMOVDQUrr  : VSSI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3164                      "movdqu\t{$src, $dst|$dst, $src}", []>,
3165                      Sched<[SchedWriteVecMoveLS.XMM.RR]>, VEX, VEX_WIG;
3166def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
3167                      "movdqa\t{$src, $dst|$dst, $src}", []>,
3168                      Sched<[SchedWriteVecMoveLS.YMM.RR]>, VEX, VEX_L, VEX_WIG;
3169def VMOVDQUYrr : VSSI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
3170                      "movdqu\t{$src, $dst|$dst, $src}", []>,
3171                      Sched<[SchedWriteVecMoveLS.YMM.RR]>, VEX, VEX_L, VEX_WIG;
3172}
3173
3174// For Disassembler
3175let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
3176def VMOVDQArr_REV  : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3177                          "movdqa\t{$src, $dst|$dst, $src}", []>,
3178                          Sched<[SchedWriteVecMoveLS.XMM.RR]>,
3179                          VEX, VEX_WIG, FoldGenData<"VMOVDQArr">;
3180def VMOVDQAYrr_REV : VPDI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
3181                          "movdqa\t{$src, $dst|$dst, $src}", []>,
3182                          Sched<[SchedWriteVecMoveLS.YMM.RR]>,
3183                          VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVDQAYrr">;
3184def VMOVDQUrr_REV  : VSSI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3185                          "movdqu\t{$src, $dst|$dst, $src}", []>,
3186                          Sched<[SchedWriteVecMoveLS.XMM.RR]>,
3187                          VEX, VEX_WIG, FoldGenData<"VMOVDQUrr">;
3188def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
3189                          "movdqu\t{$src, $dst|$dst, $src}", []>,
3190                          Sched<[SchedWriteVecMoveLS.YMM.RR]>,
3191                          VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVDQUYrr">;
3192}
3193
3194let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
3195    hasSideEffects = 0, Predicates = [HasAVX,NoVLX] in {
3196def VMOVDQArm  : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3197                      "movdqa\t{$src, $dst|$dst, $src}",
3198                      [(set VR128:$dst, (alignedloadv2i64 addr:$src))]>,
3199                      Sched<[SchedWriteVecMoveLS.XMM.RM]>, VEX, VEX_WIG;
3200def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
3201                      "movdqa\t{$src, $dst|$dst, $src}", []>,
3202                      Sched<[SchedWriteVecMoveLS.YMM.RM]>,
3203                      VEX, VEX_L, VEX_WIG;
3204def VMOVDQUrm  : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3205                   "vmovdqu\t{$src, $dst|$dst, $src}",
3206                   [(set VR128:$dst, (loadv2i64 addr:$src))]>,
3207                   Sched<[SchedWriteVecMoveLS.XMM.RM]>,
3208                   XS, VEX, VEX_WIG;
3209def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
3210                   "vmovdqu\t{$src, $dst|$dst, $src}", []>,
3211                   Sched<[SchedWriteVecMoveLS.YMM.RM]>,
3212                   XS, VEX, VEX_L, VEX_WIG;
3213}
3214
3215let mayStore = 1, hasSideEffects = 0, Predicates = [HasAVX,NoVLX] in {
3216def VMOVDQAmr  : VPDI<0x7F, MRMDestMem, (outs),
3217                      (ins i128mem:$dst, VR128:$src),
3218                      "movdqa\t{$src, $dst|$dst, $src}",
3219                      [(alignedstore (v2i64 VR128:$src), addr:$dst)]>,
3220                      Sched<[SchedWriteVecMoveLS.XMM.MR]>, VEX, VEX_WIG;
3221def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs),
3222                      (ins i256mem:$dst, VR256:$src),
3223                      "movdqa\t{$src, $dst|$dst, $src}", []>,
3224                     Sched<[SchedWriteVecMoveLS.YMM.MR]>, VEX, VEX_L, VEX_WIG;
3225def VMOVDQUmr  : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3226                   "vmovdqu\t{$src, $dst|$dst, $src}",
3227                   [(store (v2i64 VR128:$src), addr:$dst)]>,
3228                   Sched<[SchedWriteVecMoveLS.XMM.MR]>, XS, VEX, VEX_WIG;
3229def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src),
3230                   "vmovdqu\t{$src, $dst|$dst, $src}",[]>,
3231                   Sched<[SchedWriteVecMoveLS.YMM.MR]>, XS, VEX, VEX_L, VEX_WIG;
3232}
3233
3234let SchedRW = [SchedWriteVecMoveLS.XMM.RR] in {
3235let hasSideEffects = 0 in {
3236def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3237                   "movdqa\t{$src, $dst|$dst, $src}", []>;
3238
3239def MOVDQUrr :   I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3240                   "movdqu\t{$src, $dst|$dst, $src}", []>,
3241                   XS, Requires<[UseSSE2]>;
3242}
3243
3244// For Disassembler
3245let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
3246def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3247                       "movdqa\t{$src, $dst|$dst, $src}", []>,
3248                       FoldGenData<"MOVDQArr">;
3249
3250def MOVDQUrr_REV :   I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3251                       "movdqu\t{$src, $dst|$dst, $src}", []>,
3252                       XS, Requires<[UseSSE2]>, FoldGenData<"MOVDQUrr">;
3253}
3254} // SchedRW
3255
3256let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
3257    hasSideEffects = 0, SchedRW = [SchedWriteVecMoveLS.XMM.RM] in {
3258def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3259                   "movdqa\t{$src, $dst|$dst, $src}",
3260                   [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/]>;
3261def MOVDQUrm :   I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3262                   "movdqu\t{$src, $dst|$dst, $src}",
3263                   [/*(set VR128:$dst, (loadv2i64 addr:$src))*/]>,
3264                 XS, Requires<[UseSSE2]>;
3265}
3266
3267let mayStore = 1, hasSideEffects = 0,
3268    SchedRW = [SchedWriteVecMoveLS.XMM.MR] in {
3269def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3270                   "movdqa\t{$src, $dst|$dst, $src}",
3271                   [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/]>;
3272def MOVDQUmr :   I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3273                   "movdqu\t{$src, $dst|$dst, $src}",
3274                   [/*(store (v2i64 VR128:$src), addr:$dst)*/]>,
3275                 XS, Requires<[UseSSE2]>;
3276}
3277
3278} // ExeDomain = SSEPackedInt
3279
3280// Aliases to help the assembler pick two byte VEX encodings by swapping the
3281// operands relative to the normal instructions to use VEX.R instead of VEX.B.
3282def : InstAlias<"vmovdqa\t{$src, $dst|$dst, $src}",
3283                (VMOVDQArr_REV VR128L:$dst, VR128H:$src), 0>;
3284def : InstAlias<"vmovdqa\t{$src, $dst|$dst, $src}",
3285                (VMOVDQAYrr_REV VR256L:$dst, VR256H:$src), 0>;
3286def : InstAlias<"vmovdqu\t{$src, $dst|$dst, $src}",
3287                (VMOVDQUrr_REV VR128L:$dst, VR128H:$src), 0>;
3288def : InstAlias<"vmovdqu\t{$src, $dst|$dst, $src}",
3289                (VMOVDQUYrr_REV VR256L:$dst, VR256H:$src), 0>;
3290
3291// Reversed version with ".s" suffix for GAS compatibility.
3292def : InstAlias<"vmovdqa.s\t{$src, $dst|$dst, $src}",
3293                (VMOVDQArr_REV VR128:$dst, VR128:$src), 0>;
3294def : InstAlias<"vmovdqa.s\t{$src, $dst|$dst, $src}",
3295                (VMOVDQAYrr_REV VR256:$dst, VR256:$src), 0>;
3296def : InstAlias<"vmovdqu.s\t{$src, $dst|$dst, $src}",
3297                (VMOVDQUrr_REV VR128:$dst, VR128:$src), 0>;
3298def : InstAlias<"vmovdqu.s\t{$src, $dst|$dst, $src}",
3299                (VMOVDQUYrr_REV VR256:$dst, VR256:$src), 0>;
3300
3301// Reversed version with ".s" suffix for GAS compatibility.
3302def : InstAlias<"movdqa.s\t{$src, $dst|$dst, $src}",
3303                (MOVDQArr_REV VR128:$dst, VR128:$src), 0>;
3304def : InstAlias<"movdqu.s\t{$src, $dst|$dst, $src}",
3305                (MOVDQUrr_REV VR128:$dst, VR128:$src), 0>;
3306
3307let Predicates = [HasAVX, NoVLX] in {
3308  // Additional patterns for other integer sizes.
3309  def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
3310            (VMOVDQAmr addr:$dst, VR128:$src)>;
3311  def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
3312            (VMOVDQAmr addr:$dst, VR128:$src)>;
3313  def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
3314            (VMOVDQAmr addr:$dst, VR128:$src)>;
3315  def : Pat<(store (v4i32 VR128:$src), addr:$dst),
3316            (VMOVDQUmr addr:$dst, VR128:$src)>;
3317  def : Pat<(store (v8i16 VR128:$src), addr:$dst),
3318            (VMOVDQUmr addr:$dst, VR128:$src)>;
3319  def : Pat<(store (v16i8 VR128:$src), addr:$dst),
3320            (VMOVDQUmr addr:$dst, VR128:$src)>;
3321}
3322
3323//===---------------------------------------------------------------------===//
3324// SSE2 - Packed Integer Arithmetic Instructions
3325//===---------------------------------------------------------------------===//
3326
3327let ExeDomain = SSEPackedInt in { // SSE integer instructions
3328
3329/// PDI_binop_rm2 - Simple SSE2 binary operator with different src and dst types
3330multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
3331                         ValueType DstVT, ValueType SrcVT, RegisterClass RC,
3332                         PatFrag memop_frag, X86MemOperand x86memop,
3333                         X86FoldableSchedWrite sched, bit Is2Addr = 1> {
3334  let isCommutable = 1 in
3335  def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
3336       (ins RC:$src1, RC:$src2),
3337       !if(Is2Addr,
3338           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3339           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3340       [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>,
3341       Sched<[sched]>;
3342  def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
3343       (ins RC:$src1, x86memop:$src2),
3344       !if(Is2Addr,
3345           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3346           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3347       [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1),
3348                                     (bitconvert (memop_frag addr:$src2)))))]>,
3349       Sched<[sched.Folded, ReadAfterLd]>;
3350}
3351} // ExeDomain = SSEPackedInt
3352
3353defm PADDB   : PDI_binop_all<0xFC, "paddb", add, v16i8, v32i8,
3354                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3355defm PADDW   : PDI_binop_all<0xFD, "paddw", add, v8i16, v16i16,
3356                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3357defm PADDD   : PDI_binop_all<0xFE, "paddd", add, v4i32, v8i32,
3358                             SchedWriteVecALU, 1, NoVLX>;
3359defm PADDQ   : PDI_binop_all<0xD4, "paddq", add, v2i64, v4i64,
3360                             SchedWriteVecALU, 1, NoVLX>;
3361defm PADDSB  : PDI_binop_all<0xEC, "paddsb", X86adds, v16i8, v32i8,
3362                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3363defm PADDSW  : PDI_binop_all<0xED, "paddsw", X86adds, v8i16, v16i16,
3364                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3365defm PADDUSB : PDI_binop_all<0xDC, "paddusb", X86addus, v16i8, v32i8,
3366                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3367defm PADDUSW : PDI_binop_all<0xDD, "paddusw", X86addus, v8i16, v16i16,
3368                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3369defm PMULLW  : PDI_binop_all<0xD5, "pmullw", mul, v8i16, v16i16,
3370                             SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>;
3371defm PMULHUW : PDI_binop_all<0xE4, "pmulhuw", mulhu, v8i16, v16i16,
3372                             SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>;
3373defm PMULHW  : PDI_binop_all<0xE5, "pmulhw", mulhs, v8i16, v16i16,
3374                             SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>;
3375defm PSUBB   : PDI_binop_all<0xF8, "psubb", sub, v16i8, v32i8,
3376                             SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3377defm PSUBW   : PDI_binop_all<0xF9, "psubw", sub, v8i16, v16i16,
3378                             SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3379defm PSUBD   : PDI_binop_all<0xFA, "psubd", sub, v4i32, v8i32,
3380                             SchedWriteVecALU, 0, NoVLX>;
3381defm PSUBQ   : PDI_binop_all<0xFB, "psubq", sub, v2i64, v4i64,
3382                             SchedWriteVecALU, 0, NoVLX>;
3383defm PSUBSB  : PDI_binop_all<0xE8, "psubsb", X86subs, v16i8, v32i8,
3384                             SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3385defm PSUBSW  : PDI_binop_all<0xE9, "psubsw", X86subs, v8i16, v16i16,
3386                             SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3387defm PSUBUSB : PDI_binop_all<0xD8, "psubusb", X86subus, v16i8, v32i8,
3388                             SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3389defm PSUBUSW : PDI_binop_all<0xD9, "psubusw", X86subus, v8i16, v16i16,
3390                             SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3391defm PMINUB  : PDI_binop_all<0xDA, "pminub", umin, v16i8, v32i8,
3392                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3393defm PMINSW  : PDI_binop_all<0xEA, "pminsw", smin, v8i16, v16i16,
3394                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3395defm PMAXUB  : PDI_binop_all<0xDE, "pmaxub", umax, v16i8, v32i8,
3396                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3397defm PMAXSW  : PDI_binop_all<0xEE, "pmaxsw", smax, v8i16, v16i16,
3398                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3399defm PAVGB   : PDI_binop_all<0xE0, "pavgb", X86avg, v16i8, v32i8,
3400                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3401defm PAVGW   : PDI_binop_all<0xE3, "pavgw", X86avg, v8i16, v16i16,
3402                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3403defm PMULUDQ : PDI_binop_all<0xF4, "pmuludq", X86pmuludq, v2i64, v4i64,
3404                             SchedWriteVecIMul, 1, NoVLX>;
3405
3406let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
3407defm VPMADDWD : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
3408                              loadv2i64, i128mem, SchedWriteVecIMul.XMM, 0>,
3409                              VEX_4V, VEX_WIG;
3410
3411let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
3412defm VPMADDWDY : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v8i32, v16i16,
3413                               VR256, loadv4i64, i256mem, SchedWriteVecIMul.YMM,
3414                               0>, VEX_4V, VEX_L, VEX_WIG;
3415let Constraints = "$src1 = $dst" in
3416defm PMADDWD : PDI_binop_rm2<0xF5, "pmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
3417                             memopv2i64, i128mem, SchedWriteVecIMul.XMM>;
3418
3419let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
3420defm VPSADBW : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v2i64, v16i8, VR128,
3421                             loadv2i64, i128mem, SchedWritePSADBW.XMM, 0>,
3422                             VEX_4V, VEX_WIG;
3423let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
3424defm VPSADBWY : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v4i64, v32i8, VR256,
3425                             loadv4i64, i256mem, SchedWritePSADBW.YMM, 0>,
3426                             VEX_4V, VEX_L, VEX_WIG;
3427let Constraints = "$src1 = $dst" in
3428defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128,
3429                            memopv2i64, i128mem, SchedWritePSADBW.XMM>;
3430
3431//===---------------------------------------------------------------------===//
3432// SSE2 - Packed Integer Logical Instructions
3433//===---------------------------------------------------------------------===//
3434
3435multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm,
3436                         string OpcodeStr, SDNode OpNode,
3437                         SDNode OpNode2, RegisterClass RC,
3438                         X86FoldableSchedWrite sched,
3439                         X86FoldableSchedWrite schedImm,
3440                         ValueType DstVT, ValueType SrcVT,
3441                         PatFrag ld_frag, bit Is2Addr = 1> {
3442  // src2 is always 128-bit
3443  def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
3444       (ins RC:$src1, VR128:$src2),
3445       !if(Is2Addr,
3446           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3447           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3448       [(set RC:$dst, (DstVT (OpNode RC:$src1, (SrcVT VR128:$src2))))]>,
3449       Sched<[sched]>;
3450  def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
3451       (ins RC:$src1, i128mem:$src2),
3452       !if(Is2Addr,
3453           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3454           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3455       [(set RC:$dst, (DstVT (OpNode RC:$src1,
3456                       (SrcVT (bitconvert (ld_frag addr:$src2))))))]>,
3457       Sched<[sched.Folded, ReadAfterLd]>;
3458  def ri : PDIi8<opc2, ImmForm, (outs RC:$dst),
3459       (ins RC:$src1, u8imm:$src2),
3460       !if(Is2Addr,
3461           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3462           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3463       [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i8 imm:$src2))))]>,
3464       Sched<[schedImm]>;
3465}
3466
3467multiclass PDI_binop_rmi_all<bits<8> opc, bits<8> opc2, Format ImmForm,
3468                             string OpcodeStr, SDNode OpNode,
3469                             SDNode OpNode2, ValueType DstVT128,
3470                             ValueType DstVT256, ValueType SrcVT,
3471                             X86SchedWriteWidths sched,
3472                             X86SchedWriteWidths schedImm, Predicate prd> {
3473let Predicates = [HasAVX, prd] in
3474  defm V#NAME : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
3475                              OpNode, OpNode2, VR128, sched.XMM, schedImm.XMM,
3476                              DstVT128, SrcVT, loadv2i64, 0>, VEX_4V, VEX_WIG;
3477let Predicates = [HasAVX2, prd] in
3478  defm V#NAME#Y : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
3479                                OpNode, OpNode2, VR256, sched.YMM, schedImm.YMM,
3480                                DstVT256, SrcVT, loadv2i64, 0>, VEX_4V, VEX_L,
3481                                VEX_WIG;
3482let Constraints = "$src1 = $dst" in
3483  defm NAME : PDI_binop_rmi<opc, opc2, ImmForm, OpcodeStr, OpNode, OpNode2,
3484                            VR128, sched.XMM, schedImm.XMM, DstVT128, SrcVT,
3485                            memopv2i64>;
3486}
3487
3488multiclass PDI_binop_ri<bits<8> opc, Format ImmForm, string OpcodeStr,
3489                        SDNode OpNode, RegisterClass RC, ValueType VT,
3490                        X86FoldableSchedWrite sched, bit Is2Addr = 1> {
3491  def ri : PDIi8<opc, ImmForm, (outs RC:$dst), (ins RC:$src1, u8imm:$src2),
3492       !if(Is2Addr,
3493           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3494           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3495       [(set RC:$dst, (VT (OpNode RC:$src1, (i8 imm:$src2))))]>,
3496       Sched<[sched]>;
3497}
3498
3499multiclass PDI_binop_ri_all<bits<8> opc, Format ImmForm, string OpcodeStr,
3500                            SDNode OpNode, X86SchedWriteWidths sched> {
3501let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
3502  defm V#NAME : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode,
3503                             VR128, v16i8, sched.XMM, 0>, VEX_4V, VEX_WIG;
3504let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
3505  defm V#NAME#Y : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode,
3506                               VR256, v32i8, sched.YMM, 0>,
3507                               VEX_4V, VEX_L, VEX_WIG;
3508let Constraints = "$src1 = $dst" in
3509  defm NAME : PDI_binop_ri<opc, ImmForm, OpcodeStr, OpNode, VR128, v16i8,
3510                           sched.XMM>;
3511}
3512
3513let ExeDomain = SSEPackedInt in {
3514  defm PSLLW : PDI_binop_rmi_all<0xF1, 0x71, MRM6r, "psllw", X86vshl, X86vshli,
3515                                 v8i16, v16i16, v8i16, SchedWriteVecShift,
3516                                 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>;
3517  defm PSLLD : PDI_binop_rmi_all<0xF2, 0x72, MRM6r, "pslld", X86vshl, X86vshli,
3518                                 v4i32, v8i32, v4i32, SchedWriteVecShift,
3519                                 SchedWriteVecShiftImm, NoVLX>;
3520  defm PSLLQ : PDI_binop_rmi_all<0xF3, 0x73, MRM6r, "psllq", X86vshl, X86vshli,
3521                                 v2i64, v4i64, v2i64, SchedWriteVecShift,
3522                                 SchedWriteVecShiftImm, NoVLX>;
3523
3524  defm PSRLW : PDI_binop_rmi_all<0xD1, 0x71, MRM2r, "psrlw", X86vsrl, X86vsrli,
3525                                 v8i16, v16i16, v8i16, SchedWriteVecShift,
3526                                 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>;
3527  defm PSRLD : PDI_binop_rmi_all<0xD2, 0x72, MRM2r, "psrld", X86vsrl, X86vsrli,
3528                                 v4i32, v8i32, v4i32, SchedWriteVecShift,
3529                                 SchedWriteVecShiftImm, NoVLX>;
3530  defm PSRLQ : PDI_binop_rmi_all<0xD3, 0x73, MRM2r, "psrlq", X86vsrl, X86vsrli,
3531                                 v2i64, v4i64, v2i64, SchedWriteVecShift,
3532                                 SchedWriteVecShiftImm, NoVLX>;
3533
3534  defm PSRAW : PDI_binop_rmi_all<0xE1, 0x71, MRM4r, "psraw", X86vsra, X86vsrai,
3535                                 v8i16, v16i16, v8i16, SchedWriteVecShift,
3536                                 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>;
3537  defm PSRAD : PDI_binop_rmi_all<0xE2, 0x72, MRM4r, "psrad", X86vsra, X86vsrai,
3538                                 v4i32, v8i32, v4i32, SchedWriteVecShift,
3539                                 SchedWriteVecShiftImm, NoVLX>;
3540
3541  defm PSLLDQ : PDI_binop_ri_all<0x73, MRM7r, "pslldq", X86vshldq,
3542                                 SchedWriteShuffle>;
3543  defm PSRLDQ : PDI_binop_ri_all<0x73, MRM3r, "psrldq", X86vshrdq,
3544                                 SchedWriteShuffle>;
3545} // ExeDomain = SSEPackedInt
3546
3547//===---------------------------------------------------------------------===//
3548// SSE2 - Packed Integer Comparison Instructions
3549//===---------------------------------------------------------------------===//
3550
3551defm PCMPEQB : PDI_binop_all<0x74, "pcmpeqb", X86pcmpeq, v16i8, v32i8,
3552                             SchedWriteVecALU, 1, TruePredicate>;
3553defm PCMPEQW : PDI_binop_all<0x75, "pcmpeqw", X86pcmpeq, v8i16, v16i16,
3554                             SchedWriteVecALU, 1, TruePredicate>;
3555defm PCMPEQD : PDI_binop_all<0x76, "pcmpeqd", X86pcmpeq, v4i32, v8i32,
3556                             SchedWriteVecALU, 1, TruePredicate>;
3557defm PCMPGTB : PDI_binop_all<0x64, "pcmpgtb", X86pcmpgt, v16i8, v32i8,
3558                             SchedWriteVecALU, 0, TruePredicate>;
3559defm PCMPGTW : PDI_binop_all<0x65, "pcmpgtw", X86pcmpgt, v8i16, v16i16,
3560                             SchedWriteVecALU, 0, TruePredicate>;
3561defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32,
3562                             SchedWriteVecALU, 0, TruePredicate>;
3563
3564//===---------------------------------------------------------------------===//
3565// SSE2 - Packed Integer Shuffle Instructions
3566//===---------------------------------------------------------------------===//
3567
3568let ExeDomain = SSEPackedInt in {
3569multiclass sse2_pshuffle<string OpcodeStr, ValueType vt128, ValueType vt256,
3570                         SDNode OpNode, X86SchedWriteWidths sched,
3571                         Predicate prd> {
3572let Predicates = [HasAVX, prd] in {
3573  def V#NAME#ri : Ii8<0x70, MRMSrcReg, (outs VR128:$dst),
3574                      (ins VR128:$src1, u8imm:$src2),
3575                      !strconcat("v", OpcodeStr,
3576                                 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3577                      [(set VR128:$dst,
3578                        (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))]>,
3579                      VEX, Sched<[sched.XMM]>, VEX_WIG;
3580  def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst),
3581                      (ins i128mem:$src1, u8imm:$src2),
3582                      !strconcat("v", OpcodeStr,
3583                                 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3584                     [(set VR128:$dst,
3585                       (vt128 (OpNode (bitconvert (loadv2i64 addr:$src1)),
3586                        (i8 imm:$src2))))]>, VEX,
3587                  Sched<[sched.XMM.Folded]>, VEX_WIG;
3588}
3589
3590let Predicates = [HasAVX2, prd] in {
3591  def V#NAME#Yri : Ii8<0x70, MRMSrcReg, (outs VR256:$dst),
3592                       (ins VR256:$src1, u8imm:$src2),
3593                       !strconcat("v", OpcodeStr,
3594                                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3595                       [(set VR256:$dst,
3596                         (vt256 (OpNode VR256:$src1, (i8 imm:$src2))))]>,
3597                       VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG;
3598  def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst),
3599                       (ins i256mem:$src1, u8imm:$src2),
3600                       !strconcat("v", OpcodeStr,
3601                                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3602                      [(set VR256:$dst,
3603                        (vt256 (OpNode (bitconvert (loadv4i64 addr:$src1)),
3604                         (i8 imm:$src2))))]>, VEX, VEX_L,
3605                   Sched<[sched.YMM.Folded]>, VEX_WIG;
3606}
3607
3608let Predicates = [UseSSE2] in {
3609  def ri : Ii8<0x70, MRMSrcReg,
3610               (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
3611               !strconcat(OpcodeStr,
3612                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3613               [(set VR128:$dst,
3614                 (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))]>,
3615               Sched<[sched.XMM]>;
3616  def mi : Ii8<0x70, MRMSrcMem,
3617               (outs VR128:$dst), (ins i128mem:$src1, u8imm:$src2),
3618               !strconcat(OpcodeStr,
3619                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3620               [(set VR128:$dst,
3621                 (vt128 (OpNode (bitconvert (memopv2i64 addr:$src1)),
3622                        (i8 imm:$src2))))]>,
3623               Sched<[sched.XMM.Folded]>;
3624}
3625}
3626} // ExeDomain = SSEPackedInt
3627
3628defm PSHUFD  : sse2_pshuffle<"pshufd", v4i32, v8i32, X86PShufd,
3629                             SchedWriteShuffle, NoVLX>, PD;
3630defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw,
3631                             SchedWriteShuffle, NoVLX_Or_NoBWI>, XS;
3632defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw,
3633                             SchedWriteShuffle, NoVLX_Or_NoBWI>, XD;
3634
3635//===---------------------------------------------------------------------===//
3636// Packed Integer Pack Instructions (SSE & AVX)
3637//===---------------------------------------------------------------------===//
3638
3639let ExeDomain = SSEPackedInt in {
3640multiclass sse2_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
3641                     ValueType ArgVT, SDNode OpNode, RegisterClass RC,
3642                     X86MemOperand x86memop, X86FoldableSchedWrite sched,
3643                     PatFrag ld_frag, bit Is2Addr = 1> {
3644  def rr : PDI<opc, MRMSrcReg,
3645               (outs RC:$dst), (ins RC:$src1, RC:$src2),
3646               !if(Is2Addr,
3647                   !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3648                   !strconcat(OpcodeStr,
3649                              "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3650               [(set RC:$dst,
3651                     (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))]>,
3652               Sched<[sched]>;
3653  def rm : PDI<opc, MRMSrcMem,
3654               (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
3655               !if(Is2Addr,
3656                   !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3657                   !strconcat(OpcodeStr,
3658                              "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3659               [(set RC:$dst,
3660                     (OutVT (OpNode (ArgVT RC:$src1),
3661                                    (bitconvert (ld_frag addr:$src2)))))]>,
3662               Sched<[sched.Folded, ReadAfterLd]>;
3663}
3664
3665multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
3666                     ValueType ArgVT, SDNode OpNode, RegisterClass RC,
3667                     X86MemOperand x86memop, X86FoldableSchedWrite sched,
3668                     PatFrag ld_frag, bit Is2Addr = 1> {
3669  def rr : SS48I<opc, MRMSrcReg,
3670                 (outs RC:$dst), (ins RC:$src1, RC:$src2),
3671                 !if(Is2Addr,
3672                     !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3673                     !strconcat(OpcodeStr,
3674                                "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3675                 [(set RC:$dst,
3676                       (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))]>,
3677                 Sched<[sched]>;
3678  def rm : SS48I<opc, MRMSrcMem,
3679                 (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
3680                 !if(Is2Addr,
3681                     !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3682                     !strconcat(OpcodeStr,
3683                                "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3684                 [(set RC:$dst,
3685                       (OutVT (OpNode (ArgVT RC:$src1),
3686                                      (bitconvert (ld_frag addr:$src2)))))]>,
3687                 Sched<[sched.Folded, ReadAfterLd]>;
3688}
3689
3690let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
3691  defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss, VR128,
3692                             i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
3693                             VEX_4V, VEX_WIG;
3694  defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss, VR128,
3695                             i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
3696                             VEX_4V, VEX_WIG;
3697
3698  defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus, VR128,
3699                             i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
3700                             VEX_4V, VEX_WIG;
3701  defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus, VR128,
3702                             i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
3703                             VEX_4V;
3704}
3705
3706let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
3707  defm VPACKSSWBY : sse2_pack<0x63, "vpacksswb", v32i8, v16i16, X86Packss, VR256,
3708                              i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
3709                              VEX_4V, VEX_L, VEX_WIG;
3710  defm VPACKSSDWY : sse2_pack<0x6B, "vpackssdw", v16i16, v8i32, X86Packss, VR256,
3711                              i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
3712                              VEX_4V, VEX_L, VEX_WIG;
3713
3714  defm VPACKUSWBY : sse2_pack<0x67, "vpackuswb", v32i8, v16i16, X86Packus, VR256,
3715                              i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
3716                              VEX_4V, VEX_L, VEX_WIG;
3717  defm VPACKUSDWY : sse4_pack<0x2B, "vpackusdw", v16i16, v8i32, X86Packus, VR256,
3718                              i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
3719                              VEX_4V, VEX_L;
3720}
3721
3722let Constraints = "$src1 = $dst" in {
3723  defm PACKSSWB : sse2_pack<0x63, "packsswb", v16i8, v8i16, X86Packss, VR128,
3724                            i128mem, SchedWriteShuffle.XMM, memopv2i64>;
3725  defm PACKSSDW : sse2_pack<0x6B, "packssdw", v8i16, v4i32, X86Packss, VR128,
3726                            i128mem, SchedWriteShuffle.XMM, memopv2i64>;
3727
3728  defm PACKUSWB : sse2_pack<0x67, "packuswb", v16i8, v8i16, X86Packus, VR128,
3729                            i128mem, SchedWriteShuffle.XMM, memopv2i64>;
3730
3731  defm PACKUSDW : sse4_pack<0x2B, "packusdw", v8i16, v4i32, X86Packus, VR128,
3732                            i128mem, SchedWriteShuffle.XMM, memopv2i64>;
3733}
3734} // ExeDomain = SSEPackedInt
3735
3736//===---------------------------------------------------------------------===//
3737// SSE2 - Packed Integer Unpack Instructions
3738//===---------------------------------------------------------------------===//
3739
3740let ExeDomain = SSEPackedInt in {
3741multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt,
3742                       SDNode OpNode, RegisterClass RC, X86MemOperand x86memop,
3743                       X86FoldableSchedWrite sched, PatFrag ld_frag,
3744                       bit Is2Addr = 1> {
3745  def rr : PDI<opc, MRMSrcReg,
3746      (outs RC:$dst), (ins RC:$src1, RC:$src2),
3747      !if(Is2Addr,
3748          !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
3749          !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3750      [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>,
3751      Sched<[sched]>;
3752  def rm : PDI<opc, MRMSrcMem,
3753      (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
3754      !if(Is2Addr,
3755          !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
3756          !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3757      [(set RC:$dst, (vt (OpNode RC:$src1,
3758                                  (bitconvert (ld_frag addr:$src2)))))]>,
3759      Sched<[sched.Folded, ReadAfterLd]>;
3760}
3761
3762let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
3763  defm VPUNPCKLBW  : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl, VR128,
3764                                 i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
3765                                 VEX_4V, VEX_WIG;
3766  defm VPUNPCKLWD  : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl, VR128,
3767                                 i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
3768                                 VEX_4V, VEX_WIG;
3769  defm VPUNPCKHBW  : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh, VR128,
3770                                 i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
3771                                 VEX_4V, VEX_WIG;
3772  defm VPUNPCKHWD  : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh, VR128,
3773                                 i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
3774                                 VEX_4V, VEX_WIG;
3775}
3776
3777let Predicates = [HasAVX, NoVLX] in {
3778  defm VPUNPCKLDQ  : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl, VR128,
3779                                 i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
3780                                 VEX_4V, VEX_WIG;
3781  defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl, VR128,
3782                                 i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
3783                                 VEX_4V, VEX_WIG;
3784  defm VPUNPCKHDQ  : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh, VR128,
3785                                 i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
3786                                 VEX_4V, VEX_WIG;
3787  defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh, VR128,
3788                                 i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
3789                                 VEX_4V, VEX_WIG;
3790}
3791
3792let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
3793  defm VPUNPCKLBWY  : sse2_unpack<0x60, "vpunpcklbw", v32i8, X86Unpckl, VR256,
3794                                  i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
3795                                  VEX_4V, VEX_L, VEX_WIG;
3796  defm VPUNPCKLWDY  : sse2_unpack<0x61, "vpunpcklwd", v16i16, X86Unpckl, VR256,
3797                                  i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
3798                                  VEX_4V, VEX_L, VEX_WIG;
3799  defm VPUNPCKHBWY  : sse2_unpack<0x68, "vpunpckhbw", v32i8, X86Unpckh, VR256,
3800                                  i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
3801                                  VEX_4V, VEX_L, VEX_WIG;
3802  defm VPUNPCKHWDY  : sse2_unpack<0x69, "vpunpckhwd", v16i16, X86Unpckh, VR256,
3803                                  i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
3804                                  VEX_4V, VEX_L, VEX_WIG;
3805}
3806
3807let Predicates = [HasAVX2, NoVLX] in {
3808  defm VPUNPCKLDQY  : sse2_unpack<0x62, "vpunpckldq", v8i32, X86Unpckl, VR256,
3809                                  i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
3810                                  VEX_4V, VEX_L, VEX_WIG;
3811  defm VPUNPCKLQDQY : sse2_unpack<0x6C, "vpunpcklqdq", v4i64, X86Unpckl, VR256,
3812                                  i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
3813                                  VEX_4V, VEX_L, VEX_WIG;
3814  defm VPUNPCKHDQY  : sse2_unpack<0x6A, "vpunpckhdq", v8i32, X86Unpckh, VR256,
3815                                  i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
3816                                  VEX_4V, VEX_L, VEX_WIG;
3817  defm VPUNPCKHQDQY : sse2_unpack<0x6D, "vpunpckhqdq", v4i64, X86Unpckh, VR256,
3818                                  i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
3819                                  VEX_4V, VEX_L, VEX_WIG;
3820}
3821
3822let Constraints = "$src1 = $dst" in {
3823  defm PUNPCKLBW  : sse2_unpack<0x60, "punpcklbw", v16i8, X86Unpckl, VR128,
3824                                i128mem, SchedWriteShuffle.XMM, memopv2i64>;
3825  defm PUNPCKLWD  : sse2_unpack<0x61, "punpcklwd", v8i16, X86Unpckl, VR128,
3826                                i128mem, SchedWriteShuffle.XMM, memopv2i64>;
3827  defm PUNPCKLDQ  : sse2_unpack<0x62, "punpckldq", v4i32, X86Unpckl, VR128,
3828                                i128mem, SchedWriteShuffle.XMM, memopv2i64>;
3829  defm PUNPCKLQDQ : sse2_unpack<0x6C, "punpcklqdq", v2i64, X86Unpckl, VR128,
3830                                i128mem, SchedWriteShuffle.XMM, memopv2i64>;
3831
3832  defm PUNPCKHBW  : sse2_unpack<0x68, "punpckhbw", v16i8, X86Unpckh, VR128,
3833                                i128mem, SchedWriteShuffle.XMM, memopv2i64>;
3834  defm PUNPCKHWD  : sse2_unpack<0x69, "punpckhwd", v8i16, X86Unpckh, VR128,
3835                                i128mem, SchedWriteShuffle.XMM, memopv2i64>;
3836  defm PUNPCKHDQ  : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Unpckh, VR128,
3837                                i128mem, SchedWriteShuffle.XMM, memopv2i64>;
3838  defm PUNPCKHQDQ : sse2_unpack<0x6D, "punpckhqdq", v2i64, X86Unpckh, VR128,
3839                                i128mem, SchedWriteShuffle.XMM, memopv2i64>;
3840}
3841} // ExeDomain = SSEPackedInt
3842
3843//===---------------------------------------------------------------------===//
3844// SSE2 - Packed Integer Extract and Insert
3845//===---------------------------------------------------------------------===//
3846
3847let ExeDomain = SSEPackedInt in {
3848multiclass sse2_pinsrw<bit Is2Addr = 1> {
3849  def rr : Ii8<0xC4, MRMSrcReg,
3850       (outs VR128:$dst), (ins VR128:$src1,
3851        GR32orGR64:$src2, u8imm:$src3),
3852       !if(Is2Addr,
3853           "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
3854           "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
3855       [(set VR128:$dst,
3856         (X86pinsrw VR128:$src1, GR32orGR64:$src2, imm:$src3))]>,
3857       Sched<[WriteVecInsert]>;
3858  def rm : Ii8<0xC4, MRMSrcMem,
3859                      (outs VR128:$dst), (ins VR128:$src1,
3860                       i16mem:$src2, u8imm:$src3),
3861       !if(Is2Addr,
3862           "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
3863           "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
3864       [(set VR128:$dst,
3865         (X86pinsrw VR128:$src1, (extloadi16 addr:$src2),
3866                    imm:$src3))]>,
3867       Sched<[WriteVecInsertLd, ReadAfterLd]>;
3868}
3869
3870// Extract
3871let Predicates = [HasAVX, NoBWI] in
3872def VPEXTRWrr : Ii8<0xC5, MRMSrcReg,
3873                    (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
3874                    "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
3875                    [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
3876                                            imm:$src2))]>,
3877                PD, VEX, Sched<[WriteVecExtract]>;
3878def PEXTRWrr : PDIi8<0xC5, MRMSrcReg,
3879                    (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
3880                    "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
3881                    [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
3882                                            imm:$src2))]>,
3883               Sched<[WriteVecExtract]>;
3884
3885// Insert
3886let Predicates = [HasAVX, NoBWI] in
3887defm VPINSRW : sse2_pinsrw<0>, PD, VEX_4V;
3888
3889let Predicates = [UseSSE2], Constraints = "$src1 = $dst" in
3890defm PINSRW : sse2_pinsrw, PD;
3891
3892} // ExeDomain = SSEPackedInt
3893
3894//===---------------------------------------------------------------------===//
3895// SSE2 - Packed Mask Creation
3896//===---------------------------------------------------------------------===//
3897
3898let ExeDomain = SSEPackedInt in {
3899
3900def VPMOVMSKBrr  : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
3901           (ins VR128:$src),
3902           "pmovmskb\t{$src, $dst|$dst, $src}",
3903           [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))]>,
3904           Sched<[WriteVecMOVMSK]>, VEX, VEX_WIG;
3905
3906let Predicates = [HasAVX2] in {
3907def VPMOVMSKBYrr  : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
3908           (ins VR256:$src),
3909           "pmovmskb\t{$src, $dst|$dst, $src}",
3910           [(set GR32orGR64:$dst, (X86movmsk (v32i8 VR256:$src)))]>,
3911           Sched<[WriteVecMOVMSKY]>, VEX, VEX_L, VEX_WIG;
3912}
3913
3914def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src),
3915           "pmovmskb\t{$src, $dst|$dst, $src}",
3916           [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))]>,
3917           Sched<[WriteVecMOVMSK]>;
3918
3919} // ExeDomain = SSEPackedInt
3920
3921//===---------------------------------------------------------------------===//
3922// SSE2 - Conditional Store
3923//===---------------------------------------------------------------------===//
3924
3925let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecMoveLS.XMM.MR] in {
3926let Uses = [EDI], Predicates = [HasAVX,Not64BitMode] in
3927def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs),
3928           (ins VR128:$src, VR128:$mask),
3929           "maskmovdqu\t{$mask, $src|$src, $mask}",
3930           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>,
3931           VEX, VEX_WIG;
3932let Uses = [RDI], Predicates = [HasAVX,In64BitMode] in
3933def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs),
3934           (ins VR128:$src, VR128:$mask),
3935           "maskmovdqu\t{$mask, $src|$src, $mask}",
3936           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>,
3937           VEX, VEX_WIG;
3938
3939let Uses = [EDI], Predicates = [UseSSE2,Not64BitMode] in
3940def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
3941           "maskmovdqu\t{$mask, $src|$src, $mask}",
3942           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>;
3943let Uses = [RDI], Predicates = [UseSSE2,In64BitMode] in
3944def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
3945           "maskmovdqu\t{$mask, $src|$src, $mask}",
3946           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>;
3947
3948} // ExeDomain = SSEPackedInt
3949
3950//===---------------------------------------------------------------------===//
3951// SSE2 - Move Doubleword/Quadword
3952//===---------------------------------------------------------------------===//
3953
3954//===---------------------------------------------------------------------===//
3955// Move Int Doubleword to Packed Double Int
3956//
3957let ExeDomain = SSEPackedInt in {
3958def VMOVDI2PDIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
3959                        "movd\t{$src, $dst|$dst, $src}",
3960                        [(set VR128:$dst,
3961                          (v4i32 (scalar_to_vector GR32:$src)))]>,
3962                          VEX, Sched<[WriteVecMoveFromGpr]>;
3963def VMOVDI2PDIrm : VS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
3964                        "movd\t{$src, $dst|$dst, $src}",
3965                        [(set VR128:$dst,
3966                          (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>,
3967                        VEX, Sched<[WriteVecLoad]>;
3968def VMOV64toPQIrr : VRS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
3969                          "movq\t{$src, $dst|$dst, $src}",
3970                          [(set VR128:$dst,
3971                            (v2i64 (scalar_to_vector GR64:$src)))]>,
3972                          VEX, Sched<[WriteVecMoveFromGpr]>;
3973let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
3974def VMOV64toPQIrm : VRS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
3975                          "movq\t{$src, $dst|$dst, $src}", []>,
3976                          VEX, Sched<[WriteVecLoad]>;
3977let isCodeGenOnly = 1 in
3978def VMOV64toSDrr : VRS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
3979                         "movq\t{$src, $dst|$dst, $src}",
3980                         [(set FR64:$dst, (bitconvert GR64:$src))]>,
3981                         VEX, Sched<[WriteVecMoveFromGpr]>;
3982
3983def MOVDI2PDIrr : S2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
3984                      "movd\t{$src, $dst|$dst, $src}",
3985                      [(set VR128:$dst,
3986                        (v4i32 (scalar_to_vector GR32:$src)))]>,
3987                      Sched<[WriteVecMoveFromGpr]>;
3988def MOVDI2PDIrm : S2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
3989                      "movd\t{$src, $dst|$dst, $src}",
3990                      [(set VR128:$dst,
3991                        (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>,
3992                      Sched<[WriteVecLoad]>;
3993def MOV64toPQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
3994                        "movq\t{$src, $dst|$dst, $src}",
3995                        [(set VR128:$dst,
3996                          (v2i64 (scalar_to_vector GR64:$src)))]>,
3997                        Sched<[WriteVecMoveFromGpr]>;
3998let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
3999def MOV64toPQIrm : RS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4000                        "movq\t{$src, $dst|$dst, $src}", []>,
4001                        Sched<[WriteVecLoad]>;
4002let isCodeGenOnly = 1 in
4003def MOV64toSDrr : RS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
4004                       "movq\t{$src, $dst|$dst, $src}",
4005                       [(set FR64:$dst, (bitconvert GR64:$src))]>,
4006                       Sched<[WriteVecMoveFromGpr]>;
4007} // ExeDomain = SSEPackedInt
4008
4009//===---------------------------------------------------------------------===//
4010// Move Int Doubleword to Single Scalar
4011//
4012let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
4013  def VMOVDI2SSrr  : VS2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
4014                        "movd\t{$src, $dst|$dst, $src}",
4015                        [(set FR32:$dst, (bitconvert GR32:$src))]>,
4016                        VEX, Sched<[WriteVecMoveFromGpr]>;
4017
4018  def VMOVDI2SSrm  : VS2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
4019                        "movd\t{$src, $dst|$dst, $src}",
4020                        [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))]>,
4021                        VEX, Sched<[WriteVecLoad]>;
4022  def MOVDI2SSrr  : S2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
4023                        "movd\t{$src, $dst|$dst, $src}",
4024                        [(set FR32:$dst, (bitconvert GR32:$src))]>,
4025                        Sched<[WriteVecMoveFromGpr]>;
4026
4027  def MOVDI2SSrm  : S2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
4028                        "movd\t{$src, $dst|$dst, $src}",
4029                        [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))]>,
4030                        Sched<[WriteVecLoad]>;
4031} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
4032
4033//===---------------------------------------------------------------------===//
4034// Move Packed Doubleword Int to Packed Double Int
4035//
4036let ExeDomain = SSEPackedInt in {
4037def VMOVPDI2DIrr  : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
4038                         "movd\t{$src, $dst|$dst, $src}",
4039                         [(set GR32:$dst, (extractelt (v4i32 VR128:$src),
4040                                          (iPTR 0)))]>, VEX,
4041                         Sched<[WriteVecMoveToGpr]>;
4042def VMOVPDI2DImr  : VS2I<0x7E, MRMDestMem, (outs),
4043                         (ins i32mem:$dst, VR128:$src),
4044                         "movd\t{$src, $dst|$dst, $src}",
4045                         [(store (i32 (extractelt (v4i32 VR128:$src),
4046                                       (iPTR 0))), addr:$dst)]>,
4047                         VEX, Sched<[WriteVecStore]>;
4048def MOVPDI2DIrr  : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
4049                       "movd\t{$src, $dst|$dst, $src}",
4050                       [(set GR32:$dst, (extractelt (v4i32 VR128:$src),
4051                                        (iPTR 0)))]>,
4052                   Sched<[WriteVecMoveToGpr]>;
4053def MOVPDI2DImr  : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src),
4054                       "movd\t{$src, $dst|$dst, $src}",
4055                       [(store (i32 (extractelt (v4i32 VR128:$src),
4056                                     (iPTR 0))), addr:$dst)]>,
4057                       Sched<[WriteVecStore]>;
4058} // ExeDomain = SSEPackedInt
4059
4060//===---------------------------------------------------------------------===//
4061// Move Packed Doubleword Int first element to Doubleword Int
4062//
4063let ExeDomain = SSEPackedInt in {
4064let SchedRW = [WriteVecMoveToGpr] in {
4065def VMOVPQIto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
4066                          "movq\t{$src, $dst|$dst, $src}",
4067                          [(set GR64:$dst, (extractelt (v2i64 VR128:$src),
4068                                                        (iPTR 0)))]>,
4069                      VEX;
4070
4071def MOVPQIto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
4072                        "movq\t{$src, $dst|$dst, $src}",
4073                        [(set GR64:$dst, (extractelt (v2i64 VR128:$src),
4074                                                         (iPTR 0)))]>;
4075} //SchedRW
4076
4077let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
4078def VMOVPQIto64mr : VRS2I<0x7E, MRMDestMem, (outs),
4079                          (ins i64mem:$dst, VR128:$src),
4080                          "movq\t{$src, $dst|$dst, $src}", []>,
4081                          VEX, Sched<[WriteVecStore]>;
4082let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
4083def MOVPQIto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
4084                        "movq\t{$src, $dst|$dst, $src}", []>,
4085                        Sched<[WriteVecStore]>;
4086} // ExeDomain = SSEPackedInt
4087
4088//===---------------------------------------------------------------------===//
4089// Bitcast FR64 <-> GR64
4090//
4091let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
4092  let Predicates = [UseAVX] in
4093  def VMOV64toSDrm : VS2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
4094                          "movq\t{$src, $dst|$dst, $src}",
4095                          [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>,
4096                          VEX, Sched<[WriteVecLoad]>;
4097  def VMOVSDto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
4098                           "movq\t{$src, $dst|$dst, $src}",
4099                           [(set GR64:$dst, (bitconvert FR64:$src))]>,
4100                           VEX, Sched<[WriteVecMoveToGpr]>;
4101  def VMOVSDto64mr : VRS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
4102                           "movq\t{$src, $dst|$dst, $src}",
4103                           [(store (i64 (bitconvert FR64:$src)), addr:$dst)]>,
4104                           VEX, Sched<[WriteVecStore]>;
4105
4106  def MOV64toSDrm : S2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
4107                         "movq\t{$src, $dst|$dst, $src}",
4108                         [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>,
4109                         Sched<[WriteVecLoad]>;
4110  def MOVSDto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
4111                         "movq\t{$src, $dst|$dst, $src}",
4112                         [(set GR64:$dst, (bitconvert FR64:$src))]>,
4113                         Sched<[WriteVecMoveToGpr]>;
4114  def MOVSDto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
4115                         "movq\t{$src, $dst|$dst, $src}",
4116                         [(store (i64 (bitconvert FR64:$src)), addr:$dst)]>,
4117                         Sched<[WriteVecStore]>;
4118} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
4119
4120//===---------------------------------------------------------------------===//
4121// Move Scalar Single to Double Int
4122//
4123let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
4124  def VMOVSS2DIrr  : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
4125                        "movd\t{$src, $dst|$dst, $src}",
4126                        [(set GR32:$dst, (bitconvert FR32:$src))]>,
4127                        VEX, Sched<[WriteVecMoveToGpr]>;
4128  def VMOVSS2DImr  : VS2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
4129                        "movd\t{$src, $dst|$dst, $src}",
4130                        [(store (i32 (bitconvert FR32:$src)), addr:$dst)]>,
4131                        VEX, Sched<[WriteVecStore]>;
4132  def MOVSS2DIrr  : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
4133                        "movd\t{$src, $dst|$dst, $src}",
4134                        [(set GR32:$dst, (bitconvert FR32:$src))]>,
4135                        Sched<[WriteVecMoveToGpr]>;
4136  def MOVSS2DImr  : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
4137                        "movd\t{$src, $dst|$dst, $src}",
4138                        [(store (i32 (bitconvert FR32:$src)), addr:$dst)]>,
4139                        Sched<[WriteVecStore]>;
4140} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
4141
4142let Predicates = [UseAVX] in {
4143  def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
4144            (VMOVDI2PDIrr GR32:$src)>;
4145
4146  def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
4147            (VMOV64toPQIrr GR64:$src)>;
4148
4149  def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
4150              (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
4151            (SUBREG_TO_REG (i64 0), (v2i64 (VMOV64toPQIrr GR64:$src)), sub_xmm)>;
4152  // AVX 128-bit movd/movq instructions write zeros in the high 128-bit part.
4153  // These instructions also write zeros in the high part of a 256-bit register.
4154  def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))),
4155            (VMOVDI2PDIrm addr:$src)>;
4156  def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
4157            (VMOVDI2PDIrm addr:$src)>;
4158  def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
4159            (VMOVDI2PDIrm addr:$src)>;
4160  def : Pat<(v4i32 (X86vzload addr:$src)),
4161            (VMOVDI2PDIrm addr:$src)>;
4162  def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
4163              (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))),
4164            (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIrm addr:$src)), sub_xmm)>;
4165  def : Pat<(v8i32 (X86vzload addr:$src)),
4166            (SUBREG_TO_REG (i64 0), (v4i32 (VMOVDI2PDIrm addr:$src)), sub_xmm)>;
4167  // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext.
4168  def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
4169                               (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),
4170            (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIrr GR32:$src)), sub_xmm)>;
4171}
4172
4173let Predicates = [UseSSE2] in {
4174  def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
4175            (MOVDI2PDIrr GR32:$src)>;
4176
4177  def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
4178            (MOV64toPQIrr GR64:$src)>;
4179  def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))),
4180            (MOVDI2PDIrm addr:$src)>;
4181  def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
4182            (MOVDI2PDIrm addr:$src)>;
4183  def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
4184            (MOVDI2PDIrm addr:$src)>;
4185  def : Pat<(v4i32 (X86vzload addr:$src)),
4186            (MOVDI2PDIrm addr:$src)>;
4187}
4188
4189// Before the MC layer of LLVM existed, clang emitted "movd" assembly instead of
4190// "movq" due to MacOS parsing limitation. In order to parse old assembly, we add
4191// these aliases.
4192def : InstAlias<"movd\t{$src, $dst|$dst, $src}",
4193                (MOV64toPQIrr VR128:$dst, GR64:$src), 0>;
4194def : InstAlias<"movd\t{$src, $dst|$dst, $src}",
4195                (MOVPQIto64rr GR64:$dst, VR128:$src), 0>;
4196// Allow "vmovd" but print "vmovq" since we don't need compatibility for AVX.
4197def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
4198                (VMOV64toPQIrr VR128:$dst, GR64:$src), 0>;
4199def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
4200                (VMOVPQIto64rr GR64:$dst, VR128:$src), 0>;
4201
4202//===---------------------------------------------------------------------===//
4203// SSE2 - Move Quadword
4204//===---------------------------------------------------------------------===//
4205
4206//===---------------------------------------------------------------------===//
4207// Move Quadword Int to Packed Quadword Int
4208//
4209
4210let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLoad] in {
4211def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4212                    "vmovq\t{$src, $dst|$dst, $src}",
4213                    [(set VR128:$dst,
4214                      (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS,
4215                    VEX, Requires<[UseAVX]>, VEX_WIG;
4216def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4217                    "movq\t{$src, $dst|$dst, $src}",
4218                    [(set VR128:$dst,
4219                      (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>,
4220                    XS, Requires<[UseSSE2]>; // SSE2 instruction with XS Prefix
4221} // ExeDomain, SchedRW
4222
4223//===---------------------------------------------------------------------===//
4224// Move Packed Quadword Int to Quadword Int
4225//
4226let ExeDomain = SSEPackedInt, SchedRW = [WriteVecStore] in {
4227def VMOVPQI2QImr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
4228                        "movq\t{$src, $dst|$dst, $src}",
4229                        [(store (i64 (extractelt (v2i64 VR128:$src),
4230                                      (iPTR 0))), addr:$dst)]>,
4231                        VEX, VEX_WIG;
4232def MOVPQI2QImr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
4233                      "movq\t{$src, $dst|$dst, $src}",
4234                      [(store (i64 (extractelt (v2i64 VR128:$src),
4235                                    (iPTR 0))), addr:$dst)]>;
4236} // ExeDomain, SchedRW
4237
4238// For disassembler only
4239let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
4240    SchedRW = [SchedWriteVecLogic.XMM] in {
4241def VMOVPQI2QIrr : VS2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
4242                     "movq\t{$src, $dst|$dst, $src}", []>, VEX, VEX_WIG;
4243def MOVPQI2QIrr : S2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
4244                      "movq\t{$src, $dst|$dst, $src}", []>;
4245}
4246
4247// Aliases to help the assembler pick two byte VEX encodings by swapping the
4248// operands relative to the normal instructions to use VEX.R instead of VEX.B.
4249def : InstAlias<"vmovq\t{$src, $dst|$dst, $src}",
4250                (VMOVPQI2QIrr VR128L:$dst, VR128H:$src), 0>;
4251
4252def : InstAlias<"vmovq.s\t{$src, $dst|$dst, $src}",
4253                (VMOVPQI2QIrr VR128:$dst, VR128:$src), 0>;
4254def : InstAlias<"movq.s\t{$src, $dst|$dst, $src}",
4255                (MOVPQI2QIrr VR128:$dst, VR128:$src), 0>;
4256
4257let Predicates = [UseAVX] in {
4258  def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
4259            (VMOVQI2PQIrm addr:$src)>;
4260  def : Pat<(v2i64 (X86vzload addr:$src)),
4261            (VMOVQI2PQIrm addr:$src)>;
4262  def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
4263              (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))),
4264            (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIrm addr:$src)), sub_xmm)>;
4265  def : Pat<(v4i64 (X86vzload addr:$src)),
4266            (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIrm addr:$src)), sub_xmm)>;
4267}
4268
4269let Predicates = [UseSSE2] in {
4270  def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
4271            (MOVQI2PQIrm addr:$src)>;
4272  def : Pat<(v2i64 (X86vzload addr:$src)), (MOVQI2PQIrm addr:$src)>;
4273}
4274
4275//===---------------------------------------------------------------------===//
4276// Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in
4277// IA32 document. movq xmm1, xmm2 does clear the high bits.
4278//
4279let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecLogic.XMM] in {
4280def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
4281                        "vmovq\t{$src, $dst|$dst, $src}",
4282                    [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>,
4283                         XS, VEX, Requires<[UseAVX]>, VEX_WIG;
4284def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
4285                        "movq\t{$src, $dst|$dst, $src}",
4286                    [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>,
4287                        XS, Requires<[UseSSE2]>;
4288} // ExeDomain, SchedRW
4289
4290let Predicates = [UseAVX] in {
4291  def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
4292            (VMOVZPQILo2PQIrr VR128:$src)>;
4293}
4294let Predicates = [UseSSE2] in {
4295  def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
4296            (MOVZPQILo2PQIrr VR128:$src)>;
4297}
4298
4299//===---------------------------------------------------------------------===//
4300// SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP
4301//===---------------------------------------------------------------------===//
4302
4303multiclass sse3_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr,
4304                              ValueType vt, RegisterClass RC, PatFrag mem_frag,
4305                              X86MemOperand x86memop, X86FoldableSchedWrite sched> {
4306def rr : S3SI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
4307                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4308                      [(set RC:$dst, (vt (OpNode RC:$src)))]>,
4309                      Sched<[sched]>;
4310def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
4311                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4312                      [(set RC:$dst, (OpNode (mem_frag addr:$src)))]>,
4313                      Sched<[sched.Folded]>;
4314}
4315
4316let Predicates = [HasAVX, NoVLX] in {
4317  defm VMOVSHDUP  : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
4318                                       v4f32, VR128, loadv4f32, f128mem,
4319                                       SchedWriteFShuffle.XMM>, VEX, VEX_WIG;
4320  defm VMOVSLDUP  : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
4321                                       v4f32, VR128, loadv4f32, f128mem,
4322                                       SchedWriteFShuffle.XMM>, VEX, VEX_WIG;
4323  defm VMOVSHDUPY : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
4324                                       v8f32, VR256, loadv8f32, f256mem,
4325                                       SchedWriteFShuffle.YMM>, VEX, VEX_L, VEX_WIG;
4326  defm VMOVSLDUPY : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
4327                                       v8f32, VR256, loadv8f32, f256mem,
4328                                       SchedWriteFShuffle.YMM>, VEX, VEX_L, VEX_WIG;
4329}
4330defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128,
4331                                   memopv4f32, f128mem, SchedWriteFShuffle.XMM>;
4332defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128,
4333                                   memopv4f32, f128mem, SchedWriteFShuffle.XMM>;
4334
4335let Predicates = [HasAVX, NoVLX] in {
4336  def : Pat<(v4i32 (X86Movshdup VR128:$src)),
4337            (VMOVSHDUPrr VR128:$src)>;
4338  def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (loadv2i64 addr:$src)))),
4339            (VMOVSHDUPrm addr:$src)>;
4340  def : Pat<(v4i32 (X86Movsldup VR128:$src)),
4341            (VMOVSLDUPrr VR128:$src)>;
4342  def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (loadv2i64 addr:$src)))),
4343            (VMOVSLDUPrm addr:$src)>;
4344  def : Pat<(v8i32 (X86Movshdup VR256:$src)),
4345            (VMOVSHDUPYrr VR256:$src)>;
4346  def : Pat<(v8i32 (X86Movshdup (bc_v8i32 (loadv4i64 addr:$src)))),
4347            (VMOVSHDUPYrm addr:$src)>;
4348  def : Pat<(v8i32 (X86Movsldup VR256:$src)),
4349            (VMOVSLDUPYrr VR256:$src)>;
4350  def : Pat<(v8i32 (X86Movsldup (bc_v8i32 (loadv4i64 addr:$src)))),
4351            (VMOVSLDUPYrm addr:$src)>;
4352}
4353
4354let Predicates = [UseSSE3] in {
4355  def : Pat<(v4i32 (X86Movshdup VR128:$src)),
4356            (MOVSHDUPrr VR128:$src)>;
4357  def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (memopv2i64 addr:$src)))),
4358            (MOVSHDUPrm addr:$src)>;
4359  def : Pat<(v4i32 (X86Movsldup VR128:$src)),
4360            (MOVSLDUPrr VR128:$src)>;
4361  def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (memopv2i64 addr:$src)))),
4362            (MOVSLDUPrm addr:$src)>;
4363}
4364
4365//===---------------------------------------------------------------------===//
4366// SSE3 - Replicate Double FP - MOVDDUP
4367//===---------------------------------------------------------------------===//
4368
4369multiclass sse3_replicate_dfp<string OpcodeStr, X86SchedWriteWidths sched> {
4370def rr  : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
4371                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4372                    [(set VR128:$dst, (v2f64 (X86Movddup VR128:$src)))]>,
4373                    Sched<[sched.XMM]>;
4374def rm  : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
4375                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4376                    [(set VR128:$dst,
4377                      (v2f64 (X86Movddup
4378                              (scalar_to_vector (loadf64 addr:$src)))))]>,
4379                    Sched<[sched.XMM.Folded]>;
4380}
4381
4382// FIXME: Merge with above classes when there are patterns for the ymm version
4383multiclass sse3_replicate_dfp_y<string OpcodeStr, X86SchedWriteWidths sched> {
4384def rr  : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
4385                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4386                    [(set VR256:$dst, (v4f64 (X86Movddup VR256:$src)))]>,
4387                    Sched<[sched.YMM]>;
4388def rm  : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
4389                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4390                    [(set VR256:$dst,
4391                      (v4f64 (X86Movddup (loadv4f64 addr:$src))))]>,
4392                    Sched<[sched.YMM.Folded]>;
4393}
4394
4395let Predicates = [HasAVX, NoVLX] in {
4396  defm VMOVDDUP  : sse3_replicate_dfp<"vmovddup", SchedWriteFShuffle>,
4397                                      VEX, VEX_WIG;
4398  defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup", SchedWriteFShuffle>,
4399                                        VEX, VEX_L, VEX_WIG;
4400}
4401
4402defm MOVDDUP : sse3_replicate_dfp<"movddup", SchedWriteFShuffle>;
4403
4404
4405let Predicates = [HasAVX, NoVLX] in {
4406  def : Pat<(X86Movddup (loadv2f64 addr:$src)),
4407            (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
4408}
4409
4410let Predicates = [UseSSE3] in {
4411  // No need for aligned memory as this only loads 64-bits.
4412  def : Pat<(X86Movddup (loadv2f64 addr:$src)),
4413            (MOVDDUPrm addr:$src)>;
4414}
4415
4416//===---------------------------------------------------------------------===//
4417// SSE3 - Move Unaligned Integer
4418//===---------------------------------------------------------------------===//
4419
4420let Predicates = [HasAVX] in {
4421  def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
4422                      "vlddqu\t{$src, $dst|$dst, $src}",
4423                      [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>,
4424                      Sched<[SchedWriteVecMoveLS.XMM.RM]>, VEX, VEX_WIG;
4425  def VLDDQUYrm : S3DI<0xF0, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
4426                       "vlddqu\t{$src, $dst|$dst, $src}",
4427                       [(set VR256:$dst, (int_x86_avx_ldu_dq_256 addr:$src))]>,
4428                       Sched<[SchedWriteVecMoveLS.YMM.RM]>, VEX, VEX_L, VEX_WIG;
4429} // Predicates
4430
4431def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
4432                   "lddqu\t{$src, $dst|$dst, $src}",
4433                   [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>,
4434                   Sched<[SchedWriteVecMoveLS.XMM.RM]>;
4435
4436//===---------------------------------------------------------------------===//
4437// SSE3 - Arithmetic
4438//===---------------------------------------------------------------------===//
4439
4440multiclass sse3_addsub<string OpcodeStr, ValueType vt, RegisterClass RC,
4441                       X86MemOperand x86memop, X86FoldableSchedWrite sched,
4442                       PatFrag ld_frag, bit Is2Addr = 1> {
4443  def rr : I<0xD0, MRMSrcReg,
4444       (outs RC:$dst), (ins RC:$src1, RC:$src2),
4445       !if(Is2Addr,
4446           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4447           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4448       [(set RC:$dst, (vt (X86Addsub RC:$src1, RC:$src2)))]>,
4449       Sched<[sched]>;
4450  def rm : I<0xD0, MRMSrcMem,
4451       (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
4452       !if(Is2Addr,
4453           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4454           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4455       [(set RC:$dst, (vt (X86Addsub RC:$src1, (ld_frag addr:$src2))))]>,
4456       Sched<[sched.Folded, ReadAfterLd]>;
4457}
4458
4459let Predicates = [HasAVX] in {
4460  let ExeDomain = SSEPackedSingle in {
4461    defm VADDSUBPS : sse3_addsub<"vaddsubps", v4f32, VR128, f128mem,
4462                                 SchedWriteFAddSizes.PS.XMM, loadv4f32, 0>,
4463                                 XD, VEX_4V, VEX_WIG;
4464    defm VADDSUBPSY : sse3_addsub<"vaddsubps", v8f32, VR256, f256mem,
4465                                  SchedWriteFAddSizes.PS.YMM, loadv8f32, 0>,
4466                                  XD, VEX_4V, VEX_L, VEX_WIG;
4467  }
4468  let ExeDomain = SSEPackedDouble in {
4469    defm VADDSUBPD : sse3_addsub<"vaddsubpd", v2f64, VR128, f128mem,
4470                                 SchedWriteFAddSizes.PD.XMM, loadv2f64, 0>,
4471                                 PD, VEX_4V, VEX_WIG;
4472    defm VADDSUBPDY : sse3_addsub<"vaddsubpd", v4f64, VR256, f256mem,
4473                                  SchedWriteFAddSizes.PD.YMM, loadv4f64, 0>,
4474                                  PD, VEX_4V, VEX_L, VEX_WIG;
4475  }
4476}
4477let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in {
4478  let ExeDomain = SSEPackedSingle in
4479  defm ADDSUBPS : sse3_addsub<"addsubps", v4f32, VR128, f128mem,
4480                              SchedWriteFAddSizes.PS.XMM, memopv4f32>, XD;
4481  let ExeDomain = SSEPackedDouble in
4482  defm ADDSUBPD : sse3_addsub<"addsubpd", v2f64, VR128, f128mem,
4483                              SchedWriteFAddSizes.PD.XMM, memopv2f64>, PD;
4484}
4485
4486//===---------------------------------------------------------------------===//
4487// SSE3 Instructions
4488//===---------------------------------------------------------------------===//
4489
4490// Horizontal ops
4491multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
4492                   X86MemOperand x86memop, SDNode OpNode,
4493                   X86FoldableSchedWrite sched, PatFrag ld_frag,
4494                   bit Is2Addr = 1> {
4495  def rr : S3DI<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
4496       !if(Is2Addr,
4497         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4498         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4499      [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>,
4500      Sched<[sched]>;
4501
4502  def rm : S3DI<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
4503       !if(Is2Addr,
4504         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4505         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4506      [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>,
4507      Sched<[sched.Folded, ReadAfterLd]>;
4508}
4509multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
4510                  X86MemOperand x86memop, SDNode OpNode,
4511                  X86FoldableSchedWrite sched, PatFrag ld_frag,
4512                  bit Is2Addr = 1> {
4513  def rr : S3I<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
4514       !if(Is2Addr,
4515         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4516         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4517      [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>,
4518        Sched<[sched]>;
4519
4520  def rm : S3I<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
4521       !if(Is2Addr,
4522         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4523         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4524      [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>,
4525        Sched<[sched.Folded, ReadAfterLd]>;
4526}
4527
4528let Predicates = [HasAVX] in {
4529  let ExeDomain = SSEPackedSingle in {
4530    defm VHADDPS  : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem,
4531                            X86fhadd, WriteFHAdd, loadv4f32, 0>, VEX_4V, VEX_WIG;
4532    defm VHSUBPS  : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem,
4533                            X86fhsub, WriteFHAdd, loadv4f32, 0>, VEX_4V, VEX_WIG;
4534    defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem,
4535                            X86fhadd, WriteFHAddY, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG;
4536    defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem,
4537                            X86fhsub, WriteFHAddY, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG;
4538  }
4539  let ExeDomain = SSEPackedDouble in {
4540    defm VHADDPD  : S3_Int<0x7C, "vhaddpd", v2f64, VR128, f128mem,
4541                           X86fhadd, WriteFHAdd, loadv2f64, 0>, VEX_4V, VEX_WIG;
4542    defm VHSUBPD  : S3_Int<0x7D, "vhsubpd", v2f64, VR128, f128mem,
4543                           X86fhsub, WriteFHAdd, loadv2f64, 0>, VEX_4V, VEX_WIG;
4544    defm VHADDPDY : S3_Int<0x7C, "vhaddpd", v4f64, VR256, f256mem,
4545                           X86fhadd, WriteFHAddY, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG;
4546    defm VHSUBPDY : S3_Int<0x7D, "vhsubpd", v4f64, VR256, f256mem,
4547                           X86fhsub, WriteFHAddY, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG;
4548  }
4549}
4550
4551let Constraints = "$src1 = $dst" in {
4552  let ExeDomain = SSEPackedSingle in {
4553    defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, X86fhadd,
4554                          WriteFHAdd, memopv4f32>;
4555    defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, X86fhsub,
4556                          WriteFHAdd, memopv4f32>;
4557  }
4558  let ExeDomain = SSEPackedDouble in {
4559    defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, X86fhadd,
4560                         WriteFHAdd, memopv2f64>;
4561    defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, X86fhsub,
4562                         WriteFHAdd, memopv2f64>;
4563  }
4564}
4565
4566//===---------------------------------------------------------------------===//
4567// SSSE3 - Packed Absolute Instructions
4568//===---------------------------------------------------------------------===//
4569
4570/// SS3I_unop_rm_int - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
4571multiclass SS3I_unop_rm<bits<8> opc, string OpcodeStr, ValueType vt,
4572                        SDNode OpNode, X86SchedWriteWidths sched, PatFrag ld_frag> {
4573  def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
4574                 (ins VR128:$src),
4575                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4576                 [(set VR128:$dst, (vt (OpNode VR128:$src)))]>,
4577                 Sched<[sched.XMM]>;
4578
4579  def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
4580                 (ins i128mem:$src),
4581                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4582                 [(set VR128:$dst,
4583                   (vt (OpNode (bitconvert (ld_frag addr:$src)))))]>,
4584                 Sched<[sched.XMM.Folded]>;
4585}
4586
4587/// SS3I_unop_rm_int_y - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
4588multiclass SS3I_unop_rm_y<bits<8> opc, string OpcodeStr, ValueType vt,
4589                          SDNode OpNode, X86SchedWriteWidths sched> {
4590  def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
4591                  (ins VR256:$src),
4592                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4593                  [(set VR256:$dst, (vt (OpNode VR256:$src)))]>,
4594                  Sched<[sched.YMM]>;
4595
4596  def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
4597                  (ins i256mem:$src),
4598                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4599                  [(set VR256:$dst,
4600                    (vt (OpNode (bitconvert (loadv4i64 addr:$src)))))]>,
4601                  Sched<[sched.YMM.Folded]>;
4602}
4603
4604let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
4605  defm VPABSB  : SS3I_unop_rm<0x1C, "vpabsb", v16i8, abs, SchedWriteVecALU,
4606                              loadv2i64>, VEX, VEX_WIG;
4607  defm VPABSW  : SS3I_unop_rm<0x1D, "vpabsw", v8i16, abs, SchedWriteVecALU,
4608                              loadv2i64>, VEX, VEX_WIG;
4609}
4610let Predicates = [HasAVX, NoVLX] in {
4611  defm VPABSD  : SS3I_unop_rm<0x1E, "vpabsd", v4i32, abs, SchedWriteVecALU,
4612                              loadv2i64>, VEX, VEX_WIG;
4613}
4614let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
4615  defm VPABSB  : SS3I_unop_rm_y<0x1C, "vpabsb", v32i8, abs, SchedWriteVecALU>,
4616                                VEX, VEX_L, VEX_WIG;
4617  defm VPABSW  : SS3I_unop_rm_y<0x1D, "vpabsw", v16i16, abs, SchedWriteVecALU>,
4618                                VEX, VEX_L, VEX_WIG;
4619}
4620let Predicates = [HasAVX2, NoVLX] in {
4621  defm VPABSD  : SS3I_unop_rm_y<0x1E, "vpabsd", v8i32, abs, SchedWriteVecALU>,
4622                                VEX, VEX_L, VEX_WIG;
4623}
4624
4625defm PABSB : SS3I_unop_rm<0x1C, "pabsb", v16i8, abs, SchedWriteVecALU,
4626                          memopv2i64>;
4627defm PABSW : SS3I_unop_rm<0x1D, "pabsw", v8i16, abs, SchedWriteVecALU,
4628                          memopv2i64>;
4629defm PABSD : SS3I_unop_rm<0x1E, "pabsd", v4i32, abs, SchedWriteVecALU,
4630                          memopv2i64>;
4631
4632//===---------------------------------------------------------------------===//
4633// SSSE3 - Packed Binary Operator Instructions
4634//===---------------------------------------------------------------------===//
4635
4636/// SS3I_binop_rm - Simple SSSE3 bin op
4637multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
4638                         ValueType DstVT, ValueType OpVT, RegisterClass RC,
4639                         PatFrag memop_frag, X86MemOperand x86memop,
4640                         X86FoldableSchedWrite sched, bit Is2Addr = 1> {
4641  let isCommutable = 1 in
4642  def rr : SS38I<opc, MRMSrcReg, (outs RC:$dst),
4643       (ins RC:$src1, RC:$src2),
4644       !if(Is2Addr,
4645         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4646         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4647       [(set RC:$dst, (DstVT (OpNode (OpVT RC:$src1), RC:$src2)))]>,
4648       Sched<[sched]>;
4649  def rm : SS38I<opc, MRMSrcMem, (outs RC:$dst),
4650       (ins RC:$src1, x86memop:$src2),
4651       !if(Is2Addr,
4652         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4653         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4654       [(set RC:$dst,
4655         (DstVT (OpNode (OpVT RC:$src1),
4656          (bitconvert (memop_frag addr:$src2)))))]>,
4657       Sched<[sched.Folded, ReadAfterLd]>;
4658}
4659
4660/// SS3I_binop_rm_int - Simple SSSE3 bin op whose type can be v*{i8,i16,i32}.
4661multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr,
4662                             Intrinsic IntId128, X86FoldableSchedWrite sched,
4663                             PatFrag ld_frag, bit Is2Addr = 1> {
4664  let isCommutable = 1 in
4665  def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
4666       (ins VR128:$src1, VR128:$src2),
4667       !if(Is2Addr,
4668         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4669         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4670       [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
4671       Sched<[sched]>;
4672  def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
4673       (ins VR128:$src1, i128mem:$src2),
4674       !if(Is2Addr,
4675         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4676         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4677       [(set VR128:$dst,
4678         (IntId128 VR128:$src1,
4679          (bitconvert (ld_frag addr:$src2))))]>,
4680       Sched<[sched.Folded, ReadAfterLd]>;
4681}
4682
4683multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
4684                               Intrinsic IntId256,
4685                               X86FoldableSchedWrite sched> {
4686  let isCommutable = 1 in
4687  def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
4688       (ins VR256:$src1, VR256:$src2),
4689       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4690       [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>,
4691       Sched<[sched]>;
4692  def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
4693       (ins VR256:$src1, i256mem:$src2),
4694       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4695       [(set VR256:$dst,
4696         (IntId256 VR256:$src1, (bitconvert (loadv4i64 addr:$src2))))]>,
4697       Sched<[sched.Folded, ReadAfterLd]>;
4698}
4699
4700let ImmT = NoImm, Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
4701let isCommutable = 0 in {
4702  defm VPSHUFB    : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, v16i8,
4703                                  VR128, loadv2i64, i128mem,
4704                                  SchedWriteVarShuffle.XMM, 0>, VEX_4V, VEX_WIG;
4705  defm VPMADDUBSW : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v8i16,
4706                                  v16i8, VR128, loadv2i64, i128mem,
4707                                  SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG;
4708}
4709defm VPMULHRSW    : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v8i16, v8i16,
4710                                  VR128, loadv2i64, i128mem,
4711                                  SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG;
4712}
4713
4714let ImmT = NoImm, Predicates = [HasAVX] in {
4715let isCommutable = 0 in {
4716  defm VPHADDW    : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, v8i16, VR128,
4717                                  loadv2i64, i128mem,
4718                                  SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
4719  defm VPHADDD    : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, v4i32, VR128,
4720                                  loadv2i64, i128mem,
4721                                  SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
4722  defm VPHSUBW    : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, v8i16, VR128,
4723                                  loadv2i64, i128mem,
4724                                  SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
4725  defm VPHSUBD    : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, v4i32, VR128,
4726                                  loadv2i64, i128mem,
4727                                  SchedWritePHAdd.XMM, 0>, VEX_4V;
4728  defm VPSIGNB    : SS3I_binop_rm_int<0x08, "vpsignb",
4729                                      int_x86_ssse3_psign_b_128,
4730                                      SchedWriteVecALU.XMM, loadv2i64, 0>, VEX_4V, VEX_WIG;
4731  defm VPSIGNW    : SS3I_binop_rm_int<0x09, "vpsignw",
4732                                      int_x86_ssse3_psign_w_128,
4733                                      SchedWriteVecALU.XMM, loadv2i64, 0>, VEX_4V, VEX_WIG;
4734  defm VPSIGND    : SS3I_binop_rm_int<0x0A, "vpsignd",
4735                                      int_x86_ssse3_psign_d_128,
4736                                      SchedWriteVecALU.XMM, loadv2i64, 0>, VEX_4V, VEX_WIG;
4737  defm VPHADDSW   : SS3I_binop_rm_int<0x03, "vphaddsw",
4738                                      int_x86_ssse3_phadd_sw_128,
4739                                      SchedWritePHAdd.XMM, loadv2i64, 0>, VEX_4V, VEX_WIG;
4740  defm VPHSUBSW   : SS3I_binop_rm_int<0x07, "vphsubsw",
4741                                      int_x86_ssse3_phsub_sw_128,
4742                                      SchedWritePHAdd.XMM, loadv2i64, 0>, VEX_4V, VEX_WIG;
4743}
4744}
4745
4746let ImmT = NoImm, Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
4747let isCommutable = 0 in {
4748  defm VPSHUFBY   : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, v32i8,
4749                                  VR256, loadv4i64, i256mem,
4750                                  SchedWriteVarShuffle.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4751  defm VPMADDUBSWY : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v16i16,
4752                                   v32i8, VR256, loadv4i64, i256mem,
4753                                   SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4754}
4755defm VPMULHRSWY   : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v16i16, v16i16,
4756                                  VR256, loadv4i64, i256mem,
4757                                  SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4758}
4759
4760let ImmT = NoImm, Predicates = [HasAVX2] in {
4761let isCommutable = 0 in {
4762  defm VPHADDWY   : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, v16i16,
4763                                  VR256, loadv4i64, i256mem,
4764                                  SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4765  defm VPHADDDY   : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, v8i32, VR256,
4766                                  loadv4i64, i256mem,
4767                                  SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4768  defm VPHSUBWY   : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, v16i16,
4769                                  VR256, loadv4i64, i256mem,
4770                                  SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4771  defm VPHSUBDY   : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, v8i32, VR256,
4772                                  loadv4i64, i256mem,
4773                                  SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L;
4774  defm VPSIGNB   : SS3I_binop_rm_int_y<0x08, "vpsignb", int_x86_avx2_psign_b,
4775                                       SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG;
4776  defm VPSIGNW   : SS3I_binop_rm_int_y<0x09, "vpsignw", int_x86_avx2_psign_w,
4777                                       SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG;
4778  defm VPSIGND   : SS3I_binop_rm_int_y<0x0A, "vpsignd", int_x86_avx2_psign_d,
4779                                       SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG;
4780  defm VPHADDSW  : SS3I_binop_rm_int_y<0x03, "vphaddsw",
4781                                       int_x86_avx2_phadd_sw,
4782                                       SchedWritePHAdd.YMM>, VEX_4V, VEX_L, VEX_WIG;
4783  defm VPHSUBSW  : SS3I_binop_rm_int_y<0x07, "vphsubsw",
4784                                       int_x86_avx2_phsub_sw,
4785                                       SchedWritePHAdd.YMM>, VEX_4V, VEX_L, VEX_WIG;
4786}
4787}
4788
4789// None of these have i8 immediate fields.
4790let ImmT = NoImm, Constraints = "$src1 = $dst" in {
4791let isCommutable = 0 in {
4792  defm PHADDW    : SS3I_binop_rm<0x01, "phaddw", X86hadd, v8i16, v8i16, VR128,
4793                                 memopv2i64, i128mem, SchedWritePHAdd.XMM>;
4794  defm PHADDD    : SS3I_binop_rm<0x02, "phaddd", X86hadd, v4i32, v4i32, VR128,
4795                                 memopv2i64, i128mem, SchedWritePHAdd.XMM>;
4796  defm PHSUBW    : SS3I_binop_rm<0x05, "phsubw", X86hsub, v8i16, v8i16, VR128,
4797                                 memopv2i64, i128mem, SchedWritePHAdd.XMM>;
4798  defm PHSUBD    : SS3I_binop_rm<0x06, "phsubd", X86hsub, v4i32, v4i32, VR128,
4799                                 memopv2i64, i128mem, SchedWritePHAdd.XMM>;
4800  defm PSIGNB    : SS3I_binop_rm_int<0x08, "psignb", int_x86_ssse3_psign_b_128,
4801                                     SchedWriteVecALU.XMM, memopv2i64>;
4802  defm PSIGNW    : SS3I_binop_rm_int<0x09, "psignw", int_x86_ssse3_psign_w_128,
4803                                     SchedWriteVecALU.XMM, memopv2i64>;
4804  defm PSIGND    : SS3I_binop_rm_int<0x0A, "psignd", int_x86_ssse3_psign_d_128,
4805                                     SchedWriteVecALU.XMM, memopv2i64>;
4806  defm PSHUFB    : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, v16i8, VR128,
4807                                 memopv2i64, i128mem, SchedWriteVarShuffle.XMM>;
4808  defm PHADDSW   : SS3I_binop_rm_int<0x03, "phaddsw",
4809                                     int_x86_ssse3_phadd_sw_128,
4810                                     SchedWritePHAdd.XMM, memopv2i64>;
4811  defm PHSUBSW   : SS3I_binop_rm_int<0x07, "phsubsw",
4812                                     int_x86_ssse3_phsub_sw_128,
4813                                     SchedWritePHAdd.XMM, memopv2i64>;
4814  defm PMADDUBSW : SS3I_binop_rm<0x04, "pmaddubsw", X86vpmaddubsw, v8i16,
4815                                 v16i8, VR128, memopv2i64, i128mem,
4816                                 SchedWriteVecIMul.XMM>;
4817}
4818defm PMULHRSW    : SS3I_binop_rm<0x0B, "pmulhrsw", X86mulhrs, v8i16, v8i16,
4819                                 VR128, memopv2i64, i128mem, SchedWriteVecIMul.XMM>;
4820}
4821
4822//===---------------------------------------------------------------------===//
4823// SSSE3 - Packed Align Instruction Patterns
4824//===---------------------------------------------------------------------===//
4825
4826multiclass ssse3_palignr<string asm, ValueType VT, RegisterClass RC,
4827                         PatFrag memop_frag, X86MemOperand x86memop,
4828                         X86FoldableSchedWrite sched, bit Is2Addr = 1> {
4829  let hasSideEffects = 0 in {
4830  def rri : SS3AI<0x0F, MRMSrcReg, (outs RC:$dst),
4831      (ins RC:$src1, RC:$src2, u8imm:$src3),
4832      !if(Is2Addr,
4833        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
4834        !strconcat(asm,
4835                  "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
4836      [(set RC:$dst, (VT (X86PAlignr RC:$src1, RC:$src2, (i8 imm:$src3))))]>,
4837      Sched<[sched]>;
4838  let mayLoad = 1 in
4839  def rmi : SS3AI<0x0F, MRMSrcMem, (outs RC:$dst),
4840      (ins RC:$src1, x86memop:$src2, u8imm:$src3),
4841      !if(Is2Addr,
4842        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
4843        !strconcat(asm,
4844                  "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
4845      [(set RC:$dst, (VT (X86PAlignr RC:$src1,
4846                                     (bitconvert (memop_frag addr:$src2)),
4847                                     (i8 imm:$src3))))]>,
4848      Sched<[sched.Folded, ReadAfterLd]>;
4849  }
4850}
4851
4852let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
4853  defm VPALIGNR : ssse3_palignr<"vpalignr", v16i8, VR128, loadv2i64, i128mem,
4854                                SchedWriteShuffle.XMM, 0>, VEX_4V, VEX_WIG;
4855let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
4856  defm VPALIGNRY : ssse3_palignr<"vpalignr", v32i8, VR256, loadv4i64, i256mem,
4857                                 SchedWriteShuffle.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4858let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in
4859  defm PALIGNR : ssse3_palignr<"palignr", v16i8, VR128, memopv2i64, i128mem,
4860                               SchedWriteShuffle.XMM>;
4861
4862//===---------------------------------------------------------------------===//
4863// SSSE3 - Thread synchronization
4864//===---------------------------------------------------------------------===//
4865
4866let SchedRW = [WriteSystem] in {
4867let usesCustomInserter = 1 in {
4868def MONITOR : PseudoI<(outs), (ins i32mem:$src1, GR32:$src2, GR32:$src3),
4869                [(int_x86_sse3_monitor addr:$src1, GR32:$src2, GR32:$src3)]>,
4870                Requires<[HasSSE3]>;
4871}
4872
4873let Uses = [EAX, ECX, EDX] in
4874def MONITORrrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>,
4875                   TB, Requires<[HasSSE3]>;
4876
4877let Uses = [ECX, EAX] in
4878def MWAITrr   : I<0x01, MRM_C9, (outs), (ins), "mwait",
4879                  [(int_x86_sse3_mwait ECX, EAX)]>, TB, Requires<[HasSSE3]>;
4880} // SchedRW
4881
4882def : InstAlias<"mwait\t{%eax, %ecx|ecx, eax}", (MWAITrr)>, Requires<[Not64BitMode]>;
4883def : InstAlias<"mwait\t{%rax, %rcx|rcx, rax}", (MWAITrr)>, Requires<[In64BitMode]>;
4884
4885def : InstAlias<"monitor\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITORrrr)>,
4886      Requires<[Not64BitMode]>;
4887def : InstAlias<"monitor\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITORrrr)>,
4888      Requires<[In64BitMode]>;
4889
4890//===----------------------------------------------------------------------===//
4891// SSE4.1 - Packed Move with Sign/Zero Extend
4892//===----------------------------------------------------------------------===//
4893
4894multiclass SS41I_pmovx_rrrm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp,
4895                            RegisterClass OutRC, RegisterClass InRC,
4896                            X86FoldableSchedWrite sched> {
4897  def rr : SS48I<opc, MRMSrcReg, (outs OutRC:$dst), (ins InRC:$src),
4898                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>,
4899                 Sched<[sched]>;
4900
4901  def rm : SS48I<opc, MRMSrcMem, (outs OutRC:$dst), (ins MemOp:$src),
4902                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>,
4903                 Sched<[sched.Folded]>;
4904}
4905
4906multiclass SS41I_pmovx_rm_all<bits<8> opc, string OpcodeStr,
4907                              X86MemOperand MemOp, X86MemOperand MemYOp,
4908                              Predicate prd> {
4909  defm NAME : SS41I_pmovx_rrrm<opc, OpcodeStr, MemOp, VR128, VR128,
4910                               SchedWriteShuffle.XMM>;
4911  let Predicates = [HasAVX, prd] in
4912    defm V#NAME   : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemOp,
4913                                     VR128, VR128, SchedWriteShuffle.XMM>,
4914                                     VEX, VEX_WIG;
4915  let Predicates = [HasAVX2, prd] in
4916    defm V#NAME#Y : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemYOp,
4917                                     VR256, VR128, WriteShuffle256>,
4918                                     VEX, VEX_L, VEX_WIG;
4919}
4920
4921multiclass SS41I_pmovx_rm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp,
4922                          X86MemOperand MemYOp, Predicate prd> {
4923  defm PMOVSX#NAME : SS41I_pmovx_rm_all<opc, !strconcat("pmovsx", OpcodeStr),
4924                                        MemOp, MemYOp, prd>;
4925  defm PMOVZX#NAME : SS41I_pmovx_rm_all<!add(opc, 0x10),
4926                                        !strconcat("pmovzx", OpcodeStr),
4927                                        MemOp, MemYOp, prd>;
4928}
4929
4930defm BW : SS41I_pmovx_rm<0x20, "bw", i64mem, i128mem, NoVLX_Or_NoBWI>;
4931defm WD : SS41I_pmovx_rm<0x23, "wd", i64mem, i128mem, NoVLX>;
4932defm DQ : SS41I_pmovx_rm<0x25, "dq", i64mem, i128mem, NoVLX>;
4933
4934defm BD : SS41I_pmovx_rm<0x21, "bd", i32mem, i64mem, NoVLX>;
4935defm WQ : SS41I_pmovx_rm<0x24, "wq", i32mem, i64mem, NoVLX>;
4936
4937defm BQ : SS41I_pmovx_rm<0x22, "bq", i16mem, i32mem, NoVLX>;
4938
4939// AVX2 Patterns
4940multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy, SDNode ExtOp> {
4941  // Register-Register patterns
4942  let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
4943  def : Pat<(v16i16 (ExtOp (v16i8 VR128:$src))),
4944            (!cast<I>(OpcPrefix#BWYrr) VR128:$src)>;
4945  }
4946  let Predicates = [HasAVX, NoVLX] in {
4947  def : Pat<(v8i32 (ExtOp (v16i8 VR128:$src))),
4948            (!cast<I>(OpcPrefix#BDYrr) VR128:$src)>;
4949  def : Pat<(v4i64 (ExtOp (v16i8 VR128:$src))),
4950            (!cast<I>(OpcPrefix#BQYrr) VR128:$src)>;
4951
4952  def : Pat<(v8i32 (ExtOp (v8i16 VR128:$src))),
4953            (!cast<I>(OpcPrefix#WDYrr) VR128:$src)>;
4954  def : Pat<(v4i64 (ExtOp (v8i16 VR128:$src))),
4955            (!cast<I>(OpcPrefix#WQYrr) VR128:$src)>;
4956
4957  def : Pat<(v4i64 (ExtOp (v4i32 VR128:$src))),
4958            (!cast<I>(OpcPrefix#DQYrr) VR128:$src)>;
4959  }
4960
4961  // Simple Register-Memory patterns
4962  let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
4963  def : Pat<(v16i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
4964            (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
4965  }
4966  let Predicates = [HasAVX, NoVLX] in {
4967  def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
4968            (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
4969  def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
4970            (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
4971
4972  def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
4973            (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
4974  def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
4975            (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
4976
4977  def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)),
4978            (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
4979  }
4980
4981  // AVX2 Register-Memory patterns
4982  let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
4983  def : Pat<(v16i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
4984            (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
4985  def : Pat<(v16i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
4986            (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
4987  def : Pat<(v16i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
4988            (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
4989  }
4990  let Predicates = [HasAVX, NoVLX] in {
4991  def : Pat<(v8i32 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
4992            (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
4993  def : Pat<(v8i32 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
4994            (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
4995  def : Pat<(v8i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
4996            (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
4997  def : Pat<(v8i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
4998            (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
4999
5000  def : Pat<(v4i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
5001            (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
5002  def : Pat<(v4i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))),
5003            (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
5004  def : Pat<(v4i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
5005            (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
5006  def : Pat<(v4i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
5007            (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
5008
5009  def : Pat<(v8i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
5010            (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
5011  def : Pat<(v8i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
5012            (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
5013  def : Pat<(v8i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
5014            (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
5015
5016  def : Pat<(v4i64 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5017            (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
5018  def : Pat<(v4i64 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
5019            (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
5020  def : Pat<(v4i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
5021            (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
5022  def : Pat<(v4i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
5023            (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
5024
5025  def : Pat<(v4i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))),
5026            (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
5027  def : Pat<(v4i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))),
5028            (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
5029  def : Pat<(v4i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))),
5030            (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
5031  }
5032}
5033
5034defm : SS41I_pmovx_avx2_patterns<"VPMOVSX", "s", X86vsext>;
5035defm : SS41I_pmovx_avx2_patterns<"VPMOVZX", "z", X86vzext>;
5036
5037// SSE4.1/AVX patterns.
5038multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
5039                                SDNode ExtOp> {
5040  let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
5041  def : Pat<(v8i16 (ExtOp (v16i8 VR128:$src))),
5042            (!cast<I>(OpcPrefix#BWrr) VR128:$src)>;
5043  }
5044  let Predicates = [HasAVX, NoVLX] in {
5045  def : Pat<(v4i32 (ExtOp (v16i8 VR128:$src))),
5046            (!cast<I>(OpcPrefix#BDrr) VR128:$src)>;
5047  def : Pat<(v2i64 (ExtOp (v16i8 VR128:$src))),
5048            (!cast<I>(OpcPrefix#BQrr) VR128:$src)>;
5049
5050  def : Pat<(v4i32 (ExtOp (v8i16 VR128:$src))),
5051            (!cast<I>(OpcPrefix#WDrr) VR128:$src)>;
5052  def : Pat<(v2i64 (ExtOp (v8i16 VR128:$src))),
5053            (!cast<I>(OpcPrefix#WQrr) VR128:$src)>;
5054
5055  def : Pat<(v2i64 (ExtOp (v4i32 VR128:$src))),
5056            (!cast<I>(OpcPrefix#DQrr) VR128:$src)>;
5057  }
5058  let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
5059  def : Pat<(v8i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5060            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5061  }
5062  let Predicates = [HasAVX, NoVLX] in {
5063  def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5064            (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
5065  def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5066            (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
5067
5068  def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
5069            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5070  def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
5071            (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
5072
5073  def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)),
5074            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5075  }
5076  let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
5077  def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5078            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5079  def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
5080            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5081  def : Pat<(v8i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
5082            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5083  def : Pat<(v8i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
5084            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5085  def : Pat<(v8i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
5086            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5087  }
5088  let Predicates = [HasAVX, NoVLX] in {
5089  def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
5090            (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
5091  def : Pat<(v4i32 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))),
5092            (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
5093  def : Pat<(v4i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
5094            (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
5095  def : Pat<(v4i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
5096            (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
5097
5098  def : Pat<(v2i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (extloadi32i16 addr:$src)))))),
5099            (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
5100  def : Pat<(v2i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))),
5101            (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
5102  def : Pat<(v2i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
5103            (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
5104  def : Pat<(v2i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
5105            (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
5106
5107  def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5108            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5109  def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
5110            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5111  def : Pat<(v4i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
5112            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5113  def : Pat<(v4i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
5114            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5115  def : Pat<(v4i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
5116            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5117
5118  def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
5119            (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
5120  def : Pat<(v2i64 (ExtOp (v8i16 (vzmovl_v4i32 addr:$src)))),
5121            (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
5122  def : Pat<(v2i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
5123            (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
5124  def : Pat<(v2i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
5125            (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
5126
5127  def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5128            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5129  def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
5130            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5131  def : Pat<(v2i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))),
5132            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5133  def : Pat<(v2i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))),
5134            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5135  def : Pat<(v2i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))),
5136            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5137  }
5138}
5139
5140defm : SS41I_pmovx_patterns<"VPMOVSX", "s", sext_invec>;
5141defm : SS41I_pmovx_patterns<"VPMOVZX", "z", zext_invec>;
5142
5143let Predicates = [UseSSE41] in {
5144  defm : SS41I_pmovx_patterns<"PMOVSX", "s", sext_invec>;
5145  defm : SS41I_pmovx_patterns<"PMOVZX", "z", zext_invec>;
5146}
5147
5148//===----------------------------------------------------------------------===//
5149// SSE4.1 - Extract Instructions
5150//===----------------------------------------------------------------------===//
5151
5152/// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem
5153multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> {
5154  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
5155                 (ins VR128:$src1, u8imm:$src2),
5156                 !strconcat(OpcodeStr,
5157                            "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5158                 [(set GR32orGR64:$dst, (X86pextrb (v16i8 VR128:$src1),
5159                                         imm:$src2))]>,
5160                  Sched<[WriteVecExtract]>;
5161  let hasSideEffects = 0, mayStore = 1 in
5162  def mr : SS4AIi8<opc, MRMDestMem, (outs),
5163                 (ins i8mem:$dst, VR128:$src1, u8imm:$src2),
5164                 !strconcat(OpcodeStr,
5165                            "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5166                 [(store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), imm:$src2))),
5167                          addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5168}
5169
5170let Predicates = [HasAVX, NoBWI] in
5171  defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX;
5172
5173defm PEXTRB      : SS41I_extract8<0x14, "pextrb">;
5174
5175
5176/// SS41I_extract16 - SSE 4.1 extract 16 bits to memory destination
5177multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> {
5178  let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
5179  def rr_REV : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
5180                   (ins VR128:$src1, u8imm:$src2),
5181                   !strconcat(OpcodeStr,
5182                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>,
5183                   Sched<[WriteVecExtract]>, FoldGenData<NAME#rr>;
5184
5185  let hasSideEffects = 0, mayStore = 1 in
5186  def mr : SS4AIi8<opc, MRMDestMem, (outs),
5187                 (ins i16mem:$dst, VR128:$src1, u8imm:$src2),
5188                 !strconcat(OpcodeStr,
5189                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5190                 [(store (i16 (trunc (X86pextrw (v8i16 VR128:$src1), imm:$src2))),
5191                          addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5192}
5193
5194let Predicates = [HasAVX, NoBWI] in
5195  defm VPEXTRW : SS41I_extract16<0x15, "vpextrw">, VEX;
5196
5197defm PEXTRW      : SS41I_extract16<0x15, "pextrw">;
5198
5199
5200/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
5201multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> {
5202  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
5203                 (ins VR128:$src1, u8imm:$src2),
5204                 !strconcat(OpcodeStr,
5205                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5206                 [(set GR32:$dst,
5207                  (extractelt (v4i32 VR128:$src1), imm:$src2))]>,
5208                  Sched<[WriteVecExtract]>;
5209  def mr : SS4AIi8<opc, MRMDestMem, (outs),
5210                 (ins i32mem:$dst, VR128:$src1, u8imm:$src2),
5211                 !strconcat(OpcodeStr,
5212                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5213                 [(store (extractelt (v4i32 VR128:$src1), imm:$src2),
5214                          addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5215}
5216
5217let Predicates = [HasAVX, NoDQI] in
5218  defm VPEXTRD : SS41I_extract32<0x16, "vpextrd">, VEX;
5219
5220defm PEXTRD      : SS41I_extract32<0x16, "pextrd">;
5221
5222/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
5223multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> {
5224  def rr : SS4AIi8<opc, MRMDestReg, (outs GR64:$dst),
5225                 (ins VR128:$src1, u8imm:$src2),
5226                 !strconcat(OpcodeStr,
5227                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5228                 [(set GR64:$dst,
5229                  (extractelt (v2i64 VR128:$src1), imm:$src2))]>,
5230                  Sched<[WriteVecExtract]>;
5231  def mr : SS4AIi8<opc, MRMDestMem, (outs),
5232                 (ins i64mem:$dst, VR128:$src1, u8imm:$src2),
5233                 !strconcat(OpcodeStr,
5234                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5235                 [(store (extractelt (v2i64 VR128:$src1), imm:$src2),
5236                          addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5237}
5238
5239let Predicates = [HasAVX, NoDQI] in
5240  defm VPEXTRQ : SS41I_extract64<0x16, "vpextrq">, VEX, VEX_W;
5241
5242defm PEXTRQ      : SS41I_extract64<0x16, "pextrq">, REX_W;
5243
5244/// SS41I_extractf32 - SSE 4.1 extract 32 bits fp value to int reg or memory
5245/// destination
5246multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr> {
5247  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
5248                   (ins VR128:$src1, u8imm:$src2),
5249                   !strconcat(OpcodeStr,
5250                    "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5251                   [(set GR32orGR64:$dst,
5252                      (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))]>,
5253                   Sched<[WriteVecExtract]>;
5254  def mr : SS4AIi8<opc, MRMDestMem, (outs),
5255                   (ins f32mem:$dst, VR128:$src1, u8imm:$src2),
5256                   !strconcat(OpcodeStr,
5257                    "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5258                   [(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2),
5259                            addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5260}
5261
5262let ExeDomain = SSEPackedSingle in {
5263  let Predicates = [UseAVX] in
5264    defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX, VEX_WIG;
5265  defm EXTRACTPS   : SS41I_extractf32<0x17, "extractps">;
5266}
5267
5268// Also match an EXTRACTPS store when the store is done as f32 instead of i32.
5269def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)),
5270                                              imm:$src2))),
5271                 addr:$dst),
5272          (VEXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>,
5273          Requires<[HasAVX]>;
5274def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)),
5275                                              imm:$src2))),
5276                 addr:$dst),
5277          (EXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>,
5278          Requires<[UseSSE41]>;
5279
5280//===----------------------------------------------------------------------===//
5281// SSE4.1 - Insert Instructions
5282//===----------------------------------------------------------------------===//
5283
5284multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> {
5285  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
5286      (ins VR128:$src1, GR32orGR64:$src2, u8imm:$src3),
5287      !if(Is2Addr,
5288        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5289        !strconcat(asm,
5290                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5291      [(set VR128:$dst,
5292        (X86pinsrb VR128:$src1, GR32orGR64:$src2, imm:$src3))]>,
5293      Sched<[WriteVecInsert]>;
5294  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
5295      (ins VR128:$src1, i8mem:$src2, u8imm:$src3),
5296      !if(Is2Addr,
5297        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5298        !strconcat(asm,
5299                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5300      [(set VR128:$dst,
5301        (X86pinsrb VR128:$src1, (extloadi8 addr:$src2),
5302                   imm:$src3))]>, Sched<[WriteVecInsertLd, ReadAfterLd]>;
5303}
5304
5305let Predicates = [HasAVX, NoBWI] in
5306  defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V;
5307let Constraints = "$src1 = $dst" in
5308  defm PINSRB  : SS41I_insert8<0x20, "pinsrb">;
5309
5310multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> {
5311  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
5312      (ins VR128:$src1, GR32:$src2, u8imm:$src3),
5313      !if(Is2Addr,
5314        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5315        !strconcat(asm,
5316                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5317      [(set VR128:$dst,
5318        (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>,
5319      Sched<[WriteVecInsert]>;
5320  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
5321      (ins VR128:$src1, i32mem:$src2, u8imm:$src3),
5322      !if(Is2Addr,
5323        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5324        !strconcat(asm,
5325                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5326      [(set VR128:$dst,
5327        (v4i32 (insertelt VR128:$src1, (loadi32 addr:$src2),
5328                          imm:$src3)))]>, Sched<[WriteVecInsertLd, ReadAfterLd]>;
5329}
5330
5331let Predicates = [HasAVX, NoDQI] in
5332  defm VPINSRD : SS41I_insert32<0x22, "vpinsrd", 0>, VEX_4V;
5333let Constraints = "$src1 = $dst" in
5334  defm PINSRD : SS41I_insert32<0x22, "pinsrd">;
5335
5336multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> {
5337  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
5338      (ins VR128:$src1, GR64:$src2, u8imm:$src3),
5339      !if(Is2Addr,
5340        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5341        !strconcat(asm,
5342                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5343      [(set VR128:$dst,
5344        (v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>,
5345      Sched<[WriteVecInsert]>;
5346  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
5347      (ins VR128:$src1, i64mem:$src2, u8imm:$src3),
5348      !if(Is2Addr,
5349        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5350        !strconcat(asm,
5351                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5352      [(set VR128:$dst,
5353        (v2i64 (insertelt VR128:$src1, (loadi64 addr:$src2),
5354                          imm:$src3)))]>, Sched<[WriteVecInsertLd, ReadAfterLd]>;
5355}
5356
5357let Predicates = [HasAVX, NoDQI] in
5358  defm VPINSRQ : SS41I_insert64<0x22, "vpinsrq", 0>, VEX_4V, VEX_W;
5359let Constraints = "$src1 = $dst" in
5360  defm PINSRQ : SS41I_insert64<0x22, "pinsrq">, REX_W;
5361
5362// insertps has a few different modes, there's the first two here below which
5363// are optimized inserts that won't zero arbitrary elements in the destination
5364// vector. The next one matches the intrinsic and could zero arbitrary elements
5365// in the target vector.
5366multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> {
5367  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
5368      (ins VR128:$src1, VR128:$src2, u8imm:$src3),
5369      !if(Is2Addr,
5370        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5371        !strconcat(asm,
5372                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5373      [(set VR128:$dst,
5374        (X86insertps VR128:$src1, VR128:$src2, imm:$src3))]>,
5375      Sched<[SchedWriteFShuffle.XMM]>;
5376  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
5377      (ins VR128:$src1, f32mem:$src2, u8imm:$src3),
5378      !if(Is2Addr,
5379        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5380        !strconcat(asm,
5381                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5382      [(set VR128:$dst,
5383        (X86insertps VR128:$src1,
5384                   (v4f32 (scalar_to_vector (loadf32 addr:$src2))),
5385                    imm:$src3))]>,
5386      Sched<[SchedWriteFShuffle.XMM.Folded, ReadAfterLd]>;
5387}
5388
5389let ExeDomain = SSEPackedSingle in {
5390  let Predicates = [UseAVX] in
5391    defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>,
5392                     VEX_4V, VEX_WIG;
5393  let Constraints = "$src1 = $dst" in
5394    defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1>;
5395}
5396
5397let Predicates = [UseAVX] in {
5398  // If we're inserting an element from a vbroadcast of a load, fold the
5399  // load into the X86insertps instruction.
5400  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1),
5401                (X86VBroadcast (loadf32 addr:$src2)), imm:$src3)),
5402            (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
5403  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1),
5404                (X86VBroadcast (loadv4f32 addr:$src2)), imm:$src3)),
5405            (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
5406}
5407
5408//===----------------------------------------------------------------------===//
5409// SSE4.1 - Round Instructions
5410//===----------------------------------------------------------------------===//
5411
5412multiclass sse41_fp_unop_p<bits<8> opc, string OpcodeStr,
5413                           X86MemOperand x86memop, RegisterClass RC,
5414                           ValueType VT, PatFrag mem_frag, SDNode OpNode,
5415                           X86FoldableSchedWrite sched> {
5416  // Intrinsic operation, reg.
5417  // Vector intrinsic operation, reg
5418  def r : SS4AIi8<opc, MRMSrcReg,
5419                  (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2),
5420                  !strconcat(OpcodeStr,
5421                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5422                  [(set RC:$dst, (VT (OpNode RC:$src1, imm:$src2)))]>,
5423                  Sched<[sched]>;
5424
5425  // Vector intrinsic operation, mem
5426  def m : SS4AIi8<opc, MRMSrcMem,
5427                  (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2),
5428                  !strconcat(OpcodeStr,
5429                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5430                  [(set RC:$dst,
5431                        (VT (OpNode (mem_frag addr:$src1),imm:$src2)))]>,
5432                  Sched<[sched.Folded]>;
5433}
5434
5435multiclass avx_fp_unop_rm<bits<8> opcss, bits<8> opcsd,
5436                          string OpcodeStr, X86FoldableSchedWrite sched> {
5437let ExeDomain = SSEPackedSingle, hasSideEffects = 0 in {
5438  def SSr : SS4AIi8<opcss, MRMSrcReg,
5439        (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32u8imm:$src3),
5440        !strconcat(OpcodeStr,
5441            "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5442      []>, Sched<[sched]>;
5443
5444  let mayLoad = 1 in
5445  def SSm : SS4AIi8<opcss, MRMSrcMem,
5446        (outs FR32:$dst), (ins FR32:$src1, f32mem:$src2, i32u8imm:$src3),
5447        !strconcat(OpcodeStr,
5448             "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5449        []>, Sched<[sched.Folded, ReadAfterLd]>;
5450} // ExeDomain = SSEPackedSingle, hasSideEffects = 0
5451
5452let ExeDomain = SSEPackedDouble, hasSideEffects = 0 in {
5453  def SDr : SS4AIi8<opcsd, MRMSrcReg,
5454        (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32u8imm:$src3),
5455        !strconcat(OpcodeStr,
5456              "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5457        []>, Sched<[sched]>;
5458
5459  let mayLoad = 1 in
5460  def SDm : SS4AIi8<opcsd, MRMSrcMem,
5461        (outs FR64:$dst), (ins FR64:$src1, f64mem:$src2, i32u8imm:$src3),
5462        !strconcat(OpcodeStr,
5463             "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5464        []>, Sched<[sched.Folded, ReadAfterLd]>;
5465} // ExeDomain = SSEPackedDouble, hasSideEffects = 0
5466}
5467
5468multiclass sse41_fp_unop_s<bits<8> opcss, bits<8> opcsd,
5469                           string OpcodeStr, X86FoldableSchedWrite sched> {
5470let ExeDomain = SSEPackedSingle, hasSideEffects = 0 in {
5471  def SSr : SS4AIi8<opcss, MRMSrcReg,
5472                    (outs FR32:$dst), (ins FR32:$src1, i32u8imm:$src2),
5473                    !strconcat(OpcodeStr,
5474                               "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5475                    []>, Sched<[sched]>;
5476
5477  let mayLoad = 1 in
5478  def SSm : SS4AIi8<opcss, MRMSrcMem,
5479                    (outs FR32:$dst), (ins f32mem:$src1, i32u8imm:$src2),
5480                    !strconcat(OpcodeStr,
5481                               "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5482                    []>, Sched<[sched.Folded, ReadAfterLd]>;
5483} // ExeDomain = SSEPackedSingle, hasSideEffects = 0
5484
5485let ExeDomain = SSEPackedDouble, hasSideEffects = 0 in {
5486  def SDr : SS4AIi8<opcsd, MRMSrcReg,
5487                    (outs FR64:$dst), (ins FR64:$src1, i32u8imm:$src2),
5488                    !strconcat(OpcodeStr,
5489                               "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5490                    []>, Sched<[sched]>;
5491
5492  let mayLoad = 1 in
5493  def SDm : SS4AIi8<opcsd, MRMSrcMem,
5494                    (outs FR64:$dst), (ins f64mem:$src1, i32u8imm:$src2),
5495                    !strconcat(OpcodeStr,
5496                               "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5497                    []>, Sched<[sched.Folded, ReadAfterLd]>;
5498} // ExeDomain = SSEPackedDouble, hasSideEffects = 0
5499}
5500
5501multiclass sse41_fp_binop_s<bits<8> opcss, bits<8> opcsd,
5502                            string OpcodeStr, X86FoldableSchedWrite sched,
5503                            ValueType VT32, ValueType VT64,
5504                            SDNode OpNode, bit Is2Addr = 1> {
5505let ExeDomain = SSEPackedSingle, isCodeGenOnly = 1 in {
5506  def SSr_Int : SS4AIi8<opcss, MRMSrcReg,
5507        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
5508        !if(Is2Addr,
5509            !strconcat(OpcodeStr,
5510                "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5511            !strconcat(OpcodeStr,
5512                "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5513        [(set VR128:$dst, (VT32 (OpNode VR128:$src1, VR128:$src2, imm:$src3)))]>,
5514        Sched<[sched]>;
5515
5516  def SSm_Int : SS4AIi8<opcss, MRMSrcMem,
5517        (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32u8imm:$src3),
5518        !if(Is2Addr,
5519            !strconcat(OpcodeStr,
5520                "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5521            !strconcat(OpcodeStr,
5522                "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5523        [(set VR128:$dst,
5524             (OpNode VR128:$src1, sse_load_f32:$src2, imm:$src3))]>,
5525        Sched<[sched.Folded, ReadAfterLd]>;
5526} // ExeDomain = SSEPackedSingle, isCodeGenOnly = 1
5527
5528let ExeDomain = SSEPackedDouble, isCodeGenOnly = 1 in {
5529  def SDr_Int : SS4AIi8<opcsd, MRMSrcReg,
5530        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
5531        !if(Is2Addr,
5532            !strconcat(OpcodeStr,
5533                "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5534            !strconcat(OpcodeStr,
5535                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5536        [(set VR128:$dst, (VT64 (OpNode VR128:$src1, VR128:$src2, imm:$src3)))]>,
5537        Sched<[sched]>;
5538
5539  def SDm_Int : SS4AIi8<opcsd, MRMSrcMem,
5540        (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32u8imm:$src3),
5541        !if(Is2Addr,
5542            !strconcat(OpcodeStr,
5543                "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5544            !strconcat(OpcodeStr,
5545                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5546        [(set VR128:$dst,
5547              (OpNode VR128:$src1, sse_load_f64:$src2, imm:$src3))]>,
5548        Sched<[sched.Folded, ReadAfterLd]>;
5549} // ExeDomain = SSEPackedDouble, isCodeGenOnly = 1
5550}
5551
5552// FP round - roundss, roundps, roundsd, roundpd
5553let Predicates = [HasAVX, NoVLX] in {
5554  let ExeDomain = SSEPackedSingle in {
5555    // Intrinsic form
5556    defm VROUNDPS  : sse41_fp_unop_p<0x08, "vroundps", f128mem, VR128, v4f32,
5557                                     loadv4f32, X86VRndScale, SchedWriteFRnd.XMM>,
5558                                   VEX, VEX_WIG;
5559    defm VROUNDPSY : sse41_fp_unop_p<0x08, "vroundps", f256mem, VR256, v8f32,
5560                                     loadv8f32, X86VRndScale, SchedWriteFRnd.YMM>,
5561                                   VEX, VEX_L, VEX_WIG;
5562  }
5563
5564  let ExeDomain = SSEPackedDouble in {
5565    defm VROUNDPD  : sse41_fp_unop_p<0x09, "vroundpd", f128mem, VR128, v2f64,
5566                                     loadv2f64, X86VRndScale, SchedWriteFRnd.XMM>,
5567                                   VEX, VEX_WIG;
5568    defm VROUNDPDY : sse41_fp_unop_p<0x09, "vroundpd", f256mem, VR256, v4f64,
5569                                     loadv4f64, X86VRndScale, SchedWriteFRnd.YMM>,
5570                                   VEX, VEX_L, VEX_WIG;
5571  }
5572}
5573let Predicates = [HasAVX, NoAVX512] in {
5574  defm VROUND  : sse41_fp_binop_s<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl,
5575                                  v4f32, v2f64, X86RndScales, 0>,
5576                                  VEX_4V, VEX_LIG, VEX_WIG;
5577  defm VROUND  : avx_fp_unop_rm<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl>,
5578                                VEX_4V, VEX_LIG, VEX_WIG;
5579}
5580
5581let Predicates = [UseAVX] in {
5582  def : Pat<(ffloor FR32:$src),
5583            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x9))>;
5584  def : Pat<(f32 (fnearbyint FR32:$src)),
5585            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>;
5586  def : Pat<(f32 (fceil FR32:$src)),
5587            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xA))>;
5588  def : Pat<(f32 (frint FR32:$src)),
5589            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>;
5590  def : Pat<(f32 (ftrunc FR32:$src)),
5591            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xB))>;
5592
5593  def : Pat<(f64 (ffloor FR64:$src)),
5594            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x9))>;
5595  def : Pat<(f64 (fnearbyint FR64:$src)),
5596            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>;
5597  def : Pat<(f64 (fceil FR64:$src)),
5598            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xA))>;
5599  def : Pat<(f64 (frint FR64:$src)),
5600            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>;
5601  def : Pat<(f64 (ftrunc FR64:$src)),
5602            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xB))>;
5603}
5604
5605let Predicates = [UseAVX, OptForSize] in {
5606  def : Pat<(ffloor (loadf32 addr:$src)),
5607            (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src, (i32 0x9))>;
5608  def : Pat<(f32 (fnearbyint (loadf32 addr:$src))),
5609            (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src, (i32 0xC))>;
5610  def : Pat<(f32 (fceil (loadf32 addr:$src))),
5611            (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src, (i32 0xA))>;
5612  def : Pat<(f32 (frint (loadf32 addr:$src))),
5613            (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src, (i32 0x4))>;
5614  def : Pat<(f32 (ftrunc (loadf32 addr:$src))),
5615            (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src, (i32 0xB))>;
5616
5617  def : Pat<(f64 (ffloor (loadf64 addr:$src))),
5618            (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src, (i32 0x9))>;
5619  def : Pat<(f64 (fnearbyint (loadf64 addr:$src))),
5620            (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src, (i32 0xC))>;
5621  def : Pat<(f64 (fceil (loadf64 addr:$src))),
5622            (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src, (i32 0xA))>;
5623  def : Pat<(f64 (frint (loadf64 addr:$src))),
5624            (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src, (i32 0x4))>;
5625  def : Pat<(f64 (ftrunc (loadf64 addr:$src))),
5626            (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src, (i32 0xB))>;
5627}
5628
5629let Predicates = [HasAVX, NoVLX] in {
5630  def : Pat<(v4f32 (ffloor VR128:$src)),
5631            (VROUNDPSr VR128:$src, (i32 0x9))>;
5632  def : Pat<(v4f32 (fnearbyint VR128:$src)),
5633            (VROUNDPSr VR128:$src, (i32 0xC))>;
5634  def : Pat<(v4f32 (fceil VR128:$src)),
5635            (VROUNDPSr VR128:$src, (i32 0xA))>;
5636  def : Pat<(v4f32 (frint VR128:$src)),
5637            (VROUNDPSr VR128:$src, (i32 0x4))>;
5638  def : Pat<(v4f32 (ftrunc VR128:$src)),
5639            (VROUNDPSr VR128:$src, (i32 0xB))>;
5640
5641  def : Pat<(v4f32 (ffloor (loadv4f32 addr:$src))),
5642            (VROUNDPSm addr:$src, (i32 0x9))>;
5643  def : Pat<(v4f32 (fnearbyint (loadv4f32 addr:$src))),
5644            (VROUNDPSm addr:$src, (i32 0xC))>;
5645  def : Pat<(v4f32 (fceil (loadv4f32 addr:$src))),
5646            (VROUNDPSm addr:$src, (i32 0xA))>;
5647  def : Pat<(v4f32 (frint (loadv4f32 addr:$src))),
5648            (VROUNDPSm addr:$src, (i32 0x4))>;
5649  def : Pat<(v4f32 (ftrunc (loadv4f32 addr:$src))),
5650            (VROUNDPSm addr:$src, (i32 0xB))>;
5651
5652  def : Pat<(v2f64 (ffloor VR128:$src)),
5653            (VROUNDPDr VR128:$src, (i32 0x9))>;
5654  def : Pat<(v2f64 (fnearbyint VR128:$src)),
5655            (VROUNDPDr VR128:$src, (i32 0xC))>;
5656  def : Pat<(v2f64 (fceil VR128:$src)),
5657            (VROUNDPDr VR128:$src, (i32 0xA))>;
5658  def : Pat<(v2f64 (frint VR128:$src)),
5659            (VROUNDPDr VR128:$src, (i32 0x4))>;
5660  def : Pat<(v2f64 (ftrunc VR128:$src)),
5661            (VROUNDPDr VR128:$src, (i32 0xB))>;
5662
5663  def : Pat<(v2f64 (ffloor (loadv2f64 addr:$src))),
5664            (VROUNDPDm addr:$src, (i32 0x9))>;
5665  def : Pat<(v2f64 (fnearbyint (loadv2f64 addr:$src))),
5666            (VROUNDPDm addr:$src, (i32 0xC))>;
5667  def : Pat<(v2f64 (fceil (loadv2f64 addr:$src))),
5668            (VROUNDPDm addr:$src, (i32 0xA))>;
5669  def : Pat<(v2f64 (frint (loadv2f64 addr:$src))),
5670            (VROUNDPDm addr:$src, (i32 0x4))>;
5671  def : Pat<(v2f64 (ftrunc (loadv2f64 addr:$src))),
5672            (VROUNDPDm addr:$src, (i32 0xB))>;
5673
5674  def : Pat<(v8f32 (ffloor VR256:$src)),
5675            (VROUNDPSYr VR256:$src, (i32 0x9))>;
5676  def : Pat<(v8f32 (fnearbyint VR256:$src)),
5677            (VROUNDPSYr VR256:$src, (i32 0xC))>;
5678  def : Pat<(v8f32 (fceil VR256:$src)),
5679            (VROUNDPSYr VR256:$src, (i32 0xA))>;
5680  def : Pat<(v8f32 (frint VR256:$src)),
5681            (VROUNDPSYr VR256:$src, (i32 0x4))>;
5682  def : Pat<(v8f32 (ftrunc VR256:$src)),
5683            (VROUNDPSYr VR256:$src, (i32 0xB))>;
5684
5685  def : Pat<(v8f32 (ffloor (loadv8f32 addr:$src))),
5686            (VROUNDPSYm addr:$src, (i32 0x9))>;
5687  def : Pat<(v8f32 (fnearbyint (loadv8f32 addr:$src))),
5688            (VROUNDPSYm addr:$src, (i32 0xC))>;
5689  def : Pat<(v8f32 (fceil (loadv8f32 addr:$src))),
5690            (VROUNDPSYm addr:$src, (i32 0xA))>;
5691  def : Pat<(v8f32 (frint (loadv8f32 addr:$src))),
5692            (VROUNDPSYm addr:$src, (i32 0x4))>;
5693  def : Pat<(v8f32 (ftrunc (loadv8f32 addr:$src))),
5694            (VROUNDPSYm addr:$src, (i32 0xB))>;
5695
5696  def : Pat<(v4f64 (ffloor VR256:$src)),
5697            (VROUNDPDYr VR256:$src, (i32 0x9))>;
5698  def : Pat<(v4f64 (fnearbyint VR256:$src)),
5699            (VROUNDPDYr VR256:$src, (i32 0xC))>;
5700  def : Pat<(v4f64 (fceil VR256:$src)),
5701            (VROUNDPDYr VR256:$src, (i32 0xA))>;
5702  def : Pat<(v4f64 (frint VR256:$src)),
5703            (VROUNDPDYr VR256:$src, (i32 0x4))>;
5704  def : Pat<(v4f64 (ftrunc VR256:$src)),
5705            (VROUNDPDYr VR256:$src, (i32 0xB))>;
5706
5707  def : Pat<(v4f64 (ffloor (loadv4f64 addr:$src))),
5708            (VROUNDPDYm addr:$src, (i32 0x9))>;
5709  def : Pat<(v4f64 (fnearbyint (loadv4f64 addr:$src))),
5710            (VROUNDPDYm addr:$src, (i32 0xC))>;
5711  def : Pat<(v4f64 (fceil (loadv4f64 addr:$src))),
5712            (VROUNDPDYm addr:$src, (i32 0xA))>;
5713  def : Pat<(v4f64 (frint (loadv4f64 addr:$src))),
5714            (VROUNDPDYm addr:$src, (i32 0x4))>;
5715  def : Pat<(v4f64 (ftrunc (loadv4f64 addr:$src))),
5716            (VROUNDPDYm addr:$src, (i32 0xB))>;
5717}
5718
5719let ExeDomain = SSEPackedSingle in
5720defm ROUNDPS  : sse41_fp_unop_p<0x08, "roundps", f128mem, VR128, v4f32,
5721                                memopv4f32, X86VRndScale, SchedWriteFRnd.XMM>;
5722let ExeDomain = SSEPackedDouble in
5723defm ROUNDPD  : sse41_fp_unop_p<0x09, "roundpd", f128mem, VR128, v2f64,
5724                                memopv2f64, X86VRndScale, SchedWriteFRnd.XMM>;
5725
5726defm ROUND  : sse41_fp_unop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl>;
5727
5728let Constraints = "$src1 = $dst" in
5729defm ROUND  : sse41_fp_binop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl,
5730                               v4f32, v2f64, X86RndScales>;
5731
5732let Predicates = [UseSSE41] in {
5733  def : Pat<(ffloor FR32:$src),
5734            (ROUNDSSr FR32:$src, (i32 0x9))>;
5735  def : Pat<(f32 (fnearbyint FR32:$src)),
5736            (ROUNDSSr FR32:$src, (i32 0xC))>;
5737  def : Pat<(f32 (fceil FR32:$src)),
5738            (ROUNDSSr FR32:$src, (i32 0xA))>;
5739  def : Pat<(f32 (frint FR32:$src)),
5740            (ROUNDSSr FR32:$src, (i32 0x4))>;
5741  def : Pat<(f32 (ftrunc FR32:$src)),
5742            (ROUNDSSr FR32:$src, (i32 0xB))>;
5743
5744  def : Pat<(f64 (ffloor FR64:$src)),
5745            (ROUNDSDr FR64:$src, (i32 0x9))>;
5746  def : Pat<(f64 (fnearbyint FR64:$src)),
5747            (ROUNDSDr FR64:$src, (i32 0xC))>;
5748  def : Pat<(f64 (fceil FR64:$src)),
5749            (ROUNDSDr FR64:$src, (i32 0xA))>;
5750  def : Pat<(f64 (frint FR64:$src)),
5751            (ROUNDSDr FR64:$src, (i32 0x4))>;
5752  def : Pat<(f64 (ftrunc FR64:$src)),
5753            (ROUNDSDr FR64:$src, (i32 0xB))>;
5754}
5755
5756let Predicates = [UseSSE41, OptForSize] in {
5757  def : Pat<(ffloor (loadf32 addr:$src)),
5758            (ROUNDSSm addr:$src, (i32 0x9))>;
5759  def : Pat<(f32 (fnearbyint (loadf32 addr:$src))),
5760            (ROUNDSSm addr:$src, (i32 0xC))>;
5761  def : Pat<(f32 (fceil (loadf32 addr:$src))),
5762            (ROUNDSSm addr:$src, (i32 0xA))>;
5763  def : Pat<(f32 (frint (loadf32 addr:$src))),
5764            (ROUNDSSm addr:$src, (i32 0x4))>;
5765  def : Pat<(f32 (ftrunc (loadf32 addr:$src))),
5766            (ROUNDSSm addr:$src, (i32 0xB))>;
5767
5768  def : Pat<(f64 (ffloor (loadf64 addr:$src))),
5769            (ROUNDSDm addr:$src, (i32 0x9))>;
5770  def : Pat<(f64 (fnearbyint (loadf64 addr:$src))),
5771            (ROUNDSDm addr:$src, (i32 0xC))>;
5772  def : Pat<(f64 (fceil (loadf64 addr:$src))),
5773            (ROUNDSDm addr:$src, (i32 0xA))>;
5774  def : Pat<(f64 (frint (loadf64 addr:$src))),
5775            (ROUNDSDm addr:$src, (i32 0x4))>;
5776  def : Pat<(f64 (ftrunc (loadf64 addr:$src))),
5777            (ROUNDSDm addr:$src, (i32 0xB))>;
5778}
5779
5780let Predicates = [UseSSE41] in {
5781  def : Pat<(v4f32 (ffloor VR128:$src)),
5782            (ROUNDPSr VR128:$src, (i32 0x9))>;
5783  def : Pat<(v4f32 (fnearbyint VR128:$src)),
5784            (ROUNDPSr VR128:$src, (i32 0xC))>;
5785  def : Pat<(v4f32 (fceil VR128:$src)),
5786            (ROUNDPSr VR128:$src, (i32 0xA))>;
5787  def : Pat<(v4f32 (frint VR128:$src)),
5788            (ROUNDPSr VR128:$src, (i32 0x4))>;
5789  def : Pat<(v4f32 (ftrunc VR128:$src)),
5790            (ROUNDPSr VR128:$src, (i32 0xB))>;
5791
5792  def : Pat<(v4f32 (ffloor (memopv4f32 addr:$src))),
5793            (ROUNDPSm addr:$src, (i32 0x9))>;
5794  def : Pat<(v4f32 (fnearbyint (memopv4f32 addr:$src))),
5795            (ROUNDPSm addr:$src, (i32 0xC))>;
5796  def : Pat<(v4f32 (fceil (memopv4f32 addr:$src))),
5797            (ROUNDPSm addr:$src, (i32 0xA))>;
5798  def : Pat<(v4f32 (frint (memopv4f32 addr:$src))),
5799            (ROUNDPSm addr:$src, (i32 0x4))>;
5800  def : Pat<(v4f32 (ftrunc (memopv4f32 addr:$src))),
5801            (ROUNDPSm addr:$src, (i32 0xB))>;
5802
5803  def : Pat<(v2f64 (ffloor VR128:$src)),
5804            (ROUNDPDr VR128:$src, (i32 0x9))>;
5805  def : Pat<(v2f64 (fnearbyint VR128:$src)),
5806            (ROUNDPDr VR128:$src, (i32 0xC))>;
5807  def : Pat<(v2f64 (fceil VR128:$src)),
5808            (ROUNDPDr VR128:$src, (i32 0xA))>;
5809  def : Pat<(v2f64 (frint VR128:$src)),
5810            (ROUNDPDr VR128:$src, (i32 0x4))>;
5811  def : Pat<(v2f64 (ftrunc VR128:$src)),
5812            (ROUNDPDr VR128:$src, (i32 0xB))>;
5813
5814  def : Pat<(v2f64 (ffloor (memopv2f64 addr:$src))),
5815            (ROUNDPDm addr:$src, (i32 0x9))>;
5816  def : Pat<(v2f64 (fnearbyint (memopv2f64 addr:$src))),
5817            (ROUNDPDm addr:$src, (i32 0xC))>;
5818  def : Pat<(v2f64 (fceil (memopv2f64 addr:$src))),
5819            (ROUNDPDm addr:$src, (i32 0xA))>;
5820  def : Pat<(v2f64 (frint (memopv2f64 addr:$src))),
5821            (ROUNDPDm addr:$src, (i32 0x4))>;
5822  def : Pat<(v2f64 (ftrunc (memopv2f64 addr:$src))),
5823            (ROUNDPDm addr:$src, (i32 0xB))>;
5824}
5825
5826defm : scalar_unary_math_imm_patterns<ffloor, "ROUNDSS", X86Movss,
5827                                      v4f32, 0x01, UseSSE41>;
5828defm : scalar_unary_math_imm_patterns<fceil, "ROUNDSS", X86Movss,
5829                                      v4f32, 0x02, UseSSE41>;
5830defm : scalar_unary_math_imm_patterns<ffloor, "ROUNDSD", X86Movsd,
5831                                      v2f64, 0x01, UseSSE41>;
5832defm : scalar_unary_math_imm_patterns<fceil, "ROUNDSD", X86Movsd,
5833                                      v2f64, 0x02, UseSSE41>;
5834
5835//===----------------------------------------------------------------------===//
5836// SSE4.1 - Packed Bit Test
5837//===----------------------------------------------------------------------===//
5838
5839// ptest instruction we'll lower to this in X86ISelLowering primarily from
5840// the intel intrinsic that corresponds to this.
5841let Defs = [EFLAGS], Predicates = [HasAVX] in {
5842def VPTESTrr  : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
5843                "vptest\t{$src2, $src1|$src1, $src2}",
5844                [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
5845                Sched<[SchedWriteVecTest.XMM]>, VEX, VEX_WIG;
5846def VPTESTrm  : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
5847                "vptest\t{$src2, $src1|$src1, $src2}",
5848                [(set EFLAGS,(X86ptest VR128:$src1, (loadv2i64 addr:$src2)))]>,
5849                Sched<[SchedWriteVecTest.XMM.Folded, ReadAfterLd]>,
5850                VEX, VEX_WIG;
5851
5852def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2),
5853                "vptest\t{$src2, $src1|$src1, $src2}",
5854                [(set EFLAGS, (X86ptest VR256:$src1, (v4i64 VR256:$src2)))]>,
5855                Sched<[SchedWriteVecTest.YMM]>, VEX, VEX_L, VEX_WIG;
5856def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2),
5857                "vptest\t{$src2, $src1|$src1, $src2}",
5858                [(set EFLAGS,(X86ptest VR256:$src1, (loadv4i64 addr:$src2)))]>,
5859                Sched<[SchedWriteVecTest.YMM.Folded, ReadAfterLd]>,
5860                VEX, VEX_L, VEX_WIG;
5861}
5862
5863let Defs = [EFLAGS] in {
5864def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
5865              "ptest\t{$src2, $src1|$src1, $src2}",
5866              [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
5867              Sched<[SchedWriteVecTest.XMM]>;
5868def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
5869              "ptest\t{$src2, $src1|$src1, $src2}",
5870              [(set EFLAGS, (X86ptest VR128:$src1, (memopv2i64 addr:$src2)))]>,
5871              Sched<[SchedWriteVecTest.XMM.Folded, ReadAfterLd]>;
5872}
5873
5874// The bit test instructions below are AVX only
5875multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC,
5876                       X86MemOperand x86memop, PatFrag mem_frag, ValueType vt,
5877                       X86FoldableSchedWrite sched> {
5878  def rr : SS48I<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
5879            !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
5880            [(set EFLAGS, (X86testp RC:$src1, (vt RC:$src2)))]>,
5881            Sched<[sched]>, VEX;
5882  def rm : SS48I<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
5883            !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
5884            [(set EFLAGS, (X86testp RC:$src1, (mem_frag addr:$src2)))]>,
5885            Sched<[sched.Folded, ReadAfterLd]>, VEX;
5886}
5887
5888let Defs = [EFLAGS], Predicates = [HasAVX] in {
5889let ExeDomain = SSEPackedSingle in {
5890defm VTESTPS  : avx_bittest<0x0E, "vtestps", VR128, f128mem, loadv4f32, v4f32,
5891                            SchedWriteFTest.XMM>;
5892defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, loadv8f32, v8f32,
5893                            SchedWriteFTest.YMM>, VEX_L;
5894}
5895let ExeDomain = SSEPackedDouble in {
5896defm VTESTPD  : avx_bittest<0x0F, "vtestpd", VR128, f128mem, loadv2f64, v2f64,
5897                            SchedWriteFTest.XMM>;
5898defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, loadv4f64, v4f64,
5899                            SchedWriteFTest.YMM>, VEX_L;
5900}
5901}
5902
5903//===----------------------------------------------------------------------===//
5904// SSE4.1 - Misc Instructions
5905//===----------------------------------------------------------------------===//
5906
5907let Defs = [EFLAGS], Predicates = [HasPOPCNT] in {
5908  def POPCNT16rr : I<0xB8, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
5909                     "popcnt{w}\t{$src, $dst|$dst, $src}",
5910                     [(set GR16:$dst, (ctpop GR16:$src)), (implicit EFLAGS)]>,
5911                     Sched<[WritePOPCNT]>, OpSize16, XS;
5912  def POPCNT16rm : I<0xB8, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
5913                     "popcnt{w}\t{$src, $dst|$dst, $src}",
5914                     [(set GR16:$dst, (ctpop (loadi16 addr:$src))),
5915                      (implicit EFLAGS)]>,
5916                      Sched<[WritePOPCNT.Folded]>, OpSize16, XS;
5917
5918  def POPCNT32rr : I<0xB8, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
5919                     "popcnt{l}\t{$src, $dst|$dst, $src}",
5920                     [(set GR32:$dst, (ctpop GR32:$src)), (implicit EFLAGS)]>,
5921                     Sched<[WritePOPCNT]>, OpSize32, XS;
5922
5923  def POPCNT32rm : I<0xB8, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
5924                     "popcnt{l}\t{$src, $dst|$dst, $src}",
5925                     [(set GR32:$dst, (ctpop (loadi32 addr:$src))),
5926                      (implicit EFLAGS)]>,
5927                      Sched<[WritePOPCNT.Folded]>, OpSize32, XS;
5928
5929  def POPCNT64rr : RI<0xB8, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
5930                      "popcnt{q}\t{$src, $dst|$dst, $src}",
5931                      [(set GR64:$dst, (ctpop GR64:$src)), (implicit EFLAGS)]>,
5932                      Sched<[WritePOPCNT]>, XS;
5933  def POPCNT64rm : RI<0xB8, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
5934                      "popcnt{q}\t{$src, $dst|$dst, $src}",
5935                      [(set GR64:$dst, (ctpop (loadi64 addr:$src))),
5936                       (implicit EFLAGS)]>,
5937                       Sched<[WritePOPCNT.Folded]>, XS;
5938}
5939
5940// SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16.
5941multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr,
5942                                 SDNode OpNode, PatFrag ld_frag,
5943                                 X86FoldableSchedWrite Sched> {
5944  def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
5945                 (ins VR128:$src),
5946                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5947                 [(set VR128:$dst, (v8i16 (OpNode (v8i16 VR128:$src))))]>,
5948                 Sched<[Sched]>;
5949  def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
5950                  (ins i128mem:$src),
5951                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5952                  [(set VR128:$dst,
5953                    (v8i16 (OpNode (v8i16 (bitconvert (ld_frag addr:$src))))))]>,
5954                 Sched<[Sched.Folded]>;
5955}
5956
5957// PHMIN has the same profile as PSAD, thus we use the same scheduling
5958// model, although the naming is misleading.
5959let Predicates = [HasAVX] in
5960defm VPHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "vphminposuw",
5961                                         X86phminpos, loadv2i64,
5962                                         WritePHMINPOS>, VEX, VEX_WIG;
5963defm PHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "phminposuw",
5964                                         X86phminpos, memopv2i64,
5965                                         WritePHMINPOS>;
5966
5967/// SS48I_binop_rm - Simple SSE41 binary operator.
5968multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
5969                          ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
5970                          X86MemOperand x86memop, X86FoldableSchedWrite sched,
5971                          bit Is2Addr = 1> {
5972  let isCommutable = 1 in
5973  def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst),
5974       (ins RC:$src1, RC:$src2),
5975       !if(Is2Addr,
5976           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5977           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5978       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
5979       Sched<[sched]>;
5980  def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst),
5981       (ins RC:$src1, x86memop:$src2),
5982       !if(Is2Addr,
5983           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5984           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5985       [(set RC:$dst,
5986         (OpVT (OpNode RC:$src1, (bitconvert (memop_frag addr:$src2)))))]>,
5987       Sched<[sched.Folded, ReadAfterLd]>;
5988}
5989
5990let Predicates = [HasAVX, NoVLX] in {
5991  defm VPMINSD   : SS48I_binop_rm<0x39, "vpminsd", smin, v4i32, VR128,
5992                                  loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
5993                                  VEX_4V, VEX_WIG;
5994  defm VPMINUD   : SS48I_binop_rm<0x3B, "vpminud", umin, v4i32, VR128,
5995                                  loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
5996                                  VEX_4V, VEX_WIG;
5997  defm VPMAXSD   : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v4i32, VR128,
5998                                  loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
5999                                  VEX_4V, VEX_WIG;
6000  defm VPMAXUD   : SS48I_binop_rm<0x3F, "vpmaxud", umax, v4i32, VR128,
6001                                  loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
6002                                  VEX_4V, VEX_WIG;
6003  defm VPMULDQ   : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v2i64, VR128,
6004                                  loadv2i64, i128mem, SchedWriteVecIMul.XMM, 0>,
6005                                  VEX_4V, VEX_WIG;
6006}
6007let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
6008  defm VPMINSB   : SS48I_binop_rm<0x38, "vpminsb", smin, v16i8, VR128,
6009                                  loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
6010                                  VEX_4V, VEX_WIG;
6011  defm VPMINUW   : SS48I_binop_rm<0x3A, "vpminuw", umin, v8i16, VR128,
6012                                  loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
6013                                  VEX_4V, VEX_WIG;
6014  defm VPMAXSB   : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v16i8, VR128,
6015                                  loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
6016                                  VEX_4V, VEX_WIG;
6017  defm VPMAXUW   : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v8i16, VR128,
6018                                  loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
6019                                  VEX_4V, VEX_WIG;
6020}
6021
6022let Predicates = [HasAVX2, NoVLX] in {
6023  defm VPMINSDY  : SS48I_binop_rm<0x39, "vpminsd", smin, v8i32, VR256,
6024                                  loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
6025                                  VEX_4V, VEX_L, VEX_WIG;
6026  defm VPMINUDY  : SS48I_binop_rm<0x3B, "vpminud", umin, v8i32, VR256,
6027                                  loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
6028                                  VEX_4V, VEX_L, VEX_WIG;
6029  defm VPMAXSDY  : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v8i32, VR256,
6030                                  loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
6031                                  VEX_4V, VEX_L, VEX_WIG;
6032  defm VPMAXUDY  : SS48I_binop_rm<0x3F, "vpmaxud", umax, v8i32, VR256,
6033                                  loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
6034                                  VEX_4V, VEX_L, VEX_WIG;
6035  defm VPMULDQY  : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v4i64, VR256,
6036                                  loadv4i64, i256mem, SchedWriteVecIMul.YMM, 0>,
6037                                  VEX_4V, VEX_L, VEX_WIG;
6038}
6039let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
6040  defm VPMINSBY  : SS48I_binop_rm<0x38, "vpminsb", smin, v32i8, VR256,
6041                                  loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
6042                                  VEX_4V, VEX_L, VEX_WIG;
6043  defm VPMINUWY  : SS48I_binop_rm<0x3A, "vpminuw", umin, v16i16, VR256,
6044                                  loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
6045                                  VEX_4V, VEX_L, VEX_WIG;
6046  defm VPMAXSBY  : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v32i8, VR256,
6047                                  loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
6048                                  VEX_4V, VEX_L, VEX_WIG;
6049  defm VPMAXUWY  : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v16i16, VR256,
6050                                  loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
6051                                  VEX_4V, VEX_L, VEX_WIG;
6052}
6053
6054let Constraints = "$src1 = $dst" in {
6055  defm PMINSB   : SS48I_binop_rm<0x38, "pminsb", smin, v16i8, VR128,
6056                                 memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>;
6057  defm PMINSD   : SS48I_binop_rm<0x39, "pminsd", smin, v4i32, VR128,
6058                                 memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>;
6059  defm PMINUD   : SS48I_binop_rm<0x3B, "pminud", umin, v4i32, VR128,
6060                                 memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>;
6061  defm PMINUW   : SS48I_binop_rm<0x3A, "pminuw", umin, v8i16, VR128,
6062                                 memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>;
6063  defm PMAXSB   : SS48I_binop_rm<0x3C, "pmaxsb", smax, v16i8, VR128,
6064                                 memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>;
6065  defm PMAXSD   : SS48I_binop_rm<0x3D, "pmaxsd", smax, v4i32, VR128,
6066                                 memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>;
6067  defm PMAXUD   : SS48I_binop_rm<0x3F, "pmaxud", umax, v4i32, VR128,
6068                                 memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>;
6069  defm PMAXUW   : SS48I_binop_rm<0x3E, "pmaxuw", umax, v8i16, VR128,
6070                                 memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>;
6071  defm PMULDQ   : SS48I_binop_rm<0x28, "pmuldq", X86pmuldq, v2i64, VR128,
6072                                 memopv2i64, i128mem, SchedWriteVecIMul.XMM, 1>;
6073}
6074
6075let Predicates = [HasAVX, NoVLX] in
6076  defm VPMULLD  : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128,
6077                                 loadv2i64, i128mem, SchedWritePMULLD.XMM, 0>,
6078                                 VEX_4V, VEX_WIG;
6079let Predicates = [HasAVX] in
6080  defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128,
6081                                 loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
6082                                 VEX_4V, VEX_WIG;
6083
6084let Predicates = [HasAVX2, NoVLX] in
6085  defm VPMULLDY  : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256,
6086                                  loadv4i64, i256mem, SchedWritePMULLD.YMM, 0>,
6087                                  VEX_4V, VEX_L, VEX_WIG;
6088let Predicates = [HasAVX2] in
6089  defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256,
6090                                  loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
6091                                  VEX_4V, VEX_L, VEX_WIG;
6092
6093let Constraints = "$src1 = $dst" in {
6094  defm PMULLD  : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, VR128,
6095                                memopv2i64, i128mem, SchedWritePMULLD.XMM, 1>;
6096  defm PCMPEQQ : SS48I_binop_rm<0x29, "pcmpeqq", X86pcmpeq, v2i64, VR128,
6097                                memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>;
6098}
6099
6100/// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate
6101multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
6102                 Intrinsic IntId, RegisterClass RC, PatFrag memop_frag,
6103                 X86MemOperand x86memop, bit Is2Addr,
6104                 X86FoldableSchedWrite sched> {
6105  let isCommutable = 1 in
6106  def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
6107        (ins RC:$src1, RC:$src2, u8imm:$src3),
6108        !if(Is2Addr,
6109            !strconcat(OpcodeStr,
6110                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6111            !strconcat(OpcodeStr,
6112                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6113        [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))]>,
6114        Sched<[sched]>;
6115  def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
6116        (ins RC:$src1, x86memop:$src2, u8imm:$src3),
6117        !if(Is2Addr,
6118            !strconcat(OpcodeStr,
6119                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6120            !strconcat(OpcodeStr,
6121                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6122        [(set RC:$dst,
6123          (IntId RC:$src1,
6124           (bitconvert (memop_frag addr:$src2)), imm:$src3))]>,
6125        Sched<[sched.Folded, ReadAfterLd]>;
6126}
6127
6128/// SS41I_binop_rmi - SSE 4.1 binary operator with 8-bit immediate
6129multiclass SS41I_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
6130                           ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
6131                           X86MemOperand x86memop, bit Is2Addr,
6132                           X86FoldableSchedWrite sched> {
6133  let isCommutable = 1 in
6134  def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
6135        (ins RC:$src1, RC:$src2, u8imm:$src3),
6136        !if(Is2Addr,
6137            !strconcat(OpcodeStr,
6138                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6139            !strconcat(OpcodeStr,
6140                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6141        [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))]>,
6142        Sched<[sched]>;
6143  def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
6144        (ins RC:$src1, x86memop:$src2, u8imm:$src3),
6145        !if(Is2Addr,
6146            !strconcat(OpcodeStr,
6147                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6148            !strconcat(OpcodeStr,
6149                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6150        [(set RC:$dst,
6151          (OpVT (OpNode RC:$src1,
6152                 (bitconvert (memop_frag addr:$src2)), imm:$src3)))]>,
6153        Sched<[sched.Folded, ReadAfterLd]>;
6154}
6155
6156def BlendCommuteImm2 : SDNodeXForm<imm, [{
6157  uint8_t Imm = N->getZExtValue() & 0x03;
6158  return getI8Imm(Imm ^ 0x03, SDLoc(N));
6159}]>;
6160
6161def BlendCommuteImm4 : SDNodeXForm<imm, [{
6162  uint8_t Imm = N->getZExtValue() & 0x0f;
6163  return getI8Imm(Imm ^ 0x0f, SDLoc(N));
6164}]>;
6165
6166def BlendCommuteImm8 : SDNodeXForm<imm, [{
6167  uint8_t Imm = N->getZExtValue() & 0xff;
6168  return getI8Imm(Imm ^ 0xff, SDLoc(N));
6169}]>;
6170
6171let Predicates = [HasAVX] in {
6172  let isCommutable = 0 in {
6173    defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw,
6174                                        VR128, loadv2i64, i128mem, 0,
6175                                        SchedWriteMPSAD.XMM>, VEX_4V, VEX_WIG;
6176  }
6177
6178  let ExeDomain = SSEPackedSingle in
6179  defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps,
6180                                   VR128, loadv4f32, f128mem, 0,
6181                                   SchedWriteDPPS.XMM>, VEX_4V, VEX_WIG;
6182  let ExeDomain = SSEPackedDouble in
6183  defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd,
6184                                   VR128, loadv2f64, f128mem, 0,
6185                                   SchedWriteDPPD.XMM>, VEX_4V, VEX_WIG;
6186  let ExeDomain = SSEPackedSingle in
6187  defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256,
6188                                    VR256, loadv8f32, i256mem, 0,
6189                                    SchedWriteDPPS.YMM>, VEX_4V, VEX_L, VEX_WIG;
6190}
6191
6192let Predicates = [HasAVX2] in {
6193  let isCommutable = 0 in {
6194  defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw,
6195                                  VR256, loadv4i64, i256mem, 0,
6196                                  SchedWriteMPSAD.YMM>, VEX_4V, VEX_L, VEX_WIG;
6197  }
6198}
6199
6200let Constraints = "$src1 = $dst" in {
6201  let isCommutable = 0 in {
6202  defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw,
6203                                     VR128, memopv2i64, i128mem, 1,
6204                                     SchedWriteMPSAD.XMM>;
6205  }
6206
6207  let ExeDomain = SSEPackedSingle in
6208  defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps,
6209                                  VR128, memopv4f32, f128mem, 1,
6210                                  SchedWriteDPPS.XMM>;
6211  let ExeDomain = SSEPackedDouble in
6212  defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd,
6213                                  VR128, memopv2f64, f128mem, 1,
6214                                  SchedWriteDPPD.XMM>;
6215}
6216
6217/// SS41I_blend_rmi - SSE 4.1 blend with 8-bit immediate
6218multiclass SS41I_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
6219                           ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
6220                           X86MemOperand x86memop, bit Is2Addr, Domain d,
6221                           X86FoldableSchedWrite sched, SDNodeXForm commuteXForm> {
6222let ExeDomain = d, Constraints = !if(Is2Addr, "$src1 = $dst", "") in {
6223  let isCommutable = 1 in
6224  def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
6225        (ins RC:$src1, RC:$src2, u8imm:$src3),
6226        !if(Is2Addr,
6227            !strconcat(OpcodeStr,
6228                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6229            !strconcat(OpcodeStr,
6230                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6231        [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))]>,
6232        Sched<[sched]>;
6233  def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
6234        (ins RC:$src1, x86memop:$src2, u8imm:$src3),
6235        !if(Is2Addr,
6236            !strconcat(OpcodeStr,
6237                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6238            !strconcat(OpcodeStr,
6239                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6240        [(set RC:$dst,
6241          (OpVT (OpNode RC:$src1,
6242                 (bitconvert (memop_frag addr:$src2)), imm:$src3)))]>,
6243        Sched<[sched.Folded, ReadAfterLd]>;
6244}
6245
6246  // Pattern to commute if load is in first source.
6247  def : Pat<(OpVT (OpNode (bitconvert (memop_frag addr:$src2)),
6248                          RC:$src1, imm:$src3)),
6249            (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2,
6250                                            (commuteXForm imm:$src3))>;
6251}
6252
6253let Predicates = [HasAVX] in {
6254  defm VBLENDPS : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v4f32,
6255                                  VR128, loadv4f32, f128mem, 0, SSEPackedSingle,
6256                                  SchedWriteFBlend.XMM, BlendCommuteImm4>,
6257                                  VEX_4V, VEX_WIG;
6258  defm VBLENDPSY : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v8f32,
6259                                   VR256, loadv8f32, f256mem, 0, SSEPackedSingle,
6260                                   SchedWriteFBlend.YMM, BlendCommuteImm8>,
6261                                   VEX_4V, VEX_L, VEX_WIG;
6262  defm VBLENDPD : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v2f64,
6263                                  VR128, loadv2f64, f128mem, 0, SSEPackedDouble,
6264                                  SchedWriteFBlend.XMM, BlendCommuteImm2>,
6265                                  VEX_4V, VEX_WIG;
6266  defm VBLENDPDY : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v4f64,
6267                                   VR256, loadv4f64, f256mem, 0, SSEPackedDouble,
6268                                   SchedWriteFBlend.YMM, BlendCommuteImm4>,
6269                                   VEX_4V, VEX_L, VEX_WIG;
6270  defm VPBLENDW : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v8i16,
6271                                  VR128, loadv2i64, i128mem, 0, SSEPackedInt,
6272                                  SchedWriteBlend.XMM, BlendCommuteImm8>,
6273                                  VEX_4V, VEX_WIG;
6274}
6275
6276let Predicates = [HasAVX2] in {
6277  defm VPBLENDWY : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v16i16,
6278                                   VR256, loadv4i64, i256mem, 0, SSEPackedInt,
6279                                   SchedWriteBlend.YMM, BlendCommuteImm8>,
6280                                   VEX_4V, VEX_L, VEX_WIG;
6281}
6282
6283defm BLENDPS : SS41I_blend_rmi<0x0C, "blendps", X86Blendi, v4f32,
6284                               VR128, memopv4f32, f128mem, 1, SSEPackedSingle,
6285                               SchedWriteFBlend.XMM, BlendCommuteImm4>;
6286defm BLENDPD : SS41I_blend_rmi<0x0D, "blendpd", X86Blendi, v2f64,
6287                               VR128, memopv2f64, f128mem, 1, SSEPackedDouble,
6288                               SchedWriteFBlend.XMM, BlendCommuteImm2>;
6289defm PBLENDW : SS41I_blend_rmi<0x0E, "pblendw", X86Blendi, v8i16,
6290                               VR128, memopv2i64, i128mem, 1, SSEPackedInt,
6291                               SchedWriteBlend.XMM, BlendCommuteImm8>;
6292
6293// For insertion into the zero index (low half) of a 256-bit vector, it is
6294// more efficient to generate a blend with immediate instead of an insert*128.
6295let Predicates = [HasAVX] in {
6296def : Pat<(insert_subvector (v4f64 VR256:$src1), (v2f64 VR128:$src2), (iPTR 0)),
6297          (VBLENDPDYrri VR256:$src1,
6298                        (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
6299                                       VR128:$src2, sub_xmm), 0x3)>;
6300def : Pat<(insert_subvector (v8f32 VR256:$src1), (v4f32 VR128:$src2), (iPTR 0)),
6301          (VBLENDPSYrri VR256:$src1,
6302                        (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
6303                                       VR128:$src2, sub_xmm), 0xf)>;
6304}
6305
6306/// SS41I_quaternary_int_avx - AVX SSE 4.1 with 4 operators
6307multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr,
6308                                    RegisterClass RC, X86MemOperand x86memop,
6309                                    PatFrag mem_frag, Intrinsic IntId,
6310                                    X86FoldableSchedWrite sched> {
6311  def rr : Ii8Reg<opc, MRMSrcReg, (outs RC:$dst),
6312                  (ins RC:$src1, RC:$src2, RC:$src3),
6313                  !strconcat(OpcodeStr,
6314                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
6315                  [(set RC:$dst, (IntId RC:$src1, RC:$src2, RC:$src3))],
6316                  SSEPackedInt>, TAPD, VEX_4V,
6317                Sched<[sched]>;
6318
6319  def rm : Ii8Reg<opc, MRMSrcMem, (outs RC:$dst),
6320                  (ins RC:$src1, x86memop:$src2, RC:$src3),
6321                  !strconcat(OpcodeStr,
6322                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
6323                  [(set RC:$dst,
6324                        (IntId RC:$src1, (bitconvert (mem_frag addr:$src2)),
6325                               RC:$src3))], SSEPackedInt>, TAPD, VEX_4V,
6326                Sched<[sched.Folded, ReadAfterLd,
6327                       // x86memop:$src2
6328                       ReadDefault, ReadDefault, ReadDefault, ReadDefault,
6329                       ReadDefault,
6330                       // RC::$src3
6331                       ReadAfterLd]>;
6332}
6333
6334let Predicates = [HasAVX] in {
6335let ExeDomain = SSEPackedDouble in {
6336defm VBLENDVPD  : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR128, f128mem,
6337                                           loadv2f64, int_x86_sse41_blendvpd,
6338                                           SchedWriteFVarBlend.XMM>;
6339defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, f256mem,
6340                                  loadv4f64, int_x86_avx_blendv_pd_256,
6341                                  SchedWriteFVarBlend.YMM>, VEX_L;
6342} // ExeDomain = SSEPackedDouble
6343let ExeDomain = SSEPackedSingle in {
6344defm VBLENDVPS  : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR128, f128mem,
6345                                           loadv4f32, int_x86_sse41_blendvps,
6346                                           SchedWriteFVarBlend.XMM>;
6347defm VBLENDVPSY : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR256, f256mem,
6348                                  loadv8f32, int_x86_avx_blendv_ps_256,
6349                                  SchedWriteFVarBlend.YMM>, VEX_L;
6350} // ExeDomain = SSEPackedSingle
6351defm VPBLENDVB  : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR128, i128mem,
6352                                           loadv2i64, int_x86_sse41_pblendvb,
6353                                           SchedWriteVarBlend.XMM>;
6354}
6355
6356let Predicates = [HasAVX2] in {
6357defm VPBLENDVBY : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR256, i256mem,
6358                                      loadv4i64, int_x86_avx2_pblendvb,
6359                                      SchedWriteVarBlend.YMM>, VEX_L;
6360}
6361
6362let Predicates = [HasAVX] in {
6363  def : Pat<(v16i8 (vselect (v16i8 VR128:$mask), (v16i8 VR128:$src1),
6364                            (v16i8 VR128:$src2))),
6365            (VPBLENDVBrr VR128:$src2, VR128:$src1, VR128:$mask)>;
6366  def : Pat<(v4i32 (vselect (v4i32 VR128:$mask), (v4i32 VR128:$src1),
6367                            (v4i32 VR128:$src2))),
6368            (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>;
6369  def : Pat<(v4f32 (vselect (v4i32 VR128:$mask), (v4f32 VR128:$src1),
6370                            (v4f32 VR128:$src2))),
6371            (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>;
6372  def : Pat<(v2i64 (vselect (v2i64 VR128:$mask), (v2i64 VR128:$src1),
6373                            (v2i64 VR128:$src2))),
6374            (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>;
6375  def : Pat<(v2f64 (vselect (v2i64 VR128:$mask), (v2f64 VR128:$src1),
6376                            (v2f64 VR128:$src2))),
6377            (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>;
6378  def : Pat<(v8i32 (vselect (v8i32 VR256:$mask), (v8i32 VR256:$src1),
6379                            (v8i32 VR256:$src2))),
6380            (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
6381  def : Pat<(v8f32 (vselect (v8i32 VR256:$mask), (v8f32 VR256:$src1),
6382                            (v8f32 VR256:$src2))),
6383            (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
6384  def : Pat<(v4i64 (vselect (v4i64 VR256:$mask), (v4i64 VR256:$src1),
6385                            (v4i64 VR256:$src2))),
6386            (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
6387  def : Pat<(v4f64 (vselect (v4i64 VR256:$mask), (v4f64 VR256:$src1),
6388                            (v4f64 VR256:$src2))),
6389            (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
6390}
6391
6392let Predicates = [HasAVX2] in {
6393  def : Pat<(v32i8 (vselect (v32i8 VR256:$mask), (v32i8 VR256:$src1),
6394                            (v32i8 VR256:$src2))),
6395            (VPBLENDVBYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
6396}
6397
6398// Prefer a movss or movsd over a blendps when optimizing for size. these were
6399// changed to use blends because blends have better throughput on sandybridge
6400// and haswell, but movs[s/d] are 1-2 byte shorter instructions.
6401let Predicates = [HasAVX, OptForSpeed] in {
6402  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
6403            (VBLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
6404  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
6405            (VPBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
6406
6407  def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
6408            (VBLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>;
6409  def : Pat<(v4f32 (X86Movss VR128:$src1, (loadv4f32 addr:$src2))),
6410            (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>;
6411  def : Pat<(v4f32 (X86Movss (loadv4f32 addr:$src2), VR128:$src1)),
6412            (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>;
6413
6414  def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
6415            (VBLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>;
6416  def : Pat<(v2f64 (X86Movsd VR128:$src1, (loadv2f64 addr:$src2))),
6417            (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>;
6418  def : Pat<(v2f64 (X86Movsd (loadv2f64 addr:$src2), VR128:$src1)),
6419            (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>;
6420
6421  // Move low f32 and clear high bits.
6422  def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
6423            (SUBREG_TO_REG (i32 0),
6424             (v4f32 (VBLENDPSrri (v4f32 (V_SET0)),
6425                          (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)),
6426                          (i8 1))), sub_xmm)>;
6427  def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
6428            (SUBREG_TO_REG (i32 0),
6429             (v4i32 (VPBLENDWrri (v4i32 (V_SET0)),
6430                          (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)),
6431                          (i8 3))), sub_xmm)>;
6432
6433  def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))),
6434            (SUBREG_TO_REG (i32 0),
6435             (v2f64 (VBLENDPDrri (v2f64 (V_SET0)),
6436                          (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)),
6437                          (i8 1))), sub_xmm)>;
6438  def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))),
6439            (SUBREG_TO_REG (i32 0),
6440             (v2i64 (VPBLENDWrri (v2i64 (V_SET0)),
6441                          (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)),
6442                          (i8 0xf))), sub_xmm)>;
6443}
6444
6445// Prefer a movss or movsd over a blendps when optimizing for size. these were
6446// changed to use blends because blends have better throughput on sandybridge
6447// and haswell, but movs[s/d] are 1-2 byte shorter instructions.
6448let Predicates = [UseSSE41, OptForSpeed] in {
6449  // With SSE41 we can use blends for these patterns.
6450  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
6451            (BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
6452  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
6453            (PBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
6454
6455  def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
6456            (BLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>;
6457  def : Pat<(v4f32 (X86Movss VR128:$src1, (memopv4f32 addr:$src2))),
6458            (BLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>;
6459  def : Pat<(v4f32 (X86Movss (memopv4f32 addr:$src2), VR128:$src1)),
6460            (BLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>;
6461
6462  def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
6463            (BLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>;
6464  def : Pat<(v2f64 (X86Movsd VR128:$src1, (memopv2f64 addr:$src2))),
6465            (BLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>;
6466  def : Pat<(v2f64 (X86Movsd (memopv2f64 addr:$src2), VR128:$src1)),
6467            (BLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>;
6468}
6469
6470
6471/// SS41I_ternary_int - SSE 4.1 ternary operator
6472let Uses = [XMM0], Constraints = "$src1 = $dst" in {
6473  multiclass SS41I_ternary_int<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
6474                               X86MemOperand x86memop, Intrinsic IntId,
6475                               X86FoldableSchedWrite sched> {
6476    def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
6477                    (ins VR128:$src1, VR128:$src2),
6478                    !strconcat(OpcodeStr,
6479                     "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
6480                    [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0))]>,
6481                    Sched<[sched]>;
6482
6483    def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
6484                    (ins VR128:$src1, x86memop:$src2),
6485                    !strconcat(OpcodeStr,
6486                     "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
6487                    [(set VR128:$dst,
6488                      (IntId VR128:$src1,
6489                       (bitconvert (mem_frag addr:$src2)), XMM0))]>,
6490                    Sched<[sched.Folded, ReadAfterLd]>;
6491  }
6492}
6493
6494let ExeDomain = SSEPackedDouble in
6495defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", memopv2f64, f128mem,
6496                                  int_x86_sse41_blendvpd, SchedWriteFVarBlend.XMM>;
6497let ExeDomain = SSEPackedSingle in
6498defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", memopv4f32, f128mem,
6499                                  int_x86_sse41_blendvps, SchedWriteFVarBlend.XMM>;
6500defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", memopv2i64, i128mem,
6501                                  int_x86_sse41_pblendvb, SchedWriteVarBlend.XMM>;
6502
6503// Aliases with the implicit xmm0 argument
6504def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}",
6505                (BLENDVPDrr0 VR128:$dst, VR128:$src2), 0>;
6506def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}",
6507                (BLENDVPDrm0 VR128:$dst, f128mem:$src2), 0>;
6508def : InstAlias<"blendvps\t{$src2, $dst|$dst, $src2}",
6509                (BLENDVPSrr0 VR128:$dst, VR128:$src2), 0>;
6510def : InstAlias<"blendvps\t{$src2, $dst|$dst, $src2}",
6511                (BLENDVPSrm0 VR128:$dst, f128mem:$src2), 0>;
6512def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}",
6513                (PBLENDVBrr0 VR128:$dst, VR128:$src2), 0>;
6514def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}",
6515                (PBLENDVBrm0 VR128:$dst, i128mem:$src2), 0>;
6516
6517let Predicates = [UseSSE41] in {
6518  def : Pat<(v16i8 (vselect (v16i8 XMM0), (v16i8 VR128:$src1),
6519                            (v16i8 VR128:$src2))),
6520            (PBLENDVBrr0 VR128:$src2, VR128:$src1)>;
6521  def : Pat<(v4i32 (vselect (v4i32 XMM0), (v4i32 VR128:$src1),
6522                            (v4i32 VR128:$src2))),
6523            (BLENDVPSrr0 VR128:$src2, VR128:$src1)>;
6524  def : Pat<(v4f32 (vselect (v4i32 XMM0), (v4f32 VR128:$src1),
6525                            (v4f32 VR128:$src2))),
6526            (BLENDVPSrr0 VR128:$src2, VR128:$src1)>;
6527  def : Pat<(v2i64 (vselect (v2i64 XMM0), (v2i64 VR128:$src1),
6528                            (v2i64 VR128:$src2))),
6529            (BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
6530  def : Pat<(v2f64 (vselect (v2i64 XMM0), (v2f64 VR128:$src1),
6531                            (v2f64 VR128:$src2))),
6532            (BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
6533}
6534
6535let AddedComplexity = 400 in { // Prefer non-temporal versions
6536
6537let Predicates = [HasAVX, NoVLX] in
6538def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
6539                        "vmovntdqa\t{$src, $dst|$dst, $src}", []>,
6540                        Sched<[SchedWriteVecMoveLSNT.XMM.RM]>, VEX, VEX_WIG;
6541let Predicates = [HasAVX2, NoVLX] in
6542def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
6543                         "vmovntdqa\t{$src, $dst|$dst, $src}", []>,
6544                         Sched<[SchedWriteVecMoveLSNT.YMM.RM]>, VEX, VEX_L, VEX_WIG;
6545def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
6546                       "movntdqa\t{$src, $dst|$dst, $src}", []>,
6547                       Sched<[SchedWriteVecMoveLSNT.XMM.RM]>;
6548
6549let Predicates = [HasAVX2, NoVLX] in {
6550  def : Pat<(v8f32 (alignednontemporalload addr:$src)),
6551            (VMOVNTDQAYrm addr:$src)>;
6552  def : Pat<(v4f64 (alignednontemporalload addr:$src)),
6553            (VMOVNTDQAYrm addr:$src)>;
6554  def : Pat<(v4i64 (alignednontemporalload addr:$src)),
6555            (VMOVNTDQAYrm addr:$src)>;
6556}
6557
6558let Predicates = [HasAVX, NoVLX] in {
6559  def : Pat<(v4f32 (alignednontemporalload addr:$src)),
6560            (VMOVNTDQArm addr:$src)>;
6561  def : Pat<(v2f64 (alignednontemporalload addr:$src)),
6562            (VMOVNTDQArm addr:$src)>;
6563  def : Pat<(v2i64 (alignednontemporalload addr:$src)),
6564            (VMOVNTDQArm addr:$src)>;
6565}
6566
6567let Predicates = [UseSSE41] in {
6568  def : Pat<(v4f32 (alignednontemporalload addr:$src)),
6569            (MOVNTDQArm addr:$src)>;
6570  def : Pat<(v2f64 (alignednontemporalload addr:$src)),
6571            (MOVNTDQArm addr:$src)>;
6572  def : Pat<(v2i64 (alignednontemporalload addr:$src)),
6573            (MOVNTDQArm addr:$src)>;
6574}
6575
6576} // AddedComplexity
6577
6578//===----------------------------------------------------------------------===//
6579// SSE4.2 - Compare Instructions
6580//===----------------------------------------------------------------------===//
6581
6582/// SS42I_binop_rm - Simple SSE 4.2 binary operator
6583multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
6584                          ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
6585                          X86MemOperand x86memop, X86FoldableSchedWrite sched,
6586                          bit Is2Addr = 1> {
6587  def rr : SS428I<opc, MRMSrcReg, (outs RC:$dst),
6588       (ins RC:$src1, RC:$src2),
6589       !if(Is2Addr,
6590           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
6591           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
6592       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
6593       Sched<[sched]>;
6594  def rm : SS428I<opc, MRMSrcMem, (outs RC:$dst),
6595       (ins RC:$src1, x86memop:$src2),
6596       !if(Is2Addr,
6597           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
6598           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
6599       [(set RC:$dst,
6600         (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>,
6601       Sched<[sched.Folded, ReadAfterLd]>;
6602}
6603
6604let Predicates = [HasAVX] in
6605  defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128,
6606                                 loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
6607                                 VEX_4V, VEX_WIG;
6608
6609let Predicates = [HasAVX2] in
6610  defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256,
6611                                  loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
6612                                  VEX_4V, VEX_L, VEX_WIG;
6613
6614let Constraints = "$src1 = $dst" in
6615  defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128,
6616                                memopv2i64, i128mem, SchedWriteVecALU.XMM>;
6617
6618//===----------------------------------------------------------------------===//
6619// SSE4.2 - String/text Processing Instructions
6620//===----------------------------------------------------------------------===//
6621
6622multiclass pcmpistrm_SS42AI<string asm> {
6623  def rr : SS42AI<0x62, MRMSrcReg, (outs),
6624    (ins VR128:$src1, VR128:$src2, u8imm:$src3),
6625    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
6626    []>, Sched<[WritePCmpIStrM]>;
6627  let mayLoad = 1 in
6628  def rm :SS42AI<0x62, MRMSrcMem, (outs),
6629    (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
6630    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
6631    []>, Sched<[WritePCmpIStrM.Folded, ReadAfterLd]>;
6632}
6633
6634let Defs = [XMM0, EFLAGS], hasSideEffects = 0 in {
6635  let Predicates = [HasAVX] in
6636  defm VPCMPISTRM : pcmpistrm_SS42AI<"vpcmpistrm">, VEX;
6637  defm PCMPISTRM  : pcmpistrm_SS42AI<"pcmpistrm"> ;
6638}
6639
6640multiclass SS42AI_pcmpestrm<string asm> {
6641  def rr : SS42AI<0x60, MRMSrcReg, (outs),
6642    (ins VR128:$src1, VR128:$src3, u8imm:$src5),
6643    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
6644    []>, Sched<[WritePCmpEStrM]>;
6645  let mayLoad = 1 in
6646  def rm : SS42AI<0x60, MRMSrcMem, (outs),
6647    (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
6648    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
6649    []>, Sched<[WritePCmpEStrM.Folded, ReadAfterLd]>;
6650}
6651
6652let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
6653  let Predicates = [HasAVX] in
6654  defm VPCMPESTRM : SS42AI_pcmpestrm<"vpcmpestrm">, VEX;
6655  defm PCMPESTRM :  SS42AI_pcmpestrm<"pcmpestrm">;
6656}
6657
6658multiclass SS42AI_pcmpistri<string asm> {
6659  def rr : SS42AI<0x63, MRMSrcReg, (outs),
6660    (ins VR128:$src1, VR128:$src2, u8imm:$src3),
6661    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
6662    []>, Sched<[WritePCmpIStrI]>;
6663  let mayLoad = 1 in
6664  def rm : SS42AI<0x63, MRMSrcMem, (outs),
6665    (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
6666    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
6667    []>, Sched<[WritePCmpIStrI.Folded, ReadAfterLd]>;
6668}
6669
6670let Defs = [ECX, EFLAGS], hasSideEffects = 0 in {
6671  let Predicates = [HasAVX] in
6672  defm VPCMPISTRI : SS42AI_pcmpistri<"vpcmpistri">, VEX;
6673  defm PCMPISTRI  : SS42AI_pcmpistri<"pcmpistri">;
6674}
6675
6676multiclass SS42AI_pcmpestri<string asm> {
6677  def rr : SS42AI<0x61, MRMSrcReg, (outs),
6678    (ins VR128:$src1, VR128:$src3, u8imm:$src5),
6679    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
6680    []>, Sched<[WritePCmpEStrI]>;
6681  let mayLoad = 1 in
6682  def rm : SS42AI<0x61, MRMSrcMem, (outs),
6683    (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
6684    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
6685    []>, Sched<[WritePCmpEStrI.Folded, ReadAfterLd]>;
6686}
6687
6688let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
6689  let Predicates = [HasAVX] in
6690  defm VPCMPESTRI : SS42AI_pcmpestri<"vpcmpestri">, VEX;
6691  defm PCMPESTRI  : SS42AI_pcmpestri<"pcmpestri">;
6692}
6693
6694//===----------------------------------------------------------------------===//
6695// SSE4.2 - CRC Instructions
6696//===----------------------------------------------------------------------===//
6697
6698// No CRC instructions have AVX equivalents
6699
6700// crc intrinsic instruction
6701// This set of instructions are only rm, the only difference is the size
6702// of r and m.
6703class SS42I_crc32r<bits<8> opc, string asm, RegisterClass RCOut,
6704                   RegisterClass RCIn, SDPatternOperator Int> :
6705  SS42FI<opc, MRMSrcReg, (outs RCOut:$dst), (ins RCOut:$src1, RCIn:$src2),
6706         !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
6707         [(set RCOut:$dst, (Int RCOut:$src1, RCIn:$src2))]>,
6708         Sched<[WriteCRC32]>;
6709
6710class SS42I_crc32m<bits<8> opc, string asm, RegisterClass RCOut,
6711                   X86MemOperand x86memop, SDPatternOperator Int> :
6712  SS42FI<opc, MRMSrcMem, (outs RCOut:$dst), (ins RCOut:$src1, x86memop:$src2),
6713         !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
6714         [(set RCOut:$dst, (Int RCOut:$src1, (load addr:$src2)))]>,
6715         Sched<[WriteCRC32.Folded, ReadAfterLd]>;
6716
6717let Constraints = "$src1 = $dst" in {
6718  def CRC32r32m8  : SS42I_crc32m<0xF0, "crc32{b}", GR32, i8mem,
6719                                 int_x86_sse42_crc32_32_8>;
6720  def CRC32r32r8  : SS42I_crc32r<0xF0, "crc32{b}", GR32, GR8,
6721                                 int_x86_sse42_crc32_32_8>;
6722  def CRC32r32m16 : SS42I_crc32m<0xF1, "crc32{w}", GR32, i16mem,
6723                                 int_x86_sse42_crc32_32_16>, OpSize16;
6724  def CRC32r32r16 : SS42I_crc32r<0xF1, "crc32{w}", GR32, GR16,
6725                                 int_x86_sse42_crc32_32_16>, OpSize16;
6726  def CRC32r32m32 : SS42I_crc32m<0xF1, "crc32{l}", GR32, i32mem,
6727                                 int_x86_sse42_crc32_32_32>, OpSize32;
6728  def CRC32r32r32 : SS42I_crc32r<0xF1, "crc32{l}", GR32, GR32,
6729                                 int_x86_sse42_crc32_32_32>, OpSize32;
6730  def CRC32r64m64 : SS42I_crc32m<0xF1, "crc32{q}", GR64, i64mem,
6731                                 int_x86_sse42_crc32_64_64>, REX_W;
6732  def CRC32r64r64 : SS42I_crc32r<0xF1, "crc32{q}", GR64, GR64,
6733                                 int_x86_sse42_crc32_64_64>, REX_W;
6734  let hasSideEffects = 0 in {
6735    let mayLoad = 1 in
6736    def CRC32r64m8 : SS42I_crc32m<0xF0, "crc32{b}", GR64, i8mem,
6737                                   null_frag>, REX_W;
6738    def CRC32r64r8 : SS42I_crc32r<0xF0, "crc32{b}", GR64, GR8,
6739                                   null_frag>, REX_W;
6740  }
6741}
6742
6743//===----------------------------------------------------------------------===//
6744// SHA-NI Instructions
6745//===----------------------------------------------------------------------===//
6746
6747// FIXME: Is there a better scheduler class for SHA than WriteVecIMul?
6748multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId,
6749                      X86FoldableSchedWrite sched, bit UsesXMM0 = 0> {
6750  def rr : I<Opc, MRMSrcReg, (outs VR128:$dst),
6751             (ins VR128:$src1, VR128:$src2),
6752             !if(UsesXMM0,
6753                 !strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
6754                 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")),
6755             [!if(UsesXMM0,
6756                  (set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0)),
6757                  (set VR128:$dst, (IntId VR128:$src1, VR128:$src2)))]>,
6758             T8, Sched<[sched]>;
6759
6760  def rm : I<Opc, MRMSrcMem, (outs VR128:$dst),
6761             (ins VR128:$src1, i128mem:$src2),
6762             !if(UsesXMM0,
6763                 !strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
6764                 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")),
6765             [!if(UsesXMM0,
6766                  (set VR128:$dst, (IntId VR128:$src1,
6767                    (bc_v4i32 (memopv2i64 addr:$src2)), XMM0)),
6768                  (set VR128:$dst, (IntId VR128:$src1,
6769                    (bc_v4i32 (memopv2i64 addr:$src2)))))]>, T8,
6770             Sched<[sched.Folded, ReadAfterLd]>;
6771}
6772
6773let Constraints = "$src1 = $dst", Predicates = [HasSHA] in {
6774  def SHA1RNDS4rri : Ii8<0xCC, MRMSrcReg, (outs VR128:$dst),
6775                         (ins VR128:$src1, VR128:$src2, u8imm:$src3),
6776                         "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
6777                         [(set VR128:$dst,
6778                           (int_x86_sha1rnds4 VR128:$src1, VR128:$src2,
6779                            (i8 imm:$src3)))]>, TA,
6780                         Sched<[SchedWriteVecIMul.XMM]>;
6781  def SHA1RNDS4rmi : Ii8<0xCC, MRMSrcMem, (outs VR128:$dst),
6782                         (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
6783                         "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
6784                         [(set VR128:$dst,
6785                           (int_x86_sha1rnds4 VR128:$src1,
6786                            (bc_v4i32 (memopv2i64 addr:$src2)),
6787                            (i8 imm:$src3)))]>, TA,
6788                         Sched<[SchedWriteVecIMul.XMM.Folded, ReadAfterLd]>;
6789
6790  defm SHA1NEXTE : SHAI_binop<0xC8, "sha1nexte", int_x86_sha1nexte,
6791                              SchedWriteVecIMul.XMM>;
6792  defm SHA1MSG1  : SHAI_binop<0xC9, "sha1msg1", int_x86_sha1msg1,
6793                              SchedWriteVecIMul.XMM>;
6794  defm SHA1MSG2  : SHAI_binop<0xCA, "sha1msg2", int_x86_sha1msg2,
6795                              SchedWriteVecIMul.XMM>;
6796
6797  let Uses=[XMM0] in
6798  defm SHA256RNDS2 : SHAI_binop<0xCB, "sha256rnds2", int_x86_sha256rnds2,
6799                                SchedWriteVecIMul.XMM, 1>;
6800
6801  defm SHA256MSG1 : SHAI_binop<0xCC, "sha256msg1", int_x86_sha256msg1,
6802                               SchedWriteVecIMul.XMM>;
6803  defm SHA256MSG2 : SHAI_binop<0xCD, "sha256msg2", int_x86_sha256msg2,
6804                               SchedWriteVecIMul.XMM>;
6805}
6806
6807// Aliases with explicit %xmm0
6808def : InstAlias<"sha256rnds2\t{$src2, $dst|$dst, $src2}",
6809                (SHA256RNDS2rr VR128:$dst, VR128:$src2), 0>;
6810def : InstAlias<"sha256rnds2\t{$src2, $dst|$dst, $src2}",
6811                (SHA256RNDS2rm VR128:$dst, i128mem:$src2), 0>;
6812
6813//===----------------------------------------------------------------------===//
6814// AES-NI Instructions
6815//===----------------------------------------------------------------------===//
6816
6817multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr,
6818                             Intrinsic IntId, PatFrag ld_frag,
6819                             bit Is2Addr = 0, RegisterClass RC = VR128,
6820                             X86MemOperand MemOp = i128mem> {
6821  let AsmString = OpcodeStr##
6822                  !if(Is2Addr, "\t{$src2, $dst|$dst, $src2}",
6823                               "\t{$src2, $src1, $dst|$dst, $src1, $src2}") in {
6824    def rr : AES8I<opc, MRMSrcReg, (outs RC:$dst),
6825                   (ins RC:$src1, RC:$src2), "",
6826                   [(set RC:$dst, (IntId RC:$src1, RC:$src2))]>,
6827                   Sched<[WriteAESDecEnc]>;
6828    def rm : AES8I<opc, MRMSrcMem, (outs RC:$dst),
6829                   (ins RC:$src1, MemOp:$src2), "",
6830                   [(set RC:$dst, (IntId RC:$src1, (ld_frag addr:$src2)))]>,
6831                   Sched<[WriteAESDecEnc.Folded, ReadAfterLd]>;
6832  }
6833}
6834
6835// Perform One Round of an AES Encryption/Decryption Flow
6836let Predicates = [HasAVX, NoVLX_Or_NoVAES, HasAES] in {
6837  defm VAESENC          : AESI_binop_rm_int<0xDC, "vaesenc",
6838                         int_x86_aesni_aesenc, loadv2i64>, VEX_4V, VEX_WIG;
6839  defm VAESENCLAST      : AESI_binop_rm_int<0xDD, "vaesenclast",
6840                         int_x86_aesni_aesenclast, loadv2i64>, VEX_4V, VEX_WIG;
6841  defm VAESDEC          : AESI_binop_rm_int<0xDE, "vaesdec",
6842                         int_x86_aesni_aesdec, loadv2i64>, VEX_4V, VEX_WIG;
6843  defm VAESDECLAST      : AESI_binop_rm_int<0xDF, "vaesdeclast",
6844                         int_x86_aesni_aesdeclast, loadv2i64>, VEX_4V, VEX_WIG;
6845}
6846
6847let Predicates = [NoVLX, HasVAES] in {
6848  defm VAESENCY         : AESI_binop_rm_int<0xDC, "vaesenc",
6849                         int_x86_aesni_aesenc_256, loadv4i64, 0, VR256,
6850                         i256mem>, VEX_4V, VEX_L, VEX_WIG;
6851  defm VAESENCLASTY     : AESI_binop_rm_int<0xDD, "vaesenclast",
6852                         int_x86_aesni_aesenclast_256, loadv4i64, 0, VR256,
6853                         i256mem>, VEX_4V, VEX_L, VEX_WIG;
6854  defm VAESDECY         : AESI_binop_rm_int<0xDE, "vaesdec",
6855                         int_x86_aesni_aesdec_256, loadv4i64, 0, VR256,
6856                         i256mem>, VEX_4V, VEX_L, VEX_WIG;
6857  defm VAESDECLASTY     : AESI_binop_rm_int<0xDF, "vaesdeclast",
6858                         int_x86_aesni_aesdeclast_256, loadv4i64, 0, VR256,
6859                         i256mem>, VEX_4V, VEX_L, VEX_WIG;
6860}
6861
6862let Constraints = "$src1 = $dst" in {
6863  defm AESENC          : AESI_binop_rm_int<0xDC, "aesenc",
6864                         int_x86_aesni_aesenc, memopv2i64, 1>;
6865  defm AESENCLAST      : AESI_binop_rm_int<0xDD, "aesenclast",
6866                         int_x86_aesni_aesenclast, memopv2i64, 1>;
6867  defm AESDEC          : AESI_binop_rm_int<0xDE, "aesdec",
6868                         int_x86_aesni_aesdec, memopv2i64, 1>;
6869  defm AESDECLAST      : AESI_binop_rm_int<0xDF, "aesdeclast",
6870                         int_x86_aesni_aesdeclast, memopv2i64, 1>;
6871}
6872
6873// Perform the AES InvMixColumn Transformation
6874let Predicates = [HasAVX, HasAES] in {
6875  def VAESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
6876      (ins VR128:$src1),
6877      "vaesimc\t{$src1, $dst|$dst, $src1}",
6878      [(set VR128:$dst,
6879        (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>,
6880      VEX, VEX_WIG;
6881  def VAESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
6882      (ins i128mem:$src1),
6883      "vaesimc\t{$src1, $dst|$dst, $src1}",
6884      [(set VR128:$dst, (int_x86_aesni_aesimc (loadv2i64 addr:$src1)))]>,
6885      Sched<[WriteAESIMC.Folded]>, VEX, VEX_WIG;
6886}
6887def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
6888  (ins VR128:$src1),
6889  "aesimc\t{$src1, $dst|$dst, $src1}",
6890  [(set VR128:$dst,
6891    (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>;
6892def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
6893  (ins i128mem:$src1),
6894  "aesimc\t{$src1, $dst|$dst, $src1}",
6895  [(set VR128:$dst, (int_x86_aesni_aesimc (memopv2i64 addr:$src1)))]>,
6896  Sched<[WriteAESIMC.Folded]>;
6897
6898// AES Round Key Generation Assist
6899let Predicates = [HasAVX, HasAES] in {
6900  def VAESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
6901      (ins VR128:$src1, u8imm:$src2),
6902      "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6903      [(set VR128:$dst,
6904        (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>,
6905      Sched<[WriteAESKeyGen]>, VEX, VEX_WIG;
6906  def VAESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
6907      (ins i128mem:$src1, u8imm:$src2),
6908      "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6909      [(set VR128:$dst,
6910        (int_x86_aesni_aeskeygenassist (loadv2i64 addr:$src1), imm:$src2))]>,
6911      Sched<[WriteAESKeyGen.Folded]>, VEX, VEX_WIG;
6912}
6913def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
6914  (ins VR128:$src1, u8imm:$src2),
6915  "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6916  [(set VR128:$dst,
6917    (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>,
6918  Sched<[WriteAESKeyGen]>;
6919def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
6920  (ins i128mem:$src1, u8imm:$src2),
6921  "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6922  [(set VR128:$dst,
6923    (int_x86_aesni_aeskeygenassist (memopv2i64 addr:$src1), imm:$src2))]>,
6924  Sched<[WriteAESKeyGen.Folded]>;
6925
6926//===----------------------------------------------------------------------===//
6927// PCLMUL Instructions
6928//===----------------------------------------------------------------------===//
6929
6930// Immediate transform to help with commuting.
6931def PCLMULCommuteImm : SDNodeXForm<imm, [{
6932  uint8_t Imm = N->getZExtValue();
6933  return getI8Imm((uint8_t)((Imm >> 4) | (Imm << 4)), SDLoc(N));
6934}]>;
6935
6936// SSE carry-less Multiplication instructions
6937let Predicates = [NoAVX, HasPCLMUL] in {
6938  let Constraints = "$src1 = $dst" in {
6939    let isCommutable = 1 in
6940    def PCLMULQDQrr : PCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
6941              (ins VR128:$src1, VR128:$src2, u8imm:$src3),
6942              "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
6943              [(set VR128:$dst,
6944                (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))]>,
6945                Sched<[WriteCLMul]>;
6946
6947    def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
6948              (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
6949              "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
6950              [(set VR128:$dst,
6951                 (int_x86_pclmulqdq VR128:$src1, (memopv2i64 addr:$src2),
6952                  imm:$src3))]>,
6953              Sched<[WriteCLMul.Folded, ReadAfterLd]>;
6954  } // Constraints = "$src1 = $dst"
6955
6956  def : Pat<(int_x86_pclmulqdq (memopv2i64 addr:$src2), VR128:$src1,
6957                                (i8 imm:$src3)),
6958            (PCLMULQDQrm VR128:$src1, addr:$src2,
6959                          (PCLMULCommuteImm imm:$src3))>;
6960} // Predicates = [NoAVX, HasPCLMUL]
6961
6962// SSE aliases
6963foreach HI = ["hq","lq"] in
6964foreach LO = ["hq","lq"] in {
6965  def : InstAlias<"pclmul" # HI # LO # "dq\t{$src, $dst|$dst, $src}",
6966                  (PCLMULQDQrr VR128:$dst, VR128:$src,
6967                   !add(!shl(!eq(LO,"hq"),4),!eq(HI,"hq"))), 0>;
6968  def : InstAlias<"pclmul" # HI # LO # "dq\t{$src, $dst|$dst, $src}",
6969                  (PCLMULQDQrm VR128:$dst, i128mem:$src,
6970                   !add(!shl(!eq(LO,"hq"),4),!eq(HI,"hq"))), 0>;
6971}
6972
6973// AVX carry-less Multiplication instructions
6974multiclass vpclmulqdq<RegisterClass RC, X86MemOperand MemOp,
6975                      PatFrag LdFrag, Intrinsic IntId> {
6976  let isCommutable = 1 in
6977  def rr : PCLMULIi8<0x44, MRMSrcReg, (outs RC:$dst),
6978            (ins RC:$src1, RC:$src2, u8imm:$src3),
6979            "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
6980            [(set RC:$dst,
6981              (IntId RC:$src1, RC:$src2, imm:$src3))]>,
6982            Sched<[WriteCLMul]>;
6983
6984  def rm : PCLMULIi8<0x44, MRMSrcMem, (outs RC:$dst),
6985            (ins RC:$src1, MemOp:$src2, u8imm:$src3),
6986            "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
6987            [(set RC:$dst,
6988               (IntId RC:$src1, (LdFrag addr:$src2), imm:$src3))]>,
6989            Sched<[WriteCLMul.Folded, ReadAfterLd]>;
6990
6991  // We can commute a load in the first operand by swapping the sources and
6992  // rotating the immediate.
6993  def : Pat<(IntId (LdFrag addr:$src2), RC:$src1, (i8 imm:$src3)),
6994            (!cast<Instruction>(NAME#"rm") RC:$src1, addr:$src2,
6995                                           (PCLMULCommuteImm imm:$src3))>;
6996}
6997
6998let Predicates = [HasAVX, NoVLX_Or_NoVPCLMULQDQ, HasPCLMUL] in
6999defm VPCLMULQDQ : vpclmulqdq<VR128, i128mem, loadv2i64,
7000                             int_x86_pclmulqdq>, VEX_4V, VEX_WIG;
7001
7002let Predicates = [NoVLX, HasVPCLMULQDQ] in
7003defm VPCLMULQDQY : vpclmulqdq<VR256, i256mem, loadv4i64,
7004                              int_x86_pclmulqdq_256>, VEX_4V, VEX_L, VEX_WIG;
7005
7006multiclass vpclmulqdq_aliases_impl<string InstStr, RegisterClass RC,
7007                                   X86MemOperand MemOp, string Hi, string Lo> {
7008  def : InstAlias<"vpclmul"##Hi##Lo##"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7009                  (!cast<Instruction>(InstStr # "rr") RC:$dst, RC:$src1, RC:$src2,
7010                        !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>;
7011  def : InstAlias<"vpclmul"##Hi##Lo##"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7012                  (!cast<Instruction>(InstStr # "rm") RC:$dst, RC:$src1, MemOp:$src2,
7013                        !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>;
7014}
7015
7016multiclass vpclmulqdq_aliases<string InstStr, RegisterClass RC,
7017                              X86MemOperand MemOp> {
7018  defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "hq", "hq">;
7019  defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "hq", "lq">;
7020  defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "lq", "hq">;
7021  defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "lq", "lq">;
7022}
7023
7024// AVX aliases
7025defm : vpclmulqdq_aliases<"VPCLMULQDQ", VR128, i128mem>;
7026defm : vpclmulqdq_aliases<"VPCLMULQDQY", VR256, i256mem>;
7027
7028//===----------------------------------------------------------------------===//
7029// SSE4A Instructions
7030//===----------------------------------------------------------------------===//
7031
7032let Predicates = [HasSSE4A] in {
7033
7034let ExeDomain = SSEPackedInt in {
7035let Constraints = "$src = $dst" in {
7036def EXTRQI : Ii8<0x78, MRMXr, (outs VR128:$dst),
7037                 (ins VR128:$src, u8imm:$len, u8imm:$idx),
7038                 "extrq\t{$idx, $len, $src|$src, $len, $idx}",
7039                 [(set VR128:$dst, (X86extrqi VR128:$src, imm:$len,
7040                                    imm:$idx))]>,
7041                 PD, Sched<[SchedWriteVecALU.XMM]>;
7042def EXTRQ  : I<0x79, MRMSrcReg, (outs VR128:$dst),
7043              (ins VR128:$src, VR128:$mask),
7044              "extrq\t{$mask, $src|$src, $mask}",
7045              [(set VR128:$dst, (int_x86_sse4a_extrq VR128:$src,
7046                                 VR128:$mask))]>,
7047              PD, Sched<[SchedWriteVecALU.XMM]>;
7048
7049def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst),
7050                   (ins VR128:$src, VR128:$src2, u8imm:$len, u8imm:$idx),
7051                   "insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}",
7052                   [(set VR128:$dst, (X86insertqi VR128:$src, VR128:$src2,
7053                                      imm:$len, imm:$idx))]>,
7054                   XD, Sched<[SchedWriteVecALU.XMM]>;
7055def INSERTQ  : I<0x79, MRMSrcReg, (outs VR128:$dst),
7056                 (ins VR128:$src, VR128:$mask),
7057                 "insertq\t{$mask, $src|$src, $mask}",
7058                 [(set VR128:$dst, (int_x86_sse4a_insertq VR128:$src,
7059                                    VR128:$mask))]>,
7060                 XD, Sched<[SchedWriteVecALU.XMM]>;
7061}
7062} // ExeDomain = SSEPackedInt
7063
7064// Non-temporal (unaligned) scalar stores.
7065let AddedComplexity = 400 in { // Prefer non-temporal versions
7066let hasSideEffects = 0, mayStore = 1, SchedRW = [SchedWriteFMoveLSNT.Scl.MR] in {
7067def MOVNTSS : I<0x2B, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src),
7068                "movntss\t{$src, $dst|$dst, $src}", []>, XS;
7069
7070def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
7071                "movntsd\t{$src, $dst|$dst, $src}", []>, XD;
7072} // SchedRW
7073
7074def : Pat<(nontemporalstore FR32:$src, addr:$dst),
7075          (MOVNTSS addr:$dst, (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>;
7076
7077def : Pat<(nontemporalstore FR64:$src, addr:$dst),
7078          (MOVNTSD addr:$dst, (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>;
7079
7080} // AddedComplexity
7081} // HasSSE4A
7082
7083//===----------------------------------------------------------------------===//
7084// AVX Instructions
7085//===----------------------------------------------------------------------===//
7086
7087//===----------------------------------------------------------------------===//
7088// VBROADCAST - Load from memory and broadcast to all elements of the
7089//              destination operand
7090//
7091class avx_broadcast_rm<bits<8> opc, string OpcodeStr, RegisterClass RC,
7092                           X86MemOperand x86memop, ValueType VT,
7093                           PatFrag ld_frag, SchedWrite Sched> :
7094  AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
7095        !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7096        [(set RC:$dst, (VT (X86VBroadcast (ld_frag addr:$src))))]>,
7097        Sched<[Sched]>, VEX;
7098
7099// AVX2 adds register forms
7100class avx2_broadcast_rr<bits<8> opc, string OpcodeStr, RegisterClass RC,
7101                        ValueType ResVT, ValueType OpVT, SchedWrite Sched> :
7102  AVX28I<opc, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
7103         !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7104         [(set RC:$dst, (ResVT (X86VBroadcast (OpVT VR128:$src))))]>,
7105         Sched<[Sched]>, VEX;
7106
7107let ExeDomain = SSEPackedSingle, Predicates = [HasAVX, NoVLX] in {
7108  def VBROADCASTSSrm  : avx_broadcast_rm<0x18, "vbroadcastss", VR128,
7109                                         f32mem, v4f32, loadf32,
7110                                         SchedWriteFShuffle.XMM.Folded>;
7111  def VBROADCASTSSYrm : avx_broadcast_rm<0x18, "vbroadcastss", VR256,
7112                                         f32mem, v8f32, loadf32,
7113                                         SchedWriteFShuffle.XMM.Folded>, VEX_L;
7114}
7115let ExeDomain = SSEPackedDouble, Predicates = [HasAVX, NoVLX] in
7116def VBROADCASTSDYrm  : avx_broadcast_rm<0x19, "vbroadcastsd", VR256, f64mem,
7117                                        v4f64, loadf64,
7118                                        SchedWriteFShuffle.XMM.Folded>, VEX_L;
7119
7120let ExeDomain = SSEPackedSingle, Predicates = [HasAVX2, NoVLX] in {
7121  def VBROADCASTSSrr  : avx2_broadcast_rr<0x18, "vbroadcastss", VR128,
7122                                          v4f32, v4f32, SchedWriteFShuffle.XMM>;
7123  def VBROADCASTSSYrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR256,
7124                                          v8f32, v4f32, WriteFShuffle256>, VEX_L;
7125}
7126let ExeDomain = SSEPackedDouble, Predicates = [HasAVX2, NoVLX] in
7127def VBROADCASTSDYrr  : avx2_broadcast_rr<0x19, "vbroadcastsd", VR256,
7128                                         v4f64, v2f64, WriteFShuffle256>, VEX_L;
7129
7130let Predicates = [HasAVX, NoVLX] in {
7131  def : Pat<(v4f32 (X86VBroadcast (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
7132            (VBROADCASTSSrm addr:$src)>;
7133  def : Pat<(v8f32 (X86VBroadcast (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
7134            (VBROADCASTSSYrm addr:$src)>;
7135  def : Pat<(v4f64 (X86VBroadcast (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
7136            (VBROADCASTSDYrm addr:$src)>;
7137}
7138
7139//===----------------------------------------------------------------------===//
7140// VBROADCAST*128 - Load from memory and broadcast 128-bit vector to both
7141//                  halves of a 256-bit vector.
7142//
7143let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX2] in
7144def VBROADCASTI128 : AVX8I<0x5A, MRMSrcMem, (outs VR256:$dst),
7145                           (ins i128mem:$src),
7146                           "vbroadcasti128\t{$src, $dst|$dst, $src}", []>,
7147                           Sched<[WriteShuffleLd]>, VEX, VEX_L;
7148
7149let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX],
7150    ExeDomain = SSEPackedSingle in
7151def VBROADCASTF128 : AVX8I<0x1A, MRMSrcMem, (outs VR256:$dst),
7152                           (ins f128mem:$src),
7153                           "vbroadcastf128\t{$src, $dst|$dst, $src}", []>,
7154                           Sched<[SchedWriteFShuffle.XMM.Folded]>, VEX, VEX_L;
7155
7156let Predicates = [HasAVX2, NoVLX] in {
7157def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
7158          (VBROADCASTI128 addr:$src)>;
7159def : Pat<(v8i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src)))),
7160          (VBROADCASTI128 addr:$src)>;
7161def : Pat<(v16i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))),
7162          (VBROADCASTI128 addr:$src)>;
7163def : Pat<(v32i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))),
7164          (VBROADCASTI128 addr:$src)>;
7165}
7166
7167let Predicates = [HasAVX, NoVLX] in {
7168def : Pat<(v4f64 (X86SubVBroadcast (loadv2f64 addr:$src))),
7169          (VBROADCASTF128 addr:$src)>;
7170def : Pat<(v8f32 (X86SubVBroadcast (loadv4f32 addr:$src))),
7171          (VBROADCASTF128 addr:$src)>;
7172}
7173
7174let Predicates = [HasAVX1Only] in {
7175def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
7176          (VBROADCASTF128 addr:$src)>;
7177def : Pat<(v8i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src)))),
7178          (VBROADCASTF128 addr:$src)>;
7179def : Pat<(v16i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))),
7180          (VBROADCASTF128 addr:$src)>;
7181def : Pat<(v32i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))),
7182          (VBROADCASTF128 addr:$src)>;
7183}
7184
7185//===----------------------------------------------------------------------===//
7186// VINSERTF128 - Insert packed floating-point values
7187//
7188let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
7189def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst),
7190          (ins VR256:$src1, VR128:$src2, u8imm:$src3),
7191          "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7192          []>, Sched<[WriteFShuffle256]>, VEX_4V, VEX_L;
7193let mayLoad = 1 in
7194def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst),
7195          (ins VR256:$src1, f128mem:$src2, u8imm:$src3),
7196          "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7197          []>, Sched<[WriteFShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L;
7198}
7199
7200// To create a 256-bit all ones value, we should produce VCMPTRUEPS
7201// with YMM register containing zero.
7202// FIXME: Avoid producing vxorps to clear the fake inputs.
7203let Predicates = [HasAVX1Only] in {
7204def : Pat<(v8i32 immAllOnesV), (VCMPPSYrri (AVX_SET0), (AVX_SET0), 0xf)>;
7205}
7206
7207multiclass vinsert_lowering<string InstrStr, ValueType From, ValueType To,
7208                            PatFrag memop_frag> {
7209  def : Pat<(vinsert128_insert:$ins (To VR256:$src1), (From VR128:$src2),
7210                                   (iPTR imm)),
7211            (!cast<Instruction>(InstrStr#rr) VR256:$src1, VR128:$src2,
7212                                       (INSERT_get_vinsert128_imm VR256:$ins))>;
7213  def : Pat<(vinsert128_insert:$ins (To VR256:$src1),
7214                                    (From (bitconvert (memop_frag addr:$src2))),
7215                                    (iPTR imm)),
7216            (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2,
7217                                       (INSERT_get_vinsert128_imm VR256:$ins))>;
7218}
7219
7220let Predicates = [HasAVX, NoVLX] in {
7221  defm : vinsert_lowering<"VINSERTF128", v4f32, v8f32, loadv4f32>;
7222  defm : vinsert_lowering<"VINSERTF128", v2f64, v4f64, loadv2f64>;
7223}
7224
7225let Predicates = [HasAVX1Only] in {
7226  defm : vinsert_lowering<"VINSERTF128", v2i64, v4i64,  loadv2i64>;
7227  defm : vinsert_lowering<"VINSERTF128", v4i32, v8i32,  loadv2i64>;
7228  defm : vinsert_lowering<"VINSERTF128", v8i16, v16i16, loadv2i64>;
7229  defm : vinsert_lowering<"VINSERTF128", v16i8, v32i8,  loadv2i64>;
7230}
7231
7232//===----------------------------------------------------------------------===//
7233// VEXTRACTF128 - Extract packed floating-point values
7234//
7235let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
7236def VEXTRACTF128rr : AVXAIi8<0x19, MRMDestReg, (outs VR128:$dst),
7237          (ins VR256:$src1, u8imm:$src2),
7238          "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7239          []>, Sched<[WriteFShuffle256]>, VEX, VEX_L;
7240let mayStore = 1 in
7241def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs),
7242          (ins f128mem:$dst, VR256:$src1, u8imm:$src2),
7243          "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7244          []>, Sched<[WriteFStoreX]>, VEX, VEX_L;
7245}
7246
7247multiclass vextract_lowering<string InstrStr, ValueType From, ValueType To> {
7248  def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
7249            (To (!cast<Instruction>(InstrStr#rr)
7250                                    (From VR256:$src1),
7251                                    (EXTRACT_get_vextract128_imm VR128:$ext)))>;
7252  def : Pat<(store (To (vextract128_extract:$ext (From VR256:$src1),
7253                                                 (iPTR imm))), addr:$dst),
7254            (!cast<Instruction>(InstrStr#mr) addr:$dst, VR256:$src1,
7255             (EXTRACT_get_vextract128_imm VR128:$ext))>;
7256}
7257
7258// AVX1 patterns
7259let Predicates = [HasAVX, NoVLX] in {
7260  defm : vextract_lowering<"VEXTRACTF128", v8f32, v4f32>;
7261  defm : vextract_lowering<"VEXTRACTF128", v4f64, v2f64>;
7262}
7263
7264let Predicates = [HasAVX1Only] in {
7265  defm : vextract_lowering<"VEXTRACTF128", v4i64,  v2i64>;
7266  defm : vextract_lowering<"VEXTRACTF128", v8i32,  v4i32>;
7267  defm : vextract_lowering<"VEXTRACTF128", v16i16, v8i16>;
7268  defm : vextract_lowering<"VEXTRACTF128", v32i8,  v16i8>;
7269}
7270
7271//===----------------------------------------------------------------------===//
7272// VMASKMOV - Conditional SIMD Packed Loads and Stores
7273//
7274multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr,
7275                          Intrinsic IntLd, Intrinsic IntLd256,
7276                          Intrinsic IntSt, Intrinsic IntSt256> {
7277  def rm  : AVX8I<opc_rm, MRMSrcMem, (outs VR128:$dst),
7278             (ins VR128:$src1, f128mem:$src2),
7279             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7280             [(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))]>,
7281             VEX_4V, Sched<[WriteFMaskedLoad]>;
7282  def Yrm : AVX8I<opc_rm, MRMSrcMem, (outs VR256:$dst),
7283             (ins VR256:$src1, f256mem:$src2),
7284             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7285             [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
7286             VEX_4V, VEX_L, Sched<[WriteFMaskedLoadY]>;
7287  def mr  : AVX8I<opc_mr, MRMDestMem, (outs),
7288             (ins f128mem:$dst, VR128:$src1, VR128:$src2),
7289             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7290             [(IntSt addr:$dst, VR128:$src1, VR128:$src2)]>,
7291             VEX_4V, Sched<[WriteFMaskedStore]>;
7292  def Ymr : AVX8I<opc_mr, MRMDestMem, (outs),
7293             (ins f256mem:$dst, VR256:$src1, VR256:$src2),
7294             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7295             [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>,
7296             VEX_4V, VEX_L, Sched<[WriteFMaskedStoreY]>;
7297}
7298
7299let ExeDomain = SSEPackedSingle in
7300defm VMASKMOVPS : avx_movmask_rm<0x2C, 0x2E, "vmaskmovps",
7301                                 int_x86_avx_maskload_ps,
7302                                 int_x86_avx_maskload_ps_256,
7303                                 int_x86_avx_maskstore_ps,
7304                                 int_x86_avx_maskstore_ps_256>;
7305let ExeDomain = SSEPackedDouble in
7306defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd",
7307                                 int_x86_avx_maskload_pd,
7308                                 int_x86_avx_maskload_pd_256,
7309                                 int_x86_avx_maskstore_pd,
7310                                 int_x86_avx_maskstore_pd_256>;
7311
7312//===----------------------------------------------------------------------===//
7313// VPERMIL - Permute Single and Double Floating-Point Values
7314//
7315
7316multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
7317                      RegisterClass RC, X86MemOperand x86memop_f,
7318                      X86MemOperand x86memop_i, PatFrag i_frag,
7319                      ValueType f_vt, ValueType i_vt,
7320                      X86FoldableSchedWrite sched,
7321                      X86FoldableSchedWrite varsched> {
7322  let Predicates = [HasAVX, NoVLX] in {
7323    def rr  : AVX8I<opc_rm, MRMSrcReg, (outs RC:$dst),
7324               (ins RC:$src1, RC:$src2),
7325               !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7326               [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1, (i_vt RC:$src2))))]>, VEX_4V,
7327               Sched<[varsched]>;
7328    def rm  : AVX8I<opc_rm, MRMSrcMem, (outs RC:$dst),
7329               (ins RC:$src1, x86memop_i:$src2),
7330               !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7331               [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1,
7332                              (i_vt (bitconvert (i_frag addr:$src2))))))]>, VEX_4V,
7333               Sched<[varsched.Folded, ReadAfterLd]>;
7334
7335    def ri  : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst),
7336             (ins RC:$src1, u8imm:$src2),
7337             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7338             [(set RC:$dst, (f_vt (X86VPermilpi RC:$src1, (i8 imm:$src2))))]>, VEX,
7339             Sched<[sched]>;
7340    def mi  : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst),
7341             (ins x86memop_f:$src1, u8imm:$src2),
7342             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7343             [(set RC:$dst,
7344               (f_vt (X86VPermilpi (load addr:$src1), (i8 imm:$src2))))]>, VEX,
7345             Sched<[sched.Folded]>;
7346  }// Predicates = [HasAVX, NoVLX]
7347}
7348
7349let ExeDomain = SSEPackedSingle in {
7350  defm VPERMILPS  : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem,
7351                               loadv2i64, v4f32, v4i32, SchedWriteFShuffle.XMM,
7352                               SchedWriteFVarShuffle.XMM>;
7353  defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem,
7354                               loadv4i64, v8f32, v8i32, SchedWriteFShuffle.YMM,
7355                               SchedWriteFVarShuffle.YMM>, VEX_L;
7356}
7357let ExeDomain = SSEPackedDouble in {
7358  defm VPERMILPD  : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem,
7359                               loadv2i64, v2f64, v2i64, SchedWriteFShuffle.XMM,
7360                               SchedWriteFVarShuffle.XMM>;
7361  defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem,
7362                               loadv4i64, v4f64, v4i64, SchedWriteFShuffle.YMM,
7363                               SchedWriteFVarShuffle.YMM>, VEX_L;
7364}
7365
7366//===----------------------------------------------------------------------===//
7367// VPERM2F128 - Permute Floating-Point Values in 128-bit chunks
7368//
7369
7370let ExeDomain = SSEPackedSingle in {
7371let isCommutable = 1 in
7372def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst),
7373          (ins VR256:$src1, VR256:$src2, u8imm:$src3),
7374          "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7375          [(set VR256:$dst, (v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2,
7376                              (i8 imm:$src3))))]>, VEX_4V, VEX_L,
7377          Sched<[WriteFShuffle256]>;
7378def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst),
7379          (ins VR256:$src1, f256mem:$src2, u8imm:$src3),
7380          "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7381          [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4f64 addr:$src2),
7382                             (i8 imm:$src3)))]>, VEX_4V, VEX_L,
7383          Sched<[WriteFShuffle256Ld, ReadAfterLd]>;
7384}
7385
7386// Immediate transform to help with commuting.
7387def Perm2XCommuteImm : SDNodeXForm<imm, [{
7388  return getI8Imm(N->getZExtValue() ^ 0x22, SDLoc(N));
7389}]>;
7390
7391let Predicates = [HasAVX] in {
7392// Pattern with load in other operand.
7393def : Pat<(v4f64 (X86VPerm2x128 (loadv4f64 addr:$src2),
7394                                VR256:$src1, (i8 imm:$imm))),
7395          (VPERM2F128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm imm:$imm))>;
7396}
7397
7398let Predicates = [HasAVX1Only] in {
7399def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
7400          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
7401def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1,
7402                  (loadv4i64 addr:$src2), (i8 imm:$imm))),
7403          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
7404// Pattern with load in other operand.
7405def : Pat<(v4i64 (X86VPerm2x128 (loadv4i64 addr:$src2),
7406                                VR256:$src1, (i8 imm:$imm))),
7407          (VPERM2F128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm imm:$imm))>;
7408}
7409
7410//===----------------------------------------------------------------------===//
7411// VZERO - Zero YMM registers
7412// Note: These instruction do not affect the YMM16-YMM31.
7413//
7414
7415let SchedRW = [WriteSystem] in {
7416let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
7417            YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15] in {
7418  // Zero All YMM registers
7419  def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall",
7420                  [(int_x86_avx_vzeroall)]>, PS, VEX, VEX_L,
7421                  Requires<[HasAVX]>, VEX_WIG;
7422
7423  // Zero Upper bits of YMM registers
7424  def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper",
7425                     [(int_x86_avx_vzeroupper)]>, PS, VEX,
7426                     Requires<[HasAVX]>, VEX_WIG;
7427} // Defs
7428} // SchedRW
7429
7430//===----------------------------------------------------------------------===//
7431// Half precision conversion instructions
7432//
7433
7434multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop,
7435                      X86FoldableSchedWrite sched> {
7436  def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
7437             "vcvtph2ps\t{$src, $dst|$dst, $src}",
7438             [(set RC:$dst, (X86cvtph2ps VR128:$src))]>,
7439             T8PD, VEX, Sched<[sched]>;
7440  let hasSideEffects = 0, mayLoad = 1 in
7441  def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
7442             "vcvtph2ps\t{$src, $dst|$dst, $src}",
7443             [(set RC:$dst, (X86cvtph2ps (bc_v8i16
7444                                          (loadv2i64 addr:$src))))]>,
7445             T8PD, VEX, Sched<[sched.Folded]>;
7446}
7447
7448multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop,
7449                      SchedWrite RR, SchedWrite MR> {
7450  def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst),
7451               (ins RC:$src1, i32u8imm:$src2),
7452               "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7453               [(set VR128:$dst, (X86cvtps2ph RC:$src1, imm:$src2))]>,
7454               TAPD, VEX, Sched<[RR]>;
7455  let hasSideEffects = 0, mayStore = 1 in
7456  def mr : Ii8<0x1D, MRMDestMem, (outs),
7457               (ins x86memop:$dst, RC:$src1, i32u8imm:$src2),
7458               "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
7459               TAPD, VEX, Sched<[MR]>;
7460}
7461
7462let Predicates = [HasF16C, NoVLX] in {
7463  defm VCVTPH2PS  : f16c_ph2ps<VR128, f64mem, WriteCvtPH2PS>;
7464  defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, WriteCvtPH2PSY>, VEX_L;
7465  defm VCVTPS2PH  : f16c_ps2ph<VR128, f64mem, WriteCvtPS2PH,
7466                               WriteCvtPS2PHSt>;
7467  defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, WriteCvtPS2PHY,
7468                               WriteCvtPS2PHYSt>, VEX_L;
7469
7470  // Pattern match vcvtph2ps of a scalar i64 load.
7471  def : Pat<(v4f32 (X86cvtph2ps (v8i16 (vzmovl_v2i64 addr:$src)))),
7472            (VCVTPH2PSrm addr:$src)>;
7473  def : Pat<(v4f32 (X86cvtph2ps (v8i16 (vzload_v2i64 addr:$src)))),
7474            (VCVTPH2PSrm addr:$src)>;
7475  def : Pat<(v4f32 (X86cvtph2ps (v8i16 (bitconvert
7476              (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
7477            (VCVTPH2PSrm addr:$src)>;
7478
7479  def : Pat<(store (f64 (extractelt
7480                         (bc_v2f64 (v8i16 (X86cvtps2ph VR128:$src1, i32:$src2))),
7481                         (iPTR 0))), addr:$dst),
7482            (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>;
7483  def : Pat<(store (i64 (extractelt
7484                         (bc_v2i64 (v8i16 (X86cvtps2ph VR128:$src1, i32:$src2))),
7485                         (iPTR 0))), addr:$dst),
7486            (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>;
7487  def : Pat<(store (v8i16 (X86cvtps2ph VR256:$src1, i32:$src2)), addr:$dst),
7488            (VCVTPS2PHYmr addr:$dst, VR256:$src1, imm:$src2)>;
7489}
7490
7491// Patterns for  matching conversions from float to half-float and vice versa.
7492let Predicates = [HasF16C, NoVLX] in {
7493  // Use MXCSR.RC for rounding instead of explicitly specifying the default
7494  // rounding mode (Nearest-Even, encoded as 0). Both are equivalent in the
7495  // configurations we support (the default). However, falling back to MXCSR is
7496  // more consistent with other instructions, which are always controlled by it.
7497  // It's encoded as 0b100.
7498  def : Pat<(fp_to_f16 FR32:$src),
7499            (i16 (EXTRACT_SUBREG (VMOVPDI2DIrr (v8i16 (VCVTPS2PHrr
7500              (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 4))), sub_16bit))>;
7501
7502  def : Pat<(f16_to_fp GR16:$src),
7503            (f32 (COPY_TO_REGCLASS (v4f32 (VCVTPH2PSrr
7504              (v4i32 (COPY_TO_REGCLASS (MOVSX32rr16 GR16:$src), VR128)))), FR32)) >;
7505
7506  def : Pat<(f16_to_fp (i16 (fp_to_f16 FR32:$src))),
7507            (f32 (COPY_TO_REGCLASS (v4f32 (VCVTPH2PSrr
7508             (v8i16 (VCVTPS2PHrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 4)))), FR32)) >;
7509}
7510
7511//===----------------------------------------------------------------------===//
7512// AVX2 Instructions
7513//===----------------------------------------------------------------------===//
7514
7515/// AVX2_blend_rmi - AVX2 blend with 8-bit immediate
7516multiclass AVX2_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
7517                          ValueType OpVT, X86FoldableSchedWrite sched,
7518                          RegisterClass RC, PatFrag memop_frag,
7519                          X86MemOperand x86memop, SDNodeXForm commuteXForm> {
7520  let isCommutable = 1 in
7521  def rri : AVX2AIi8<opc, MRMSrcReg, (outs RC:$dst),
7522        (ins RC:$src1, RC:$src2, u8imm:$src3),
7523        !strconcat(OpcodeStr,
7524            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
7525        [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))]>,
7526        Sched<[sched]>, VEX_4V;
7527  def rmi : AVX2AIi8<opc, MRMSrcMem, (outs RC:$dst),
7528        (ins RC:$src1, x86memop:$src2, u8imm:$src3),
7529        !strconcat(OpcodeStr,
7530            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
7531        [(set RC:$dst,
7532          (OpVT (OpNode RC:$src1,
7533           (bitconvert (memop_frag addr:$src2)), imm:$src3)))]>,
7534        Sched<[sched.Folded, ReadAfterLd]>, VEX_4V;
7535
7536  // Pattern to commute if load is in first source.
7537  def : Pat<(OpVT (OpNode (bitconvert (memop_frag addr:$src2)),
7538                          RC:$src1, imm:$src3)),
7539            (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2,
7540                                            (commuteXForm imm:$src3))>;
7541}
7542
7543defm VPBLENDD : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v4i32,
7544                               SchedWriteBlend.XMM, VR128, loadv2i64, i128mem,
7545                               BlendCommuteImm4>;
7546defm VPBLENDDY : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v8i32,
7547                                SchedWriteBlend.YMM, VR256, loadv4i64, i256mem,
7548                                BlendCommuteImm8>, VEX_L;
7549
7550// For insertion into the zero index (low half) of a 256-bit vector, it is
7551// more efficient to generate a blend with immediate instead of an insert*128.
7552let Predicates = [HasAVX2] in {
7553def : Pat<(insert_subvector (v8i32 VR256:$src1), (v4i32 VR128:$src2), (iPTR 0)),
7554          (VPBLENDDYrri VR256:$src1,
7555                        (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7556                                       VR128:$src2, sub_xmm), 0xf)>;
7557def : Pat<(insert_subvector (v4i64 VR256:$src1), (v2i64 VR128:$src2), (iPTR 0)),
7558          (VPBLENDDYrri VR256:$src1,
7559                        (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7560                                       VR128:$src2, sub_xmm), 0xf)>;
7561def : Pat<(insert_subvector (v16i16 VR256:$src1), (v8i16 VR128:$src2), (iPTR 0)),
7562          (VPBLENDDYrri VR256:$src1,
7563                        (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7564                                       VR128:$src2, sub_xmm), 0xf)>;
7565def : Pat<(insert_subvector (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR 0)),
7566          (VPBLENDDYrri VR256:$src1,
7567                        (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7568                                       VR128:$src2, sub_xmm), 0xf)>;
7569}
7570
7571let Predicates = [HasAVX1Only] in {
7572def : Pat<(insert_subvector (v8i32 VR256:$src1), (v4i32 VR128:$src2), (iPTR 0)),
7573          (VBLENDPSYrri VR256:$src1,
7574                        (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7575                                       VR128:$src2, sub_xmm), 0xf)>;
7576def : Pat<(insert_subvector (v4i64 VR256:$src1), (v2i64 VR128:$src2), (iPTR 0)),
7577          (VBLENDPSYrri VR256:$src1,
7578                        (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7579                                       VR128:$src2, sub_xmm), 0xf)>;
7580def : Pat<(insert_subvector (v16i16 VR256:$src1), (v8i16 VR128:$src2), (iPTR 0)),
7581          (VBLENDPSYrri VR256:$src1,
7582                        (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7583                                       VR128:$src2, sub_xmm), 0xf)>;
7584def : Pat<(insert_subvector (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR 0)),
7585          (VBLENDPSYrri VR256:$src1,
7586                        (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7587                                       VR128:$src2, sub_xmm), 0xf)>;
7588}
7589
7590//===----------------------------------------------------------------------===//
7591// VPBROADCAST - Load from memory and broadcast to all elements of the
7592//               destination operand
7593//
7594multiclass avx2_broadcast<bits<8> opc, string OpcodeStr,
7595                          X86MemOperand x86memop, PatFrag ld_frag,
7596                          ValueType OpVT128, ValueType OpVT256, Predicate prd> {
7597  let Predicates = [HasAVX2, prd] in {
7598    def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
7599                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7600                  [(set VR128:$dst,
7601                   (OpVT128 (X86VBroadcast (OpVT128 VR128:$src))))]>,
7602                  Sched<[SchedWriteShuffle.XMM]>, VEX;
7603    def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
7604                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7605                  [(set VR128:$dst,
7606                   (OpVT128 (X86VBroadcast (ld_frag addr:$src))))]>,
7607                  Sched<[SchedWriteShuffle.XMM.Folded]>, VEX;
7608    def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
7609                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7610                   [(set VR256:$dst,
7611                    (OpVT256 (X86VBroadcast (OpVT128 VR128:$src))))]>,
7612                   Sched<[WriteShuffle256]>, VEX, VEX_L;
7613    def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src),
7614                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7615                   [(set VR256:$dst,
7616                    (OpVT256 (X86VBroadcast (ld_frag addr:$src))))]>,
7617                   Sched<[SchedWriteShuffle.XMM.Folded]>, VEX, VEX_L;
7618
7619    // Provide aliases for broadcast from the same register class that
7620    // automatically does the extract.
7621    def : Pat<(OpVT256 (X86VBroadcast (OpVT256 VR256:$src))),
7622              (!cast<Instruction>(NAME#"Yrr")
7623                  (OpVT128 (EXTRACT_SUBREG (OpVT256 VR256:$src),sub_xmm)))>;
7624  }
7625}
7626
7627defm VPBROADCASTB  : avx2_broadcast<0x78, "vpbroadcastb", i8mem, loadi8,
7628                                    v16i8, v32i8, NoVLX_Or_NoBWI>;
7629defm VPBROADCASTW  : avx2_broadcast<0x79, "vpbroadcastw", i16mem, loadi16,
7630                                    v8i16, v16i16, NoVLX_Or_NoBWI>;
7631defm VPBROADCASTD  : avx2_broadcast<0x58, "vpbroadcastd", i32mem, loadi32,
7632                                    v4i32, v8i32, NoVLX>;
7633defm VPBROADCASTQ  : avx2_broadcast<0x59, "vpbroadcastq", i64mem, loadi64,
7634                                    v2i64, v4i64, NoVLX>;
7635
7636let Predicates = [HasAVX2, NoVLX] in {
7637  // 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD.
7638  def : Pat<(v2i64 (X86VBroadcast (v2i64 (X86vzload addr:$src)))),
7639            (VPBROADCASTQrm addr:$src)>;
7640  def : Pat<(v4i64 (X86VBroadcast (v4i64 (X86vzload addr:$src)))),
7641            (VPBROADCASTQYrm addr:$src)>;
7642
7643  def : Pat<(v4i32 (X86VBroadcast (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
7644            (VPBROADCASTDrm addr:$src)>;
7645  def : Pat<(v8i32 (X86VBroadcast (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
7646            (VPBROADCASTDYrm addr:$src)>;
7647  def : Pat<(v2i64 (X86VBroadcast (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
7648            (VPBROADCASTQrm addr:$src)>;
7649  def : Pat<(v4i64 (X86VBroadcast (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
7650            (VPBROADCASTQYrm addr:$src)>;
7651}
7652let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
7653  // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably.
7654  // This means we'll encounter truncated i32 loads; match that here.
7655  def : Pat<(v8i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
7656            (VPBROADCASTWrm addr:$src)>;
7657  def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
7658            (VPBROADCASTWYrm addr:$src)>;
7659  def : Pat<(v8i16 (X86VBroadcast
7660              (i16 (trunc (i32 (zextloadi16 addr:$src)))))),
7661            (VPBROADCASTWrm addr:$src)>;
7662  def : Pat<(v16i16 (X86VBroadcast
7663              (i16 (trunc (i32 (zextloadi16 addr:$src)))))),
7664            (VPBROADCASTWYrm addr:$src)>;
7665}
7666
7667let Predicates = [HasAVX2, NoVLX] in {
7668  // Provide aliases for broadcast from the same register class that
7669  // automatically does the extract.
7670  def : Pat<(v8f32 (X86VBroadcast (v8f32 VR256:$src))),
7671            (VBROADCASTSSYrr (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src),
7672                                                    sub_xmm)))>;
7673  def : Pat<(v4f64 (X86VBroadcast (v4f64 VR256:$src))),
7674            (VBROADCASTSDYrr (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src),
7675                                                    sub_xmm)))>;
7676}
7677
7678let Predicates = [HasAVX2, NoVLX] in {
7679  // Provide fallback in case the load node that is used in the patterns above
7680  // is used by additional users, which prevents the pattern selection.
7681    def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
7682              (VBROADCASTSSrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>;
7683    def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
7684              (VBROADCASTSSYrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>;
7685    def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
7686              (VBROADCASTSDYrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>;
7687}
7688
7689let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
7690  def : Pat<(v16i8 (X86VBroadcast GR8:$src)),
7691        (VPBROADCASTBrr (v16i8 (COPY_TO_REGCLASS
7692                         (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
7693                                             GR8:$src, sub_8bit)),
7694                         VR128)))>;
7695  def : Pat<(v32i8 (X86VBroadcast GR8:$src)),
7696        (VPBROADCASTBYrr (v16i8 (COPY_TO_REGCLASS
7697                          (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
7698                                              GR8:$src, sub_8bit)),
7699                          VR128)))>;
7700
7701  def : Pat<(v8i16 (X86VBroadcast GR16:$src)),
7702        (VPBROADCASTWrr (v8i16 (COPY_TO_REGCLASS
7703                         (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
7704                                             GR16:$src, sub_16bit)),
7705                         VR128)))>;
7706  def : Pat<(v16i16 (X86VBroadcast GR16:$src)),
7707        (VPBROADCASTWYrr (v8i16 (COPY_TO_REGCLASS
7708                          (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
7709                                              GR16:$src, sub_16bit)),
7710                          VR128)))>;
7711}
7712let Predicates = [HasAVX2, NoVLX] in {
7713  def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
7714            (VPBROADCASTDrr (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)))>;
7715  def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
7716            (VPBROADCASTDYrr (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)))>;
7717  def : Pat<(v2i64 (X86VBroadcast GR64:$src)),
7718            (VPBROADCASTQrr (v2i64 (COPY_TO_REGCLASS GR64:$src, VR128)))>;
7719  def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
7720            (VPBROADCASTQYrr (v2i64 (COPY_TO_REGCLASS GR64:$src, VR128)))>;
7721}
7722
7723// AVX1 broadcast patterns
7724let Predicates = [HasAVX1Only] in {
7725def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))),
7726          (VBROADCASTSSYrm addr:$src)>;
7727def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))),
7728          (VBROADCASTSDYrm addr:$src)>;
7729def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
7730          (VBROADCASTSSrm addr:$src)>;
7731}
7732
7733  // Provide fallback in case the load node that is used in the patterns above
7734  // is used by additional users, which prevents the pattern selection.
7735let Predicates = [HasAVX, NoVLX] in {
7736  // 128bit broadcasts:
7737  def : Pat<(v2f64 (X86VBroadcast f64:$src)),
7738            (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>;
7739  def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))),
7740            (VMOVDDUPrm addr:$src)>;
7741
7742  def : Pat<(v2f64 (X86VBroadcast v2f64:$src)),
7743            (VMOVDDUPrr VR128:$src)>;
7744  def : Pat<(v2f64 (X86VBroadcast (loadv2f64 addr:$src))),
7745            (VMOVDDUPrm addr:$src)>;
7746}
7747
7748let Predicates = [HasAVX1Only] in {
7749  def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
7750            (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)>;
7751  def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
7752            (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
7753              (v4f32 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), sub_xmm),
7754              (v4f32 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), 1)>;
7755  def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
7756            (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
7757              (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), sub_xmm),
7758              (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), 1)>;
7759
7760  def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
7761            (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)), 0)>;
7762  def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
7763            (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7764              (v4i32 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)), 0)), sub_xmm),
7765              (v4i32 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)), 0)), 1)>;
7766  def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
7767            (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)),
7768              (v4i32 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR64:$src, VR128)), 0x44)), sub_xmm),
7769              (v4i32 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR64:$src, VR128)), 0x44)), 1)>;
7770
7771  def : Pat<(v2i64 (X86VBroadcast i64:$src)),
7772            (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR64:$src, VR128)), 0x44)>;
7773  def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))),
7774            (VMOVDDUPrm addr:$src)>;
7775}
7776
7777//===----------------------------------------------------------------------===//
7778// VPERM - Permute instructions
7779//
7780
7781multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
7782                     ValueType OpVT, X86FoldableSchedWrite Sched,
7783                     X86MemOperand memOp> {
7784  let Predicates = [HasAVX2, NoVLX] in {
7785    def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
7786                     (ins VR256:$src1, VR256:$src2),
7787                     !strconcat(OpcodeStr,
7788                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7789                     [(set VR256:$dst,
7790                       (OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>,
7791                     Sched<[Sched]>, VEX_4V, VEX_L;
7792    def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
7793                     (ins VR256:$src1, memOp:$src2),
7794                     !strconcat(OpcodeStr,
7795                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7796                     [(set VR256:$dst,
7797                       (OpVT (X86VPermv VR256:$src1,
7798                              (bitconvert (mem_frag addr:$src2)))))]>,
7799                     Sched<[Sched.Folded, ReadAfterLd]>, VEX_4V, VEX_L;
7800  }
7801}
7802
7803defm VPERMD : avx2_perm<0x36, "vpermd", loadv4i64, v8i32, WriteVarShuffle256,
7804                        i256mem>;
7805let ExeDomain = SSEPackedSingle in
7806defm VPERMPS : avx2_perm<0x16, "vpermps", loadv8f32, v8f32, WriteFVarShuffle256,
7807                        f256mem>;
7808
7809multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
7810                         ValueType OpVT, X86FoldableSchedWrite Sched,
7811                         X86MemOperand memOp> {
7812  let Predicates = [HasAVX2, NoVLX] in {
7813    def Yri : AVX2AIi8<opc, MRMSrcReg, (outs VR256:$dst),
7814                       (ins VR256:$src1, u8imm:$src2),
7815                       !strconcat(OpcodeStr,
7816                           "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7817                       [(set VR256:$dst,
7818                         (OpVT (X86VPermi VR256:$src1, (i8 imm:$src2))))]>,
7819                       Sched<[Sched]>, VEX, VEX_L;
7820    def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst),
7821                       (ins memOp:$src1, u8imm:$src2),
7822                       !strconcat(OpcodeStr,
7823                           "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7824                       [(set VR256:$dst,
7825                         (OpVT (X86VPermi (mem_frag addr:$src1),
7826                                (i8 imm:$src2))))]>,
7827                       Sched<[Sched.Folded, ReadAfterLd]>, VEX, VEX_L;
7828  }
7829}
7830
7831defm VPERMQ : avx2_perm_imm<0x00, "vpermq", loadv4i64, v4i64,
7832                            WriteShuffle256, i256mem>, VEX_W;
7833let ExeDomain = SSEPackedDouble in
7834defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64,
7835                             WriteFShuffle256, f256mem>, VEX_W;
7836
7837//===----------------------------------------------------------------------===//
7838// VPERM2I128 - Permute Floating-Point Values in 128-bit chunks
7839//
7840let isCommutable = 1 in
7841def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst),
7842          (ins VR256:$src1, VR256:$src2, u8imm:$src3),
7843          "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7844          [(set VR256:$dst, (v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2,
7845                            (i8 imm:$src3))))]>, Sched<[WriteShuffle256]>,
7846          VEX_4V, VEX_L;
7847def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst),
7848          (ins VR256:$src1, f256mem:$src2, u8imm:$src3),
7849          "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7850          [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4i64 addr:$src2),
7851                             (i8 imm:$src3)))]>,
7852          Sched<[WriteShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L;
7853
7854let Predicates = [HasAVX2] in
7855def : Pat<(v4i64 (X86VPerm2x128 (loadv4i64 addr:$src2),
7856                                VR256:$src1, (i8 imm:$imm))),
7857          (VPERM2I128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm imm:$imm))>;
7858
7859
7860//===----------------------------------------------------------------------===//
7861// VINSERTI128 - Insert packed integer values
7862//
7863let hasSideEffects = 0 in {
7864def VINSERTI128rr : AVX2AIi8<0x38, MRMSrcReg, (outs VR256:$dst),
7865          (ins VR256:$src1, VR128:$src2, u8imm:$src3),
7866          "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7867          []>, Sched<[WriteShuffle256]>, VEX_4V, VEX_L;
7868let mayLoad = 1 in
7869def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst),
7870          (ins VR256:$src1, i128mem:$src2, u8imm:$src3),
7871          "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7872          []>, Sched<[WriteShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L;
7873}
7874
7875let Predicates = [HasAVX2, NoVLX] in {
7876  defm : vinsert_lowering<"VINSERTI128", v2i64, v4i64,  loadv2i64>;
7877  defm : vinsert_lowering<"VINSERTI128", v4i32, v8i32,  loadv2i64>;
7878  defm : vinsert_lowering<"VINSERTI128", v8i16, v16i16, loadv2i64>;
7879  defm : vinsert_lowering<"VINSERTI128", v16i8, v32i8,  loadv2i64>;
7880}
7881
7882//===----------------------------------------------------------------------===//
7883// VEXTRACTI128 - Extract packed integer values
7884//
7885def VEXTRACTI128rr : AVX2AIi8<0x39, MRMDestReg, (outs VR128:$dst),
7886          (ins VR256:$src1, u8imm:$src2),
7887          "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
7888          Sched<[WriteShuffle256]>, VEX, VEX_L;
7889let hasSideEffects = 0, mayStore = 1 in
7890def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs),
7891          (ins i128mem:$dst, VR256:$src1, u8imm:$src2),
7892          "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
7893          Sched<[SchedWriteVecMoveLS.XMM.MR]>, VEX, VEX_L;
7894
7895let Predicates = [HasAVX2, NoVLX] in {
7896  defm : vextract_lowering<"VEXTRACTI128", v4i64,  v2i64>;
7897  defm : vextract_lowering<"VEXTRACTI128", v8i32,  v4i32>;
7898  defm : vextract_lowering<"VEXTRACTI128", v16i16, v8i16>;
7899  defm : vextract_lowering<"VEXTRACTI128", v32i8,  v16i8>;
7900}
7901
7902//===----------------------------------------------------------------------===//
7903// VPMASKMOV - Conditional SIMD Integer Packed Loads and Stores
7904//
7905multiclass avx2_pmovmask<string OpcodeStr,
7906                         Intrinsic IntLd128, Intrinsic IntLd256,
7907                         Intrinsic IntSt128, Intrinsic IntSt256> {
7908  def rm  : AVX28I<0x8c, MRMSrcMem, (outs VR128:$dst),
7909             (ins VR128:$src1, i128mem:$src2),
7910             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7911             [(set VR128:$dst, (IntLd128 addr:$src2, VR128:$src1))]>,
7912             VEX_4V, Sched<[WriteVecMaskedLoad]>;
7913  def Yrm : AVX28I<0x8c, MRMSrcMem, (outs VR256:$dst),
7914             (ins VR256:$src1, i256mem:$src2),
7915             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7916             [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
7917             VEX_4V, VEX_L, Sched<[WriteVecMaskedLoadY]>;
7918  def mr  : AVX28I<0x8e, MRMDestMem, (outs),
7919             (ins i128mem:$dst, VR128:$src1, VR128:$src2),
7920             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7921             [(IntSt128 addr:$dst, VR128:$src1, VR128:$src2)]>,
7922             VEX_4V, Sched<[WriteVecMaskedStore]>;
7923  def Ymr : AVX28I<0x8e, MRMDestMem, (outs),
7924             (ins i256mem:$dst, VR256:$src1, VR256:$src2),
7925             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7926             [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>,
7927             VEX_4V, VEX_L, Sched<[WriteVecMaskedStoreY]>;
7928}
7929
7930defm VPMASKMOVD : avx2_pmovmask<"vpmaskmovd",
7931                                int_x86_avx2_maskload_d,
7932                                int_x86_avx2_maskload_d_256,
7933                                int_x86_avx2_maskstore_d,
7934                                int_x86_avx2_maskstore_d_256>;
7935defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq",
7936                                int_x86_avx2_maskload_q,
7937                                int_x86_avx2_maskload_q_256,
7938                                int_x86_avx2_maskstore_q,
7939                                int_x86_avx2_maskstore_q_256>, VEX_W;
7940
7941multiclass maskmov_lowering<string InstrStr, RegisterClass RC, ValueType VT,
7942                          ValueType MaskVT, string BlendStr, ValueType ZeroVT> {
7943    // masked store
7944    def: Pat<(X86mstore addr:$ptr, (MaskVT RC:$mask), (VT RC:$src)),
7945             (!cast<Instruction>(InstrStr#"mr") addr:$ptr, RC:$mask, RC:$src)>;
7946    // masked load
7947    def: Pat<(VT (X86mload addr:$ptr, (MaskVT RC:$mask), undef)),
7948             (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>;
7949    def: Pat<(VT (X86mload addr:$ptr, (MaskVT RC:$mask),
7950                              (VT (bitconvert (ZeroVT immAllZerosV))))),
7951             (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>;
7952    def: Pat<(VT (X86mload addr:$ptr, (MaskVT RC:$mask), (VT RC:$src0))),
7953             (!cast<Instruction>(BlendStr#"rr")
7954                 RC:$src0,
7955                 (VT (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)),
7956                 RC:$mask)>;
7957}
7958let Predicates = [HasAVX] in {
7959  defm : maskmov_lowering<"VMASKMOVPS", VR128, v4f32, v4i32, "VBLENDVPS", v4i32>;
7960  defm : maskmov_lowering<"VMASKMOVPD", VR128, v2f64, v2i64, "VBLENDVPD", v4i32>;
7961  defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8f32, v8i32, "VBLENDVPSY", v8i32>;
7962  defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4f64, v4i64, "VBLENDVPDY", v8i32>;
7963}
7964let Predicates = [HasAVX1Only] in {
7965  // load/store i32/i64 not supported use ps/pd version
7966  defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8i32, v8i32, "VBLENDVPSY", v8i32>;
7967  defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4i64, v4i64, "VBLENDVPDY", v8i32>;
7968  defm : maskmov_lowering<"VMASKMOVPS", VR128, v4i32, v4i32, "VBLENDVPS", v4i32>;
7969  defm : maskmov_lowering<"VMASKMOVPD", VR128, v2i64, v2i64, "VBLENDVPD", v4i32>;
7970}
7971let Predicates = [HasAVX2] in {
7972  defm : maskmov_lowering<"VPMASKMOVDY", VR256, v8i32, v8i32, "VBLENDVPSY", v8i32>;
7973  defm : maskmov_lowering<"VPMASKMOVQY", VR256, v4i64, v4i64, "VBLENDVPDY", v8i32>;
7974  defm : maskmov_lowering<"VPMASKMOVD", VR128, v4i32, v4i32, "VBLENDVPS", v4i32>;
7975  defm : maskmov_lowering<"VPMASKMOVQ", VR128, v2i64, v2i64, "VBLENDVPD", v4i32>;
7976}
7977
7978//===----------------------------------------------------------------------===//
7979// SubVector Broadcasts
7980// Provide fallback in case the load node that is used in the patterns above
7981// is used by additional users, which prevents the pattern selection.
7982
7983let Predicates = [HasAVX2, NoVLX] in {
7984def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128:$src))),
7985          (VINSERTI128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
7986                         (v2i64 VR128:$src), 1)>;
7987def : Pat<(v8i32 (X86SubVBroadcast (v4i32 VR128:$src))),
7988          (VINSERTI128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
7989                         (v4i32 VR128:$src), 1)>;
7990def : Pat<(v16i16 (X86SubVBroadcast (v8i16 VR128:$src))),
7991          (VINSERTI128rr (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
7992                         (v8i16 VR128:$src), 1)>;
7993def : Pat<(v32i8 (X86SubVBroadcast (v16i8 VR128:$src))),
7994          (VINSERTI128rr (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
7995                         (v16i8 VR128:$src), 1)>;
7996}
7997
7998let Predicates = [HasAVX, NoVLX] in {
7999def : Pat<(v4f64 (X86SubVBroadcast (v2f64 VR128:$src))),
8000          (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
8001                         (v2f64 VR128:$src), 1)>;
8002def : Pat<(v8f32 (X86SubVBroadcast (v4f32 VR128:$src))),
8003          (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
8004                         (v4f32 VR128:$src), 1)>;
8005}
8006
8007let Predicates = [HasAVX1Only] in {
8008def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128:$src))),
8009          (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
8010                         (v2i64 VR128:$src), 1)>;
8011def : Pat<(v8i32 (X86SubVBroadcast (v4i32 VR128:$src))),
8012          (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
8013                         (v4i32 VR128:$src), 1)>;
8014def : Pat<(v16i16 (X86SubVBroadcast (v8i16 VR128:$src))),
8015          (VINSERTF128rr (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
8016                         (v8i16 VR128:$src), 1)>;
8017def : Pat<(v32i8 (X86SubVBroadcast (v16i8 VR128:$src))),
8018          (VINSERTF128rr (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
8019                         (v16i8 VR128:$src), 1)>;
8020}
8021
8022//===----------------------------------------------------------------------===//
8023// Variable Bit Shifts
8024//
8025multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
8026                          ValueType vt128, ValueType vt256> {
8027  def rr  : AVX28I<opc, MRMSrcReg, (outs VR128:$dst),
8028             (ins VR128:$src1, VR128:$src2),
8029             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8030             [(set VR128:$dst,
8031               (vt128 (OpNode VR128:$src1, (vt128 VR128:$src2))))]>,
8032             VEX_4V, Sched<[SchedWriteVarVecShift.XMM]>;
8033  def rm  : AVX28I<opc, MRMSrcMem, (outs VR128:$dst),
8034             (ins VR128:$src1, i128mem:$src2),
8035             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8036             [(set VR128:$dst,
8037               (vt128 (OpNode VR128:$src1,
8038                       (vt128 (bitconvert (loadv2i64 addr:$src2))))))]>,
8039             VEX_4V, Sched<[SchedWriteVarVecShift.XMM.Folded, ReadAfterLd]>;
8040  def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
8041             (ins VR256:$src1, VR256:$src2),
8042             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8043             [(set VR256:$dst,
8044               (vt256 (OpNode VR256:$src1, (vt256 VR256:$src2))))]>,
8045             VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM]>;
8046  def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
8047             (ins VR256:$src1, i256mem:$src2),
8048             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8049             [(set VR256:$dst,
8050               (vt256 (OpNode VR256:$src1,
8051                       (vt256 (bitconvert (loadv4i64 addr:$src2))))))]>,
8052             VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM.Folded, ReadAfterLd]>;
8053}
8054
8055let Predicates = [HasAVX2, NoVLX] in {
8056  defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", shl, v4i32, v8i32>;
8057  defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", shl, v2i64, v4i64>, VEX_W;
8058  defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", srl, v4i32, v8i32>;
8059  defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", srl, v2i64, v4i64>, VEX_W;
8060  defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", sra, v4i32, v8i32>;
8061
8062  def : Pat<(v4i32 (X86vsrav VR128:$src1, VR128:$src2)),
8063            (VPSRAVDrr VR128:$src1, VR128:$src2)>;
8064  def : Pat<(v4i32 (X86vsrav VR128:$src1,
8065                    (bitconvert (loadv2i64 addr:$src2)))),
8066            (VPSRAVDrm VR128:$src1, addr:$src2)>;
8067  def : Pat<(v8i32 (X86vsrav VR256:$src1, VR256:$src2)),
8068            (VPSRAVDYrr VR256:$src1, VR256:$src2)>;
8069  def : Pat<(v8i32 (X86vsrav VR256:$src1,
8070                    (bitconvert (loadv4i64 addr:$src2)))),
8071            (VPSRAVDYrm VR256:$src1, addr:$src2)>;
8072}
8073
8074//===----------------------------------------------------------------------===//
8075// VGATHER - GATHER Operations
8076
8077// FIXME: Improve scheduling of gather instructions.
8078multiclass avx2_gather<bits<8> opc, string OpcodeStr, ValueType VTx,
8079                       ValueType VTy, PatFrag GatherNode128,
8080                       PatFrag GatherNode256, RegisterClass RC256,
8081                       X86MemOperand memop128, X86MemOperand memop256,
8082                       ValueType MTx = VTx, ValueType MTy = VTy> {
8083  def rm  : AVX28I<opc, MRMSrcMem4VOp3, (outs VR128:$dst, VR128:$mask_wb),
8084            (ins VR128:$src1, memop128:$src2, VR128:$mask),
8085            !strconcat(OpcodeStr,
8086              "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
8087            [(set (VTx VR128:$dst), (MTx VR128:$mask_wb),
8088                  (GatherNode128 VR128:$src1, VR128:$mask,
8089                                vectoraddr:$src2))]>,
8090            VEX, Sched<[WriteLoad]>;
8091  def Yrm : AVX28I<opc, MRMSrcMem4VOp3, (outs RC256:$dst, RC256:$mask_wb),
8092            (ins RC256:$src1, memop256:$src2, RC256:$mask),
8093            !strconcat(OpcodeStr,
8094              "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
8095            [(set (VTy RC256:$dst), (MTy RC256:$mask_wb),
8096                  (GatherNode256 RC256:$src1, RC256:$mask,
8097                                vectoraddr:$src2))]>,
8098            VEX, VEX_L, Sched<[WriteLoad]>;
8099}
8100
8101let Predicates = [UseAVX2] in {
8102  let mayLoad = 1, hasSideEffects = 0, Constraints
8103    = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb"
8104    in {
8105    defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", v2i64, v4i64, mgatherv4i32,
8106                        mgatherv4i32, VR256, vx128mem, vx256mem>, VEX_W;
8107    defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", v2i64, v4i64, mgatherv2i64,
8108                        mgatherv4i64, VR256, vx128mem, vy256mem>, VEX_W;
8109    defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", v4i32, v8i32, mgatherv4i32,
8110                        mgatherv8i32, VR256, vx128mem, vy256mem>;
8111    defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", v4i32, v4i32, mgatherv2i64,
8112                        mgatherv4i64, VR128, vx64mem, vy128mem>;
8113
8114    let ExeDomain = SSEPackedDouble in {
8115      defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", v2f64, v4f64, mgatherv4i32,
8116                          mgatherv4i32, VR256, vx128mem, vx256mem,
8117                          v2i64, v4i64>, VEX_W;
8118      defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", v2f64, v4f64, mgatherv2i64,
8119                          mgatherv4i64, VR256, vx128mem, vy256mem,
8120                          v2i64, v4i64>, VEX_W;
8121    }
8122
8123    let ExeDomain = SSEPackedSingle in {
8124      defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", v4f32, v8f32, mgatherv4i32,
8125                          mgatherv8i32, VR256, vx128mem, vy256mem,
8126                          v4i32, v8i32>;
8127      defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", v4f32, v4f32, mgatherv2i64,
8128                          mgatherv4i64, VR128, vx64mem, vy128mem,
8129                          v4i32, v4i32>;
8130    }
8131  }
8132}
8133
8134//===----------------------------------------------------------------------===//
8135// Extra selection patterns for f128, f128mem
8136
8137// movaps is shorter than movdqa. movaps is in SSE and movdqa is in SSE2.
8138def : Pat<(alignedstore (f128 VR128:$src), addr:$dst),
8139          (MOVAPSmr addr:$dst, (COPY_TO_REGCLASS (f128 VR128:$src), VR128))>;
8140def : Pat<(store (f128 VR128:$src), addr:$dst),
8141          (MOVUPSmr addr:$dst, (COPY_TO_REGCLASS (f128 VR128:$src), VR128))>;
8142
8143def : Pat<(alignedloadf128 addr:$src),
8144          (COPY_TO_REGCLASS (MOVAPSrm addr:$src), VR128)>;
8145def : Pat<(loadf128 addr:$src),
8146          (COPY_TO_REGCLASS (MOVUPSrm addr:$src), VR128)>;
8147
8148// andps is shorter than andpd or pand. andps is SSE and andpd/pand are in SSE2
8149def : Pat<(f128 (X86fand VR128:$src1, (memopf128 addr:$src2))),
8150          (COPY_TO_REGCLASS
8151           (ANDPSrm (COPY_TO_REGCLASS VR128:$src1, VR128), f128mem:$src2),
8152           VR128)>;
8153
8154def : Pat<(f128 (X86fand VR128:$src1, VR128:$src2)),
8155          (COPY_TO_REGCLASS
8156           (ANDPSrr (COPY_TO_REGCLASS VR128:$src1, VR128),
8157                    (COPY_TO_REGCLASS VR128:$src2, VR128)), VR128)>;
8158
8159def : Pat<(f128 (X86for VR128:$src1, (memopf128 addr:$src2))),
8160          (COPY_TO_REGCLASS
8161           (ORPSrm (COPY_TO_REGCLASS VR128:$src1, VR128), f128mem:$src2),
8162           VR128)>;
8163
8164def : Pat<(f128 (X86for VR128:$src1, VR128:$src2)),
8165          (COPY_TO_REGCLASS
8166           (ORPSrr (COPY_TO_REGCLASS VR128:$src1, VR128),
8167                   (COPY_TO_REGCLASS VR128:$src2, VR128)), VR128)>;
8168
8169def : Pat<(f128 (X86fxor VR128:$src1, (memopf128 addr:$src2))),
8170          (COPY_TO_REGCLASS
8171           (XORPSrm (COPY_TO_REGCLASS VR128:$src1, VR128), f128mem:$src2),
8172           VR128)>;
8173
8174def : Pat<(f128 (X86fxor VR128:$src1, VR128:$src2)),
8175          (COPY_TO_REGCLASS
8176           (XORPSrr (COPY_TO_REGCLASS VR128:$src1, VR128),
8177                    (COPY_TO_REGCLASS VR128:$src2, VR128)), VR128)>;
8178
8179//===----------------------------------------------------------------------===//
8180// GFNI instructions
8181//===----------------------------------------------------------------------===//
8182
8183multiclass GF2P8MULB_rm<string OpcodeStr, ValueType OpVT,
8184                        RegisterClass RC, PatFrag MemOpFrag,
8185                        X86MemOperand X86MemOp, bit Is2Addr = 0> {
8186  let ExeDomain = SSEPackedInt,
8187      AsmString = !if(Is2Addr,
8188        OpcodeStr##"\t{$src2, $dst|$dst, $src2}",
8189        OpcodeStr##"\t{$src2, $src1, $dst|$dst, $src1, $src2}") in {
8190    let isCommutable = 1 in
8191    def rr : PDI<0xCF, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), "",
8192                 [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1, RC:$src2)))]>,
8193             Sched<[SchedWriteVecALU.XMM]>, T8PD;
8194
8195    def rm : PDI<0xCF, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, X86MemOp:$src2), "",
8196                 [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1,
8197                                 (bitconvert (MemOpFrag addr:$src2)))))]>,
8198             Sched<[SchedWriteVecALU.XMM.Folded, ReadAfterLd]>, T8PD;
8199  }
8200}
8201
8202multiclass GF2P8AFFINE_rmi<bits<8> Op, string OpStr, ValueType OpVT,
8203                           SDNode OpNode, RegisterClass RC, PatFrag MemOpFrag,
8204                           X86MemOperand X86MemOp, bit Is2Addr = 0> {
8205  let AsmString = !if(Is2Addr,
8206      OpStr##"\t{$src3, $src2, $dst|$dst, $src2, $src3}",
8207      OpStr##"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}") in {
8208  def rri : Ii8<Op, MRMSrcReg, (outs RC:$dst),
8209              (ins RC:$src1, RC:$src2, u8imm:$src3), "",
8210              [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))],
8211              SSEPackedInt>, Sched<[SchedWriteVecALU.XMM]>;
8212  def rmi : Ii8<Op, MRMSrcMem, (outs RC:$dst),
8213              (ins RC:$src1, X86MemOp:$src2, u8imm:$src3), "",
8214              [(set RC:$dst, (OpVT (OpNode RC:$src1,
8215                                    (bitconvert (MemOpFrag addr:$src2)),
8216                              imm:$src3)))], SSEPackedInt>,
8217              Sched<[SchedWriteVecALU.XMM.Folded, ReadAfterLd]>;
8218  }
8219}
8220
8221multiclass GF2P8AFFINE_common<bits<8> Op, string OpStr, SDNode OpNode> {
8222  let Constraints = "$src1 = $dst",
8223      Predicates  = [HasGFNI, UseSSE2] in
8224  defm NAME         : GF2P8AFFINE_rmi<Op, OpStr, v16i8, OpNode,
8225                                      VR128, loadv2i64, i128mem, 1>;
8226  let Predicates  = [HasGFNI, HasAVX, NoVLX_Or_NoBWI] in {
8227    defm V##NAME    : GF2P8AFFINE_rmi<Op, "v"##OpStr, v16i8, OpNode, VR128,
8228                                      loadv2i64, i128mem>, VEX_4V, VEX_W;
8229    defm V##NAME##Y : GF2P8AFFINE_rmi<Op, "v"##OpStr, v32i8, OpNode, VR256,
8230                                      loadv4i64, i256mem>, VEX_4V, VEX_L, VEX_W;
8231  }
8232}
8233
8234// GF2P8MULB
8235let Constraints = "$src1 = $dst",
8236    Predicates  = [HasGFNI, UseSSE2] in
8237defm GF2P8MULB      : GF2P8MULB_rm<"gf2p8mulb", v16i8, VR128, memopv2i64,
8238                                    i128mem, 1>;
8239let Predicates  = [HasGFNI, HasAVX, NoVLX_Or_NoBWI] in {
8240  defm VGF2P8MULB   : GF2P8MULB_rm<"vgf2p8mulb", v16i8, VR128, loadv2i64,
8241                                   i128mem>, VEX_4V;
8242  defm VGF2P8MULBY  : GF2P8MULB_rm<"vgf2p8mulb", v32i8, VR256, loadv4i64,
8243                                   i256mem>, VEX_4V, VEX_L;
8244}
8245// GF2P8AFFINEINVQB, GF2P8AFFINEQB
8246let isCommutable = 0 in {
8247  defm GF2P8AFFINEINVQB : GF2P8AFFINE_common<0xCF, "gf2p8affineinvqb",
8248                                             X86GF2P8affineinvqb>, TAPD;
8249  defm GF2P8AFFINEQB    : GF2P8AFFINE_common<0xCE, "gf2p8affineqb",
8250                                             X86GF2P8affineqb>, TAPD;
8251}
8252
8253