1//===-- X86InstrSSE.td - SSE Instruction Set ---------------*- tablegen -*-===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file describes the X86 SSE instruction set, defining the instructions,
11// and properties of the instructions which are needed for code generation,
12// machine code emission, and analysis.
13//
14//===----------------------------------------------------------------------===//
15
16class OpndItins<InstrItinClass arg_rr, InstrItinClass arg_rm> {
17  InstrItinClass rr = arg_rr;
18  InstrItinClass rm = arg_rm;
19  // InstrSchedModel info.
20  X86FoldableSchedWrite Sched = WriteFAdd;
21}
22
23class SizeItins<OpndItins arg_s, OpndItins arg_d> {
24  OpndItins s = arg_s;
25  OpndItins d = arg_d;
26}
27
28
29class ShiftOpndItins<InstrItinClass arg_rr, InstrItinClass arg_rm,
30  InstrItinClass arg_ri> {
31  InstrItinClass rr = arg_rr;
32  InstrItinClass rm = arg_rm;
33  InstrItinClass ri = arg_ri;
34}
35
36
37// scalar
38let Sched = WriteFAdd in {
39def SSE_ALU_F32S : OpndItins<
40  IIC_SSE_ALU_F32S_RR, IIC_SSE_ALU_F32S_RM
41>;
42
43def SSE_ALU_F64S : OpndItins<
44  IIC_SSE_ALU_F64S_RR, IIC_SSE_ALU_F64S_RM
45>;
46}
47
48def SSE_ALU_ITINS_S : SizeItins<
49  SSE_ALU_F32S, SSE_ALU_F64S
50>;
51
52let Sched = WriteFMul in {
53def SSE_MUL_F32S : OpndItins<
54  IIC_SSE_MUL_F32S_RR, IIC_SSE_MUL_F64S_RM
55>;
56
57def SSE_MUL_F64S : OpndItins<
58  IIC_SSE_MUL_F64S_RR, IIC_SSE_MUL_F64S_RM
59>;
60}
61
62def SSE_MUL_ITINS_S : SizeItins<
63  SSE_MUL_F32S, SSE_MUL_F64S
64>;
65
66let Sched = WriteFDiv in {
67def SSE_DIV_F32S : OpndItins<
68  IIC_SSE_DIV_F32S_RR, IIC_SSE_DIV_F64S_RM
69>;
70
71def SSE_DIV_F64S : OpndItins<
72  IIC_SSE_DIV_F64S_RR, IIC_SSE_DIV_F64S_RM
73>;
74}
75
76def SSE_DIV_ITINS_S : SizeItins<
77  SSE_DIV_F32S, SSE_DIV_F64S
78>;
79
80// parallel
81let Sched = WriteFAdd in {
82def SSE_ALU_F32P : OpndItins<
83  IIC_SSE_ALU_F32P_RR, IIC_SSE_ALU_F32P_RM
84>;
85
86def SSE_ALU_F64P : OpndItins<
87  IIC_SSE_ALU_F64P_RR, IIC_SSE_ALU_F64P_RM
88>;
89}
90
91def SSE_ALU_ITINS_P : SizeItins<
92  SSE_ALU_F32P, SSE_ALU_F64P
93>;
94
95let Sched = WriteFMul in {
96def SSE_MUL_F32P : OpndItins<
97  IIC_SSE_MUL_F32P_RR, IIC_SSE_MUL_F64P_RM
98>;
99
100def SSE_MUL_F64P : OpndItins<
101  IIC_SSE_MUL_F64P_RR, IIC_SSE_MUL_F64P_RM
102>;
103}
104
105def SSE_MUL_ITINS_P : SizeItins<
106  SSE_MUL_F32P, SSE_MUL_F64P
107>;
108
109let Sched = WriteFDiv in {
110def SSE_DIV_F32P : OpndItins<
111  IIC_SSE_DIV_F32P_RR, IIC_SSE_DIV_F64P_RM
112>;
113
114def SSE_DIV_F64P : OpndItins<
115  IIC_SSE_DIV_F64P_RR, IIC_SSE_DIV_F64P_RM
116>;
117}
118
119def SSE_DIV_ITINS_P : SizeItins<
120  SSE_DIV_F32P, SSE_DIV_F64P
121>;
122
123let Sched = WriteVecLogic in
124def SSE_VEC_BIT_ITINS_P : OpndItins<
125  IIC_SSE_BIT_P_RR, IIC_SSE_BIT_P_RM
126>;
127
128def SSE_BIT_ITINS_P : OpndItins<
129  IIC_SSE_BIT_P_RR, IIC_SSE_BIT_P_RM
130>;
131
132let Sched = WriteVecALU in {
133def SSE_INTALU_ITINS_P : OpndItins<
134  IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
135>;
136
137def SSE_INTALUQ_ITINS_P : OpndItins<
138  IIC_SSE_INTALUQ_P_RR, IIC_SSE_INTALUQ_P_RM
139>;
140}
141
142let Sched = WriteVecIMul in
143def SSE_INTMUL_ITINS_P : OpndItins<
144  IIC_SSE_INTMUL_P_RR, IIC_SSE_INTMUL_P_RM
145>;
146
147def SSE_INTSHIFT_ITINS_P : ShiftOpndItins<
148  IIC_SSE_INTSH_P_RR, IIC_SSE_INTSH_P_RM, IIC_SSE_INTSH_P_RI
149>;
150
151def SSE_MOVA_ITINS : OpndItins<
152  IIC_SSE_MOVA_P_RR, IIC_SSE_MOVA_P_RM
153>;
154
155def SSE_MOVU_ITINS : OpndItins<
156  IIC_SSE_MOVU_P_RR, IIC_SSE_MOVU_P_RM
157>;
158
159def SSE_DPPD_ITINS : OpndItins<
160  IIC_SSE_DPPD_RR, IIC_SSE_DPPD_RM
161>;
162
163def SSE_DPPS_ITINS : OpndItins<
164  IIC_SSE_DPPS_RR, IIC_SSE_DPPD_RM
165>;
166
167def DEFAULT_ITINS : OpndItins<
168  IIC_ALU_NONMEM, IIC_ALU_MEM
169>;
170
171def SSE_EXTRACT_ITINS : OpndItins<
172  IIC_SSE_EXTRACTPS_RR, IIC_SSE_EXTRACTPS_RM
173>;
174
175def SSE_INSERT_ITINS : OpndItins<
176  IIC_SSE_INSERTPS_RR, IIC_SSE_INSERTPS_RM
177>;
178
179let Sched = WriteMPSAD in
180def SSE_MPSADBW_ITINS : OpndItins<
181  IIC_SSE_MPSADBW_RR, IIC_SSE_MPSADBW_RM
182>;
183
184let Sched = WriteVecIMul in
185def SSE_PMULLD_ITINS : OpndItins<
186  IIC_SSE_PMULLD_RR, IIC_SSE_PMULLD_RM
187>;
188
189// Definitions for backward compatibility.
190// The instructions mapped on these definitions uses a different itinerary
191// than the actual scheduling model.
192let Sched = WriteShuffle in
193def DEFAULT_ITINS_SHUFFLESCHED :  OpndItins<
194  IIC_ALU_NONMEM, IIC_ALU_MEM
195>;
196
197let Sched = WriteVecIMul in
198def DEFAULT_ITINS_VECIMULSCHED :  OpndItins<
199  IIC_ALU_NONMEM, IIC_ALU_MEM
200>;
201
202let Sched = WriteShuffle in
203def SSE_INTALU_ITINS_SHUFF_P : OpndItins<
204  IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
205>;
206
207let Sched = WriteMPSAD in
208def DEFAULT_ITINS_MPSADSCHED :  OpndItins<
209  IIC_ALU_NONMEM, IIC_ALU_MEM
210>;
211
212let Sched = WriteFBlend in
213def DEFAULT_ITINS_FBLENDSCHED :  OpndItins<
214  IIC_ALU_NONMEM, IIC_ALU_MEM
215>;
216
217let Sched = WriteBlend in
218def DEFAULT_ITINS_BLENDSCHED :  OpndItins<
219  IIC_ALU_NONMEM, IIC_ALU_MEM
220>;
221
222let Sched = WriteVarBlend in
223def DEFAULT_ITINS_VARBLENDSCHED :  OpndItins<
224  IIC_ALU_NONMEM, IIC_ALU_MEM
225>;
226
227let Sched = WriteFBlend in
228def SSE_INTALU_ITINS_FBLEND_P : OpndItins<
229  IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
230>;
231
232let Sched = WriteBlend in
233def SSE_INTALU_ITINS_BLEND_P : OpndItins<
234  IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
235>;
236
237//===----------------------------------------------------------------------===//
238// SSE 1 & 2 Instructions Classes
239//===----------------------------------------------------------------------===//
240
241/// sse12_fp_scalar - SSE 1 & 2 scalar instructions class
242multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
243                           RegisterClass RC, X86MemOperand x86memop,
244                           Domain d, OpndItins itins, bit Is2Addr = 1> {
245  let isCommutable = 1 in {
246    def rr : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
247       !if(Is2Addr,
248           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
249           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
250       [(set RC:$dst, (OpNode RC:$src1, RC:$src2))], itins.rr, d>,
251       Sched<[itins.Sched]>;
252  }
253  def rm : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
254       !if(Is2Addr,
255           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
256           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
257       [(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))], itins.rm, d>,
258       Sched<[itins.Sched.Folded, ReadAfterLd]>;
259}
260
261/// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class
262multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr, RegisterClass RC,
263                             string asm, string SSEVer, string FPSizeStr,
264                             Operand memopr, ComplexPattern mem_cpat,
265                             Domain d, OpndItins itins, bit Is2Addr = 1> {
266let isCodeGenOnly = 1 in {
267  def rr_Int : SI_Int<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
268       !if(Is2Addr,
269           !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
270           !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
271       [(set RC:$dst, (!cast<Intrinsic>(
272                 !strconcat("int_x86_sse", SSEVer, "_", OpcodeStr, FPSizeStr))
273             RC:$src1, RC:$src2))], itins.rr, d>,
274       Sched<[itins.Sched]>;
275  def rm_Int : SI_Int<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, memopr:$src2),
276       !if(Is2Addr,
277           !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
278           !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
279       [(set RC:$dst, (!cast<Intrinsic>(!strconcat("int_x86_sse",
280                                          SSEVer, "_", OpcodeStr, FPSizeStr))
281             RC:$src1, mem_cpat:$src2))], itins.rm, d>,
282       Sched<[itins.Sched.Folded, ReadAfterLd]>;
283}
284}
285
286/// sse12_fp_packed - SSE 1 & 2 packed instructions class
287multiclass sse12_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
288                           RegisterClass RC, ValueType vt,
289                           X86MemOperand x86memop, PatFrag mem_frag,
290                           Domain d, OpndItins itins, bit Is2Addr = 1> {
291  let isCommutable = 1 in
292    def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
293       !if(Is2Addr,
294           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
295           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
296       [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], itins.rr, d>,
297       Sched<[itins.Sched]>;
298  let mayLoad = 1 in
299    def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
300       !if(Is2Addr,
301           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
302           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
303       [(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))],
304          itins.rm, d>,
305       Sched<[itins.Sched.Folded, ReadAfterLd]>;
306}
307
308/// sse12_fp_packed_logical_rm - SSE 1 & 2 packed instructions class
309multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d,
310                                      string OpcodeStr, X86MemOperand x86memop,
311                                      list<dag> pat_rr, list<dag> pat_rm,
312                                      bit Is2Addr = 1> {
313  let isCommutable = 1, hasSideEffects = 0 in
314    def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
315       !if(Is2Addr,
316           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
317           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
318       pat_rr, NoItinerary, d>,
319       Sched<[WriteVecLogic]>;
320  def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
321       !if(Is2Addr,
322           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
323           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
324       pat_rm, NoItinerary, d>,
325       Sched<[WriteVecLogicLd, ReadAfterLd]>;
326}
327
328//===----------------------------------------------------------------------===//
329//  Non-instruction patterns
330//===----------------------------------------------------------------------===//
331
332// A vector extract of the first f32/f64 position is a subregister copy
333def : Pat<(f32 (extractelt (v4f32 VR128:$src), (iPTR 0))),
334          (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32)>;
335def : Pat<(f64 (extractelt (v2f64 VR128:$src), (iPTR 0))),
336          (COPY_TO_REGCLASS (v2f64 VR128:$src), FR64)>;
337
338// A 128-bit subvector extract from the first 256-bit vector position
339// is a subregister copy that needs no instruction.
340def : Pat<(v4i32 (extract_subvector (v8i32 VR256:$src), (iPTR 0))),
341          (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm))>;
342def : Pat<(v4f32 (extract_subvector (v8f32 VR256:$src), (iPTR 0))),
343          (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm))>;
344
345def : Pat<(v2i64 (extract_subvector (v4i64 VR256:$src), (iPTR 0))),
346          (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm))>;
347def : Pat<(v2f64 (extract_subvector (v4f64 VR256:$src), (iPTR 0))),
348          (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm))>;
349
350def : Pat<(v8i16 (extract_subvector (v16i16 VR256:$src), (iPTR 0))),
351          (v8i16 (EXTRACT_SUBREG (v16i16 VR256:$src), sub_xmm))>;
352def : Pat<(v16i8 (extract_subvector (v32i8 VR256:$src), (iPTR 0))),
353          (v16i8 (EXTRACT_SUBREG (v32i8 VR256:$src), sub_xmm))>;
354
355// A 128-bit subvector insert to the first 256-bit vector position
356// is a subregister copy that needs no instruction.
357let AddedComplexity = 25 in { // to give priority over vinsertf128rm
358def : Pat<(insert_subvector undef, (v2i64 VR128:$src), (iPTR 0)),
359          (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
360def : Pat<(insert_subvector undef, (v2f64 VR128:$src), (iPTR 0)),
361          (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
362def : Pat<(insert_subvector undef, (v4i32 VR128:$src), (iPTR 0)),
363          (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
364def : Pat<(insert_subvector undef, (v4f32 VR128:$src), (iPTR 0)),
365          (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
366def : Pat<(insert_subvector undef, (v8i16 VR128:$src), (iPTR 0)),
367          (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
368def : Pat<(insert_subvector undef, (v16i8 VR128:$src), (iPTR 0)),
369          (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
370}
371
372// Implicitly promote a 32-bit scalar to a vector.
373def : Pat<(v4f32 (scalar_to_vector FR32:$src)),
374          (COPY_TO_REGCLASS FR32:$src, VR128)>;
375def : Pat<(v8f32 (scalar_to_vector FR32:$src)),
376          (COPY_TO_REGCLASS FR32:$src, VR128)>;
377// Implicitly promote a 64-bit scalar to a vector.
378def : Pat<(v2f64 (scalar_to_vector FR64:$src)),
379          (COPY_TO_REGCLASS FR64:$src, VR128)>;
380def : Pat<(v4f64 (scalar_to_vector FR64:$src)),
381          (COPY_TO_REGCLASS FR64:$src, VR128)>;
382
383// Bitcasts between 128-bit vector types. Return the original type since
384// no instruction is needed for the conversion
385let Predicates = [HasSSE2] in {
386  def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>;
387  def : Pat<(v2i64 (bitconvert (v8i16 VR128:$src))), (v2i64 VR128:$src)>;
388  def : Pat<(v2i64 (bitconvert (v16i8 VR128:$src))), (v2i64 VR128:$src)>;
389  def : Pat<(v2i64 (bitconvert (v2f64 VR128:$src))), (v2i64 VR128:$src)>;
390  def : Pat<(v2i64 (bitconvert (v4f32 VR128:$src))), (v2i64 VR128:$src)>;
391  def : Pat<(v4i32 (bitconvert (v2i64 VR128:$src))), (v4i32 VR128:$src)>;
392  def : Pat<(v4i32 (bitconvert (v8i16 VR128:$src))), (v4i32 VR128:$src)>;
393  def : Pat<(v4i32 (bitconvert (v16i8 VR128:$src))), (v4i32 VR128:$src)>;
394  def : Pat<(v4i32 (bitconvert (v2f64 VR128:$src))), (v4i32 VR128:$src)>;
395  def : Pat<(v4i32 (bitconvert (v4f32 VR128:$src))), (v4i32 VR128:$src)>;
396  def : Pat<(v8i16 (bitconvert (v2i64 VR128:$src))), (v8i16 VR128:$src)>;
397  def : Pat<(v8i16 (bitconvert (v4i32 VR128:$src))), (v8i16 VR128:$src)>;
398  def : Pat<(v8i16 (bitconvert (v16i8 VR128:$src))), (v8i16 VR128:$src)>;
399  def : Pat<(v8i16 (bitconvert (v2f64 VR128:$src))), (v8i16 VR128:$src)>;
400  def : Pat<(v8i16 (bitconvert (v4f32 VR128:$src))), (v8i16 VR128:$src)>;
401  def : Pat<(v16i8 (bitconvert (v2i64 VR128:$src))), (v16i8 VR128:$src)>;
402  def : Pat<(v16i8 (bitconvert (v4i32 VR128:$src))), (v16i8 VR128:$src)>;
403  def : Pat<(v16i8 (bitconvert (v8i16 VR128:$src))), (v16i8 VR128:$src)>;
404  def : Pat<(v16i8 (bitconvert (v2f64 VR128:$src))), (v16i8 VR128:$src)>;
405  def : Pat<(v16i8 (bitconvert (v4f32 VR128:$src))), (v16i8 VR128:$src)>;
406  def : Pat<(v4f32 (bitconvert (v2i64 VR128:$src))), (v4f32 VR128:$src)>;
407  def : Pat<(v4f32 (bitconvert (v4i32 VR128:$src))), (v4f32 VR128:$src)>;
408  def : Pat<(v4f32 (bitconvert (v8i16 VR128:$src))), (v4f32 VR128:$src)>;
409  def : Pat<(v4f32 (bitconvert (v16i8 VR128:$src))), (v4f32 VR128:$src)>;
410  def : Pat<(v4f32 (bitconvert (v2f64 VR128:$src))), (v4f32 VR128:$src)>;
411  def : Pat<(v2f64 (bitconvert (v2i64 VR128:$src))), (v2f64 VR128:$src)>;
412  def : Pat<(v2f64 (bitconvert (v4i32 VR128:$src))), (v2f64 VR128:$src)>;
413  def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>;
414  def : Pat<(v2f64 (bitconvert (v16i8 VR128:$src))), (v2f64 VR128:$src)>;
415  def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>;
416  def : Pat<(f128  (bitconvert (i128  FR128:$src))), (f128  FR128:$src)>;
417  def : Pat<(i128  (bitconvert (f128  FR128:$src))), (i128  FR128:$src)>;
418}
419
420// Bitcasts between 256-bit vector types. Return the original type since
421// no instruction is needed for the conversion
422let Predicates = [HasAVX] in {
423  def : Pat<(v4f64  (bitconvert (v8f32 VR256:$src))),  (v4f64 VR256:$src)>;
424  def : Pat<(v4f64  (bitconvert (v8i32 VR256:$src))),  (v4f64 VR256:$src)>;
425  def : Pat<(v4f64  (bitconvert (v4i64 VR256:$src))),  (v4f64 VR256:$src)>;
426  def : Pat<(v4f64  (bitconvert (v16i16 VR256:$src))), (v4f64 VR256:$src)>;
427  def : Pat<(v4f64  (bitconvert (v32i8 VR256:$src))),  (v4f64 VR256:$src)>;
428  def : Pat<(v8f32  (bitconvert (v8i32 VR256:$src))),  (v8f32 VR256:$src)>;
429  def : Pat<(v8f32  (bitconvert (v4i64 VR256:$src))),  (v8f32 VR256:$src)>;
430  def : Pat<(v8f32  (bitconvert (v4f64 VR256:$src))),  (v8f32 VR256:$src)>;
431  def : Pat<(v8f32  (bitconvert (v32i8 VR256:$src))),  (v8f32 VR256:$src)>;
432  def : Pat<(v8f32  (bitconvert (v16i16 VR256:$src))), (v8f32 VR256:$src)>;
433  def : Pat<(v4i64  (bitconvert (v8f32 VR256:$src))),  (v4i64 VR256:$src)>;
434  def : Pat<(v4i64  (bitconvert (v8i32 VR256:$src))),  (v4i64 VR256:$src)>;
435  def : Pat<(v4i64  (bitconvert (v4f64 VR256:$src))),  (v4i64 VR256:$src)>;
436  def : Pat<(v4i64  (bitconvert (v32i8 VR256:$src))),  (v4i64 VR256:$src)>;
437  def : Pat<(v4i64  (bitconvert (v16i16 VR256:$src))), (v4i64 VR256:$src)>;
438  def : Pat<(v32i8  (bitconvert (v4f64 VR256:$src))),  (v32i8 VR256:$src)>;
439  def : Pat<(v32i8  (bitconvert (v4i64 VR256:$src))),  (v32i8 VR256:$src)>;
440  def : Pat<(v32i8  (bitconvert (v8f32 VR256:$src))),  (v32i8 VR256:$src)>;
441  def : Pat<(v32i8  (bitconvert (v8i32 VR256:$src))),  (v32i8 VR256:$src)>;
442  def : Pat<(v32i8  (bitconvert (v16i16 VR256:$src))), (v32i8 VR256:$src)>;
443  def : Pat<(v8i32  (bitconvert (v32i8 VR256:$src))),  (v8i32 VR256:$src)>;
444  def : Pat<(v8i32  (bitconvert (v16i16 VR256:$src))), (v8i32 VR256:$src)>;
445  def : Pat<(v8i32  (bitconvert (v8f32 VR256:$src))),  (v8i32 VR256:$src)>;
446  def : Pat<(v8i32  (bitconvert (v4i64 VR256:$src))),  (v8i32 VR256:$src)>;
447  def : Pat<(v8i32  (bitconvert (v4f64 VR256:$src))),  (v8i32 VR256:$src)>;
448  def : Pat<(v16i16 (bitconvert (v8f32 VR256:$src))),  (v16i16 VR256:$src)>;
449  def : Pat<(v16i16 (bitconvert (v8i32 VR256:$src))),  (v16i16 VR256:$src)>;
450  def : Pat<(v16i16 (bitconvert (v4i64 VR256:$src))),  (v16i16 VR256:$src)>;
451  def : Pat<(v16i16 (bitconvert (v4f64 VR256:$src))),  (v16i16 VR256:$src)>;
452  def : Pat<(v16i16 (bitconvert (v32i8 VR256:$src))),  (v16i16 VR256:$src)>;
453}
454
455// Alias instructions that map fld0 to xorps for sse or vxorps for avx.
456// This is expanded by ExpandPostRAPseudos.
457let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
458    isPseudo = 1, SchedRW = [WriteZero] in {
459  def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "",
460                   [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1]>;
461  def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "",
462                   [(set FR64:$dst, fpimm0)]>, Requires<[HasSSE2]>;
463}
464
465//===----------------------------------------------------------------------===//
466// AVX & SSE - Zero/One Vectors
467//===----------------------------------------------------------------------===//
468
469// Alias instruction that maps zero vector to pxor / xorp* for sse.
470// This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then
471// swizzled by ExecutionDepsFix to pxor.
472// We set canFoldAsLoad because this can be converted to a constant-pool
473// load of an all-zeros value if folding it would be beneficial.
474let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
475    isPseudo = 1, SchedRW = [WriteZero] in {
476def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "",
477               [(set VR128:$dst, (v4f32 immAllZerosV))]>;
478}
479
480def : Pat<(v2f64 immAllZerosV), (V_SET0)>;
481def : Pat<(v4i32 immAllZerosV), (V_SET0)>;
482def : Pat<(v2i64 immAllZerosV), (V_SET0)>;
483def : Pat<(v8i16 immAllZerosV), (V_SET0)>;
484def : Pat<(v16i8 immAllZerosV), (V_SET0)>;
485
486
487// The same as done above but for AVX.  The 256-bit AVX1 ISA doesn't support PI,
488// and doesn't need it because on sandy bridge the register is set to zero
489// at the rename stage without using any execution unit, so SET0PSY
490// and SET0PDY can be used for vector int instructions without penalty
491let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
492    isPseudo = 1, Predicates = [HasAVX], SchedRW = [WriteZero] in {
493def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "",
494                 [(set VR256:$dst, (v8f32 immAllZerosV))]>;
495}
496
497let Predicates = [HasAVX] in
498  def : Pat<(v4f64 immAllZerosV), (AVX_SET0)>;
499
500let Predicates = [HasAVX2] in {
501  def : Pat<(v4i64 immAllZerosV), (AVX_SET0)>;
502  def : Pat<(v8i32 immAllZerosV), (AVX_SET0)>;
503  def : Pat<(v16i16 immAllZerosV), (AVX_SET0)>;
504  def : Pat<(v32i8 immAllZerosV), (AVX_SET0)>;
505}
506
507// AVX1 has no support for 256-bit integer instructions, but since the 128-bit
508// VPXOR instruction writes zero to its upper part, it's safe build zeros.
509let Predicates = [HasAVX1Only] in {
510def : Pat<(v32i8 immAllZerosV), (SUBREG_TO_REG (i8 0), (V_SET0), sub_xmm)>;
511def : Pat<(bc_v32i8 (v8f32 immAllZerosV)),
512          (SUBREG_TO_REG (i8 0), (V_SET0), sub_xmm)>;
513
514def : Pat<(v16i16 immAllZerosV), (SUBREG_TO_REG (i16 0), (V_SET0), sub_xmm)>;
515def : Pat<(bc_v16i16 (v8f32 immAllZerosV)),
516          (SUBREG_TO_REG (i16 0), (V_SET0), sub_xmm)>;
517
518def : Pat<(v8i32 immAllZerosV), (SUBREG_TO_REG (i32 0), (V_SET0), sub_xmm)>;
519def : Pat<(bc_v8i32 (v8f32 immAllZerosV)),
520          (SUBREG_TO_REG (i32 0), (V_SET0), sub_xmm)>;
521
522def : Pat<(v4i64 immAllZerosV), (SUBREG_TO_REG (i64 0), (V_SET0), sub_xmm)>;
523def : Pat<(bc_v4i64 (v8f32 immAllZerosV)),
524          (SUBREG_TO_REG (i64 0), (V_SET0), sub_xmm)>;
525}
526
527// We set canFoldAsLoad because this can be converted to a constant-pool
528// load of an all-ones value if folding it would be beneficial.
529let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
530    isPseudo = 1, SchedRW = [WriteZero] in {
531  def V_SETALLONES : I<0, Pseudo, (outs VR128:$dst), (ins), "",
532                       [(set VR128:$dst, (v4i32 immAllOnesV))]>;
533  let Predicates = [HasAVX2] in
534  def AVX2_SETALLONES : I<0, Pseudo, (outs VR256:$dst), (ins), "",
535                          [(set VR256:$dst, (v8i32 immAllOnesV))]>;
536}
537
538
539//===----------------------------------------------------------------------===//
540// SSE 1 & 2 - Move FP Scalar Instructions
541//
542// Move Instructions. Register-to-register movss/movsd is not used for FR32/64
543// register copies because it's a partial register update; Register-to-register
544// movss/movsd is not modeled as an INSERT_SUBREG because INSERT_SUBREG requires
545// that the insert be implementable in terms of a copy, and just mentioned, we
546// don't use movss/movsd for copies.
547//===----------------------------------------------------------------------===//
548
549multiclass sse12_move_rr<RegisterClass RC, SDNode OpNode, ValueType vt,
550                         X86MemOperand x86memop, string base_opc,
551                         string asm_opr, Domain d = GenericDomain> {
552  def rr : SI<0x10, MRMSrcReg, (outs VR128:$dst),
553              (ins VR128:$src1, RC:$src2),
554              !strconcat(base_opc, asm_opr),
555              [(set VR128:$dst, (vt (OpNode VR128:$src1,
556                                 (scalar_to_vector RC:$src2))))],
557              IIC_SSE_MOV_S_RR, d>, Sched<[WriteFShuffle]>;
558
559  // For the disassembler
560  let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
561  def rr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
562                  (ins VR128:$src1, RC:$src2),
563                  !strconcat(base_opc, asm_opr),
564                  [], IIC_SSE_MOV_S_RR>, Sched<[WriteFShuffle]>;
565}
566
567multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
568                      X86MemOperand x86memop, string OpcodeStr,
569                      Domain d = GenericDomain> {
570  // AVX
571  defm V#NAME : sse12_move_rr<RC, OpNode, vt, x86memop, OpcodeStr,
572                              "\t{$src2, $src1, $dst|$dst, $src1, $src2}", d>,
573                              VEX_4V, VEX_LIG;
574
575  def V#NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
576                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
577                     [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR, d>,
578                     VEX, VEX_LIG, Sched<[WriteStore]>;
579  // SSE1 & 2
580  let Constraints = "$src1 = $dst" in {
581    defm NAME : sse12_move_rr<RC, OpNode, vt, x86memop, OpcodeStr,
582                              "\t{$src2, $dst|$dst, $src2}", d>;
583  }
584
585  def NAME#mr   : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
586                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
587                     [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR, d>,
588                  Sched<[WriteStore]>;
589}
590
591// Loading from memory automatically zeroing upper bits.
592multiclass sse12_move_rm<RegisterClass RC, X86MemOperand x86memop,
593                         PatFrag mem_pat, string OpcodeStr,
594                         Domain d = GenericDomain> {
595  def V#NAME#rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
596                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
597                     [(set RC:$dst, (mem_pat addr:$src))],
598                     IIC_SSE_MOV_S_RM, d>, VEX, VEX_LIG, Sched<[WriteLoad]>;
599  def NAME#rm   : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
600                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
601                     [(set RC:$dst, (mem_pat addr:$src))],
602                     IIC_SSE_MOV_S_RM, d>, Sched<[WriteLoad]>;
603}
604
605defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss",
606                        SSEPackedSingle>, XS;
607defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd",
608                        SSEPackedDouble>, XD;
609
610let canFoldAsLoad = 1, isReMaterializable = 1 in {
611  defm MOVSS : sse12_move_rm<FR32, f32mem, loadf32, "movss",
612                             SSEPackedSingle>, XS;
613
614  let AddedComplexity = 20 in
615    defm MOVSD : sse12_move_rm<FR64, f64mem, loadf64, "movsd",
616                               SSEPackedDouble>, XD;
617}
618
619// Patterns
620let Predicates = [UseAVX] in {
621  let AddedComplexity = 20 in {
622  // MOVSSrm zeros the high parts of the register; represent this
623  // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
624  def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
625            (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
626  def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
627            (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
628  def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
629            (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
630
631  // MOVSDrm zeros the high parts of the register; represent this
632  // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
633  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
634            (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
635  def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
636            (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
637  def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
638            (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
639  def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
640            (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
641  def : Pat<(v2f64 (X86vzload addr:$src)),
642            (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
643
644  // Represent the same patterns above but in the form they appear for
645  // 256-bit types
646  def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
647                   (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))),
648            (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
649  def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
650                   (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))),
651            (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
652  }
653
654  // Extract and store.
655  def : Pat<(store (f32 (extractelt (v4f32 VR128:$src), (iPTR 0))),
656                   addr:$dst),
657            (VMOVSSmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32))>;
658  def : Pat<(store (f64 (extractelt (v2f64 VR128:$src), (iPTR 0))),
659                   addr:$dst),
660            (VMOVSDmr addr:$dst, (COPY_TO_REGCLASS (v2f64 VR128:$src), FR64))>;
661
662  // Shuffle with VMOVSS
663  def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)),
664            (VMOVSSrr (v4i32 VR128:$src1),
665                      (COPY_TO_REGCLASS (v4i32 VR128:$src2), FR32))>;
666  def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
667            (VMOVSSrr (v4f32 VR128:$src1),
668                      (COPY_TO_REGCLASS (v4f32 VR128:$src2), FR32))>;
669
670  // 256-bit variants
671  def : Pat<(v8i32 (X86Movss VR256:$src1, VR256:$src2)),
672            (SUBREG_TO_REG (i32 0),
673              (VMOVSSrr (EXTRACT_SUBREG (v8i32 VR256:$src1), sub_xmm),
674                        (EXTRACT_SUBREG (v8i32 VR256:$src2), sub_xmm)),
675              sub_xmm)>;
676  def : Pat<(v8f32 (X86Movss VR256:$src1, VR256:$src2)),
677            (SUBREG_TO_REG (i32 0),
678              (VMOVSSrr (EXTRACT_SUBREG (v8f32 VR256:$src1), sub_xmm),
679                        (EXTRACT_SUBREG (v8f32 VR256:$src2), sub_xmm)),
680              sub_xmm)>;
681
682  // Shuffle with VMOVSD
683  def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
684            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
685  def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
686            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
687  def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)),
688            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
689  def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)),
690            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
691
692  // 256-bit variants
693  def : Pat<(v4i64 (X86Movsd VR256:$src1, VR256:$src2)),
694            (SUBREG_TO_REG (i32 0),
695              (VMOVSDrr (EXTRACT_SUBREG (v4i64 VR256:$src1), sub_xmm),
696                        (EXTRACT_SUBREG (v4i64 VR256:$src2), sub_xmm)),
697              sub_xmm)>;
698  def : Pat<(v4f64 (X86Movsd VR256:$src1, VR256:$src2)),
699            (SUBREG_TO_REG (i32 0),
700              (VMOVSDrr (EXTRACT_SUBREG (v4f64 VR256:$src1), sub_xmm),
701                        (EXTRACT_SUBREG (v4f64 VR256:$src2), sub_xmm)),
702              sub_xmm)>;
703
704  // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem
705  // is during lowering, where it's not possible to recognize the fold cause
706  // it has two uses through a bitcast. One use disappears at isel time and the
707  // fold opportunity reappears.
708  def : Pat<(v2f64 (X86Movlpd VR128:$src1, VR128:$src2)),
709            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
710  def : Pat<(v2i64 (X86Movlpd VR128:$src1, VR128:$src2)),
711            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
712  def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)),
713            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
714  def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)),
715            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
716}
717
718let Predicates = [UseSSE1] in {
719  let Predicates = [NoSSE41], AddedComplexity = 15 in {
720  // Move scalar to XMM zero-extended, zeroing a VR128 then do a
721  // MOVSS to the lower bits.
722  def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
723            (MOVSSrr (v4f32 (V_SET0)), FR32:$src)>;
724  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
725            (MOVSSrr (v4f32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>;
726  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
727            (MOVSSrr (v4i32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>;
728  }
729
730  let AddedComplexity = 20 in {
731  // MOVSSrm already zeros the high parts of the register.
732  def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
733            (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
734  def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
735            (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
736  def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
737            (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
738  }
739
740  // Extract and store.
741  def : Pat<(store (f32 (extractelt (v4f32 VR128:$src), (iPTR 0))),
742                   addr:$dst),
743            (MOVSSmr addr:$dst, (COPY_TO_REGCLASS VR128:$src, FR32))>;
744
745  // Shuffle with MOVSS
746  def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)),
747            (MOVSSrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR32))>;
748  def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
749            (MOVSSrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR32))>;
750}
751
752let Predicates = [UseSSE2] in {
753  let Predicates = [NoSSE41], AddedComplexity = 15 in {
754  // Move scalar to XMM zero-extended, zeroing a VR128 then do a
755  // MOVSD to the lower bits.
756  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
757            (MOVSDrr (v2f64 (V_SET0)), FR64:$src)>;
758  }
759
760  let AddedComplexity = 20 in {
761  // MOVSDrm already zeros the high parts of the register.
762  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
763            (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
764  def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
765            (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
766  def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
767            (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
768  def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
769            (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
770  def : Pat<(v2f64 (X86vzload addr:$src)),
771            (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
772  }
773
774  // Extract and store.
775  def : Pat<(store (f64 (extractelt (v2f64 VR128:$src), (iPTR 0))),
776                   addr:$dst),
777            (MOVSDmr addr:$dst, (COPY_TO_REGCLASS VR128:$src, FR64))>;
778
779  // Shuffle with MOVSD
780  def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
781            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
782  def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
783            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
784  def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)),
785            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
786  def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)),
787            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
788
789  // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem
790  // is during lowering, where it's not possible to recognize the fold because
791  // it has two uses through a bitcast. One use disappears at isel time and the
792  // fold opportunity reappears.
793  def : Pat<(v2f64 (X86Movlpd VR128:$src1, VR128:$src2)),
794            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
795  def : Pat<(v2i64 (X86Movlpd VR128:$src1, VR128:$src2)),
796            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
797  def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)),
798            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
799  def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)),
800            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
801}
802
803//===----------------------------------------------------------------------===//
804// SSE 1 & 2 - Move Aligned/Unaligned FP Instructions
805//===----------------------------------------------------------------------===//
806
807multiclass sse12_mov_packed<bits<8> opc, RegisterClass RC,
808                            X86MemOperand x86memop, PatFrag ld_frag,
809                            string asm, Domain d,
810                            OpndItins itins,
811                            bit IsReMaterializable = 1> {
812let hasSideEffects = 0 in
813  def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
814              !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], itins.rr, d>,
815           Sched<[WriteFShuffle]>;
816let canFoldAsLoad = 1, isReMaterializable = IsReMaterializable in
817  def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
818              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
819                   [(set RC:$dst, (ld_frag addr:$src))], itins.rm, d>,
820           Sched<[WriteLoad]>;
821}
822
823let Predicates = [HasAVX, NoVLX] in {
824defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32,
825                              "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
826                              PS, VEX;
827defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64,
828                              "movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
829                              PD, VEX;
830defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32,
831                              "movups", SSEPackedSingle, SSE_MOVU_ITINS>,
832                              PS, VEX;
833defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64,
834                              "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>,
835                              PD, VEX;
836
837defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32,
838                              "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
839                              PS, VEX, VEX_L;
840defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64,
841                              "movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
842                              PD, VEX, VEX_L;
843defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32,
844                              "movups", SSEPackedSingle, SSE_MOVU_ITINS>,
845                              PS, VEX, VEX_L;
846defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64,
847                              "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>,
848                              PD, VEX, VEX_L;
849}
850
851let Predicates = [UseSSE1] in {
852defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32,
853                              "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
854                              PS;
855defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32,
856                              "movups", SSEPackedSingle, SSE_MOVU_ITINS>,
857                              PS;
858}
859let Predicates = [UseSSE2] in {
860defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64,
861                              "movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
862                              PD;
863defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64,
864                              "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>,
865                              PD;
866}
867
868let SchedRW = [WriteStore], Predicates = [HasAVX, NoVLX]  in {
869def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
870                   "movaps\t{$src, $dst|$dst, $src}",
871                   [(alignedstore (v4f32 VR128:$src), addr:$dst)],
872                   IIC_SSE_MOVA_P_MR>, VEX;
873def VMOVAPDmr : VPDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
874                   "movapd\t{$src, $dst|$dst, $src}",
875                   [(alignedstore (v2f64 VR128:$src), addr:$dst)],
876                   IIC_SSE_MOVA_P_MR>, VEX;
877def VMOVUPSmr : VPSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
878                   "movups\t{$src, $dst|$dst, $src}",
879                   [(store (v4f32 VR128:$src), addr:$dst)],
880                   IIC_SSE_MOVU_P_MR>, VEX;
881def VMOVUPDmr : VPDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
882                   "movupd\t{$src, $dst|$dst, $src}",
883                   [(store (v2f64 VR128:$src), addr:$dst)],
884                   IIC_SSE_MOVU_P_MR>, VEX;
885def VMOVAPSYmr : VPSI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
886                   "movaps\t{$src, $dst|$dst, $src}",
887                   [(alignedstore256 (v8f32 VR256:$src), addr:$dst)],
888                   IIC_SSE_MOVA_P_MR>, VEX, VEX_L;
889def VMOVAPDYmr : VPDI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
890                   "movapd\t{$src, $dst|$dst, $src}",
891                   [(alignedstore256 (v4f64 VR256:$src), addr:$dst)],
892                   IIC_SSE_MOVA_P_MR>, VEX, VEX_L;
893def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
894                   "movups\t{$src, $dst|$dst, $src}",
895                   [(store (v8f32 VR256:$src), addr:$dst)],
896                   IIC_SSE_MOVU_P_MR>, VEX, VEX_L;
897def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
898                   "movupd\t{$src, $dst|$dst, $src}",
899                   [(store (v4f64 VR256:$src), addr:$dst)],
900                   IIC_SSE_MOVU_P_MR>, VEX, VEX_L;
901} // SchedRW
902
903// For disassembler
904let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
905    SchedRW = [WriteFShuffle] in {
906  def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst),
907                          (ins VR128:$src),
908                          "movaps\t{$src, $dst|$dst, $src}", [],
909                          IIC_SSE_MOVA_P_RR>, VEX;
910  def VMOVAPDrr_REV : VPDI<0x29, MRMDestReg, (outs VR128:$dst),
911                           (ins VR128:$src),
912                           "movapd\t{$src, $dst|$dst, $src}", [],
913                           IIC_SSE_MOVA_P_RR>, VEX;
914  def VMOVUPSrr_REV : VPSI<0x11, MRMDestReg, (outs VR128:$dst),
915                           (ins VR128:$src),
916                           "movups\t{$src, $dst|$dst, $src}", [],
917                           IIC_SSE_MOVU_P_RR>, VEX;
918  def VMOVUPDrr_REV : VPDI<0x11, MRMDestReg, (outs VR128:$dst),
919                           (ins VR128:$src),
920                           "movupd\t{$src, $dst|$dst, $src}", [],
921                           IIC_SSE_MOVU_P_RR>, VEX;
922  def VMOVAPSYrr_REV : VPSI<0x29, MRMDestReg, (outs VR256:$dst),
923                            (ins VR256:$src),
924                            "movaps\t{$src, $dst|$dst, $src}", [],
925                            IIC_SSE_MOVA_P_RR>, VEX, VEX_L;
926  def VMOVAPDYrr_REV : VPDI<0x29, MRMDestReg, (outs VR256:$dst),
927                            (ins VR256:$src),
928                            "movapd\t{$src, $dst|$dst, $src}", [],
929                            IIC_SSE_MOVA_P_RR>, VEX, VEX_L;
930  def VMOVUPSYrr_REV : VPSI<0x11, MRMDestReg, (outs VR256:$dst),
931                            (ins VR256:$src),
932                            "movups\t{$src, $dst|$dst, $src}", [],
933                            IIC_SSE_MOVU_P_RR>, VEX, VEX_L;
934  def VMOVUPDYrr_REV : VPDI<0x11, MRMDestReg, (outs VR256:$dst),
935                            (ins VR256:$src),
936                            "movupd\t{$src, $dst|$dst, $src}", [],
937                            IIC_SSE_MOVU_P_RR>, VEX, VEX_L;
938}
939
940def : Pat<(int_x86_avx_storeu_ps_256 addr:$dst, VR256:$src),
941          (VMOVUPSYmr addr:$dst, VR256:$src)>;
942def : Pat<(int_x86_avx_storeu_pd_256 addr:$dst, VR256:$src),
943          (VMOVUPDYmr addr:$dst, VR256:$src)>;
944
945let SchedRW = [WriteStore] in {
946def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
947                   "movaps\t{$src, $dst|$dst, $src}",
948                   [(alignedstore (v4f32 VR128:$src), addr:$dst)],
949                   IIC_SSE_MOVA_P_MR>;
950def MOVAPDmr : PDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
951                   "movapd\t{$src, $dst|$dst, $src}",
952                   [(alignedstore (v2f64 VR128:$src), addr:$dst)],
953                   IIC_SSE_MOVA_P_MR>;
954def MOVUPSmr : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
955                   "movups\t{$src, $dst|$dst, $src}",
956                   [(store (v4f32 VR128:$src), addr:$dst)],
957                   IIC_SSE_MOVU_P_MR>;
958def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
959                   "movupd\t{$src, $dst|$dst, $src}",
960                   [(store (v2f64 VR128:$src), addr:$dst)],
961                   IIC_SSE_MOVU_P_MR>;
962} // SchedRW
963
964// For disassembler
965let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
966    SchedRW = [WriteFShuffle] in {
967  def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
968                         "movaps\t{$src, $dst|$dst, $src}", [],
969                         IIC_SSE_MOVA_P_RR>;
970  def MOVAPDrr_REV : PDI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
971                         "movapd\t{$src, $dst|$dst, $src}", [],
972                         IIC_SSE_MOVA_P_RR>;
973  def MOVUPSrr_REV : PSI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
974                         "movups\t{$src, $dst|$dst, $src}", [],
975                         IIC_SSE_MOVU_P_RR>;
976  def MOVUPDrr_REV : PDI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
977                         "movupd\t{$src, $dst|$dst, $src}", [],
978                         IIC_SSE_MOVU_P_RR>;
979}
980
981let Predicates = [HasAVX] in {
982  def : Pat<(int_x86_sse_storeu_ps addr:$dst, VR128:$src),
983            (VMOVUPSmr addr:$dst, VR128:$src)>;
984  def : Pat<(int_x86_sse2_storeu_pd addr:$dst, VR128:$src),
985            (VMOVUPDmr addr:$dst, VR128:$src)>;
986}
987
988let Predicates = [UseSSE1] in
989  def : Pat<(int_x86_sse_storeu_ps addr:$dst, VR128:$src),
990            (MOVUPSmr addr:$dst, VR128:$src)>;
991let Predicates = [UseSSE2] in
992  def : Pat<(int_x86_sse2_storeu_pd addr:$dst, VR128:$src),
993            (MOVUPDmr addr:$dst, VR128:$src)>;
994
995// Use vmovaps/vmovups for AVX integer load/store.
996let Predicates = [HasAVX, NoVLX] in {
997  // 128-bit load/store
998  def : Pat<(alignedloadv2i64 addr:$src),
999            (VMOVAPSrm addr:$src)>;
1000  def : Pat<(loadv2i64 addr:$src),
1001            (VMOVUPSrm addr:$src)>;
1002
1003  def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
1004            (VMOVAPSmr addr:$dst, VR128:$src)>;
1005  def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
1006            (VMOVAPSmr addr:$dst, VR128:$src)>;
1007  def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
1008            (VMOVAPSmr addr:$dst, VR128:$src)>;
1009  def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
1010            (VMOVAPSmr addr:$dst, VR128:$src)>;
1011  def : Pat<(store (v2i64 VR128:$src), addr:$dst),
1012            (VMOVUPSmr addr:$dst, VR128:$src)>;
1013  def : Pat<(store (v4i32 VR128:$src), addr:$dst),
1014            (VMOVUPSmr addr:$dst, VR128:$src)>;
1015  def : Pat<(store (v8i16 VR128:$src), addr:$dst),
1016            (VMOVUPSmr addr:$dst, VR128:$src)>;
1017  def : Pat<(store (v16i8 VR128:$src), addr:$dst),
1018            (VMOVUPSmr addr:$dst, VR128:$src)>;
1019
1020  // 256-bit load/store
1021  def : Pat<(alignedloadv4i64 addr:$src),
1022            (VMOVAPSYrm addr:$src)>;
1023  def : Pat<(loadv4i64 addr:$src),
1024            (VMOVUPSYrm addr:$src)>;
1025  def : Pat<(alignedstore256 (v4i64 VR256:$src), addr:$dst),
1026            (VMOVAPSYmr addr:$dst, VR256:$src)>;
1027  def : Pat<(alignedstore256 (v8i32 VR256:$src), addr:$dst),
1028            (VMOVAPSYmr addr:$dst, VR256:$src)>;
1029  def : Pat<(alignedstore256 (v16i16 VR256:$src), addr:$dst),
1030            (VMOVAPSYmr addr:$dst, VR256:$src)>;
1031  def : Pat<(alignedstore256 (v32i8 VR256:$src), addr:$dst),
1032            (VMOVAPSYmr addr:$dst, VR256:$src)>;
1033  def : Pat<(store (v4i64 VR256:$src), addr:$dst),
1034            (VMOVUPSYmr addr:$dst, VR256:$src)>;
1035  def : Pat<(store (v8i32 VR256:$src), addr:$dst),
1036            (VMOVUPSYmr addr:$dst, VR256:$src)>;
1037  def : Pat<(store (v16i16 VR256:$src), addr:$dst),
1038            (VMOVUPSYmr addr:$dst, VR256:$src)>;
1039  def : Pat<(store (v32i8 VR256:$src), addr:$dst),
1040            (VMOVUPSYmr addr:$dst, VR256:$src)>;
1041
1042  // Special patterns for storing subvector extracts of lower 128-bits
1043  // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr
1044  def : Pat<(alignedstore (v2f64 (extract_subvector
1045                                  (v4f64 VR256:$src), (iPTR 0))), addr:$dst),
1046            (VMOVAPDmr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1047  def : Pat<(alignedstore (v4f32 (extract_subvector
1048                                  (v8f32 VR256:$src), (iPTR 0))), addr:$dst),
1049            (VMOVAPSmr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1050  def : Pat<(alignedstore (v2i64 (extract_subvector
1051                                  (v4i64 VR256:$src), (iPTR 0))), addr:$dst),
1052            (VMOVAPDmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1053  def : Pat<(alignedstore (v4i32 (extract_subvector
1054                                  (v8i32 VR256:$src), (iPTR 0))), addr:$dst),
1055            (VMOVAPSmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1056  def : Pat<(alignedstore (v8i16 (extract_subvector
1057                                  (v16i16 VR256:$src), (iPTR 0))), addr:$dst),
1058            (VMOVAPSmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1059  def : Pat<(alignedstore (v16i8 (extract_subvector
1060                                  (v32i8 VR256:$src), (iPTR 0))), addr:$dst),
1061            (VMOVAPSmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1062
1063  def : Pat<(store (v2f64 (extract_subvector
1064                           (v4f64 VR256:$src), (iPTR 0))), addr:$dst),
1065            (VMOVUPDmr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1066  def : Pat<(store (v4f32 (extract_subvector
1067                           (v8f32 VR256:$src), (iPTR 0))), addr:$dst),
1068            (VMOVUPSmr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1069  def : Pat<(store (v2i64 (extract_subvector
1070                           (v4i64 VR256:$src), (iPTR 0))), addr:$dst),
1071            (VMOVUPDmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1072  def : Pat<(store (v4i32 (extract_subvector
1073                           (v8i32 VR256:$src), (iPTR 0))), addr:$dst),
1074            (VMOVUPSmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1075  def : Pat<(store (v8i16 (extract_subvector
1076                           (v16i16 VR256:$src), (iPTR 0))), addr:$dst),
1077            (VMOVUPSmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1078  def : Pat<(store (v16i8 (extract_subvector
1079                           (v32i8 VR256:$src), (iPTR 0))), addr:$dst),
1080            (VMOVUPSmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1081}
1082
1083// Use movaps / movups for SSE integer load / store (one byte shorter).
1084// The instructions selected below are then converted to MOVDQA/MOVDQU
1085// during the SSE domain pass.
1086let Predicates = [UseSSE1] in {
1087  def : Pat<(alignedloadv2i64 addr:$src),
1088            (MOVAPSrm addr:$src)>;
1089  def : Pat<(loadv2i64 addr:$src),
1090            (MOVUPSrm addr:$src)>;
1091
1092  def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
1093            (MOVAPSmr addr:$dst, VR128:$src)>;
1094  def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
1095            (MOVAPSmr addr:$dst, VR128:$src)>;
1096  def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
1097            (MOVAPSmr addr:$dst, VR128:$src)>;
1098  def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
1099            (MOVAPSmr addr:$dst, VR128:$src)>;
1100  def : Pat<(store (v2i64 VR128:$src), addr:$dst),
1101            (MOVUPSmr addr:$dst, VR128:$src)>;
1102  def : Pat<(store (v4i32 VR128:$src), addr:$dst),
1103            (MOVUPSmr addr:$dst, VR128:$src)>;
1104  def : Pat<(store (v8i16 VR128:$src), addr:$dst),
1105            (MOVUPSmr addr:$dst, VR128:$src)>;
1106  def : Pat<(store (v16i8 VR128:$src), addr:$dst),
1107            (MOVUPSmr addr:$dst, VR128:$src)>;
1108}
1109
1110// Alias instruction to load FR32 or FR64 from f128mem using movaps. Upper
1111// bits are disregarded. FIXME: Set encoding to pseudo!
1112let canFoldAsLoad = 1, isReMaterializable = 1, SchedRW = [WriteLoad] in {
1113let isCodeGenOnly = 1 in {
1114  def FsVMOVAPSrm : VPSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src),
1115                         "movaps\t{$src, $dst|$dst, $src}",
1116                         [(set FR32:$dst, (alignedloadfsf32 addr:$src))],
1117                         IIC_SSE_MOVA_P_RM>, VEX;
1118  def FsVMOVAPDrm : VPDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src),
1119                         "movapd\t{$src, $dst|$dst, $src}",
1120                         [(set FR64:$dst, (alignedloadfsf64 addr:$src))],
1121                         IIC_SSE_MOVA_P_RM>, VEX;
1122  def FsMOVAPSrm : PSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src),
1123                       "movaps\t{$src, $dst|$dst, $src}",
1124                       [(set FR32:$dst, (alignedloadfsf32 addr:$src))],
1125                       IIC_SSE_MOVA_P_RM>;
1126  def FsMOVAPDrm : PDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src),
1127                       "movapd\t{$src, $dst|$dst, $src}",
1128                       [(set FR64:$dst, (alignedloadfsf64 addr:$src))],
1129                       IIC_SSE_MOVA_P_RM>;
1130}
1131}
1132
1133//===----------------------------------------------------------------------===//
1134// SSE 1 & 2 - Move Low packed FP Instructions
1135//===----------------------------------------------------------------------===//
1136
1137multiclass sse12_mov_hilo_packed_base<bits<8>opc, SDNode psnode, SDNode pdnode,
1138                                      string base_opc, string asm_opr,
1139                                      InstrItinClass itin> {
1140  def PSrm : PI<opc, MRMSrcMem,
1141         (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
1142         !strconcat(base_opc, "s", asm_opr),
1143     [(set VR128:$dst,
1144       (psnode VR128:$src1,
1145              (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))))],
1146              itin, SSEPackedSingle>, PS,
1147     Sched<[WriteFShuffleLd, ReadAfterLd]>;
1148
1149  def PDrm : PI<opc, MRMSrcMem,
1150         (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
1151         !strconcat(base_opc, "d", asm_opr),
1152     [(set VR128:$dst, (v2f64 (pdnode VR128:$src1,
1153                              (scalar_to_vector (loadf64 addr:$src2)))))],
1154              itin, SSEPackedDouble>, PD,
1155     Sched<[WriteFShuffleLd, ReadAfterLd]>;
1156
1157}
1158
1159multiclass sse12_mov_hilo_packed<bits<8>opc, SDNode psnode, SDNode pdnode,
1160                                 string base_opc, InstrItinClass itin> {
1161  let Predicates = [UseAVX] in
1162    defm V#NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc,
1163                                    "\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1164                                    itin>, VEX_4V;
1165
1166  let Constraints = "$src1 = $dst" in
1167    defm NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc,
1168                                    "\t{$src2, $dst|$dst, $src2}",
1169                                    itin>;
1170}
1171
1172let AddedComplexity = 20 in {
1173  defm MOVL : sse12_mov_hilo_packed<0x12, X86Movlps, X86Movlpd, "movlp",
1174                                    IIC_SSE_MOV_LH>;
1175}
1176
1177let SchedRW = [WriteStore] in {
1178let Predicates = [UseAVX] in {
1179def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
1180                   "movlps\t{$src, $dst|$dst, $src}",
1181                   [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128:$src)),
1182                                 (iPTR 0))), addr:$dst)],
1183                                 IIC_SSE_MOV_LH>, VEX;
1184def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
1185                   "movlpd\t{$src, $dst|$dst, $src}",
1186                   [(store (f64 (extractelt (v2f64 VR128:$src),
1187                                 (iPTR 0))), addr:$dst)],
1188                                 IIC_SSE_MOV_LH>, VEX;
1189}// UseAVX
1190def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
1191                   "movlps\t{$src, $dst|$dst, $src}",
1192                   [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128:$src)),
1193                                 (iPTR 0))), addr:$dst)],
1194                                 IIC_SSE_MOV_LH>;
1195def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
1196                   "movlpd\t{$src, $dst|$dst, $src}",
1197                   [(store (f64 (extractelt (v2f64 VR128:$src),
1198                                 (iPTR 0))), addr:$dst)],
1199                                 IIC_SSE_MOV_LH>;
1200} // SchedRW
1201
1202let Predicates = [UseAVX] in {
1203  // Shuffle with VMOVLPS
1204  def : Pat<(v4f32 (X86Movlps VR128:$src1, (load addr:$src2))),
1205            (VMOVLPSrm VR128:$src1, addr:$src2)>;
1206  def : Pat<(v4i32 (X86Movlps VR128:$src1, (load addr:$src2))),
1207            (VMOVLPSrm VR128:$src1, addr:$src2)>;
1208
1209  // Shuffle with VMOVLPD
1210  def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))),
1211            (VMOVLPDrm VR128:$src1, addr:$src2)>;
1212  def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))),
1213            (VMOVLPDrm VR128:$src1, addr:$src2)>;
1214  def : Pat<(v2f64 (X86Movsd VR128:$src1,
1215                             (v2f64 (scalar_to_vector (loadf64 addr:$src2))))),
1216            (VMOVLPDrm VR128:$src1, addr:$src2)>;
1217
1218  // Store patterns
1219  def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)),
1220                   addr:$src1),
1221            (VMOVLPSmr addr:$src1, VR128:$src2)>;
1222  def : Pat<(store (v4i32 (X86Movlps
1223                   (bc_v4i32 (loadv2i64 addr:$src1)), VR128:$src2)), addr:$src1),
1224            (VMOVLPSmr addr:$src1, VR128:$src2)>;
1225  def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128:$src2)),
1226                   addr:$src1),
1227            (VMOVLPDmr addr:$src1, VR128:$src2)>;
1228  def : Pat<(store (v2i64 (X86Movlpd (load addr:$src1), VR128:$src2)),
1229                   addr:$src1),
1230            (VMOVLPDmr addr:$src1, VR128:$src2)>;
1231}
1232
1233let Predicates = [UseSSE1] in {
1234  // (store (vector_shuffle (load addr), v2, <4, 5, 2, 3>), addr) using MOVLPS
1235  def : Pat<(store (i64 (extractelt (bc_v2i64 (v4f32 VR128:$src2)),
1236                                 (iPTR 0))), addr:$src1),
1237            (MOVLPSmr addr:$src1, VR128:$src2)>;
1238
1239  // Shuffle with MOVLPS
1240  def : Pat<(v4f32 (X86Movlps VR128:$src1, (load addr:$src2))),
1241            (MOVLPSrm VR128:$src1, addr:$src2)>;
1242  def : Pat<(v4i32 (X86Movlps VR128:$src1, (load addr:$src2))),
1243            (MOVLPSrm VR128:$src1, addr:$src2)>;
1244  def : Pat<(X86Movlps VR128:$src1,
1245                      (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
1246            (MOVLPSrm VR128:$src1, addr:$src2)>;
1247
1248  // Store patterns
1249  def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)),
1250                                      addr:$src1),
1251            (MOVLPSmr addr:$src1, VR128:$src2)>;
1252  def : Pat<(store (v4i32 (X86Movlps
1253                   (bc_v4i32 (loadv2i64 addr:$src1)), VR128:$src2)),
1254                              addr:$src1),
1255            (MOVLPSmr addr:$src1, VR128:$src2)>;
1256}
1257
1258let Predicates = [UseSSE2] in {
1259  // Shuffle with MOVLPD
1260  def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))),
1261            (MOVLPDrm VR128:$src1, addr:$src2)>;
1262  def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))),
1263            (MOVLPDrm VR128:$src1, addr:$src2)>;
1264  def : Pat<(v2f64 (X86Movsd VR128:$src1,
1265                             (v2f64 (scalar_to_vector (loadf64 addr:$src2))))),
1266            (MOVLPDrm VR128:$src1, addr:$src2)>;
1267
1268  // Store patterns
1269  def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128:$src2)),
1270                           addr:$src1),
1271            (MOVLPDmr addr:$src1, VR128:$src2)>;
1272  def : Pat<(store (v2i64 (X86Movlpd (load addr:$src1), VR128:$src2)),
1273                           addr:$src1),
1274            (MOVLPDmr addr:$src1, VR128:$src2)>;
1275}
1276
1277//===----------------------------------------------------------------------===//
1278// SSE 1 & 2 - Move Hi packed FP Instructions
1279//===----------------------------------------------------------------------===//
1280
1281let AddedComplexity = 20 in {
1282  defm MOVH : sse12_mov_hilo_packed<0x16, X86Movlhps, X86Movlhpd, "movhp",
1283                                    IIC_SSE_MOV_LH>;
1284}
1285
1286let SchedRW = [WriteStore] in {
1287// v2f64 extract element 1 is always custom lowered to unpack high to low
1288// and extract element 0 so the non-store version isn't too horrible.
1289let Predicates = [UseAVX] in {
1290def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
1291                   "movhps\t{$src, $dst|$dst, $src}",
1292                   [(store (f64 (extractelt
1293                                 (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)),
1294                                            (bc_v2f64 (v4f32 VR128:$src))),
1295                                 (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX;
1296def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
1297                   "movhpd\t{$src, $dst|$dst, $src}",
1298                   [(store (f64 (extractelt
1299                                 (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
1300                                 (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX;
1301} // UseAVX
1302def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
1303                   "movhps\t{$src, $dst|$dst, $src}",
1304                   [(store (f64 (extractelt
1305                                 (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)),
1306                                            (bc_v2f64 (v4f32 VR128:$src))),
1307                                 (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>;
1308def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
1309                   "movhpd\t{$src, $dst|$dst, $src}",
1310                   [(store (f64 (extractelt
1311                                 (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
1312                                 (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>;
1313} // SchedRW
1314
1315let Predicates = [UseAVX] in {
1316  // VMOVHPS patterns
1317  def : Pat<(X86Movlhps VR128:$src1,
1318                 (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
1319            (VMOVHPSrm VR128:$src1, addr:$src2)>;
1320  def : Pat<(X86Movlhps VR128:$src1,
1321                 (bc_v4i32 (v2i64 (X86vzload addr:$src2)))),
1322            (VMOVHPSrm VR128:$src1, addr:$src2)>;
1323
1324  // VMOVHPD patterns
1325
1326  // FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem
1327  // is during lowering, where it's not possible to recognize the load fold
1328  // cause it has two uses through a bitcast. One use disappears at isel time
1329  // and the fold opportunity reappears.
1330  def : Pat<(v2f64 (X86Unpckl VR128:$src1,
1331                      (scalar_to_vector (loadf64 addr:$src2)))),
1332            (VMOVHPDrm VR128:$src1, addr:$src2)>;
1333  // Also handle an i64 load because that may get selected as a faster way to
1334  // load the data.
1335  def : Pat<(v2f64 (X86Unpckl VR128:$src1,
1336                      (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
1337            (VMOVHPDrm VR128:$src1, addr:$src2)>;
1338
1339  def : Pat<(store (f64 (extractelt
1340                          (v2f64 (X86VPermilpi VR128:$src, (i8 1))),
1341                          (iPTR 0))), addr:$dst),
1342            (VMOVHPDmr addr:$dst, VR128:$src)>;
1343}
1344
1345let Predicates = [UseSSE1] in {
1346  // MOVHPS patterns
1347  def : Pat<(X86Movlhps VR128:$src1,
1348                 (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
1349            (MOVHPSrm VR128:$src1, addr:$src2)>;
1350  def : Pat<(X86Movlhps VR128:$src1,
1351                 (bc_v4f32 (v2i64 (X86vzload addr:$src2)))),
1352            (MOVHPSrm VR128:$src1, addr:$src2)>;
1353}
1354
1355let Predicates = [UseSSE2] in {
1356  // MOVHPD patterns
1357
1358  // FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem
1359  // is during lowering, where it's not possible to recognize the load fold
1360  // cause it has two uses through a bitcast. One use disappears at isel time
1361  // and the fold opportunity reappears.
1362  def : Pat<(v2f64 (X86Unpckl VR128:$src1,
1363                      (scalar_to_vector (loadf64 addr:$src2)))),
1364            (MOVHPDrm VR128:$src1, addr:$src2)>;
1365  // Also handle an i64 load because that may get selected as a faster way to
1366  // load the data.
1367  def : Pat<(v2f64 (X86Unpckl VR128:$src1,
1368                      (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
1369            (MOVHPDrm VR128:$src1, addr:$src2)>;
1370
1371  def : Pat<(store (f64 (extractelt
1372                          (v2f64 (X86Shufp VR128:$src, VR128:$src, (i8 1))),
1373                          (iPTR 0))), addr:$dst),
1374            (MOVHPDmr addr:$dst, VR128:$src)>;
1375}
1376
1377//===----------------------------------------------------------------------===//
1378// SSE 1 & 2 - Move Low to High and High to Low packed FP Instructions
1379//===----------------------------------------------------------------------===//
1380
1381let AddedComplexity = 20, Predicates = [UseAVX] in {
1382  def VMOVLHPSrr : VPSI<0x16, MRMSrcReg, (outs VR128:$dst),
1383                                       (ins VR128:$src1, VR128:$src2),
1384                      "movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1385                      [(set VR128:$dst,
1386                        (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))],
1387                        IIC_SSE_MOV_LH>,
1388                      VEX_4V, Sched<[WriteFShuffle]>;
1389  def VMOVHLPSrr : VPSI<0x12, MRMSrcReg, (outs VR128:$dst),
1390                                       (ins VR128:$src1, VR128:$src2),
1391                      "movhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1392                      [(set VR128:$dst,
1393                        (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))],
1394                        IIC_SSE_MOV_LH>,
1395                      VEX_4V, Sched<[WriteFShuffle]>;
1396}
1397let Constraints = "$src1 = $dst", AddedComplexity = 20 in {
1398  def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst),
1399                                       (ins VR128:$src1, VR128:$src2),
1400                      "movlhps\t{$src2, $dst|$dst, $src2}",
1401                      [(set VR128:$dst,
1402                        (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))],
1403                        IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>;
1404  def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst),
1405                                       (ins VR128:$src1, VR128:$src2),
1406                      "movhlps\t{$src2, $dst|$dst, $src2}",
1407                      [(set VR128:$dst,
1408                        (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))],
1409                        IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>;
1410}
1411
1412let Predicates = [UseAVX] in {
1413  // MOVLHPS patterns
1414  def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)),
1415            (VMOVLHPSrr VR128:$src1, VR128:$src2)>;
1416  def : Pat<(v2i64 (X86Movlhps VR128:$src1, VR128:$src2)),
1417            (VMOVLHPSrr (v2i64 VR128:$src1), VR128:$src2)>;
1418
1419  // MOVHLPS patterns
1420  def : Pat<(v4i32 (X86Movhlps VR128:$src1, VR128:$src2)),
1421            (VMOVHLPSrr VR128:$src1, VR128:$src2)>;
1422}
1423
1424let Predicates = [UseSSE1] in {
1425  // MOVLHPS patterns
1426  def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)),
1427            (MOVLHPSrr VR128:$src1, VR128:$src2)>;
1428  def : Pat<(v2i64 (X86Movlhps VR128:$src1, VR128:$src2)),
1429            (MOVLHPSrr (v2i64 VR128:$src1), VR128:$src2)>;
1430
1431  // MOVHLPS patterns
1432  def : Pat<(v4i32 (X86Movhlps VR128:$src1, VR128:$src2)),
1433            (MOVHLPSrr VR128:$src1, VR128:$src2)>;
1434}
1435
1436//===----------------------------------------------------------------------===//
1437// SSE 1 & 2 - Conversion Instructions
1438//===----------------------------------------------------------------------===//
1439
1440def SSE_CVT_PD : OpndItins<
1441  IIC_SSE_CVT_PD_RR, IIC_SSE_CVT_PD_RM
1442>;
1443
1444let Sched = WriteCvtI2F in
1445def SSE_CVT_PS : OpndItins<
1446  IIC_SSE_CVT_PS_RR, IIC_SSE_CVT_PS_RM
1447>;
1448
1449let Sched = WriteCvtI2F in
1450def SSE_CVT_Scalar : OpndItins<
1451  IIC_SSE_CVT_Scalar_RR, IIC_SSE_CVT_Scalar_RM
1452>;
1453
1454let Sched = WriteCvtF2I in
1455def SSE_CVT_SS2SI_32 : OpndItins<
1456  IIC_SSE_CVT_SS2SI32_RR, IIC_SSE_CVT_SS2SI32_RM
1457>;
1458
1459let Sched = WriteCvtF2I in
1460def SSE_CVT_SS2SI_64 : OpndItins<
1461  IIC_SSE_CVT_SS2SI64_RR, IIC_SSE_CVT_SS2SI64_RM
1462>;
1463
1464let Sched = WriteCvtF2I in
1465def SSE_CVT_SD2SI : OpndItins<
1466  IIC_SSE_CVT_SD2SI_RR, IIC_SSE_CVT_SD2SI_RM
1467>;
1468
1469multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
1470                     SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag,
1471                     string asm, OpndItins itins> {
1472  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
1473                        [(set DstRC:$dst, (OpNode SrcRC:$src))],
1474                        itins.rr>, Sched<[itins.Sched]>;
1475  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
1476                        [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))],
1477                        itins.rm>, Sched<[itins.Sched.Folded]>;
1478}
1479
1480multiclass sse12_cvt_p<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
1481                       X86MemOperand x86memop, string asm, Domain d,
1482                       OpndItins itins> {
1483let hasSideEffects = 0 in {
1484  def rr : I<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
1485             [], itins.rr, d>, Sched<[itins.Sched]>;
1486  let mayLoad = 1 in
1487  def rm : I<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
1488             [], itins.rm, d>, Sched<[itins.Sched.Folded]>;
1489}
1490}
1491
1492multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
1493                          X86MemOperand x86memop, string asm> {
1494let hasSideEffects = 0, Predicates = [UseAVX] in {
1495  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src),
1496              !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
1497           Sched<[WriteCvtI2F]>;
1498  let mayLoad = 1 in
1499  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
1500              (ins DstRC:$src1, x86memop:$src),
1501              !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
1502           Sched<[WriteCvtI2FLd, ReadAfterLd]>;
1503} // hasSideEffects = 0
1504}
1505
1506let Predicates = [UseAVX] in {
1507defm VCVTTSS2SI   : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
1508                                "cvttss2si\t{$src, $dst|$dst, $src}",
1509                                SSE_CVT_SS2SI_32>,
1510                                XS, VEX, VEX_LIG;
1511defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32,
1512                                "cvttss2si\t{$src, $dst|$dst, $src}",
1513                                SSE_CVT_SS2SI_64>,
1514                                XS, VEX, VEX_W, VEX_LIG;
1515defm VCVTTSD2SI   : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
1516                                "cvttsd2si\t{$src, $dst|$dst, $src}",
1517                                SSE_CVT_SD2SI>,
1518                                XD, VEX, VEX_LIG;
1519defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
1520                                "cvttsd2si\t{$src, $dst|$dst, $src}",
1521                                SSE_CVT_SD2SI>,
1522                                XD, VEX, VEX_W, VEX_LIG;
1523
1524def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
1525                (VCVTTSS2SIrr GR32:$dst, FR32:$src), 0>;
1526def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
1527                (VCVTTSS2SIrm GR32:$dst, f32mem:$src), 0>;
1528def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
1529                (VCVTTSD2SIrr GR32:$dst, FR64:$src), 0>;
1530def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
1531                (VCVTTSD2SIrm GR32:$dst, f64mem:$src), 0>;
1532def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
1533                (VCVTTSS2SI64rr GR64:$dst, FR32:$src), 0>;
1534def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
1535                (VCVTTSS2SI64rm GR64:$dst, f32mem:$src), 0>;
1536def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
1537                (VCVTTSD2SI64rr GR64:$dst, FR64:$src), 0>;
1538def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
1539                (VCVTTSD2SI64rm GR64:$dst, f64mem:$src), 0>;
1540}
1541// The assembler can recognize rr 64-bit instructions by seeing a rxx
1542// register, but the same isn't true when only using memory operands,
1543// provide other assembly "l" and "q" forms to address this explicitly
1544// where appropriate to do so.
1545defm VCVTSI2SS   : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss{l}">,
1546                                  XS, VEX_4V, VEX_LIG;
1547defm VCVTSI2SS64 : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss{q}">,
1548                                  XS, VEX_4V, VEX_W, VEX_LIG;
1549defm VCVTSI2SD   : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd{l}">,
1550                                  XD, VEX_4V, VEX_LIG;
1551defm VCVTSI2SD64 : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd{q}">,
1552                                  XD, VEX_4V, VEX_W, VEX_LIG;
1553
1554let Predicates = [UseAVX] in {
1555  def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
1556                (VCVTSI2SSrm FR64:$dst, FR64:$src1, i32mem:$src), 0>;
1557  def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}",
1558                (VCVTSI2SDrm FR64:$dst, FR64:$src1, i32mem:$src), 0>;
1559
1560  def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))),
1561            (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
1562  def : Pat<(f32 (sint_to_fp (loadi64 addr:$src))),
1563            (VCVTSI2SS64rm (f32 (IMPLICIT_DEF)), addr:$src)>;
1564  def : Pat<(f64 (sint_to_fp (loadi32 addr:$src))),
1565            (VCVTSI2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>;
1566  def : Pat<(f64 (sint_to_fp (loadi64 addr:$src))),
1567            (VCVTSI2SD64rm (f64 (IMPLICIT_DEF)), addr:$src)>;
1568
1569  def : Pat<(f32 (sint_to_fp GR32:$src)),
1570            (VCVTSI2SSrr (f32 (IMPLICIT_DEF)), GR32:$src)>;
1571  def : Pat<(f32 (sint_to_fp GR64:$src)),
1572            (VCVTSI2SS64rr (f32 (IMPLICIT_DEF)), GR64:$src)>;
1573  def : Pat<(f64 (sint_to_fp GR32:$src)),
1574            (VCVTSI2SDrr (f64 (IMPLICIT_DEF)), GR32:$src)>;
1575  def : Pat<(f64 (sint_to_fp GR64:$src)),
1576            (VCVTSI2SD64rr (f64 (IMPLICIT_DEF)), GR64:$src)>;
1577}
1578
1579defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
1580                      "cvttss2si\t{$src, $dst|$dst, $src}",
1581                      SSE_CVT_SS2SI_32>, XS;
1582defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32,
1583                      "cvttss2si\t{$src, $dst|$dst, $src}",
1584                      SSE_CVT_SS2SI_64>, XS, REX_W;
1585defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
1586                      "cvttsd2si\t{$src, $dst|$dst, $src}",
1587                      SSE_CVT_SD2SI>, XD;
1588defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
1589                      "cvttsd2si\t{$src, $dst|$dst, $src}",
1590                      SSE_CVT_SD2SI>, XD, REX_W;
1591defm CVTSI2SS  : sse12_cvt_s<0x2A, GR32, FR32, sint_to_fp, i32mem, loadi32,
1592                      "cvtsi2ss{l}\t{$src, $dst|$dst, $src}",
1593                      SSE_CVT_Scalar>, XS;
1594defm CVTSI2SS64 : sse12_cvt_s<0x2A, GR64, FR32, sint_to_fp, i64mem, loadi64,
1595                      "cvtsi2ss{q}\t{$src, $dst|$dst, $src}",
1596                      SSE_CVT_Scalar>, XS, REX_W;
1597defm CVTSI2SD  : sse12_cvt_s<0x2A, GR32, FR64, sint_to_fp, i32mem, loadi32,
1598                      "cvtsi2sd{l}\t{$src, $dst|$dst, $src}",
1599                      SSE_CVT_Scalar>, XD;
1600defm CVTSI2SD64 : sse12_cvt_s<0x2A, GR64, FR64, sint_to_fp, i64mem, loadi64,
1601                      "cvtsi2sd{q}\t{$src, $dst|$dst, $src}",
1602                      SSE_CVT_Scalar>, XD, REX_W;
1603
1604def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
1605                (CVTTSS2SIrr GR32:$dst, FR32:$src), 0>;
1606def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
1607                (CVTTSS2SIrm GR32:$dst, f32mem:$src), 0>;
1608def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
1609                (CVTTSD2SIrr GR32:$dst, FR64:$src), 0>;
1610def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
1611                (CVTTSD2SIrm GR32:$dst, f64mem:$src), 0>;
1612def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
1613                (CVTTSS2SI64rr GR64:$dst, FR32:$src), 0>;
1614def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
1615                (CVTTSS2SI64rm GR64:$dst, f32mem:$src), 0>;
1616def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
1617                (CVTTSD2SI64rr GR64:$dst, FR64:$src), 0>;
1618def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
1619                (CVTTSD2SI64rm GR64:$dst, f64mem:$src), 0>;
1620
1621def : InstAlias<"cvtsi2ss\t{$src, $dst|$dst, $src}",
1622                (CVTSI2SSrm FR64:$dst, i32mem:$src), 0>;
1623def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}",
1624                (CVTSI2SDrm FR64:$dst, i32mem:$src), 0>;
1625
1626// Conversion Instructions Intrinsics - Match intrinsics which expect MM
1627// and/or XMM operand(s).
1628
1629multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
1630                         Intrinsic Int, Operand memop, ComplexPattern mem_cpat,
1631                         string asm, OpndItins itins> {
1632  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
1633              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
1634              [(set DstRC:$dst, (Int SrcRC:$src))], itins.rr>,
1635           Sched<[itins.Sched]>;
1636  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src),
1637              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
1638              [(set DstRC:$dst, (Int mem_cpat:$src))], itins.rm>,
1639           Sched<[itins.Sched.Folded]>;
1640}
1641
1642multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC,
1643                    RegisterClass DstRC, Intrinsic Int, X86MemOperand x86memop,
1644                    PatFrag ld_frag, string asm, OpndItins itins,
1645                    bit Is2Addr = 1> {
1646  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src2),
1647              !if(Is2Addr,
1648                  !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
1649                  !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
1650              [(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))],
1651              itins.rr>, Sched<[itins.Sched]>;
1652  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
1653              (ins DstRC:$src1, x86memop:$src2),
1654              !if(Is2Addr,
1655                  !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
1656                  !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
1657              [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))],
1658              itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
1659}
1660
1661let Predicates = [UseAVX] in {
1662defm VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32,
1663                  int_x86_sse2_cvtsd2si, sdmem, sse_load_f64, "cvtsd2si",
1664                  SSE_CVT_SD2SI>, XD, VEX, VEX_LIG;
1665defm VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64,
1666                    int_x86_sse2_cvtsd2si64, sdmem, sse_load_f64, "cvtsd2si",
1667                    SSE_CVT_SD2SI>, XD, VEX, VEX_W, VEX_LIG;
1668}
1669defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse2_cvtsd2si,
1670                 sdmem, sse_load_f64, "cvtsd2si", SSE_CVT_SD2SI>, XD;
1671defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse2_cvtsd2si64,
1672                   sdmem, sse_load_f64, "cvtsd2si", SSE_CVT_SD2SI>, XD, REX_W;
1673
1674
1675let isCodeGenOnly = 1 in {
1676  let Predicates = [UseAVX] in {
1677  defm Int_VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1678            int_x86_sse_cvtsi2ss, i32mem, loadi32, "cvtsi2ss{l}",
1679            SSE_CVT_Scalar, 0>, XS, VEX_4V;
1680  defm Int_VCVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1681            int_x86_sse_cvtsi642ss, i64mem, loadi64, "cvtsi2ss{q}",
1682            SSE_CVT_Scalar, 0>, XS, VEX_4V,
1683            VEX_W;
1684  defm Int_VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1685            int_x86_sse2_cvtsi2sd, i32mem, loadi32, "cvtsi2sd{l}",
1686            SSE_CVT_Scalar, 0>, XD, VEX_4V;
1687  defm Int_VCVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1688            int_x86_sse2_cvtsi642sd, i64mem, loadi64, "cvtsi2sd{q}",
1689            SSE_CVT_Scalar, 0>, XD,
1690            VEX_4V, VEX_W;
1691  }
1692  let Constraints = "$src1 = $dst" in {
1693    defm Int_CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1694                          int_x86_sse_cvtsi2ss, i32mem, loadi32,
1695                          "cvtsi2ss{l}", SSE_CVT_Scalar>, XS;
1696    defm Int_CVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1697                          int_x86_sse_cvtsi642ss, i64mem, loadi64,
1698                          "cvtsi2ss{q}", SSE_CVT_Scalar>, XS, REX_W;
1699    defm Int_CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1700                          int_x86_sse2_cvtsi2sd, i32mem, loadi32,
1701                          "cvtsi2sd{l}", SSE_CVT_Scalar>, XD;
1702    defm Int_CVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1703                          int_x86_sse2_cvtsi642sd, i64mem, loadi64,
1704                          "cvtsi2sd{q}", SSE_CVT_Scalar>, XD, REX_W;
1705  }
1706} // isCodeGenOnly = 1
1707
1708/// SSE 1 Only
1709
1710// Aliases for intrinsics
1711let isCodeGenOnly = 1 in {
1712let Predicates = [UseAVX] in {
1713defm Int_VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si,
1714                                    ssmem, sse_load_f32, "cvttss2si",
1715                                    SSE_CVT_SS2SI_32>, XS, VEX;
1716defm Int_VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
1717                                   int_x86_sse_cvttss2si64, ssmem, sse_load_f32,
1718                                   "cvttss2si", SSE_CVT_SS2SI_64>,
1719                                   XS, VEX, VEX_W;
1720defm Int_VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si,
1721                                    sdmem, sse_load_f64, "cvttsd2si",
1722                                    SSE_CVT_SD2SI>, XD, VEX;
1723defm Int_VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
1724                                  int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64,
1725                                  "cvttsd2si", SSE_CVT_SD2SI>,
1726                                  XD, VEX, VEX_W;
1727}
1728defm Int_CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si,
1729                                    ssmem, sse_load_f32, "cvttss2si",
1730                                    SSE_CVT_SS2SI_32>, XS;
1731defm Int_CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
1732                                   int_x86_sse_cvttss2si64, ssmem, sse_load_f32,
1733                                   "cvttss2si", SSE_CVT_SS2SI_64>, XS, REX_W;
1734defm Int_CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si,
1735                                    sdmem, sse_load_f64, "cvttsd2si",
1736                                    SSE_CVT_SD2SI>, XD;
1737defm Int_CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
1738                                  int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64,
1739                                  "cvttsd2si", SSE_CVT_SD2SI>, XD, REX_W;
1740} // isCodeGenOnly = 1
1741
1742let Predicates = [UseAVX] in {
1743defm VCVTSS2SI   : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si,
1744                                  ssmem, sse_load_f32, "cvtss2si",
1745                                  SSE_CVT_SS2SI_32>, XS, VEX, VEX_LIG;
1746defm VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64,
1747                                  ssmem, sse_load_f32, "cvtss2si",
1748                                  SSE_CVT_SS2SI_64>, XS, VEX, VEX_W, VEX_LIG;
1749}
1750defm CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si,
1751                               ssmem, sse_load_f32, "cvtss2si",
1752                               SSE_CVT_SS2SI_32>, XS;
1753defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64,
1754                                 ssmem, sse_load_f32, "cvtss2si",
1755                                 SSE_CVT_SS2SI_64>, XS, REX_W;
1756
1757defm VCVTDQ2PS   : sse12_cvt_p<0x5B, VR128, VR128, i128mem,
1758                               "vcvtdq2ps\t{$src, $dst|$dst, $src}",
1759                               SSEPackedSingle, SSE_CVT_PS>,
1760                               PS, VEX, Requires<[HasAVX]>;
1761defm VCVTDQ2PSY  : sse12_cvt_p<0x5B, VR256, VR256, i256mem,
1762                               "vcvtdq2ps\t{$src, $dst|$dst, $src}",
1763                               SSEPackedSingle, SSE_CVT_PS>,
1764                               PS, VEX, VEX_L, Requires<[HasAVX]>;
1765
1766defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, i128mem,
1767                            "cvtdq2ps\t{$src, $dst|$dst, $src}",
1768                            SSEPackedSingle, SSE_CVT_PS>,
1769                            PS, Requires<[UseSSE2]>;
1770
1771let Predicates = [UseAVX] in {
1772def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
1773                (VCVTSS2SIrr GR32:$dst, VR128:$src), 0>;
1774def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
1775                (VCVTSS2SIrm GR32:$dst, ssmem:$src), 0>;
1776def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
1777                (VCVTSD2SIrr GR32:$dst, VR128:$src), 0>;
1778def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
1779                (VCVTSD2SIrm GR32:$dst, sdmem:$src), 0>;
1780def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
1781                (VCVTSS2SI64rr GR64:$dst, VR128:$src), 0>;
1782def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
1783                (VCVTSS2SI64rm GR64:$dst, ssmem:$src), 0>;
1784def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
1785                (VCVTSD2SI64rr GR64:$dst, VR128:$src), 0>;
1786def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
1787                (VCVTSD2SI64rm GR64:$dst, sdmem:$src), 0>;
1788}
1789
1790def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
1791                (CVTSS2SIrr GR32:$dst, VR128:$src), 0>;
1792def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
1793                (CVTSS2SIrm GR32:$dst, ssmem:$src), 0>;
1794def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
1795                (CVTSD2SIrr GR32:$dst, VR128:$src), 0>;
1796def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
1797                (CVTSD2SIrm GR32:$dst, sdmem:$src), 0>;
1798def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
1799                (CVTSS2SI64rr GR64:$dst, VR128:$src), 0>;
1800def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
1801                (CVTSS2SI64rm GR64:$dst, ssmem:$src), 0>;
1802def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
1803                (CVTSD2SI64rr GR64:$dst, VR128:$src), 0>;
1804def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
1805                (CVTSD2SI64rm GR64:$dst, sdmem:$src)>;
1806
1807/// SSE 2 Only
1808
1809// Convert scalar double to scalar single
1810let hasSideEffects = 0, Predicates = [UseAVX] in {
1811def VCVTSD2SSrr  : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst),
1812                       (ins FR64:$src1, FR64:$src2),
1813                      "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [],
1814                      IIC_SSE_CVT_Scalar_RR>, VEX_4V, VEX_LIG,
1815                      Sched<[WriteCvtF2F]>;
1816let mayLoad = 1 in
1817def VCVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst),
1818                       (ins FR64:$src1, f64mem:$src2),
1819                      "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1820                      [], IIC_SSE_CVT_Scalar_RM>,
1821                      XD, Requires<[HasAVX, OptForSize]>, VEX_4V, VEX_LIG,
1822                      Sched<[WriteCvtF2FLd, ReadAfterLd]>;
1823}
1824
1825def : Pat<(f32 (fround FR64:$src)), (VCVTSD2SSrr FR64:$src, FR64:$src)>,
1826          Requires<[UseAVX]>;
1827
1828def CVTSD2SSrr  : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
1829                      "cvtsd2ss\t{$src, $dst|$dst, $src}",
1830                      [(set FR32:$dst, (fround FR64:$src))],
1831                      IIC_SSE_CVT_Scalar_RR>, Sched<[WriteCvtF2F]>;
1832def CVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src),
1833                      "cvtsd2ss\t{$src, $dst|$dst, $src}",
1834                      [(set FR32:$dst, (fround (loadf64 addr:$src)))],
1835                      IIC_SSE_CVT_Scalar_RM>,
1836                      XD,
1837                  Requires<[UseSSE2, OptForSize]>, Sched<[WriteCvtF2FLd]>;
1838
1839let isCodeGenOnly = 1 in {
1840def Int_VCVTSD2SSrr: I<0x5A, MRMSrcReg,
1841                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1842                       "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1843                       [(set VR128:$dst,
1844                         (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))],
1845                       IIC_SSE_CVT_Scalar_RR>, XD, VEX_4V, Requires<[HasAVX]>,
1846                       Sched<[WriteCvtF2F]>;
1847def Int_VCVTSD2SSrm: I<0x5A, MRMSrcReg,
1848                       (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
1849                       "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1850                       [(set VR128:$dst, (int_x86_sse2_cvtsd2ss
1851                                          VR128:$src1, sse_load_f64:$src2))],
1852                       IIC_SSE_CVT_Scalar_RM>, XD, VEX_4V, Requires<[HasAVX]>,
1853                       Sched<[WriteCvtF2FLd, ReadAfterLd]>;
1854
1855let Constraints = "$src1 = $dst" in {
1856def Int_CVTSD2SSrr: I<0x5A, MRMSrcReg,
1857                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1858                       "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
1859                       [(set VR128:$dst,
1860                         (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))],
1861                       IIC_SSE_CVT_Scalar_RR>, XD, Requires<[UseSSE2]>,
1862                       Sched<[WriteCvtF2F]>;
1863def Int_CVTSD2SSrm: I<0x5A, MRMSrcReg,
1864                       (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
1865                       "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
1866                       [(set VR128:$dst, (int_x86_sse2_cvtsd2ss
1867                                          VR128:$src1, sse_load_f64:$src2))],
1868                       IIC_SSE_CVT_Scalar_RM>, XD, Requires<[UseSSE2]>,
1869                       Sched<[WriteCvtF2FLd, ReadAfterLd]>;
1870}
1871} // isCodeGenOnly = 1
1872
1873// Convert scalar single to scalar double
1874// SSE2 instructions with XS prefix
1875let hasSideEffects = 0, Predicates = [UseAVX] in {
1876def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst),
1877                    (ins FR32:$src1, FR32:$src2),
1878                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1879                    [], IIC_SSE_CVT_Scalar_RR>,
1880                    XS, Requires<[HasAVX]>, VEX_4V, VEX_LIG,
1881                    Sched<[WriteCvtF2F]>;
1882let mayLoad = 1 in
1883def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst),
1884                    (ins FR32:$src1, f32mem:$src2),
1885                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1886                    [], IIC_SSE_CVT_Scalar_RM>,
1887                    XS, VEX_4V, VEX_LIG, Requires<[HasAVX, OptForSize]>,
1888                    Sched<[WriteCvtF2FLd, ReadAfterLd]>;
1889}
1890
1891def : Pat<(f64 (fextend FR32:$src)),
1892    (VCVTSS2SDrr FR32:$src, FR32:$src)>, Requires<[UseAVX]>;
1893def : Pat<(fextend (loadf32 addr:$src)),
1894    (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX]>;
1895
1896def : Pat<(extloadf32 addr:$src),
1897    (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>,
1898    Requires<[UseAVX, OptForSize]>;
1899def : Pat<(extloadf32 addr:$src),
1900    (VCVTSS2SDrr (f32 (IMPLICIT_DEF)), (VMOVSSrm addr:$src))>,
1901    Requires<[UseAVX, OptForSpeed]>;
1902
1903def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
1904                   "cvtss2sd\t{$src, $dst|$dst, $src}",
1905                   [(set FR64:$dst, (fextend FR32:$src))],
1906                   IIC_SSE_CVT_Scalar_RR>, XS,
1907                 Requires<[UseSSE2]>, Sched<[WriteCvtF2F]>;
1908def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
1909                   "cvtss2sd\t{$src, $dst|$dst, $src}",
1910                   [(set FR64:$dst, (extloadf32 addr:$src))],
1911                   IIC_SSE_CVT_Scalar_RM>, XS,
1912                 Requires<[UseSSE2, OptForSize]>, Sched<[WriteCvtF2FLd]>;
1913
1914// extload f32 -> f64.  This matches load+fextend because we have a hack in
1915// the isel (PreprocessForFPConvert) that can introduce loads after dag
1916// combine.
1917// Since these loads aren't folded into the fextend, we have to match it
1918// explicitly here.
1919def : Pat<(fextend (loadf32 addr:$src)),
1920          (CVTSS2SDrm addr:$src)>, Requires<[UseSSE2]>;
1921def : Pat<(extloadf32 addr:$src),
1922          (CVTSS2SDrr (MOVSSrm addr:$src))>, Requires<[UseSSE2, OptForSpeed]>;
1923
1924let isCodeGenOnly = 1 in {
1925def Int_VCVTSS2SDrr: I<0x5A, MRMSrcReg,
1926                      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1927                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1928                    [(set VR128:$dst,
1929                      (int_x86_sse2_cvtss2sd VR128:$src1, VR128:$src2))],
1930                    IIC_SSE_CVT_Scalar_RR>, XS, VEX_4V, Requires<[HasAVX]>,
1931                    Sched<[WriteCvtF2F]>;
1932def Int_VCVTSS2SDrm: I<0x5A, MRMSrcMem,
1933                      (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
1934                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1935                    [(set VR128:$dst,
1936                      (int_x86_sse2_cvtss2sd VR128:$src1, sse_load_f32:$src2))],
1937                    IIC_SSE_CVT_Scalar_RM>, XS, VEX_4V, Requires<[HasAVX]>,
1938                    Sched<[WriteCvtF2FLd, ReadAfterLd]>;
1939let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix
1940def Int_CVTSS2SDrr: I<0x5A, MRMSrcReg,
1941                      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1942                    "cvtss2sd\t{$src2, $dst|$dst, $src2}",
1943                    [(set VR128:$dst,
1944                      (int_x86_sse2_cvtss2sd VR128:$src1, VR128:$src2))],
1945                    IIC_SSE_CVT_Scalar_RR>, XS, Requires<[UseSSE2]>,
1946                    Sched<[WriteCvtF2F]>;
1947def Int_CVTSS2SDrm: I<0x5A, MRMSrcMem,
1948                      (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
1949                    "cvtss2sd\t{$src2, $dst|$dst, $src2}",
1950                    [(set VR128:$dst,
1951                      (int_x86_sse2_cvtss2sd VR128:$src1, sse_load_f32:$src2))],
1952                    IIC_SSE_CVT_Scalar_RM>, XS, Requires<[UseSSE2]>,
1953                    Sched<[WriteCvtF2FLd, ReadAfterLd]>;
1954}
1955} // isCodeGenOnly = 1
1956
1957// Convert packed single/double fp to doubleword
1958def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1959                       "cvtps2dq\t{$src, $dst|$dst, $src}",
1960                       [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))],
1961                       IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>;
1962def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1963                       "cvtps2dq\t{$src, $dst|$dst, $src}",
1964                       [(set VR128:$dst,
1965                         (int_x86_sse2_cvtps2dq (loadv4f32 addr:$src)))],
1966                       IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>;
1967def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
1968                        "cvtps2dq\t{$src, $dst|$dst, $src}",
1969                        [(set VR256:$dst,
1970                          (int_x86_avx_cvt_ps2dq_256 VR256:$src))],
1971                        IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
1972def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
1973                        "cvtps2dq\t{$src, $dst|$dst, $src}",
1974                        [(set VR256:$dst,
1975                          (int_x86_avx_cvt_ps2dq_256 (loadv8f32 addr:$src)))],
1976                        IIC_SSE_CVT_PS_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
1977def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1978                     "cvtps2dq\t{$src, $dst|$dst, $src}",
1979                     [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))],
1980                     IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>;
1981def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1982                     "cvtps2dq\t{$src, $dst|$dst, $src}",
1983                     [(set VR128:$dst,
1984                       (int_x86_sse2_cvtps2dq (memopv4f32 addr:$src)))],
1985                     IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>;
1986
1987
1988// Convert Packed Double FP to Packed DW Integers
1989let Predicates = [HasAVX] in {
1990// The assembler can recognize rr 256-bit instructions by seeing a ymm
1991// register, but the same isn't true when using memory operands instead.
1992// Provide other assembly rr and rm forms to address this explicitly.
1993def VCVTPD2DQrr  : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1994                       "vcvtpd2dq\t{$src, $dst|$dst, $src}",
1995                       [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))]>,
1996                       VEX, Sched<[WriteCvtF2I]>;
1997
1998// XMM only
1999def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
2000                (VCVTPD2DQrr VR128:$dst, VR128:$src), 0>;
2001def VCVTPD2DQXrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2002                       "vcvtpd2dqx\t{$src, $dst|$dst, $src}",
2003                       [(set VR128:$dst,
2004                         (int_x86_sse2_cvtpd2dq (loadv2f64 addr:$src)))]>, VEX,
2005                       Sched<[WriteCvtF2ILd]>;
2006
2007// YMM only
2008def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
2009                       "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}",
2010                       [(set VR128:$dst,
2011                         (int_x86_avx_cvt_pd2dq_256 VR256:$src))]>, VEX, VEX_L,
2012                       Sched<[WriteCvtF2I]>;
2013def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
2014                       "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}",
2015                       [(set VR128:$dst,
2016                         (int_x86_avx_cvt_pd2dq_256 (loadv4f64 addr:$src)))]>,
2017                       VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
2018def : InstAlias<"vcvtpd2dq\t{$src, $dst|$dst, $src}",
2019                (VCVTPD2DQYrr VR128:$dst, VR256:$src), 0>;
2020}
2021
2022def CVTPD2DQrm  : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2023                      "cvtpd2dq\t{$src, $dst|$dst, $src}",
2024                      [(set VR128:$dst,
2025                        (int_x86_sse2_cvtpd2dq (memopv2f64 addr:$src)))],
2026                      IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtF2ILd]>;
2027def CVTPD2DQrr  : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2028                      "cvtpd2dq\t{$src, $dst|$dst, $src}",
2029                      [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))],
2030                      IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2I]>;
2031
2032// Convert with truncation packed single/double fp to doubleword
2033// SSE2 packed instructions with XS prefix
2034def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2035                         "cvttps2dq\t{$src, $dst|$dst, $src}",
2036                         [(set VR128:$dst,
2037                           (int_x86_sse2_cvttps2dq VR128:$src))],
2038                         IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>;
2039def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2040                         "cvttps2dq\t{$src, $dst|$dst, $src}",
2041                         [(set VR128:$dst, (int_x86_sse2_cvttps2dq
2042                                            (loadv4f32 addr:$src)))],
2043                         IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>;
2044def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
2045                          "cvttps2dq\t{$src, $dst|$dst, $src}",
2046                          [(set VR256:$dst,
2047                            (int_x86_avx_cvtt_ps2dq_256 VR256:$src))],
2048                          IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
2049def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
2050                          "cvttps2dq\t{$src, $dst|$dst, $src}",
2051                          [(set VR256:$dst, (int_x86_avx_cvtt_ps2dq_256
2052                                             (loadv8f32 addr:$src)))],
2053                          IIC_SSE_CVT_PS_RM>, VEX, VEX_L,
2054                          Sched<[WriteCvtF2ILd]>;
2055
2056def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2057                       "cvttps2dq\t{$src, $dst|$dst, $src}",
2058                       [(set VR128:$dst, (int_x86_sse2_cvttps2dq VR128:$src))],
2059                       IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>;
2060def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2061                       "cvttps2dq\t{$src, $dst|$dst, $src}",
2062                       [(set VR128:$dst,
2063                         (int_x86_sse2_cvttps2dq (memopv4f32 addr:$src)))],
2064                       IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>;
2065
2066let Predicates = [HasAVX] in {
2067  def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src),
2068            (VCVTDQ2PSrr VR128:$src)>;
2069  def : Pat<(int_x86_sse2_cvtdq2ps (bc_v4i32 (loadv2i64 addr:$src))),
2070            (VCVTDQ2PSrm addr:$src)>;
2071}
2072
2073let Predicates = [HasAVX, NoVLX] in {
2074  def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))),
2075            (VCVTDQ2PSrr VR128:$src)>;
2076  def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))),
2077            (VCVTDQ2PSrm addr:$src)>;
2078
2079  def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
2080            (VCVTTPS2DQrr VR128:$src)>;
2081  def : Pat<(v4i32 (fp_to_sint (loadv4f32 addr:$src))),
2082            (VCVTTPS2DQrm addr:$src)>;
2083
2084  def : Pat<(v8f32 (sint_to_fp (v8i32 VR256:$src))),
2085            (VCVTDQ2PSYrr VR256:$src)>;
2086  def : Pat<(v8f32 (sint_to_fp (bc_v8i32 (loadv4i64 addr:$src)))),
2087            (VCVTDQ2PSYrm addr:$src)>;
2088
2089  def : Pat<(v8i32 (fp_to_sint (v8f32 VR256:$src))),
2090            (VCVTTPS2DQYrr VR256:$src)>;
2091  def : Pat<(v8i32 (fp_to_sint (loadv8f32 addr:$src))),
2092            (VCVTTPS2DQYrm addr:$src)>;
2093}
2094
2095let Predicates = [UseSSE2] in {
2096  def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))),
2097            (CVTDQ2PSrr VR128:$src)>;
2098  def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (memopv2i64 addr:$src)))),
2099            (CVTDQ2PSrm addr:$src)>;
2100
2101  def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src),
2102            (CVTDQ2PSrr VR128:$src)>;
2103  def : Pat<(int_x86_sse2_cvtdq2ps (bc_v4i32 (memopv2i64 addr:$src))),
2104            (CVTDQ2PSrm addr:$src)>;
2105
2106  def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
2107            (CVTTPS2DQrr VR128:$src)>;
2108  def : Pat<(v4i32 (fp_to_sint (memopv4f32 addr:$src))),
2109            (CVTTPS2DQrm addr:$src)>;
2110}
2111
2112def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2113                        "cvttpd2dq\t{$src, $dst|$dst, $src}",
2114                        [(set VR128:$dst,
2115                              (int_x86_sse2_cvttpd2dq VR128:$src))],
2116                              IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2I]>;
2117
2118// The assembler can recognize rr 256-bit instructions by seeing a ymm
2119// register, but the same isn't true when using memory operands instead.
2120// Provide other assembly rr and rm forms to address this explicitly.
2121
2122// XMM only
2123def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}",
2124                (VCVTTPD2DQrr VR128:$dst, VR128:$src), 0>;
2125def VCVTTPD2DQXrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2126                         "cvttpd2dqx\t{$src, $dst|$dst, $src}",
2127                         [(set VR128:$dst, (int_x86_sse2_cvttpd2dq
2128                                            (loadv2f64 addr:$src)))],
2129                         IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2ILd]>;
2130
2131// YMM only
2132def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
2133                         "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
2134                         [(set VR128:$dst,
2135                           (int_x86_avx_cvtt_pd2dq_256 VR256:$src))],
2136                         IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
2137def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
2138                         "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
2139                         [(set VR128:$dst,
2140                          (int_x86_avx_cvtt_pd2dq_256 (loadv4f64 addr:$src)))],
2141                         IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
2142def : InstAlias<"vcvttpd2dq\t{$src, $dst|$dst, $src}",
2143                (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0>;
2144
2145let Predicates = [HasAVX, NoVLX] in {
2146  def : Pat<(v4i32 (fp_to_sint (v4f64 VR256:$src))),
2147            (VCVTTPD2DQYrr VR256:$src)>;
2148  def : Pat<(v4i32 (fp_to_sint (loadv4f64 addr:$src))),
2149            (VCVTTPD2DQYrm addr:$src)>;
2150} // Predicates = [HasAVX]
2151
2152def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2153                      "cvttpd2dq\t{$src, $dst|$dst, $src}",
2154                      [(set VR128:$dst, (int_x86_sse2_cvttpd2dq VR128:$src))],
2155                      IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2I]>;
2156def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src),
2157                      "cvttpd2dq\t{$src, $dst|$dst, $src}",
2158                      [(set VR128:$dst, (int_x86_sse2_cvttpd2dq
2159                                        (memopv2f64 addr:$src)))],
2160                                        IIC_SSE_CVT_PD_RM>,
2161                      Sched<[WriteCvtF2ILd]>;
2162
2163// Convert packed single to packed double
2164let Predicates = [HasAVX] in {
2165                  // SSE2 instructions without OpSize prefix
2166def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2167                     "vcvtps2pd\t{$src, $dst|$dst, $src}",
2168                     [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))],
2169                     IIC_SSE_CVT_PD_RR>, PS, VEX, Sched<[WriteCvtF2F]>;
2170def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
2171                    "vcvtps2pd\t{$src, $dst|$dst, $src}",
2172                    [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))],
2173                    IIC_SSE_CVT_PD_RM>, PS, VEX, Sched<[WriteCvtF2FLd]>;
2174def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
2175                     "vcvtps2pd\t{$src, $dst|$dst, $src}",
2176                     [(set VR256:$dst,
2177                       (int_x86_avx_cvt_ps2_pd_256 VR128:$src))],
2178                     IIC_SSE_CVT_PD_RR>, PS, VEX, VEX_L, Sched<[WriteCvtF2F]>;
2179def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
2180                     "vcvtps2pd\t{$src, $dst|$dst, $src}",
2181                     [(set VR256:$dst,
2182                       (int_x86_avx_cvt_ps2_pd_256 (loadv4f32 addr:$src)))],
2183                     IIC_SSE_CVT_PD_RM>, PS, VEX, VEX_L, Sched<[WriteCvtF2FLd]>;
2184}
2185
2186let Predicates = [UseSSE2] in {
2187def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2188                       "cvtps2pd\t{$src, $dst|$dst, $src}",
2189                       [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))],
2190                       IIC_SSE_CVT_PD_RR>, PS, Sched<[WriteCvtF2F]>;
2191def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
2192                   "cvtps2pd\t{$src, $dst|$dst, $src}",
2193                   [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))],
2194                   IIC_SSE_CVT_PD_RM>, PS, Sched<[WriteCvtF2FLd]>;
2195}
2196
2197// Convert Packed DW Integers to Packed Double FP
2198let Predicates = [HasAVX] in {
2199let hasSideEffects = 0, mayLoad = 1 in
2200def VCVTDQ2PDrm  : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
2201                     "vcvtdq2pd\t{$src, $dst|$dst, $src}",
2202                     []>, VEX, Sched<[WriteCvtI2FLd]>;
2203def VCVTDQ2PDrr  : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2204                     "vcvtdq2pd\t{$src, $dst|$dst, $src}",
2205                     [(set VR128:$dst,
2206                       (int_x86_sse2_cvtdq2pd VR128:$src))]>, VEX,
2207                   Sched<[WriteCvtI2F]>;
2208def VCVTDQ2PDYrm  : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src),
2209                     "vcvtdq2pd\t{$src, $dst|$dst, $src}",
2210                     [(set VR256:$dst,
2211                       (int_x86_avx_cvtdq2_pd_256
2212                        (bitconvert (loadv2i64 addr:$src))))]>, VEX, VEX_L,
2213                    Sched<[WriteCvtI2FLd]>;
2214def VCVTDQ2PDYrr  : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
2215                     "vcvtdq2pd\t{$src, $dst|$dst, $src}",
2216                     [(set VR256:$dst,
2217                       (int_x86_avx_cvtdq2_pd_256 VR128:$src))]>, VEX, VEX_L,
2218                    Sched<[WriteCvtI2F]>;
2219}
2220
2221let hasSideEffects = 0, mayLoad = 1 in
2222def CVTDQ2PDrm  : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
2223                       "cvtdq2pd\t{$src, $dst|$dst, $src}", [],
2224                       IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtI2FLd]>;
2225def CVTDQ2PDrr  : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2226                       "cvtdq2pd\t{$src, $dst|$dst, $src}",
2227                       [(set VR128:$dst, (int_x86_sse2_cvtdq2pd VR128:$src))],
2228                       IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtI2F]>;
2229
2230// AVX register conversion intrinsics
2231let Predicates = [HasAVX] in {
2232  def : Pat<(v2f64 (X86cvtdq2pd (v4i32 VR128:$src))),
2233            (VCVTDQ2PDrr VR128:$src)>;
2234  def : Pat<(v2f64 (X86cvtdq2pd (bc_v4i32 (loadv2i64 addr:$src)))),
2235            (VCVTDQ2PDrm addr:$src)>;
2236
2237  def : Pat<(v4f64 (sint_to_fp (v4i32 VR128:$src))),
2238            (VCVTDQ2PDYrr VR128:$src)>;
2239  def : Pat<(v4f64 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))),
2240            (VCVTDQ2PDYrm addr:$src)>;
2241} // Predicates = [HasAVX]
2242
2243// SSE2 register conversion intrinsics
2244let Predicates = [HasSSE2] in {
2245  def : Pat<(v2f64 (X86cvtdq2pd (v4i32 VR128:$src))),
2246            (CVTDQ2PDrr VR128:$src)>;
2247  def : Pat<(v2f64 (X86cvtdq2pd (bc_v4i32 (loadv2i64 addr:$src)))),
2248            (CVTDQ2PDrm addr:$src)>;
2249} // Predicates = [HasSSE2]
2250
2251// Convert packed double to packed single
2252// The assembler can recognize rr 256-bit instructions by seeing a ymm
2253// register, but the same isn't true when using memory operands instead.
2254// Provide other assembly rr and rm forms to address this explicitly.
2255def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2256                       "cvtpd2ps\t{$src, $dst|$dst, $src}",
2257                       [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))],
2258                       IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2F]>;
2259
2260// XMM only
2261def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
2262                (VCVTPD2PSrr VR128:$dst, VR128:$src), 0>;
2263def VCVTPD2PSXrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2264                        "cvtpd2psx\t{$src, $dst|$dst, $src}",
2265                        [(set VR128:$dst,
2266                          (int_x86_sse2_cvtpd2ps (loadv2f64 addr:$src)))],
2267                        IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2FLd]>;
2268
2269// YMM only
2270def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
2271                        "cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
2272                        [(set VR128:$dst,
2273                          (int_x86_avx_cvt_pd2_ps_256 VR256:$src))],
2274                        IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2F]>;
2275def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
2276                        "cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
2277                        [(set VR128:$dst,
2278                          (int_x86_avx_cvt_pd2_ps_256 (loadv4f64 addr:$src)))],
2279                        IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2FLd]>;
2280def : InstAlias<"vcvtpd2ps\t{$src, $dst|$dst, $src}",
2281                (VCVTPD2PSYrr VR128:$dst, VR256:$src), 0>;
2282
2283def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2284                     "cvtpd2ps\t{$src, $dst|$dst, $src}",
2285                     [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))],
2286                     IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2F]>;
2287def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2288                     "cvtpd2ps\t{$src, $dst|$dst, $src}",
2289                     [(set VR128:$dst,
2290                       (int_x86_sse2_cvtpd2ps (memopv2f64 addr:$src)))],
2291                     IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtF2FLd]>;
2292
2293
2294// AVX 256-bit register conversion intrinsics
2295// FIXME: Migrate SSE conversion intrinsics matching to use patterns as below
2296// whenever possible to avoid declaring two versions of each one.
2297let Predicates = [HasAVX] in {
2298  def : Pat<(int_x86_avx_cvtdq2_ps_256 VR256:$src),
2299            (VCVTDQ2PSYrr VR256:$src)>;
2300  def : Pat<(int_x86_avx_cvtdq2_ps_256 (bitconvert (loadv4i64 addr:$src))),
2301            (VCVTDQ2PSYrm addr:$src)>;
2302}
2303
2304let Predicates = [HasAVX, NoVLX] in {
2305  // Match fround and fextend for 128/256-bit conversions
2306  def : Pat<(v4f32 (X86vfpround (v2f64 VR128:$src))),
2307            (VCVTPD2PSrr VR128:$src)>;
2308  def : Pat<(v4f32 (X86vfpround (loadv2f64 addr:$src))),
2309            (VCVTPD2PSXrm addr:$src)>;
2310  def : Pat<(v4f32 (fround (v4f64 VR256:$src))),
2311            (VCVTPD2PSYrr VR256:$src)>;
2312  def : Pat<(v4f32 (fround (loadv4f64 addr:$src))),
2313            (VCVTPD2PSYrm addr:$src)>;
2314
2315  def : Pat<(v2f64 (X86vfpext (v4f32 VR128:$src))),
2316            (VCVTPS2PDrr VR128:$src)>;
2317  def : Pat<(v4f64 (fextend (v4f32 VR128:$src))),
2318            (VCVTPS2PDYrr VR128:$src)>;
2319  def : Pat<(v4f64 (extloadv4f32 addr:$src)),
2320            (VCVTPS2PDYrm addr:$src)>;
2321}
2322
2323let Predicates = [UseSSE2] in {
2324  // Match fround and fextend for 128 conversions
2325  def : Pat<(v4f32 (X86vfpround (v2f64 VR128:$src))),
2326            (CVTPD2PSrr VR128:$src)>;
2327  def : Pat<(v4f32 (X86vfpround (memopv2f64 addr:$src))),
2328            (CVTPD2PSrm addr:$src)>;
2329
2330  def : Pat<(v2f64 (X86vfpext (v4f32 VR128:$src))),
2331            (CVTPS2PDrr VR128:$src)>;
2332}
2333
2334//===----------------------------------------------------------------------===//
2335// SSE 1 & 2 - Compare Instructions
2336//===----------------------------------------------------------------------===//
2337
2338// sse12_cmp_scalar - sse 1 & 2 compare scalar instructions
2339multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
2340                            Operand CC, SDNode OpNode, ValueType VT,
2341                            PatFrag ld_frag, string asm, string asm_alt,
2342                            OpndItins itins, ImmLeaf immLeaf> {
2343  def rr : SIi8<0xC2, MRMSrcReg,
2344                (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
2345                [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, immLeaf:$cc))],
2346                itins.rr>, Sched<[itins.Sched]>;
2347  def rm : SIi8<0xC2, MRMSrcMem,
2348                (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
2349                [(set RC:$dst, (OpNode (VT RC:$src1),
2350                                         (ld_frag addr:$src2), immLeaf:$cc))],
2351                                         itins.rm>,
2352           Sched<[itins.Sched.Folded, ReadAfterLd]>;
2353
2354  // Accept explicit immediate argument form instead of comparison code.
2355  let isAsmParserOnly = 1, hasSideEffects = 0 in {
2356    def rr_alt : SIi8<0xC2, MRMSrcReg, (outs RC:$dst),
2357                      (ins RC:$src1, RC:$src2, u8imm:$cc), asm_alt, [],
2358                      IIC_SSE_ALU_F32S_RR>, Sched<[itins.Sched]>;
2359    let mayLoad = 1 in
2360    def rm_alt : SIi8<0xC2, MRMSrcMem, (outs RC:$dst),
2361                      (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm_alt, [],
2362                      IIC_SSE_ALU_F32S_RM>,
2363                      Sched<[itins.Sched.Folded, ReadAfterLd]>;
2364  }
2365}
2366
2367defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, AVXCC, X86cmps, f32, loadf32,
2368                 "cmp${cc}ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2369                 "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
2370                 SSE_ALU_F32S, i8immZExt5>, XS, VEX_4V, VEX_LIG;
2371defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, AVXCC, X86cmps, f64, loadf64,
2372                 "cmp${cc}sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2373                 "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
2374                 SSE_ALU_F32S, i8immZExt5>, // same latency as 32 bit compare
2375                 XD, VEX_4V, VEX_LIG;
2376
2377let Constraints = "$src1 = $dst" in {
2378  defm CMPSS : sse12_cmp_scalar<FR32, f32mem, SSECC, X86cmps, f32, loadf32,
2379                  "cmp${cc}ss\t{$src2, $dst|$dst, $src2}",
2380                  "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}", SSE_ALU_F32S,
2381                  i8immZExt3>, XS;
2382  defm CMPSD : sse12_cmp_scalar<FR64, f64mem, SSECC, X86cmps, f64, loadf64,
2383                  "cmp${cc}sd\t{$src2, $dst|$dst, $src2}",
2384                  "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
2385                  SSE_ALU_F64S, i8immZExt3>, XD;
2386}
2387
2388multiclass sse12_cmp_scalar_int<X86MemOperand x86memop, Operand CC,
2389                         Intrinsic Int, string asm, OpndItins itins,
2390                         ImmLeaf immLeaf> {
2391  def rr : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst),
2392                      (ins VR128:$src1, VR128:$src, CC:$cc), asm,
2393                        [(set VR128:$dst, (Int VR128:$src1,
2394                                               VR128:$src, immLeaf:$cc))],
2395                                               itins.rr>,
2396           Sched<[itins.Sched]>;
2397  def rm : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst),
2398                      (ins VR128:$src1, x86memop:$src, CC:$cc), asm,
2399                        [(set VR128:$dst, (Int VR128:$src1,
2400                                               (load addr:$src), immLeaf:$cc))],
2401                                               itins.rm>,
2402           Sched<[itins.Sched.Folded, ReadAfterLd]>;
2403}
2404
2405let isCodeGenOnly = 1 in {
2406  // Aliases to match intrinsics which expect XMM operand(s).
2407  defm Int_VCMPSS  : sse12_cmp_scalar_int<f32mem, AVXCC, int_x86_sse_cmp_ss,
2408                       "cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}",
2409                       SSE_ALU_F32S, i8immZExt5>,
2410                       XS, VEX_4V;
2411  defm Int_VCMPSD  : sse12_cmp_scalar_int<f64mem, AVXCC, int_x86_sse2_cmp_sd,
2412                       "cmp${cc}sd\t{$src, $src1, $dst|$dst, $src1, $src}",
2413                       SSE_ALU_F32S, i8immZExt5>, // same latency as f32
2414                       XD, VEX_4V;
2415  let Constraints = "$src1 = $dst" in {
2416    defm Int_CMPSS  : sse12_cmp_scalar_int<f32mem, SSECC, int_x86_sse_cmp_ss,
2417                         "cmp${cc}ss\t{$src, $dst|$dst, $src}",
2418                         SSE_ALU_F32S, i8immZExt3>, XS;
2419    defm Int_CMPSD  : sse12_cmp_scalar_int<f64mem, SSECC, int_x86_sse2_cmp_sd,
2420                         "cmp${cc}sd\t{$src, $dst|$dst, $src}",
2421                         SSE_ALU_F64S, i8immZExt3>,
2422                         XD;
2423}
2424}
2425
2426
2427// sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS
2428multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode,
2429                            ValueType vt, X86MemOperand x86memop,
2430                            PatFrag ld_frag, string OpcodeStr> {
2431  def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
2432                     !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
2433                     [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))],
2434                     IIC_SSE_COMIS_RR>,
2435          Sched<[WriteFAdd]>;
2436  def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
2437                     !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
2438                     [(set EFLAGS, (OpNode (vt RC:$src1),
2439                                           (ld_frag addr:$src2)))],
2440                                           IIC_SSE_COMIS_RM>,
2441          Sched<[WriteFAddLd, ReadAfterLd]>;
2442}
2443
2444let Defs = [EFLAGS] in {
2445  defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
2446                                  "ucomiss">, PS, VEX, VEX_LIG;
2447  defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
2448                                  "ucomisd">, PD, VEX, VEX_LIG;
2449  let Pattern = []<dag> in {
2450    defm VCOMISS  : sse12_ord_cmp<0x2F, FR32, undef, f32, f32mem, loadf32,
2451                                    "comiss">, PS, VEX, VEX_LIG;
2452    defm VCOMISD  : sse12_ord_cmp<0x2F, FR64, undef, f64, f64mem, loadf64,
2453                                    "comisd">, PD, VEX, VEX_LIG;
2454  }
2455
2456  let isCodeGenOnly = 1 in {
2457    defm Int_VUCOMISS  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem,
2458                              load, "ucomiss">, PS, VEX;
2459    defm Int_VUCOMISD  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem,
2460                              load, "ucomisd">, PD, VEX;
2461
2462    defm Int_VCOMISS  : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem,
2463                              load, "comiss">, PS, VEX;
2464    defm Int_VCOMISD  : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem,
2465                              load, "comisd">, PD, VEX;
2466  }
2467  defm UCOMISS  : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
2468                                  "ucomiss">, PS;
2469  defm UCOMISD  : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
2470                                  "ucomisd">, PD;
2471
2472  let Pattern = []<dag> in {
2473    defm COMISS  : sse12_ord_cmp<0x2F, FR32, undef, f32, f32mem, loadf32,
2474                                    "comiss">, PS;
2475    defm COMISD  : sse12_ord_cmp<0x2F, FR64, undef, f64, f64mem, loadf64,
2476                                    "comisd">, PD;
2477  }
2478
2479  let isCodeGenOnly = 1 in {
2480    defm Int_UCOMISS  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem,
2481                                load, "ucomiss">, PS;
2482    defm Int_UCOMISD  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem,
2483                                load, "ucomisd">, PD;
2484
2485    defm Int_COMISS  : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem, load,
2486                                    "comiss">, PS;
2487    defm Int_COMISD  : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem, load,
2488                                    "comisd">, PD;
2489  }
2490} // Defs = [EFLAGS]
2491
2492// sse12_cmp_packed - sse 1 & 2 compare packed instructions
2493multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
2494                            Operand CC, Intrinsic Int, string asm,
2495                            string asm_alt, Domain d, ImmLeaf immLeaf,
2496                            PatFrag ld_frag, OpndItins itins = SSE_ALU_F32P> {
2497  let isCommutable = 1 in
2498  def rri : PIi8<0xC2, MRMSrcReg,
2499             (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
2500             [(set RC:$dst, (Int RC:$src1, RC:$src2, immLeaf:$cc))],
2501             itins.rr, d>,
2502            Sched<[WriteFAdd]>;
2503  def rmi : PIi8<0xC2, MRMSrcMem,
2504             (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
2505             [(set RC:$dst, (Int RC:$src1, (ld_frag addr:$src2), immLeaf:$cc))],
2506             itins.rm, d>,
2507            Sched<[WriteFAddLd, ReadAfterLd]>;
2508
2509  // Accept explicit immediate argument form instead of comparison code.
2510  let isAsmParserOnly = 1, hasSideEffects = 0 in {
2511    def rri_alt : PIi8<0xC2, MRMSrcReg,
2512               (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc),
2513               asm_alt, [], itins.rr, d>, Sched<[WriteFAdd]>;
2514    let mayLoad = 1 in
2515    def rmi_alt : PIi8<0xC2, MRMSrcMem,
2516               (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc),
2517               asm_alt, [], itins.rm, d>,
2518               Sched<[WriteFAddLd, ReadAfterLd]>;
2519  }
2520}
2521
2522defm VCMPPS : sse12_cmp_packed<VR128, f128mem, AVXCC, int_x86_sse_cmp_ps,
2523               "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2524               "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
2525               SSEPackedSingle, i8immZExt5, loadv4f32>, PS, VEX_4V;
2526defm VCMPPD : sse12_cmp_packed<VR128, f128mem, AVXCC, int_x86_sse2_cmp_pd,
2527               "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2528               "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
2529               SSEPackedDouble, i8immZExt5, loadv2f64>, PD, VEX_4V;
2530defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, AVXCC, int_x86_avx_cmp_ps_256,
2531               "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2532               "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
2533               SSEPackedSingle, i8immZExt5, loadv8f32>, PS, VEX_4V, VEX_L;
2534defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, AVXCC, int_x86_avx_cmp_pd_256,
2535               "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2536               "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
2537               SSEPackedDouble, i8immZExt5, loadv4f64>, PD, VEX_4V, VEX_L;
2538let Constraints = "$src1 = $dst" in {
2539  defm CMPPS : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse_cmp_ps,
2540                 "cmp${cc}ps\t{$src2, $dst|$dst, $src2}",
2541                 "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}",
2542                 SSEPackedSingle, i8immZExt5, memopv4f32, SSE_ALU_F32P>, PS;
2543  defm CMPPD : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse2_cmp_pd,
2544                 "cmp${cc}pd\t{$src2, $dst|$dst, $src2}",
2545                 "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
2546                 SSEPackedDouble, i8immZExt5, memopv2f64, SSE_ALU_F64P>, PD;
2547}
2548
2549let Predicates = [HasAVX] in {
2550def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)),
2551          (VCMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>;
2552def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), (loadv4f32 addr:$src2), imm:$cc)),
2553          (VCMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>;
2554def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)),
2555          (VCMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>;
2556def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), (loadv2f64 addr:$src2), imm:$cc)),
2557          (VCMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
2558
2559def : Pat<(v8i32 (X86cmpp (v8f32 VR256:$src1), VR256:$src2, imm:$cc)),
2560          (VCMPPSYrri (v8f32 VR256:$src1), (v8f32 VR256:$src2), imm:$cc)>;
2561def : Pat<(v8i32 (X86cmpp (v8f32 VR256:$src1), (loadv8f32 addr:$src2), imm:$cc)),
2562          (VCMPPSYrmi (v8f32 VR256:$src1), addr:$src2, imm:$cc)>;
2563def : Pat<(v4i64 (X86cmpp (v4f64 VR256:$src1), VR256:$src2, imm:$cc)),
2564          (VCMPPDYrri VR256:$src1, VR256:$src2, imm:$cc)>;
2565def : Pat<(v4i64 (X86cmpp (v4f64 VR256:$src1), (loadv4f64 addr:$src2), imm:$cc)),
2566          (VCMPPDYrmi VR256:$src1, addr:$src2, imm:$cc)>;
2567}
2568
2569let Predicates = [UseSSE1] in {
2570def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)),
2571          (CMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>;
2572def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), (memopv4f32 addr:$src2), imm:$cc)),
2573          (CMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>;
2574}
2575
2576let Predicates = [UseSSE2] in {
2577def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)),
2578          (CMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>;
2579def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), (memopv2f64 addr:$src2), imm:$cc)),
2580          (CMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
2581}
2582
2583//===----------------------------------------------------------------------===//
2584// SSE 1 & 2 - Shuffle Instructions
2585//===----------------------------------------------------------------------===//
2586
2587/// sse12_shuffle - sse 1 & 2 fp shuffle instructions
2588multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop,
2589                         ValueType vt, string asm, PatFrag mem_frag,
2590                         Domain d> {
2591  def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst),
2592                   (ins RC:$src1, x86memop:$src2, u8imm:$src3), asm,
2593                   [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2),
2594                                       (i8 imm:$src3))))], IIC_SSE_SHUFP, d>,
2595            Sched<[WriteFShuffleLd, ReadAfterLd]>;
2596  def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst),
2597                 (ins RC:$src1, RC:$src2, u8imm:$src3), asm,
2598                 [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2,
2599                                     (i8 imm:$src3))))], IIC_SSE_SHUFP, d>,
2600            Sched<[WriteFShuffle]>;
2601}
2602
2603let Predicates = [HasAVX, NoVLX] in {
2604  defm VSHUFPS  : sse12_shuffle<VR128, f128mem, v4f32,
2605           "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2606           loadv4f32, SSEPackedSingle>, PS, VEX_4V;
2607  defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32,
2608           "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2609           loadv8f32, SSEPackedSingle>, PS, VEX_4V, VEX_L;
2610  defm VSHUFPD  : sse12_shuffle<VR128, f128mem, v2f64,
2611           "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2612           loadv2f64, SSEPackedDouble>, PD, VEX_4V;
2613  defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64,
2614           "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2615           loadv4f64, SSEPackedDouble>, PD, VEX_4V, VEX_L;
2616}
2617let Constraints = "$src1 = $dst" in {
2618  defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
2619                    "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
2620                    memopv4f32, SSEPackedSingle>, PS;
2621  defm SHUFPD : sse12_shuffle<VR128, f128mem, v2f64,
2622                    "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}",
2623                    memopv2f64, SSEPackedDouble>, PD;
2624}
2625
2626let Predicates = [HasAVX, NoVLX] in {
2627  def : Pat<(v4i32 (X86Shufp VR128:$src1,
2628                       (bc_v4i32 (loadv2i64 addr:$src2)), (i8 imm:$imm))),
2629            (VSHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>;
2630  def : Pat<(v4i32 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
2631            (VSHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>;
2632
2633  def : Pat<(v2i64 (X86Shufp VR128:$src1,
2634                       (loadv2i64 addr:$src2), (i8 imm:$imm))),
2635            (VSHUFPDrmi VR128:$src1, addr:$src2, imm:$imm)>;
2636  def : Pat<(v2i64 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
2637            (VSHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>;
2638
2639  // 256-bit patterns
2640  def : Pat<(v8i32 (X86Shufp VR256:$src1, VR256:$src2, (i8 imm:$imm))),
2641            (VSHUFPSYrri VR256:$src1, VR256:$src2, imm:$imm)>;
2642  def : Pat<(v8i32 (X86Shufp VR256:$src1,
2643                      (bc_v8i32 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
2644            (VSHUFPSYrmi VR256:$src1, addr:$src2, imm:$imm)>;
2645
2646  def : Pat<(v4i64 (X86Shufp VR256:$src1, VR256:$src2, (i8 imm:$imm))),
2647            (VSHUFPDYrri VR256:$src1, VR256:$src2, imm:$imm)>;
2648  def : Pat<(v4i64 (X86Shufp VR256:$src1,
2649                              (loadv4i64 addr:$src2), (i8 imm:$imm))),
2650            (VSHUFPDYrmi VR256:$src1, addr:$src2, imm:$imm)>;
2651}
2652
2653let Predicates = [UseSSE1] in {
2654  def : Pat<(v4i32 (X86Shufp VR128:$src1,
2655                       (bc_v4i32 (memopv2i64 addr:$src2)), (i8 imm:$imm))),
2656            (SHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>;
2657  def : Pat<(v4i32 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
2658            (SHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>;
2659}
2660
2661let Predicates = [UseSSE2] in {
2662  // Generic SHUFPD patterns
2663  def : Pat<(v2i64 (X86Shufp VR128:$src1,
2664                       (memopv2i64 addr:$src2), (i8 imm:$imm))),
2665            (SHUFPDrmi VR128:$src1, addr:$src2, imm:$imm)>;
2666  def : Pat<(v2i64 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
2667            (SHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>;
2668}
2669
2670//===----------------------------------------------------------------------===//
2671// SSE 1 & 2 - Unpack FP Instructions
2672//===----------------------------------------------------------------------===//
2673
2674/// sse12_unpack_interleave - sse 1 & 2 fp unpack and interleave
2675multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt,
2676                                   PatFrag mem_frag, RegisterClass RC,
2677                                   X86MemOperand x86memop, string asm,
2678                                   Domain d> {
2679    def rr : PI<opc, MRMSrcReg,
2680                (outs RC:$dst), (ins RC:$src1, RC:$src2),
2681                asm, [(set RC:$dst,
2682                           (vt (OpNode RC:$src1, RC:$src2)))],
2683                           IIC_SSE_UNPCK, d>, Sched<[WriteFShuffle]>;
2684    def rm : PI<opc, MRMSrcMem,
2685                (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
2686                asm, [(set RC:$dst,
2687                           (vt (OpNode RC:$src1,
2688                                       (mem_frag addr:$src2))))],
2689                                       IIC_SSE_UNPCK, d>,
2690             Sched<[WriteFShuffleLd, ReadAfterLd]>;
2691}
2692
2693let Predicates = [HasAVX, NoVLX] in {
2694defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, loadv4f32,
2695      VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2696                     SSEPackedSingle>, PS, VEX_4V;
2697defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, loadv2f64,
2698      VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2699                     SSEPackedDouble>, PD, VEX_4V;
2700defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, loadv4f32,
2701      VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2702                     SSEPackedSingle>, PS, VEX_4V;
2703defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, loadv2f64,
2704      VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2705                     SSEPackedDouble>, PD, VEX_4V;
2706
2707defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, loadv8f32,
2708      VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2709                     SSEPackedSingle>, PS, VEX_4V, VEX_L;
2710defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, loadv4f64,
2711      VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2712                     SSEPackedDouble>, PD, VEX_4V, VEX_L;
2713defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, loadv8f32,
2714      VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2715                     SSEPackedSingle>, PS, VEX_4V, VEX_L;
2716defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, loadv4f64,
2717      VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2718                     SSEPackedDouble>, PD, VEX_4V, VEX_L;
2719}// Predicates = [HasAVX, NoVLX]
2720let Constraints = "$src1 = $dst" in {
2721  defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memopv4f32,
2722        VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}",
2723                       SSEPackedSingle>, PS;
2724  defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memopv2f64,
2725        VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}",
2726                       SSEPackedDouble>, PD;
2727  defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memopv4f32,
2728        VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}",
2729                       SSEPackedSingle>, PS;
2730  defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memopv2f64,
2731        VR128, f128mem, "unpcklpd\t{$src2, $dst|$dst, $src2}",
2732                       SSEPackedDouble>, PD;
2733} // Constraints = "$src1 = $dst"
2734
2735let Predicates = [HasAVX1Only] in {
2736  def : Pat<(v8i32 (X86Unpckl VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))),
2737            (VUNPCKLPSYrm VR256:$src1, addr:$src2)>;
2738  def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)),
2739            (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>;
2740  def : Pat<(v8i32 (X86Unpckh VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))),
2741            (VUNPCKHPSYrm VR256:$src1, addr:$src2)>;
2742  def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)),
2743            (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>;
2744
2745  def : Pat<(v4i64 (X86Unpckl VR256:$src1, (loadv4i64 addr:$src2))),
2746            (VUNPCKLPDYrm VR256:$src1, addr:$src2)>;
2747  def : Pat<(v4i64 (X86Unpckl VR256:$src1, VR256:$src2)),
2748            (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>;
2749  def : Pat<(v4i64 (X86Unpckh VR256:$src1, (loadv4i64 addr:$src2))),
2750            (VUNPCKHPDYrm VR256:$src1, addr:$src2)>;
2751  def : Pat<(v4i64 (X86Unpckh VR256:$src1, VR256:$src2)),
2752            (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>;
2753}
2754
2755//===----------------------------------------------------------------------===//
2756// SSE 1 & 2 - Extract Floating-Point Sign mask
2757//===----------------------------------------------------------------------===//
2758
2759/// sse12_extr_sign_mask - sse 1 & 2 unpack and interleave
2760multiclass sse12_extr_sign_mask<RegisterClass RC, Intrinsic Int, string asm,
2761                                Domain d> {
2762  def rr : PI<0x50, MRMSrcReg, (outs GR32orGR64:$dst), (ins RC:$src),
2763              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
2764              [(set GR32orGR64:$dst, (Int RC:$src))], IIC_SSE_MOVMSK, d>,
2765              Sched<[WriteVecLogic]>;
2766}
2767
2768let Predicates = [HasAVX] in {
2769  defm VMOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps,
2770                                        "movmskps", SSEPackedSingle>, PS, VEX;
2771  defm VMOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd,
2772                                        "movmskpd", SSEPackedDouble>, PD, VEX;
2773  defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_ps_256,
2774                                        "movmskps", SSEPackedSingle>, PS,
2775                                        VEX, VEX_L;
2776  defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_pd_256,
2777                                        "movmskpd", SSEPackedDouble>, PD,
2778                                        VEX, VEX_L;
2779
2780  def : Pat<(i32 (X86fgetsign FR32:$src)),
2781            (VMOVMSKPSrr (COPY_TO_REGCLASS FR32:$src, VR128))>;
2782  def : Pat<(i64 (X86fgetsign FR32:$src)),
2783            (SUBREG_TO_REG (i64 0),
2784             (VMOVMSKPSrr (COPY_TO_REGCLASS FR32:$src, VR128)), sub_32bit)>;
2785  def : Pat<(i32 (X86fgetsign FR64:$src)),
2786            (VMOVMSKPDrr (COPY_TO_REGCLASS FR64:$src, VR128))>;
2787  def : Pat<(i64 (X86fgetsign FR64:$src)),
2788            (SUBREG_TO_REG (i64 0),
2789             (VMOVMSKPDrr (COPY_TO_REGCLASS FR64:$src, VR128)), sub_32bit)>;
2790}
2791
2792defm MOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps, "movmskps",
2793                                     SSEPackedSingle>, PS;
2794defm MOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd, "movmskpd",
2795                                     SSEPackedDouble>, PD;
2796
2797def : Pat<(i32 (X86fgetsign FR32:$src)),
2798          (MOVMSKPSrr (COPY_TO_REGCLASS FR32:$src, VR128))>,
2799      Requires<[UseSSE1]>;
2800def : Pat<(i64 (X86fgetsign FR32:$src)),
2801          (SUBREG_TO_REG (i64 0),
2802           (MOVMSKPSrr (COPY_TO_REGCLASS FR32:$src, VR128)), sub_32bit)>,
2803      Requires<[UseSSE1]>;
2804def : Pat<(i32 (X86fgetsign FR64:$src)),
2805          (MOVMSKPDrr (COPY_TO_REGCLASS FR64:$src, VR128))>,
2806      Requires<[UseSSE2]>;
2807def : Pat<(i64 (X86fgetsign FR64:$src)),
2808          (SUBREG_TO_REG (i64 0),
2809           (MOVMSKPDrr (COPY_TO_REGCLASS FR64:$src, VR128)), sub_32bit)>,
2810      Requires<[UseSSE2]>;
2811
2812//===---------------------------------------------------------------------===//
2813// SSE2 - Packed Integer Logical Instructions
2814//===---------------------------------------------------------------------===//
2815
2816let ExeDomain = SSEPackedInt in { // SSE integer instructions
2817
2818/// PDI_binop_rm - Simple SSE2 binary operator.
2819multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
2820                        ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
2821                        X86MemOperand x86memop, OpndItins itins,
2822                        bit IsCommutable, bit Is2Addr> {
2823  let isCommutable = IsCommutable in
2824  def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
2825       (ins RC:$src1, RC:$src2),
2826       !if(Is2Addr,
2827           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2828           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
2829       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))], itins.rr>,
2830       Sched<[itins.Sched]>;
2831  def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
2832       (ins RC:$src1, x86memop:$src2),
2833       !if(Is2Addr,
2834           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2835           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
2836       [(set RC:$dst, (OpVT (OpNode RC:$src1,
2837                                     (bitconvert (memop_frag addr:$src2)))))],
2838                                     itins.rm>,
2839       Sched<[itins.Sched.Folded, ReadAfterLd]>;
2840}
2841} // ExeDomain = SSEPackedInt
2842
2843multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode,
2844                         ValueType OpVT128, ValueType OpVT256,
2845                         OpndItins itins, bit IsCommutable = 0, Predicate prd> {
2846let Predicates = [HasAVX, prd] in
2847  defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128,
2848                    VR128, loadv2i64, i128mem, itins, IsCommutable, 0>, VEX_4V;
2849
2850let Constraints = "$src1 = $dst" in
2851  defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128,
2852                           memopv2i64, i128mem, itins, IsCommutable, 1>;
2853
2854let Predicates = [HasAVX2, prd] in
2855  defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode,
2856                               OpVT256, VR256, loadv4i64, i256mem, itins,
2857                               IsCommutable, 0>, VEX_4V, VEX_L;
2858}
2859
2860// These are ordered here for pattern ordering requirements with the fp versions
2861
2862defm PAND  : PDI_binop_all<0xDB, "pand", and, v2i64, v4i64,
2863                           SSE_VEC_BIT_ITINS_P, 1, NoVLX>;
2864defm POR   : PDI_binop_all<0xEB, "por", or, v2i64, v4i64,
2865                           SSE_VEC_BIT_ITINS_P, 1, NoVLX>;
2866defm PXOR  : PDI_binop_all<0xEF, "pxor", xor, v2i64, v4i64,
2867                           SSE_VEC_BIT_ITINS_P, 1, NoVLX>;
2868defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64,
2869                           SSE_VEC_BIT_ITINS_P, 0, NoVLX>;
2870
2871//===----------------------------------------------------------------------===//
2872// SSE 1 & 2 - Logical Instructions
2873//===----------------------------------------------------------------------===//
2874
2875// Multiclass for scalars using the X86 logical operation aliases for FP.
2876multiclass sse12_fp_packed_scalar_logical_alias<
2877    bits<8> opc, string OpcodeStr, SDNode OpNode, OpndItins itins> {
2878  defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
2879                FR32, f32, f128mem, loadf32_128, SSEPackedSingle, itins, 0>,
2880                PS, VEX_4V;
2881
2882  defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
2883                FR64, f64, f128mem, loadf64_128, SSEPackedDouble, itins, 0>,
2884                PD, VEX_4V;
2885
2886  let Constraints = "$src1 = $dst" in {
2887    defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, FR32,
2888                f32, f128mem, memopfsf32_128, SSEPackedSingle, itins>, PS;
2889
2890    defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, FR64,
2891                f64, f128mem, memopfsf64_128, SSEPackedDouble, itins>, PD;
2892  }
2893}
2894
2895let isCodeGenOnly = 1 in {
2896  defm FsAND  : sse12_fp_packed_scalar_logical_alias<0x54, "and", X86fand,
2897                SSE_BIT_ITINS_P>;
2898  defm FsOR   : sse12_fp_packed_scalar_logical_alias<0x56, "or", X86for,
2899                SSE_BIT_ITINS_P>;
2900  defm FsXOR  : sse12_fp_packed_scalar_logical_alias<0x57, "xor", X86fxor,
2901                SSE_BIT_ITINS_P>;
2902
2903  let isCommutable = 0 in
2904    defm FsANDN : sse12_fp_packed_scalar_logical_alias<0x55, "andn", X86fandn,
2905                  SSE_BIT_ITINS_P>;
2906}
2907
2908// Multiclass for vectors using the X86 logical operation aliases for FP.
2909multiclass sse12_fp_packed_vector_logical_alias<
2910    bits<8> opc, string OpcodeStr, SDNode OpNode, OpndItins itins> {
2911  let Predicates = [HasAVX, NoVLX_Or_NoDQI] in {
2912  defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
2913              VR128, v4f32, f128mem, loadv4f32, SSEPackedSingle, itins, 0>,
2914              PS, VEX_4V;
2915
2916  defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
2917        VR128, v2f64, f128mem, loadv2f64, SSEPackedDouble, itins, 0>,
2918        PD, VEX_4V;
2919
2920  defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
2921        VR256, v8f32, f256mem, loadv8f32, SSEPackedSingle, itins, 0>,
2922        PS, VEX_4V, VEX_L;
2923
2924  defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
2925        VR256, v4f64, f256mem, loadv4f64, SSEPackedDouble, itins, 0>,
2926        PD, VEX_4V, VEX_L;
2927  }
2928
2929  let Constraints = "$src1 = $dst" in {
2930    defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128,
2931                v4f32, f128mem, memopv4f32, SSEPackedSingle, itins>,
2932                PS;
2933
2934    defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128,
2935                v2f64, f128mem, memopv2f64, SSEPackedDouble, itins>,
2936                PD;
2937  }
2938}
2939
2940let isCodeGenOnly = 1 in {
2941  defm FvAND  : sse12_fp_packed_vector_logical_alias<0x54, "and", X86fand,
2942                SSE_BIT_ITINS_P>;
2943  defm FvOR   : sse12_fp_packed_vector_logical_alias<0x56, "or", X86for,
2944                SSE_BIT_ITINS_P>;
2945  defm FvXOR  : sse12_fp_packed_vector_logical_alias<0x57, "xor", X86fxor,
2946                SSE_BIT_ITINS_P>;
2947
2948  let isCommutable = 0 in
2949    defm FvANDN : sse12_fp_packed_vector_logical_alias<0x55, "andn", X86fandn,
2950                  SSE_BIT_ITINS_P>;
2951}
2952
2953/// sse12_fp_packed_logical - SSE 1 & 2 packed FP logical ops
2954///
2955multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
2956                                   SDNode OpNode> {
2957  let Predicates = [HasAVX, NoVLX] in {
2958  defm V#NAME#PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle,
2959        !strconcat(OpcodeStr, "ps"), f256mem,
2960        [(set VR256:$dst, (v4i64 (OpNode VR256:$src1, VR256:$src2)))],
2961        [(set VR256:$dst, (OpNode (bc_v4i64 (v8f32 VR256:$src1)),
2962                           (loadv4i64 addr:$src2)))], 0>, PS, VEX_4V, VEX_L;
2963
2964  defm V#NAME#PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble,
2965        !strconcat(OpcodeStr, "pd"), f256mem,
2966        [(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)),
2967                                  (bc_v4i64 (v4f64 VR256:$src2))))],
2968        [(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)),
2969                                  (loadv4i64 addr:$src2)))], 0>,
2970                                  PD, VEX_4V, VEX_L;
2971
2972  // In AVX no need to add a pattern for 128-bit logical rr ps, because they
2973  // are all promoted to v2i64, and the patterns are covered by the int
2974  // version. This is needed in SSE only, because v2i64 isn't supported on
2975  // SSE1, but only on SSE2.
2976  defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
2977       !strconcat(OpcodeStr, "ps"), f128mem, [],
2978       [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)),
2979                                 (loadv2i64 addr:$src2)))], 0>, PS, VEX_4V;
2980
2981  defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
2982       !strconcat(OpcodeStr, "pd"), f128mem,
2983       [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
2984                                 (bc_v2i64 (v2f64 VR128:$src2))))],
2985       [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
2986                                 (loadv2i64 addr:$src2)))], 0>,
2987                                                 PD, VEX_4V;
2988  }
2989
2990  let Constraints = "$src1 = $dst" in {
2991    defm PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
2992         !strconcat(OpcodeStr, "ps"), f128mem,
2993         [(set VR128:$dst, (v2i64 (OpNode VR128:$src1, VR128:$src2)))],
2994         [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)),
2995                                   (memopv2i64 addr:$src2)))]>, PS;
2996
2997    defm PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
2998         !strconcat(OpcodeStr, "pd"), f128mem,
2999         [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
3000                                   (bc_v2i64 (v2f64 VR128:$src2))))],
3001         [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
3002                                   (memopv2i64 addr:$src2)))]>, PD;
3003  }
3004}
3005
3006defm AND  : sse12_fp_packed_logical<0x54, "and", and>;
3007defm OR   : sse12_fp_packed_logical<0x56, "or", or>;
3008defm XOR  : sse12_fp_packed_logical<0x57, "xor", xor>;
3009let isCommutable = 0 in
3010  defm ANDN : sse12_fp_packed_logical<0x55, "andn", X86andnp>;
3011
3012// AVX1 requires type coercions in order to fold loads directly into logical
3013// operations.
3014let Predicates = [HasAVX1Only] in {
3015  def : Pat<(bc_v8f32 (and VR256:$src1, (loadv4i64 addr:$src2))),
3016            (VANDPSYrm VR256:$src1, addr:$src2)>;
3017  def : Pat<(bc_v8f32 (or VR256:$src1, (loadv4i64 addr:$src2))),
3018            (VORPSYrm VR256:$src1, addr:$src2)>;
3019  def : Pat<(bc_v8f32 (xor VR256:$src1, (loadv4i64 addr:$src2))),
3020            (VXORPSYrm VR256:$src1, addr:$src2)>;
3021  def : Pat<(bc_v8f32 (X86andnp VR256:$src1, (loadv4i64 addr:$src2))),
3022            (VANDNPSYrm VR256:$src1, addr:$src2)>;
3023}
3024
3025//===----------------------------------------------------------------------===//
3026// SSE 1 & 2 - Arithmetic Instructions
3027//===----------------------------------------------------------------------===//
3028
3029/// basic_sse12_fp_binop_xxx - SSE 1 & 2 binops come in both scalar and
3030/// vector forms.
3031///
3032/// In addition, we also have a special variant of the scalar form here to
3033/// represent the associated intrinsic operation.  This form is unlike the
3034/// plain scalar form, in that it takes an entire vector (instead of a scalar)
3035/// and leaves the top elements unmodified (therefore these cannot be commuted).
3036///
3037/// These three forms can each be reg+reg or reg+mem.
3038///
3039
3040/// FIXME: once all 256-bit intrinsics are matched, cleanup and refactor those
3041/// classes below
3042multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr,
3043                                  SDNode OpNode, SizeItins itins> {
3044  let Predicates = [HasAVX, NoVLX] in {
3045  defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
3046                               VR128, v4f32, f128mem, loadv4f32,
3047                               SSEPackedSingle, itins.s, 0>, PS, VEX_4V;
3048  defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
3049                               VR128, v2f64, f128mem, loadv2f64,
3050                               SSEPackedDouble, itins.d, 0>, PD, VEX_4V;
3051
3052  defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"),
3053                        OpNode, VR256, v8f32, f256mem, loadv8f32,
3054                        SSEPackedSingle, itins.s, 0>, PS, VEX_4V, VEX_L;
3055  defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"),
3056                        OpNode, VR256, v4f64, f256mem, loadv4f64,
3057                        SSEPackedDouble, itins.d, 0>, PD, VEX_4V, VEX_L;
3058  }
3059
3060  let Constraints = "$src1 = $dst" in {
3061    defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128,
3062                              v4f32, f128mem, memopv4f32, SSEPackedSingle,
3063                              itins.s>, PS;
3064    defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128,
3065                              v2f64, f128mem, memopv2f64, SSEPackedDouble,
3066                              itins.d>, PD;
3067  }
3068}
3069
3070multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
3071                                  SizeItins itins> {
3072  defm V#NAME#SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
3073                         OpNode, FR32, f32mem, SSEPackedSingle, itins.s, 0>,
3074                         XS, VEX_4V, VEX_LIG;
3075  defm V#NAME#SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
3076                         OpNode, FR64, f64mem, SSEPackedDouble, itins.d, 0>,
3077                         XD, VEX_4V, VEX_LIG;
3078
3079  let Constraints = "$src1 = $dst" in {
3080    defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
3081                              OpNode, FR32, f32mem, SSEPackedSingle,
3082                              itins.s>, XS;
3083    defm SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
3084                              OpNode, FR64, f64mem, SSEPackedDouble,
3085                              itins.d>, XD;
3086  }
3087}
3088
3089multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr,
3090                                      SizeItins itins> {
3091  defm V#NAME#SS : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
3092                   !strconcat(OpcodeStr, "ss"), "", "_ss", ssmem, sse_load_f32,
3093                   SSEPackedSingle, itins.s, 0>, XS, VEX_4V, VEX_LIG;
3094  defm V#NAME#SD : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
3095                   !strconcat(OpcodeStr, "sd"), "2", "_sd", sdmem, sse_load_f64,
3096                   SSEPackedDouble, itins.d, 0>, XD, VEX_4V, VEX_LIG;
3097
3098  let Constraints = "$src1 = $dst" in {
3099    defm SS : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
3100                   !strconcat(OpcodeStr, "ss"), "", "_ss", ssmem, sse_load_f32,
3101                   SSEPackedSingle, itins.s>, XS;
3102    defm SD : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
3103                   !strconcat(OpcodeStr, "sd"), "2", "_sd", sdmem, sse_load_f64,
3104                   SSEPackedDouble, itins.d>, XD;
3105  }
3106}
3107
3108// Binary Arithmetic instructions
3109defm ADD : basic_sse12_fp_binop_p<0x58, "add", fadd, SSE_ALU_ITINS_P>,
3110           basic_sse12_fp_binop_s<0x58, "add", fadd, SSE_ALU_ITINS_S>,
3111           basic_sse12_fp_binop_s_int<0x58, "add", SSE_ALU_ITINS_S>;
3112defm MUL : basic_sse12_fp_binop_p<0x59, "mul", fmul, SSE_MUL_ITINS_P>,
3113           basic_sse12_fp_binop_s<0x59, "mul", fmul, SSE_MUL_ITINS_S>,
3114           basic_sse12_fp_binop_s_int<0x59, "mul", SSE_MUL_ITINS_S>;
3115let isCommutable = 0 in {
3116  defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", fsub, SSE_ALU_ITINS_P>,
3117             basic_sse12_fp_binop_s<0x5C, "sub", fsub, SSE_ALU_ITINS_S>,
3118             basic_sse12_fp_binop_s_int<0x5C, "sub", SSE_ALU_ITINS_S>;
3119  defm DIV : basic_sse12_fp_binop_p<0x5E, "div", fdiv, SSE_DIV_ITINS_P>,
3120             basic_sse12_fp_binop_s<0x5E, "div", fdiv, SSE_DIV_ITINS_S>,
3121             basic_sse12_fp_binop_s_int<0x5E, "div", SSE_DIV_ITINS_S>;
3122  defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SSE_ALU_ITINS_P>,
3123             basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SSE_ALU_ITINS_S>,
3124             basic_sse12_fp_binop_s_int<0x5F, "max", SSE_ALU_ITINS_S>;
3125  defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SSE_ALU_ITINS_P>,
3126             basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SSE_ALU_ITINS_S>,
3127             basic_sse12_fp_binop_s_int<0x5D, "min", SSE_ALU_ITINS_S>;
3128}
3129
3130let isCodeGenOnly = 1 in {
3131  defm MAXC: basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_P>,
3132             basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_S>;
3133  defm MINC: basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SSE_ALU_ITINS_P>,
3134             basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SSE_ALU_ITINS_S>;
3135}
3136
3137// Patterns used to select SSE scalar fp arithmetic instructions from
3138// either:
3139//
3140// (1) a scalar fp operation followed by a blend
3141//
3142// The effect is that the backend no longer emits unnecessary vector
3143// insert instructions immediately after SSE scalar fp instructions
3144// like addss or mulss.
3145//
3146// For example, given the following code:
3147//   __m128 foo(__m128 A, __m128 B) {
3148//     A[0] += B[0];
3149//     return A;
3150//   }
3151//
3152// Previously we generated:
3153//   addss %xmm0, %xmm1
3154//   movss %xmm1, %xmm0
3155//
3156// We now generate:
3157//   addss %xmm1, %xmm0
3158//
3159// (2) a vector packed single/double fp operation followed by a vector insert
3160//
3161// The effect is that the backend converts the packed fp instruction
3162// followed by a vector insert into a single SSE scalar fp instruction.
3163//
3164// For example, given the following code:
3165//   __m128 foo(__m128 A, __m128 B) {
3166//     __m128 C = A + B;
3167//     return (__m128) {c[0], a[1], a[2], a[3]};
3168//   }
3169//
3170// Previously we generated:
3171//   addps %xmm0, %xmm1
3172//   movss %xmm1, %xmm0
3173//
3174// We now generate:
3175//   addss %xmm1, %xmm0
3176
3177// TODO: Some canonicalization in lowering would simplify the number of
3178// patterns we have to try to match.
3179multiclass scalar_math_f32_patterns<SDNode Op, string OpcPrefix> {
3180  let Predicates = [UseSSE1] in {
3181    // extracted scalar math op with insert via movss
3182    def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
3183          (Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))),
3184          FR32:$src))))),
3185      (!cast<I>(OpcPrefix#SSrr_Int) v4f32:$dst,
3186          (COPY_TO_REGCLASS FR32:$src, VR128))>;
3187
3188    // vector math op with insert via movss
3189    def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst),
3190          (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
3191      (!cast<I>(OpcPrefix#SSrr_Int) v4f32:$dst, v4f32:$src)>;
3192  }
3193
3194  // With SSE 4.1, blendi is preferred to movsd, so match that too.
3195  let Predicates = [UseSSE41] in {
3196    // extracted scalar math op with insert via blend
3197    def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
3198          (Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))),
3199          FR32:$src))), (i8 1))),
3200      (!cast<I>(OpcPrefix#SSrr_Int) v4f32:$dst,
3201          (COPY_TO_REGCLASS FR32:$src, VR128))>;
3202
3203    // vector math op with insert via blend
3204    def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst),
3205          (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))),
3206      (!cast<I>(OpcPrefix#SSrr_Int)v4f32:$dst, v4f32:$src)>;
3207
3208  }
3209
3210  // Repeat everything for AVX, except for the movss + scalar combo...
3211  // because that one shouldn't occur with AVX codegen?
3212  let Predicates = [HasAVX] in {
3213    // extracted scalar math op with insert via blend
3214    def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
3215          (Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))),
3216          FR32:$src))), (i8 1))),
3217      (!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst,
3218          (COPY_TO_REGCLASS FR32:$src, VR128))>;
3219
3220    // vector math op with insert via movss
3221    def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst),
3222          (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
3223      (!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst, v4f32:$src)>;
3224
3225    // vector math op with insert via blend
3226    def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst),
3227          (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))),
3228      (!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst, v4f32:$src)>;
3229  }
3230}
3231
3232defm : scalar_math_f32_patterns<fadd, "ADD">;
3233defm : scalar_math_f32_patterns<fsub, "SUB">;
3234defm : scalar_math_f32_patterns<fmul, "MUL">;
3235defm : scalar_math_f32_patterns<fdiv, "DIV">;
3236
3237multiclass scalar_math_f64_patterns<SDNode Op, string OpcPrefix> {
3238  let Predicates = [UseSSE2] in {
3239    // extracted scalar math op with insert via movsd
3240    def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector
3241          (Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))),
3242          FR64:$src))))),
3243      (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst,
3244          (COPY_TO_REGCLASS FR64:$src, VR128))>;
3245
3246    // vector math op with insert via movsd
3247    def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
3248          (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
3249      (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>;
3250  }
3251
3252  // With SSE 4.1, blendi is preferred to movsd, so match those too.
3253  let Predicates = [UseSSE41] in {
3254    // extracted scalar math op with insert via blend
3255    def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector
3256          (Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))),
3257          FR64:$src))), (i8 1))),
3258      (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst,
3259          (COPY_TO_REGCLASS FR64:$src, VR128))>;
3260
3261    // vector math op with insert via blend
3262    def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst),
3263          (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))),
3264      (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>;
3265  }
3266
3267  // Repeat everything for AVX.
3268  let Predicates = [HasAVX] in {
3269    // extracted scalar math op with insert via movsd
3270    def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector
3271          (Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))),
3272          FR64:$src))))),
3273      (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst,
3274          (COPY_TO_REGCLASS FR64:$src, VR128))>;
3275
3276    // extracted scalar math op with insert via blend
3277    def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector
3278          (Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))),
3279          FR64:$src))), (i8 1))),
3280      (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst,
3281          (COPY_TO_REGCLASS FR64:$src, VR128))>;
3282
3283    // vector math op with insert via movsd
3284    def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
3285          (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
3286      (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>;
3287
3288    // vector math op with insert via blend
3289    def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst),
3290          (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))),
3291      (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>;
3292  }
3293}
3294
3295defm : scalar_math_f64_patterns<fadd, "ADD">;
3296defm : scalar_math_f64_patterns<fsub, "SUB">;
3297defm : scalar_math_f64_patterns<fmul, "MUL">;
3298defm : scalar_math_f64_patterns<fdiv, "DIV">;
3299
3300
3301/// Unop Arithmetic
3302/// In addition, we also have a special variant of the scalar form here to
3303/// represent the associated intrinsic operation.  This form is unlike the
3304/// plain scalar form, in that it takes an entire vector (instead of a
3305/// scalar) and leaves the top elements undefined.
3306///
3307/// And, we have a special variant form for a full-vector intrinsic form.
3308
3309let Sched = WriteFSqrt in {
3310def SSE_SQRTPS : OpndItins<
3311  IIC_SSE_SQRTPS_RR, IIC_SSE_SQRTPS_RM
3312>;
3313
3314def SSE_SQRTSS : OpndItins<
3315  IIC_SSE_SQRTSS_RR, IIC_SSE_SQRTSS_RM
3316>;
3317
3318def SSE_SQRTPD : OpndItins<
3319  IIC_SSE_SQRTPD_RR, IIC_SSE_SQRTPD_RM
3320>;
3321
3322def SSE_SQRTSD : OpndItins<
3323  IIC_SSE_SQRTSD_RR, IIC_SSE_SQRTSD_RM
3324>;
3325}
3326
3327let Sched = WriteFRsqrt in {
3328def SSE_RSQRTPS : OpndItins<
3329  IIC_SSE_RSQRTPS_RR, IIC_SSE_RSQRTPS_RM
3330>;
3331
3332def SSE_RSQRTSS : OpndItins<
3333  IIC_SSE_RSQRTSS_RR, IIC_SSE_RSQRTSS_RM
3334>;
3335}
3336
3337let Sched = WriteFRcp in {
3338def SSE_RCPP : OpndItins<
3339  IIC_SSE_RCPP_RR, IIC_SSE_RCPP_RM
3340>;
3341
3342def SSE_RCPS : OpndItins<
3343  IIC_SSE_RCPS_RR, IIC_SSE_RCPS_RM
3344>;
3345}
3346
3347/// sse_fp_unop_s - SSE1 unops in scalar form
3348/// For the non-AVX defs, we need $src1 to be tied to $dst because
3349/// the HW instructions are 2 operand / destructive.
3350multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
3351                          ValueType vt, ValueType ScalarVT,
3352                          X86MemOperand x86memop, Operand vec_memop,
3353                          ComplexPattern mem_cpat, Intrinsic Intr,
3354                          SDNode OpNode, Domain d, OpndItins itins,
3355                          Predicate target, string Suffix> {
3356  let hasSideEffects = 0 in {
3357  def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1),
3358              !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"),
3359            [(set RC:$dst, (OpNode RC:$src1))], itins.rr, d>, Sched<[itins.Sched]>,
3360            Requires<[target]>;
3361  let mayLoad = 1 in
3362  def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1),
3363            !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"),
3364            [(set RC:$dst, (OpNode (load addr:$src1)))], itins.rm, d>,
3365            Sched<[itins.Sched.Folded, ReadAfterLd]>,
3366            Requires<[target, OptForSize]>;
3367
3368  let isCodeGenOnly = 1, Constraints = "$src1 = $dst" in {
3369  def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
3370              !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3371            []>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
3372  let mayLoad = 1 in
3373  def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, vec_memop:$src2),
3374              !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3375            []>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
3376  }
3377  }
3378
3379  let Predicates = [target] in {
3380  def : Pat<(vt (OpNode mem_cpat:$src)),
3381            (vt (COPY_TO_REGCLASS (vt (!cast<Instruction>(NAME#Suffix##m_Int)
3382                 (vt (IMPLICIT_DEF)), mem_cpat:$src)), RC))>;
3383  // These are unary operations, but they are modeled as having 2 source operands
3384  // because the high elements of the destination are unchanged in SSE.
3385  def : Pat<(Intr VR128:$src),
3386            (!cast<Instruction>(NAME#Suffix##r_Int) VR128:$src, VR128:$src)>;
3387  def : Pat<(Intr (load addr:$src)),
3388            (vt (COPY_TO_REGCLASS(!cast<Instruction>(NAME#Suffix##m)
3389                                      addr:$src), VR128))>;
3390  def : Pat<(Intr mem_cpat:$src),
3391             (!cast<Instruction>(NAME#Suffix##m_Int)
3392                    (vt (IMPLICIT_DEF)), mem_cpat:$src)>;
3393  }
3394}
3395
3396multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
3397                          ValueType vt, ValueType ScalarVT,
3398                          X86MemOperand x86memop, Operand vec_memop,
3399                          ComplexPattern mem_cpat,
3400                          Intrinsic Intr, SDNode OpNode, Domain d,
3401                          OpndItins itins, string Suffix> {
3402  let hasSideEffects = 0 in {
3403  def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
3404            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3405            [], itins.rr, d>, Sched<[itins.Sched]>;
3406  let mayLoad = 1 in
3407  def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
3408             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3409            [], itins.rm, d>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
3410  let isCodeGenOnly = 1 in {
3411  def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst),
3412                (ins VR128:$src1, VR128:$src2),
3413             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3414             []>, Sched<[itins.Sched.Folded]>;
3415  let mayLoad = 1 in
3416  def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst),
3417                (ins VR128:$src1, vec_memop:$src2),
3418             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3419             []>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
3420  }
3421  }
3422
3423  let Predicates = [UseAVX] in {
3424   def : Pat<(OpNode RC:$src),  (!cast<Instruction>("V"#NAME#Suffix##r)
3425                                (ScalarVT (IMPLICIT_DEF)), RC:$src)>;
3426
3427   def : Pat<(vt (OpNode mem_cpat:$src)),
3428             (!cast<Instruction>("V"#NAME#Suffix##m_Int) (vt (IMPLICIT_DEF)),
3429                                  mem_cpat:$src)>;
3430
3431  }
3432  let Predicates = [HasAVX] in {
3433   def : Pat<(Intr VR128:$src),
3434             (!cast<Instruction>("V"#NAME#Suffix##r_Int) (vt (IMPLICIT_DEF)),
3435                                 VR128:$src)>;
3436
3437   def : Pat<(Intr mem_cpat:$src),
3438             (!cast<Instruction>("V"#NAME#Suffix##m_Int)
3439                    (vt (IMPLICIT_DEF)), mem_cpat:$src)>;
3440  }
3441  let Predicates = [UseAVX, OptForSize] in
3442  def : Pat<(ScalarVT (OpNode (load addr:$src))),
3443            (!cast<Instruction>("V"#NAME#Suffix##m) (ScalarVT (IMPLICIT_DEF)),
3444             addr:$src)>;
3445}
3446
3447/// sse1_fp_unop_p - SSE1 unops in packed form.
3448multiclass sse1_fp_unop_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
3449                          OpndItins itins> {
3450let Predicates = [HasAVX] in {
3451  def V#NAME#PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3452                       !strconcat("v", OpcodeStr,
3453                                  "ps\t{$src, $dst|$dst, $src}"),
3454                       [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))],
3455                       itins.rr>, VEX, Sched<[itins.Sched]>;
3456  def V#NAME#PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
3457                       !strconcat("v", OpcodeStr,
3458                                  "ps\t{$src, $dst|$dst, $src}"),
3459                       [(set VR128:$dst, (OpNode (loadv4f32 addr:$src)))],
3460                       itins.rm>, VEX, Sched<[itins.Sched.Folded]>;
3461  def V#NAME#PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
3462                        !strconcat("v", OpcodeStr,
3463                                   "ps\t{$src, $dst|$dst, $src}"),
3464                        [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))],
3465                        itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>;
3466  def V#NAME#PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
3467                        !strconcat("v", OpcodeStr,
3468                                   "ps\t{$src, $dst|$dst, $src}"),
3469                        [(set VR256:$dst, (OpNode (loadv8f32 addr:$src)))],
3470                        itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>;
3471}
3472
3473  def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3474                !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
3475                [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))], itins.rr>,
3476            Sched<[itins.Sched]>;
3477  def PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
3478                !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
3479                [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))], itins.rm>,
3480            Sched<[itins.Sched.Folded]>;
3481}
3482
3483/// sse2_fp_unop_p - SSE2 unops in vector forms.
3484multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr,
3485                          SDNode OpNode, OpndItins itins> {
3486let Predicates = [HasAVX] in {
3487  def V#NAME#PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3488                       !strconcat("v", OpcodeStr,
3489                                  "pd\t{$src, $dst|$dst, $src}"),
3490                       [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))],
3491                       itins.rr>, VEX, Sched<[itins.Sched]>;
3492  def V#NAME#PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
3493                       !strconcat("v", OpcodeStr,
3494                                  "pd\t{$src, $dst|$dst, $src}"),
3495                       [(set VR128:$dst, (OpNode (loadv2f64 addr:$src)))],
3496                       itins.rm>, VEX, Sched<[itins.Sched.Folded]>;
3497  def V#NAME#PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
3498                        !strconcat("v", OpcodeStr,
3499                                   "pd\t{$src, $dst|$dst, $src}"),
3500                        [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))],
3501                        itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>;
3502  def V#NAME#PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
3503                        !strconcat("v", OpcodeStr,
3504                                   "pd\t{$src, $dst|$dst, $src}"),
3505                        [(set VR256:$dst, (OpNode (loadv4f64 addr:$src)))],
3506                        itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>;
3507}
3508
3509  def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3510              !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
3511              [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))], itins.rr>,
3512            Sched<[itins.Sched]>;
3513  def PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
3514                !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
3515                [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))], itins.rm>,
3516            Sched<[itins.Sched.Folded]>;
3517}
3518
3519multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
3520                          OpndItins itins> {
3521  defm SS        :  sse_fp_unop_s<opc, OpcodeStr##ss, FR32, v4f32, f32, f32mem,
3522                      ssmem, sse_load_f32,
3523                      !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode,
3524                      SSEPackedSingle, itins, UseSSE1, "SS">, XS;
3525  defm V#NAME#SS  : avx_fp_unop_s<opc, "v"#OpcodeStr##ss, FR32, v4f32, f32,
3526                      f32mem, ssmem, sse_load_f32,
3527                      !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode,
3528                      SSEPackedSingle, itins, "SS">, XS, VEX_4V, VEX_LIG;
3529}
3530
3531multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
3532                          OpndItins itins> {
3533  defm SD         : sse_fp_unop_s<opc, OpcodeStr##sd, FR64, v2f64, f64, f64mem,
3534                         sdmem, sse_load_f64,
3535                         !cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd),
3536                         OpNode, SSEPackedDouble, itins, UseSSE2, "SD">, XD;
3537  defm V#NAME#SD  : avx_fp_unop_s<opc, "v"#OpcodeStr##sd, FR64, v2f64, f64,
3538                         f64mem, sdmem, sse_load_f64,
3539                         !cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd),
3540                         OpNode, SSEPackedDouble, itins, "SD">,
3541                         XD, VEX_4V, VEX_LIG;
3542}
3543
3544// Square root.
3545defm SQRT  : sse1_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSS>,
3546             sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPS>,
3547             sse2_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSD>,
3548             sse2_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPD>;
3549
3550// Reciprocal approximations. Note that these typically require refinement
3551// in order to obtain suitable precision.
3552defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SSE_RSQRTSS>,
3553             sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_RSQRTPS>;
3554defm RCP   : sse1_fp_unop_s<0x53, "rcp", X86frcp, SSE_RCPS>,
3555             sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP>;
3556
3557// There is no f64 version of the reciprocal approximation instructions.
3558
3559// TODO: We should add *scalar* op patterns for these just like we have for
3560// the binops above. If the binop and unop patterns could all be unified
3561// that would be even better.
3562
3563multiclass scalar_unary_math_patterns<Intrinsic Intr, string OpcPrefix,
3564                                      SDNode Move, ValueType VT,
3565                                      Predicate BasePredicate> {
3566  let Predicates = [BasePredicate] in {
3567    def : Pat<(VT (Move VT:$dst, (Intr VT:$src))),
3568              (!cast<I>(OpcPrefix#r_Int) VT:$dst, VT:$src)>;
3569  }
3570
3571  // With SSE 4.1, blendi is preferred to movs*, so match that too.
3572  let Predicates = [UseSSE41] in {
3573    def : Pat<(VT (X86Blendi VT:$dst, (Intr VT:$src), (i8 1))),
3574              (!cast<I>(OpcPrefix#r_Int) VT:$dst, VT:$src)>;
3575  }
3576
3577  // Repeat for AVX versions of the instructions.
3578  let Predicates = [HasAVX] in {
3579    def : Pat<(VT (Move VT:$dst, (Intr VT:$src))),
3580              (!cast<I>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>;
3581
3582    def : Pat<(VT (X86Blendi VT:$dst, (Intr VT:$src), (i8 1))),
3583              (!cast<I>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>;
3584  }
3585}
3586
3587defm : scalar_unary_math_patterns<int_x86_sse_rcp_ss, "RCPSS", X86Movss,
3588                                  v4f32, UseSSE1>;
3589defm : scalar_unary_math_patterns<int_x86_sse_rsqrt_ss, "RSQRTSS", X86Movss,
3590                                  v4f32, UseSSE1>;
3591defm : scalar_unary_math_patterns<int_x86_sse_sqrt_ss, "SQRTSS", X86Movss,
3592                                  v4f32, UseSSE1>;
3593defm : scalar_unary_math_patterns<int_x86_sse2_sqrt_sd, "SQRTSD", X86Movsd,
3594                                  v2f64, UseSSE2>;
3595
3596
3597//===----------------------------------------------------------------------===//
3598// SSE 1 & 2 - Non-temporal stores
3599//===----------------------------------------------------------------------===//
3600
3601let AddedComplexity = 400 in { // Prefer non-temporal versions
3602let SchedRW = [WriteStore] in {
3603let Predicates = [HasAVX, NoVLX] in {
3604def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs),
3605                     (ins f128mem:$dst, VR128:$src),
3606                     "movntps\t{$src, $dst|$dst, $src}",
3607                     [(alignednontemporalstore (v4f32 VR128:$src),
3608                                               addr:$dst)],
3609                                               IIC_SSE_MOVNT>, VEX;
3610def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs),
3611                     (ins f128mem:$dst, VR128:$src),
3612                     "movntpd\t{$src, $dst|$dst, $src}",
3613                     [(alignednontemporalstore (v2f64 VR128:$src),
3614                                               addr:$dst)],
3615                                               IIC_SSE_MOVNT>, VEX;
3616
3617let ExeDomain = SSEPackedInt in
3618def VMOVNTDQmr    : VPDI<0xE7, MRMDestMem, (outs),
3619                         (ins f128mem:$dst, VR128:$src),
3620                         "movntdq\t{$src, $dst|$dst, $src}",
3621                         [(alignednontemporalstore (v2i64 VR128:$src),
3622                                                   addr:$dst)],
3623                                                   IIC_SSE_MOVNT>, VEX;
3624
3625def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs),
3626                     (ins f256mem:$dst, VR256:$src),
3627                     "movntps\t{$src, $dst|$dst, $src}",
3628                     [(alignednontemporalstore (v8f32 VR256:$src),
3629                                               addr:$dst)],
3630                                               IIC_SSE_MOVNT>, VEX, VEX_L;
3631def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs),
3632                     (ins f256mem:$dst, VR256:$src),
3633                     "movntpd\t{$src, $dst|$dst, $src}",
3634                     [(alignednontemporalstore (v4f64 VR256:$src),
3635                                               addr:$dst)],
3636                                               IIC_SSE_MOVNT>, VEX, VEX_L;
3637let ExeDomain = SSEPackedInt in
3638def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs),
3639                    (ins f256mem:$dst, VR256:$src),
3640                    "movntdq\t{$src, $dst|$dst, $src}",
3641                    [(alignednontemporalstore (v4i64 VR256:$src),
3642                                              addr:$dst)],
3643                                              IIC_SSE_MOVNT>, VEX, VEX_L;
3644}
3645
3646def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
3647                    "movntps\t{$src, $dst|$dst, $src}",
3648                    [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)],
3649                    IIC_SSE_MOVNT>;
3650def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
3651                    "movntpd\t{$src, $dst|$dst, $src}",
3652                    [(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)],
3653                    IIC_SSE_MOVNT>;
3654
3655let ExeDomain = SSEPackedInt in
3656def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
3657                    "movntdq\t{$src, $dst|$dst, $src}",
3658                    [(alignednontemporalstore (v2i64 VR128:$src), addr:$dst)],
3659                    IIC_SSE_MOVNT>;
3660
3661// There is no AVX form for instructions below this point
3662def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
3663                 "movnti{l}\t{$src, $dst|$dst, $src}",
3664                 [(nontemporalstore (i32 GR32:$src), addr:$dst)],
3665                 IIC_SSE_MOVNT>,
3666               PS, Requires<[HasSSE2]>;
3667def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
3668                     "movnti{q}\t{$src, $dst|$dst, $src}",
3669                     [(nontemporalstore (i64 GR64:$src), addr:$dst)],
3670                     IIC_SSE_MOVNT>,
3671                  PS, Requires<[HasSSE2]>;
3672} // SchedRW = [WriteStore]
3673
3674let Predicates = [HasAVX2, NoVLX] in {
3675  def : Pat<(alignednontemporalstore (v8i32 VR256:$src), addr:$dst),
3676            (VMOVNTDQYmr addr:$dst, VR256:$src)>;
3677  def : Pat<(alignednontemporalstore (v16i16 VR256:$src), addr:$dst),
3678            (VMOVNTDQYmr addr:$dst, VR256:$src)>;
3679  def : Pat<(alignednontemporalstore (v32i8 VR256:$src), addr:$dst),
3680            (VMOVNTDQYmr addr:$dst, VR256:$src)>;
3681}
3682
3683let Predicates = [HasAVX, NoVLX] in {
3684  def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst),
3685            (VMOVNTDQmr addr:$dst, VR128:$src)>;
3686  def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst),
3687            (VMOVNTDQmr addr:$dst, VR128:$src)>;
3688  def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst),
3689            (VMOVNTDQmr addr:$dst, VR128:$src)>;
3690}
3691
3692def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst),
3693          (MOVNTDQmr addr:$dst, VR128:$src)>;
3694def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst),
3695          (MOVNTDQmr addr:$dst, VR128:$src)>;
3696def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst),
3697          (MOVNTDQmr addr:$dst, VR128:$src)>;
3698
3699} // AddedComplexity
3700
3701//===----------------------------------------------------------------------===//
3702// SSE 1 & 2 - Prefetch and memory fence
3703//===----------------------------------------------------------------------===//
3704
3705// Prefetch intrinsic.
3706let Predicates = [HasSSE1], SchedRW = [WriteLoad] in {
3707def PREFETCHT0   : I<0x18, MRM1m, (outs), (ins i8mem:$src),
3708    "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3), (i32 1))],
3709    IIC_SSE_PREFETCH>, TB;
3710def PREFETCHT1   : I<0x18, MRM2m, (outs), (ins i8mem:$src),
3711    "prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2), (i32 1))],
3712    IIC_SSE_PREFETCH>, TB;
3713def PREFETCHT2   : I<0x18, MRM3m, (outs), (ins i8mem:$src),
3714    "prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1), (i32 1))],
3715    IIC_SSE_PREFETCH>, TB;
3716def PREFETCHNTA  : I<0x18, MRM0m, (outs), (ins i8mem:$src),
3717    "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0), (i32 1))],
3718    IIC_SSE_PREFETCH>, TB;
3719}
3720
3721// FIXME: How should flush instruction be modeled?
3722let SchedRW = [WriteLoad] in {
3723// Flush cache
3724def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
3725               "clflush\t$src", [(int_x86_sse2_clflush addr:$src)],
3726               IIC_SSE_PREFETCH>, PS, Requires<[HasSSE2]>;
3727}
3728
3729let SchedRW = [WriteNop] in {
3730// Pause. This "instruction" is encoded as "rep; nop", so even though it
3731// was introduced with SSE2, it's backward compatible.
3732def PAUSE : I<0x90, RawFrm, (outs), (ins),
3733              "pause", [(int_x86_sse2_pause)], IIC_SSE_PAUSE>,
3734              OBXS, Requires<[HasSSE2]>;
3735}
3736
3737let SchedRW = [WriteFence] in {
3738// Load, store, and memory fence
3739def SFENCE : I<0xAE, MRM_F8, (outs), (ins),
3740               "sfence", [(int_x86_sse_sfence)], IIC_SSE_SFENCE>,
3741               PS, Requires<[HasSSE1]>;
3742def LFENCE : I<0xAE, MRM_E8, (outs), (ins),
3743               "lfence", [(int_x86_sse2_lfence)], IIC_SSE_LFENCE>,
3744               TB, Requires<[HasSSE2]>;
3745def MFENCE : I<0xAE, MRM_F0, (outs), (ins),
3746               "mfence", [(int_x86_sse2_mfence)], IIC_SSE_MFENCE>,
3747               TB, Requires<[HasSSE2]>;
3748} // SchedRW
3749
3750def : Pat<(X86SFence), (SFENCE)>;
3751def : Pat<(X86LFence), (LFENCE)>;
3752def : Pat<(X86MFence), (MFENCE)>;
3753
3754//===----------------------------------------------------------------------===//
3755// SSE 1 & 2 - Load/Store XCSR register
3756//===----------------------------------------------------------------------===//
3757
3758def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src),
3759                  "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)],
3760                  IIC_SSE_LDMXCSR>, VEX, Sched<[WriteLoad]>;
3761def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
3762                  "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)],
3763                  IIC_SSE_STMXCSR>, VEX, Sched<[WriteStore]>;
3764
3765let Predicates = [UseSSE1] in {
3766def LDMXCSR : I<0xAE, MRM2m, (outs), (ins i32mem:$src),
3767                "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)],
3768                IIC_SSE_LDMXCSR>, TB, Sched<[WriteLoad]>;
3769def STMXCSR : I<0xAE, MRM3m, (outs), (ins i32mem:$dst),
3770                "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)],
3771                IIC_SSE_STMXCSR>, TB, Sched<[WriteStore]>;
3772}
3773
3774//===---------------------------------------------------------------------===//
3775// SSE2 - Move Aligned/Unaligned Packed Integer Instructions
3776//===---------------------------------------------------------------------===//
3777
3778let ExeDomain = SSEPackedInt in { // SSE integer instructions
3779
3780let hasSideEffects = 0, SchedRW = [WriteMove] in {
3781def VMOVDQArr  : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3782                    "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>,
3783                    VEX;
3784def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
3785                    "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>,
3786                    VEX, VEX_L;
3787def VMOVDQUrr  : VSSI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3788                    "movdqu\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVU_P_RR>,
3789                    VEX;
3790def VMOVDQUYrr : VSSI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
3791                    "movdqu\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVU_P_RR>,
3792                    VEX, VEX_L;
3793}
3794
3795// For Disassembler
3796let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
3797    SchedRW = [WriteMove] in {
3798def VMOVDQArr_REV  : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3799                        "movdqa\t{$src, $dst|$dst, $src}", [],
3800                        IIC_SSE_MOVA_P_RR>,
3801                        VEX;
3802def VMOVDQAYrr_REV : VPDI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
3803                        "movdqa\t{$src, $dst|$dst, $src}", [],
3804                        IIC_SSE_MOVA_P_RR>, VEX, VEX_L;
3805def VMOVDQUrr_REV  : VSSI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3806                        "movdqu\t{$src, $dst|$dst, $src}", [],
3807                        IIC_SSE_MOVU_P_RR>,
3808                        VEX;
3809def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
3810                        "movdqu\t{$src, $dst|$dst, $src}", [],
3811                        IIC_SSE_MOVU_P_RR>, VEX, VEX_L;
3812}
3813
3814let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
3815    hasSideEffects = 0, SchedRW = [WriteLoad] in {
3816def VMOVDQArm  : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3817                   "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RM>,
3818                   VEX;
3819def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
3820                   "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RM>,
3821                   VEX, VEX_L;
3822let Predicates = [HasAVX] in {
3823  def VMOVDQUrm  : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3824                    "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_RM>,
3825                    XS, VEX;
3826  def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
3827                    "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_RM>,
3828                    XS, VEX, VEX_L;
3829}
3830}
3831
3832let mayStore = 1, hasSideEffects = 0, SchedRW = [WriteStore] in {
3833def VMOVDQAmr  : VPDI<0x7F, MRMDestMem, (outs),
3834                     (ins i128mem:$dst, VR128:$src),
3835                     "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_MR>,
3836                     VEX;
3837def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs),
3838                     (ins i256mem:$dst, VR256:$src),
3839                     "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_MR>,
3840                     VEX, VEX_L;
3841let Predicates = [HasAVX] in {
3842def VMOVDQUmr  : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3843                  "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_MR>,
3844                  XS, VEX;
3845def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src),
3846                  "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_MR>,
3847                  XS, VEX, VEX_L;
3848}
3849}
3850
3851let SchedRW = [WriteMove] in {
3852let hasSideEffects = 0 in
3853def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3854                   "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>;
3855
3856def MOVDQUrr :   I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3857                   "movdqu\t{$src, $dst|$dst, $src}",
3858                   [], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>;
3859
3860// For Disassembler
3861let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
3862def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3863                       "movdqa\t{$src, $dst|$dst, $src}", [],
3864                       IIC_SSE_MOVA_P_RR>;
3865
3866def MOVDQUrr_REV :   I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3867                       "movdqu\t{$src, $dst|$dst, $src}",
3868                       [], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>;
3869}
3870} // SchedRW
3871
3872let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
3873    hasSideEffects = 0, SchedRW = [WriteLoad] in {
3874def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3875                   "movdqa\t{$src, $dst|$dst, $src}",
3876                   [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/],
3877                   IIC_SSE_MOVA_P_RM>;
3878def MOVDQUrm :   I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3879                   "movdqu\t{$src, $dst|$dst, $src}",
3880                   [/*(set VR128:$dst, (loadv2i64 addr:$src))*/],
3881                   IIC_SSE_MOVU_P_RM>,
3882                 XS, Requires<[UseSSE2]>;
3883}
3884
3885let mayStore = 1, hasSideEffects = 0, SchedRW = [WriteStore] in {
3886def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3887                   "movdqa\t{$src, $dst|$dst, $src}",
3888                   [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/],
3889                   IIC_SSE_MOVA_P_MR>;
3890def MOVDQUmr :   I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3891                   "movdqu\t{$src, $dst|$dst, $src}",
3892                   [/*(store (v2i64 VR128:$src), addr:$dst)*/],
3893                   IIC_SSE_MOVU_P_MR>,
3894                 XS, Requires<[UseSSE2]>;
3895}
3896
3897} // ExeDomain = SSEPackedInt
3898
3899let Predicates = [HasAVX] in {
3900  def : Pat<(int_x86_sse2_storeu_dq addr:$dst, VR128:$src),
3901            (VMOVDQUmr addr:$dst, VR128:$src)>;
3902  def : Pat<(int_x86_avx_storeu_dq_256 addr:$dst, VR256:$src),
3903            (VMOVDQUYmr addr:$dst, VR256:$src)>;
3904}
3905let Predicates = [UseSSE2] in
3906def : Pat<(int_x86_sse2_storeu_dq addr:$dst, VR128:$src),
3907          (MOVDQUmr addr:$dst, VR128:$src)>;
3908
3909//===---------------------------------------------------------------------===//
3910// SSE2 - Packed Integer Arithmetic Instructions
3911//===---------------------------------------------------------------------===//
3912
3913let Sched = WriteVecIMul in
3914def SSE_PMADD : OpndItins<
3915  IIC_SSE_PMADD, IIC_SSE_PMADD
3916>;
3917
3918let ExeDomain = SSEPackedInt in { // SSE integer instructions
3919
3920multiclass PDI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId,
3921                            RegisterClass RC, PatFrag memop_frag,
3922                            X86MemOperand x86memop,
3923                            OpndItins itins,
3924                            bit IsCommutable = 0,
3925                            bit Is2Addr = 1> {
3926  let isCommutable = IsCommutable in
3927  def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
3928       (ins RC:$src1, RC:$src2),
3929       !if(Is2Addr,
3930           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3931           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3932       [(set RC:$dst, (IntId RC:$src1, RC:$src2))], itins.rr>,
3933      Sched<[itins.Sched]>;
3934  def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
3935       (ins RC:$src1, x86memop:$src2),
3936       !if(Is2Addr,
3937           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3938           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3939       [(set RC:$dst, (IntId RC:$src1, (bitconvert (memop_frag addr:$src2))))],
3940       itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
3941}
3942
3943multiclass PDI_binop_all_int<bits<8> opc, string OpcodeStr, Intrinsic IntId128,
3944                             Intrinsic IntId256, OpndItins itins,
3945                             bit IsCommutable = 0> {
3946let Predicates = [HasAVX] in
3947  defm V#NAME : PDI_binop_rm_int<opc, !strconcat("v", OpcodeStr), IntId128,
3948                                 VR128, loadv2i64, i128mem, itins,
3949                                 IsCommutable, 0>, VEX_4V;
3950
3951let Constraints = "$src1 = $dst" in
3952  defm NAME : PDI_binop_rm_int<opc, OpcodeStr, IntId128, VR128, memopv2i64,
3953                               i128mem, itins, IsCommutable, 1>;
3954
3955let Predicates = [HasAVX2] in
3956  defm V#NAME#Y : PDI_binop_rm_int<opc, !strconcat("v", OpcodeStr), IntId256,
3957                                   VR256, loadv4i64, i256mem, itins,
3958                                   IsCommutable, 0>, VEX_4V, VEX_L;
3959}
3960
3961multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm,
3962                         string OpcodeStr, SDNode OpNode,
3963                         SDNode OpNode2, RegisterClass RC,
3964                         ValueType DstVT, ValueType SrcVT, PatFrag bc_frag,
3965                         PatFrag ld_frag, ShiftOpndItins itins,
3966                         bit Is2Addr = 1> {
3967  // src2 is always 128-bit
3968  def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
3969       (ins RC:$src1, VR128:$src2),
3970       !if(Is2Addr,
3971           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3972           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3973       [(set RC:$dst, (DstVT (OpNode RC:$src1, (SrcVT VR128:$src2))))],
3974        itins.rr>, Sched<[WriteVecShift]>;
3975  def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
3976       (ins RC:$src1, i128mem:$src2),
3977       !if(Is2Addr,
3978           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3979           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3980       [(set RC:$dst, (DstVT (OpNode RC:$src1,
3981                       (bc_frag (ld_frag addr:$src2)))))], itins.rm>,
3982      Sched<[WriteVecShiftLd, ReadAfterLd]>;
3983  def ri : PDIi8<opc2, ImmForm, (outs RC:$dst),
3984       (ins RC:$src1, u8imm:$src2),
3985       !if(Is2Addr,
3986           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3987           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3988       [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i8 imm:$src2))))], itins.ri>,
3989       Sched<[WriteVecShift]>;
3990}
3991
3992/// PDI_binop_rm2 - Simple SSE2 binary operator with different src and dst types
3993multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
3994                         ValueType DstVT, ValueType SrcVT, RegisterClass RC,
3995                         PatFrag memop_frag, X86MemOperand x86memop,
3996                         OpndItins itins,
3997                         bit IsCommutable = 0, bit Is2Addr = 1> {
3998  let isCommutable = IsCommutable in
3999  def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
4000       (ins RC:$src1, RC:$src2),
4001       !if(Is2Addr,
4002           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4003           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4004       [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>,
4005       Sched<[itins.Sched]>;
4006  def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
4007       (ins RC:$src1, x86memop:$src2),
4008       !if(Is2Addr,
4009           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4010           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4011       [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1),
4012                                     (bitconvert (memop_frag addr:$src2)))))]>,
4013       Sched<[itins.Sched.Folded, ReadAfterLd]>;
4014}
4015} // ExeDomain = SSEPackedInt
4016
4017defm PADDB   : PDI_binop_all<0xFC, "paddb", add, v16i8, v32i8,
4018                             SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
4019defm PADDW   : PDI_binop_all<0xFD, "paddw", add, v8i16, v16i16,
4020                             SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
4021defm PADDD   : PDI_binop_all<0xFE, "paddd", add, v4i32, v8i32,
4022                             SSE_INTALU_ITINS_P, 1, NoVLX>;
4023defm PADDQ   : PDI_binop_all<0xD4, "paddq", add, v2i64, v4i64,
4024                             SSE_INTALUQ_ITINS_P, 1, NoVLX>;
4025defm PMULLW  : PDI_binop_all<0xD5, "pmullw", mul, v8i16, v16i16,
4026                             SSE_INTMUL_ITINS_P, 1, NoVLX_Or_NoBWI>;
4027defm PMULHUW : PDI_binop_all<0xE4, "pmulhuw", mulhu, v8i16, v16i16,
4028                             SSE_INTMUL_ITINS_P, 1, NoVLX_Or_NoBWI>;
4029defm PMULHW  : PDI_binop_all<0xE5, "pmulhw", mulhs, v8i16, v16i16,
4030                             SSE_INTMUL_ITINS_P, 1, NoVLX_Or_NoBWI>;
4031defm PSUBB   : PDI_binop_all<0xF8, "psubb", sub, v16i8, v32i8,
4032                             SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
4033defm PSUBW   : PDI_binop_all<0xF9, "psubw", sub, v8i16, v16i16,
4034                             SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
4035defm PSUBD   : PDI_binop_all<0xFA, "psubd", sub, v4i32, v8i32,
4036                             SSE_INTALU_ITINS_P, 0, NoVLX>;
4037defm PSUBQ   : PDI_binop_all<0xFB, "psubq", sub, v2i64, v4i64,
4038                             SSE_INTALUQ_ITINS_P, 0, NoVLX>;
4039defm PSUBUSB : PDI_binop_all<0xD8, "psubusb", X86subus, v16i8, v32i8,
4040                             SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
4041defm PSUBUSW : PDI_binop_all<0xD9, "psubusw", X86subus, v8i16, v16i16,
4042                             SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
4043defm PMINUB  : PDI_binop_all<0xDA, "pminub", umin, v16i8, v32i8,
4044                             SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
4045defm PMINSW  : PDI_binop_all<0xEA, "pminsw", smin, v8i16, v16i16,
4046                             SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
4047defm PMAXUB  : PDI_binop_all<0xDE, "pmaxub", umax, v16i8, v32i8,
4048                             SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
4049defm PMAXSW  : PDI_binop_all<0xEE, "pmaxsw", smax, v8i16, v16i16,
4050                             SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
4051defm PAVGB   : PDI_binop_all<0xE0, "pavgb", X86avg, v16i8, v32i8,
4052                             SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
4053defm PAVGW   : PDI_binop_all<0xE3, "pavgw", X86avg, v8i16, v16i16,
4054                             SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
4055
4056// Intrinsic forms
4057defm PSUBSB  : PDI_binop_all_int<0xE8, "psubsb", int_x86_sse2_psubs_b,
4058                                 int_x86_avx2_psubs_b, SSE_INTALU_ITINS_P, 0>;
4059defm PSUBSW  : PDI_binop_all_int<0xE9, "psubsw" , int_x86_sse2_psubs_w,
4060                                 int_x86_avx2_psubs_w, SSE_INTALU_ITINS_P, 0>;
4061defm PADDSB  : PDI_binop_all_int<0xEC, "paddsb" , int_x86_sse2_padds_b,
4062                                 int_x86_avx2_padds_b, SSE_INTALU_ITINS_P, 1>;
4063defm PADDSW  : PDI_binop_all_int<0xED, "paddsw" , int_x86_sse2_padds_w,
4064                                 int_x86_avx2_padds_w, SSE_INTALU_ITINS_P, 1>;
4065defm PADDUSB : PDI_binop_all_int<0xDC, "paddusb", int_x86_sse2_paddus_b,
4066                                 int_x86_avx2_paddus_b, SSE_INTALU_ITINS_P, 1>;
4067defm PADDUSW : PDI_binop_all_int<0xDD, "paddusw", int_x86_sse2_paddus_w,
4068                                 int_x86_avx2_paddus_w, SSE_INTALU_ITINS_P, 1>;
4069defm PMADDWD : PDI_binop_all_int<0xF5, "pmaddwd", int_x86_sse2_pmadd_wd,
4070                                 int_x86_avx2_pmadd_wd, SSE_PMADD, 1>;
4071
4072let Predicates = [HasAVX] in
4073defm VPSADBW : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v2i64, v16i8, VR128,
4074                             loadv2i64, i128mem, SSE_INTMUL_ITINS_P, 1, 0>,
4075                             VEX_4V;
4076let Predicates = [HasAVX2] in
4077defm VPSADBWY : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v4i64, v32i8, VR256,
4078                             loadv4i64, i256mem, SSE_INTMUL_ITINS_P, 1, 0>,
4079                             VEX_4V, VEX_L;
4080let Constraints = "$src1 = $dst" in
4081defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128,
4082                            memopv2i64, i128mem, SSE_INTALU_ITINS_P, 1>;
4083
4084let Predicates = [HasAVX] in
4085defm VPMULUDQ : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v2i64, v4i32, VR128,
4086                              loadv2i64, i128mem, SSE_INTMUL_ITINS_P, 1, 0>,
4087                              VEX_4V;
4088let Predicates = [HasAVX2] in
4089defm VPMULUDQY : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v4i64, v8i32,
4090                               VR256, loadv4i64, i256mem,
4091                               SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L;
4092let Constraints = "$src1 = $dst" in
4093defm PMULUDQ : PDI_binop_rm2<0xF4, "pmuludq", X86pmuludq, v2i64, v4i32, VR128,
4094                             memopv2i64, i128mem, SSE_INTMUL_ITINS_P, 1>;
4095
4096//===---------------------------------------------------------------------===//
4097// SSE2 - Packed Integer Logical Instructions
4098//===---------------------------------------------------------------------===//
4099
4100let Predicates = [HasAVX, NoVLX] in {
4101defm VPSLLW : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli,
4102                            VR128, v8i16, v8i16, bc_v8i16, loadv2i64,
4103                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
4104defm VPSLLD : PDI_binop_rmi<0xF2, 0x72, MRM6r, "vpslld", X86vshl, X86vshli,
4105                            VR128, v4i32, v4i32, bc_v4i32, loadv2i64,
4106                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
4107defm VPSLLQ : PDI_binop_rmi<0xF3, 0x73, MRM6r, "vpsllq", X86vshl, X86vshli,
4108                            VR128, v2i64, v2i64, bc_v2i64, loadv2i64,
4109                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
4110
4111defm VPSRLW : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli,
4112                            VR128, v8i16, v8i16, bc_v8i16, loadv2i64,
4113                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
4114defm VPSRLD : PDI_binop_rmi<0xD2, 0x72, MRM2r, "vpsrld", X86vsrl, X86vsrli,
4115                            VR128, v4i32, v4i32, bc_v4i32, loadv2i64,
4116                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
4117defm VPSRLQ : PDI_binop_rmi<0xD3, 0x73, MRM2r, "vpsrlq", X86vsrl, X86vsrli,
4118                            VR128, v2i64, v2i64, bc_v2i64, loadv2i64,
4119                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
4120
4121defm VPSRAW : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai,
4122                            VR128, v8i16, v8i16, bc_v8i16, loadv2i64,
4123                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
4124defm VPSRAD : PDI_binop_rmi<0xE2, 0x72, MRM4r, "vpsrad", X86vsra, X86vsrai,
4125                            VR128, v4i32, v4i32, bc_v4i32, loadv2i64,
4126                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
4127} // Predicates = [HasAVX]
4128
4129let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] ,
4130                                    Predicates = [HasAVX, NoVLX_Or_NoBWI]in {
4131  // 128-bit logical shifts.
4132  def VPSLLDQri : PDIi8<0x73, MRM7r,
4133                    (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
4134                    "vpslldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
4135                    [(set VR128:$dst,
4136                      (v2i64 (X86vshldq VR128:$src1, (i8 imm:$src2))))]>,
4137                    VEX_4V;
4138  def VPSRLDQri : PDIi8<0x73, MRM3r,
4139                    (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
4140                    "vpsrldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
4141                    [(set VR128:$dst,
4142                      (v2i64 (X86vshrdq VR128:$src1, (i8 imm:$src2))))]>,
4143                    VEX_4V;
4144  // PSRADQri doesn't exist in SSE[1-3].
4145} // Predicates = [HasAVX, NoVLX_Or_NoBWI]
4146
4147let Predicates = [HasAVX2, NoVLX] in {
4148defm VPSLLWY : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli,
4149                             VR256, v16i16, v8i16, bc_v8i16, loadv2i64,
4150                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
4151defm VPSLLDY : PDI_binop_rmi<0xF2, 0x72, MRM6r, "vpslld", X86vshl, X86vshli,
4152                             VR256, v8i32, v4i32, bc_v4i32, loadv2i64,
4153                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
4154defm VPSLLQY : PDI_binop_rmi<0xF3, 0x73, MRM6r, "vpsllq", X86vshl, X86vshli,
4155                             VR256, v4i64, v2i64, bc_v2i64, loadv2i64,
4156                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
4157
4158defm VPSRLWY : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli,
4159                             VR256, v16i16, v8i16, bc_v8i16, loadv2i64,
4160                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
4161defm VPSRLDY : PDI_binop_rmi<0xD2, 0x72, MRM2r, "vpsrld", X86vsrl, X86vsrli,
4162                             VR256, v8i32, v4i32, bc_v4i32, loadv2i64,
4163                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
4164defm VPSRLQY : PDI_binop_rmi<0xD3, 0x73, MRM2r, "vpsrlq", X86vsrl, X86vsrli,
4165                             VR256, v4i64, v2i64, bc_v2i64, loadv2i64,
4166                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
4167
4168defm VPSRAWY : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai,
4169                             VR256, v16i16, v8i16, bc_v8i16, loadv2i64,
4170                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
4171defm VPSRADY : PDI_binop_rmi<0xE2, 0x72, MRM4r, "vpsrad", X86vsra, X86vsrai,
4172                             VR256, v8i32, v4i32, bc_v4i32, loadv2i64,
4173                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
4174}// Predicates = [HasAVX2]
4175
4176let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift], hasSideEffects = 0 ,
4177                                    Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
4178  // 256-bit logical shifts.
4179  def VPSLLDQYri : PDIi8<0x73, MRM7r,
4180                    (outs VR256:$dst), (ins VR256:$src1, u8imm:$src2),
4181                    "vpslldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
4182                    [(set VR256:$dst,
4183                      (v4i64 (X86vshldq VR256:$src1, (i8 imm:$src2))))]>,
4184                    VEX_4V, VEX_L;
4185  def VPSRLDQYri : PDIi8<0x73, MRM3r,
4186                    (outs VR256:$dst), (ins VR256:$src1, u8imm:$src2),
4187                    "vpsrldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
4188                    [(set VR256:$dst,
4189                      (v4i64 (X86vshrdq VR256:$src1, (i8 imm:$src2))))]>,
4190                    VEX_4V, VEX_L;
4191  // PSRADQYri doesn't exist in SSE[1-3].
4192} // Predicates = [HasAVX2, NoVLX_Or_NoBWI]
4193
4194let Constraints = "$src1 = $dst" in {
4195defm PSLLW : PDI_binop_rmi<0xF1, 0x71, MRM6r, "psllw", X86vshl, X86vshli,
4196                           VR128, v8i16, v8i16, bc_v8i16, memopv2i64,
4197                           SSE_INTSHIFT_ITINS_P>;
4198defm PSLLD : PDI_binop_rmi<0xF2, 0x72, MRM6r, "pslld", X86vshl, X86vshli,
4199                           VR128, v4i32, v4i32, bc_v4i32, memopv2i64,
4200                           SSE_INTSHIFT_ITINS_P>;
4201defm PSLLQ : PDI_binop_rmi<0xF3, 0x73, MRM6r, "psllq", X86vshl, X86vshli,
4202                           VR128, v2i64, v2i64, bc_v2i64, memopv2i64,
4203                           SSE_INTSHIFT_ITINS_P>;
4204
4205defm PSRLW : PDI_binop_rmi<0xD1, 0x71, MRM2r, "psrlw", X86vsrl, X86vsrli,
4206                           VR128, v8i16, v8i16, bc_v8i16, memopv2i64,
4207                           SSE_INTSHIFT_ITINS_P>;
4208defm PSRLD : PDI_binop_rmi<0xD2, 0x72, MRM2r, "psrld", X86vsrl, X86vsrli,
4209                           VR128, v4i32, v4i32, bc_v4i32, memopv2i64,
4210                           SSE_INTSHIFT_ITINS_P>;
4211defm PSRLQ : PDI_binop_rmi<0xD3, 0x73, MRM2r, "psrlq", X86vsrl, X86vsrli,
4212                           VR128, v2i64, v2i64, bc_v2i64, memopv2i64,
4213                           SSE_INTSHIFT_ITINS_P>;
4214
4215defm PSRAW : PDI_binop_rmi<0xE1, 0x71, MRM4r, "psraw", X86vsra, X86vsrai,
4216                           VR128, v8i16, v8i16, bc_v8i16, memopv2i64,
4217                           SSE_INTSHIFT_ITINS_P>;
4218defm PSRAD : PDI_binop_rmi<0xE2, 0x72, MRM4r, "psrad", X86vsra, X86vsrai,
4219                           VR128, v4i32, v4i32, bc_v4i32, memopv2i64,
4220                           SSE_INTSHIFT_ITINS_P>;
4221
4222let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift], hasSideEffects = 0 in {
4223  // 128-bit logical shifts.
4224  def PSLLDQri : PDIi8<0x73, MRM7r,
4225                       (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
4226                       "pslldq\t{$src2, $dst|$dst, $src2}",
4227                       [(set VR128:$dst,
4228                         (v2i64 (X86vshldq VR128:$src1, (i8 imm:$src2))))],
4229                       IIC_SSE_INTSHDQ_P_RI>;
4230  def PSRLDQri : PDIi8<0x73, MRM3r,
4231                       (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
4232                       "psrldq\t{$src2, $dst|$dst, $src2}",
4233                       [(set VR128:$dst,
4234                         (v2i64 (X86vshrdq VR128:$src1, (i8 imm:$src2))))],
4235                       IIC_SSE_INTSHDQ_P_RI>;
4236  // PSRADQri doesn't exist in SSE[1-3].
4237}
4238} // Constraints = "$src1 = $dst"
4239
4240//===---------------------------------------------------------------------===//
4241// SSE2 - Packed Integer Comparison Instructions
4242//===---------------------------------------------------------------------===//
4243
4244defm PCMPEQB : PDI_binop_all<0x74, "pcmpeqb", X86pcmpeq, v16i8, v32i8,
4245                             SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
4246defm PCMPEQW : PDI_binop_all<0x75, "pcmpeqw", X86pcmpeq, v8i16, v16i16,
4247                             SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
4248defm PCMPEQD : PDI_binop_all<0x76, "pcmpeqd", X86pcmpeq, v4i32, v8i32,
4249                             SSE_INTALU_ITINS_P, 1, NoVLX>;
4250defm PCMPGTB : PDI_binop_all<0x64, "pcmpgtb", X86pcmpgt, v16i8, v32i8,
4251                             SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
4252defm PCMPGTW : PDI_binop_all<0x65, "pcmpgtw", X86pcmpgt, v8i16, v16i16,
4253                             SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
4254defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32,
4255                             SSE_INTALU_ITINS_P, 0, NoVLX>;
4256
4257//===---------------------------------------------------------------------===//
4258// SSE2 - Packed Integer Shuffle Instructions
4259//===---------------------------------------------------------------------===//
4260
4261let ExeDomain = SSEPackedInt in {
4262multiclass sse2_pshuffle<string OpcodeStr, ValueType vt128, ValueType vt256,
4263                         SDNode OpNode> {
4264let Predicates = [HasAVX] in {
4265  def V#NAME#ri : Ii8<0x70, MRMSrcReg, (outs VR128:$dst),
4266                      (ins VR128:$src1, u8imm:$src2),
4267                      !strconcat("v", OpcodeStr,
4268                                 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4269                      [(set VR128:$dst,
4270                        (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))],
4271                      IIC_SSE_PSHUF_RI>, VEX, Sched<[WriteShuffle]>;
4272  def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst),
4273                      (ins i128mem:$src1, u8imm:$src2),
4274                      !strconcat("v", OpcodeStr,
4275                                 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4276                     [(set VR128:$dst,
4277                       (vt128 (OpNode (bitconvert (loadv2i64 addr:$src1)),
4278                        (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>, VEX,
4279                  Sched<[WriteShuffleLd]>;
4280}
4281
4282let Predicates = [HasAVX2] in {
4283  def V#NAME#Yri : Ii8<0x70, MRMSrcReg, (outs VR256:$dst),
4284                       (ins VR256:$src1, u8imm:$src2),
4285                       !strconcat("v", OpcodeStr,
4286                                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4287                       [(set VR256:$dst,
4288                         (vt256 (OpNode VR256:$src1, (i8 imm:$src2))))],
4289                       IIC_SSE_PSHUF_RI>, VEX, VEX_L, Sched<[WriteShuffle]>;
4290  def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst),
4291                       (ins i256mem:$src1, u8imm:$src2),
4292                       !strconcat("v", OpcodeStr,
4293                                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4294                      [(set VR256:$dst,
4295                        (vt256 (OpNode (bitconvert (loadv4i64 addr:$src1)),
4296                         (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>, VEX, VEX_L,
4297                   Sched<[WriteShuffleLd]>;
4298}
4299
4300let Predicates = [UseSSE2] in {
4301  def ri : Ii8<0x70, MRMSrcReg,
4302               (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
4303               !strconcat(OpcodeStr,
4304                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4305                [(set VR128:$dst,
4306                  (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))],
4307                IIC_SSE_PSHUF_RI>, Sched<[WriteShuffle]>;
4308  def mi : Ii8<0x70, MRMSrcMem,
4309               (outs VR128:$dst), (ins i128mem:$src1, u8imm:$src2),
4310               !strconcat(OpcodeStr,
4311                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4312                [(set VR128:$dst,
4313                  (vt128 (OpNode (bitconvert (memopv2i64 addr:$src1)),
4314                          (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>,
4315           Sched<[WriteShuffleLd, ReadAfterLd]>;
4316}
4317}
4318} // ExeDomain = SSEPackedInt
4319
4320defm PSHUFD  : sse2_pshuffle<"pshufd", v4i32, v8i32, X86PShufd>, PD;
4321defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw>, XS;
4322defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw>, XD;
4323
4324let Predicates = [HasAVX] in {
4325  def : Pat<(v4f32 (X86PShufd (loadv4f32 addr:$src1), (i8 imm:$imm))),
4326            (VPSHUFDmi addr:$src1, imm:$imm)>;
4327  def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))),
4328            (VPSHUFDri VR128:$src1, imm:$imm)>;
4329}
4330
4331let Predicates = [UseSSE2] in {
4332  def : Pat<(v4f32 (X86PShufd (memopv4f32 addr:$src1), (i8 imm:$imm))),
4333            (PSHUFDmi addr:$src1, imm:$imm)>;
4334  def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))),
4335            (PSHUFDri VR128:$src1, imm:$imm)>;
4336}
4337
4338//===---------------------------------------------------------------------===//
4339// Packed Integer Pack Instructions (SSE & AVX)
4340//===---------------------------------------------------------------------===//
4341
4342let ExeDomain = SSEPackedInt in {
4343multiclass sse2_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
4344                     ValueType ArgVT, SDNode OpNode, PatFrag bc_frag,
4345                     PatFrag ld_frag, bit Is2Addr = 1> {
4346  def rr : PDI<opc, MRMSrcReg,
4347               (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
4348               !if(Is2Addr,
4349                   !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4350                   !strconcat(OpcodeStr,
4351                              "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4352               [(set VR128:$dst,
4353                     (OutVT (OpNode (ArgVT VR128:$src1), VR128:$src2)))]>,
4354               Sched<[WriteShuffle]>;
4355  def rm : PDI<opc, MRMSrcMem,
4356               (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
4357               !if(Is2Addr,
4358                   !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4359                   !strconcat(OpcodeStr,
4360                              "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4361               [(set VR128:$dst,
4362                     (OutVT (OpNode VR128:$src1,
4363                                    (bc_frag (ld_frag addr:$src2)))))]>,
4364               Sched<[WriteShuffleLd, ReadAfterLd]>;
4365}
4366
4367multiclass sse2_pack_y<bits<8> opc, string OpcodeStr, ValueType OutVT,
4368                       ValueType ArgVT, SDNode OpNode, PatFrag bc_frag> {
4369  def Yrr : PDI<opc, MRMSrcReg,
4370                (outs VR256:$dst), (ins VR256:$src1, VR256:$src2),
4371                !strconcat(OpcodeStr,
4372                           "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4373                [(set VR256:$dst,
4374                      (OutVT (OpNode (ArgVT VR256:$src1), VR256:$src2)))]>,
4375                Sched<[WriteShuffle]>;
4376  def Yrm : PDI<opc, MRMSrcMem,
4377                (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2),
4378                !strconcat(OpcodeStr,
4379                           "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4380                [(set VR256:$dst,
4381                      (OutVT (OpNode VR256:$src1,
4382                                     (bc_frag (loadv4i64 addr:$src2)))))]>,
4383                Sched<[WriteShuffleLd, ReadAfterLd]>;
4384}
4385
4386multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
4387                     ValueType ArgVT, SDNode OpNode, PatFrag bc_frag,
4388                     PatFrag ld_frag, bit Is2Addr = 1> {
4389  def rr : SS48I<opc, MRMSrcReg,
4390                 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
4391                 !if(Is2Addr,
4392                     !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4393                     !strconcat(OpcodeStr,
4394                                "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4395                 [(set VR128:$dst,
4396                       (OutVT (OpNode (ArgVT VR128:$src1), VR128:$src2)))]>,
4397                 Sched<[WriteShuffle]>;
4398  def rm : SS48I<opc, MRMSrcMem,
4399                 (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
4400                 !if(Is2Addr,
4401                     !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4402                     !strconcat(OpcodeStr,
4403                                "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4404                 [(set VR128:$dst,
4405                       (OutVT (OpNode VR128:$src1,
4406                                      (bc_frag (ld_frag addr:$src2)))))]>,
4407                 Sched<[WriteShuffleLd, ReadAfterLd]>;
4408}
4409
4410multiclass sse4_pack_y<bits<8> opc, string OpcodeStr, ValueType OutVT,
4411                     ValueType ArgVT, SDNode OpNode, PatFrag bc_frag> {
4412  def Yrr : SS48I<opc, MRMSrcReg,
4413                  (outs VR256:$dst), (ins VR256:$src1, VR256:$src2),
4414                  !strconcat(OpcodeStr,
4415                             "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4416                  [(set VR256:$dst,
4417                        (OutVT (OpNode (ArgVT VR256:$src1), VR256:$src2)))]>,
4418                  Sched<[WriteShuffle]>;
4419  def Yrm : SS48I<opc, MRMSrcMem,
4420                  (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2),
4421                  !strconcat(OpcodeStr,
4422                             "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4423                  [(set VR256:$dst,
4424                        (OutVT (OpNode VR256:$src1,
4425                                       (bc_frag (loadv4i64 addr:$src2)))))]>,
4426                  Sched<[WriteShuffleLd, ReadAfterLd]>;
4427}
4428
4429let Predicates = [HasAVX] in {
4430  defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss,
4431                             bc_v8i16, loadv2i64, 0>, VEX_4V;
4432  defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss,
4433                             bc_v4i32, loadv2i64, 0>, VEX_4V;
4434
4435  defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus,
4436                             bc_v8i16, loadv2i64, 0>, VEX_4V;
4437  defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus,
4438                             bc_v4i32, loadv2i64, 0>, VEX_4V;
4439}
4440
4441let Predicates = [HasAVX2] in {
4442  defm VPACKSSWB : sse2_pack_y<0x63, "vpacksswb", v32i8, v16i16, X86Packss,
4443                               bc_v16i16>, VEX_4V, VEX_L;
4444  defm VPACKSSDW : sse2_pack_y<0x6B, "vpackssdw", v16i16, v8i32, X86Packss,
4445                               bc_v8i32>, VEX_4V, VEX_L;
4446
4447  defm VPACKUSWB : sse2_pack_y<0x67, "vpackuswb", v32i8, v16i16, X86Packus,
4448                               bc_v16i16>, VEX_4V, VEX_L;
4449  defm VPACKUSDW : sse4_pack_y<0x2B, "vpackusdw", v16i16, v8i32, X86Packus,
4450                               bc_v8i32>, VEX_4V, VEX_L;
4451}
4452
4453let Constraints = "$src1 = $dst" in {
4454  defm PACKSSWB : sse2_pack<0x63, "packsswb", v16i8, v8i16, X86Packss,
4455                            bc_v8i16, memopv2i64>;
4456  defm PACKSSDW : sse2_pack<0x6B, "packssdw", v8i16, v4i32, X86Packss,
4457                            bc_v4i32, memopv2i64>;
4458
4459  defm PACKUSWB : sse2_pack<0x67, "packuswb", v16i8, v8i16, X86Packus,
4460                            bc_v8i16, memopv2i64>;
4461
4462  let Predicates = [HasSSE41] in
4463  defm PACKUSDW : sse4_pack<0x2B, "packusdw", v8i16, v4i32, X86Packus,
4464                            bc_v4i32, memopv2i64>;
4465}
4466} // ExeDomain = SSEPackedInt
4467
4468//===---------------------------------------------------------------------===//
4469// SSE2 - Packed Integer Unpack Instructions
4470//===---------------------------------------------------------------------===//
4471
4472let ExeDomain = SSEPackedInt in {
4473multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt,
4474                       SDNode OpNode, PatFrag bc_frag, PatFrag ld_frag,
4475                       bit Is2Addr = 1> {
4476  def rr : PDI<opc, MRMSrcReg,
4477      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
4478      !if(Is2Addr,
4479          !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
4480          !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4481      [(set VR128:$dst, (vt (OpNode VR128:$src1, VR128:$src2)))],
4482      IIC_SSE_UNPCK>, Sched<[WriteShuffle]>;
4483  def rm : PDI<opc, MRMSrcMem,
4484      (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
4485      !if(Is2Addr,
4486          !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
4487          !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4488      [(set VR128:$dst, (OpNode VR128:$src1,
4489                                  (bc_frag (ld_frag addr:$src2))))],
4490                                               IIC_SSE_UNPCK>,
4491      Sched<[WriteShuffleLd, ReadAfterLd]>;
4492}
4493
4494multiclass sse2_unpack_y<bits<8> opc, string OpcodeStr, ValueType vt,
4495                         SDNode OpNode, PatFrag bc_frag> {
4496  def Yrr : PDI<opc, MRMSrcReg,
4497      (outs VR256:$dst), (ins VR256:$src1, VR256:$src2),
4498      !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4499      [(set VR256:$dst, (vt (OpNode VR256:$src1, VR256:$src2)))]>,
4500      Sched<[WriteShuffle]>;
4501  def Yrm : PDI<opc, MRMSrcMem,
4502      (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2),
4503      !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4504      [(set VR256:$dst, (OpNode VR256:$src1,
4505                                  (bc_frag (loadv4i64 addr:$src2))))]>,
4506      Sched<[WriteShuffleLd, ReadAfterLd]>;
4507}
4508
4509
4510let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
4511  defm VPUNPCKLBW  : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl,
4512                                 bc_v16i8, loadv2i64, 0>, VEX_4V;
4513  defm VPUNPCKLWD  : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl,
4514                                 bc_v8i16, loadv2i64, 0>, VEX_4V;
4515  defm VPUNPCKHBW  : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh,
4516                                 bc_v16i8, loadv2i64, 0>, VEX_4V;
4517  defm VPUNPCKHWD  : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh,
4518                                 bc_v8i16, loadv2i64, 0>, VEX_4V;
4519}
4520let Predicates = [HasAVX, NoVLX] in {
4521  defm VPUNPCKLDQ  : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl,
4522                                 bc_v4i32, loadv2i64, 0>, VEX_4V;
4523  defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl,
4524                                 bc_v2i64, loadv2i64, 0>, VEX_4V;
4525  defm VPUNPCKHDQ  : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh,
4526                                 bc_v4i32, loadv2i64, 0>, VEX_4V;
4527  defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh,
4528                                 bc_v2i64, loadv2i64, 0>, VEX_4V;
4529}
4530
4531let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
4532  defm VPUNPCKLBW  : sse2_unpack_y<0x60, "vpunpcklbw", v32i8, X86Unpckl,
4533                                   bc_v32i8>, VEX_4V, VEX_L;
4534  defm VPUNPCKLWD  : sse2_unpack_y<0x61, "vpunpcklwd", v16i16, X86Unpckl,
4535                                   bc_v16i16>, VEX_4V, VEX_L;
4536  defm VPUNPCKHBW  : sse2_unpack_y<0x68, "vpunpckhbw", v32i8, X86Unpckh,
4537                                   bc_v32i8>, VEX_4V, VEX_L;
4538  defm VPUNPCKHWD  : sse2_unpack_y<0x69, "vpunpckhwd", v16i16, X86Unpckh,
4539                                   bc_v16i16>, VEX_4V, VEX_L;
4540}
4541let Predicates = [HasAVX2, NoVLX] in {
4542  defm VPUNPCKLDQ  : sse2_unpack_y<0x62, "vpunpckldq", v8i32, X86Unpckl,
4543                                   bc_v8i32>, VEX_4V, VEX_L;
4544  defm VPUNPCKLQDQ : sse2_unpack_y<0x6C, "vpunpcklqdq", v4i64, X86Unpckl,
4545                                   bc_v4i64>, VEX_4V, VEX_L;
4546  defm VPUNPCKHDQ  : sse2_unpack_y<0x6A, "vpunpckhdq", v8i32, X86Unpckh,
4547                                   bc_v8i32>, VEX_4V, VEX_L;
4548  defm VPUNPCKHQDQ : sse2_unpack_y<0x6D, "vpunpckhqdq", v4i64, X86Unpckh,
4549                                   bc_v4i64>, VEX_4V, VEX_L;
4550}
4551
4552let Constraints = "$src1 = $dst" in {
4553  defm PUNPCKLBW  : sse2_unpack<0x60, "punpcklbw", v16i8, X86Unpckl,
4554                                bc_v16i8, memopv2i64>;
4555  defm PUNPCKLWD  : sse2_unpack<0x61, "punpcklwd", v8i16, X86Unpckl,
4556                                bc_v8i16, memopv2i64>;
4557  defm PUNPCKLDQ  : sse2_unpack<0x62, "punpckldq", v4i32, X86Unpckl,
4558                                bc_v4i32, memopv2i64>;
4559  defm PUNPCKLQDQ : sse2_unpack<0x6C, "punpcklqdq", v2i64, X86Unpckl,
4560                                bc_v2i64, memopv2i64>;
4561
4562  defm PUNPCKHBW  : sse2_unpack<0x68, "punpckhbw", v16i8, X86Unpckh,
4563                                bc_v16i8, memopv2i64>;
4564  defm PUNPCKHWD  : sse2_unpack<0x69, "punpckhwd", v8i16, X86Unpckh,
4565                                bc_v8i16, memopv2i64>;
4566  defm PUNPCKHDQ  : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Unpckh,
4567                                bc_v4i32, memopv2i64>;
4568  defm PUNPCKHQDQ : sse2_unpack<0x6D, "punpckhqdq", v2i64, X86Unpckh,
4569                                bc_v2i64, memopv2i64>;
4570}
4571} // ExeDomain = SSEPackedInt
4572
4573//===---------------------------------------------------------------------===//
4574// SSE2 - Packed Integer Extract and Insert
4575//===---------------------------------------------------------------------===//
4576
4577let ExeDomain = SSEPackedInt in {
4578multiclass sse2_pinsrw<bit Is2Addr = 1> {
4579  def rri : Ii8<0xC4, MRMSrcReg,
4580       (outs VR128:$dst), (ins VR128:$src1,
4581        GR32orGR64:$src2, u8imm:$src3),
4582       !if(Is2Addr,
4583           "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
4584           "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
4585       [(set VR128:$dst,
4586         (X86pinsrw VR128:$src1, GR32orGR64:$src2, imm:$src3))],
4587       IIC_SSE_PINSRW>, Sched<[WriteShuffle]>;
4588  def rmi : Ii8<0xC4, MRMSrcMem,
4589                       (outs VR128:$dst), (ins VR128:$src1,
4590                        i16mem:$src2, u8imm:$src3),
4591       !if(Is2Addr,
4592           "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
4593           "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
4594       [(set VR128:$dst,
4595         (X86pinsrw VR128:$src1, (extloadi16 addr:$src2),
4596                    imm:$src3))], IIC_SSE_PINSRW>,
4597       Sched<[WriteShuffleLd, ReadAfterLd]>;
4598}
4599
4600// Extract
4601let Predicates = [HasAVX, NoBWI] in
4602def VPEXTRWri : Ii8<0xC5, MRMSrcReg,
4603                    (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
4604                    "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
4605                    [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
4606                                            imm:$src2))]>, PD, VEX,
4607                Sched<[WriteShuffle]>;
4608def PEXTRWri : PDIi8<0xC5, MRMSrcReg,
4609                    (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
4610                    "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
4611                    [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
4612                                            imm:$src2))], IIC_SSE_PEXTRW>,
4613               Sched<[WriteShuffleLd, ReadAfterLd]>;
4614
4615// Insert
4616let Predicates = [HasAVX, NoBWI] in
4617defm VPINSRW : sse2_pinsrw<0>, PD, VEX_4V;
4618
4619let Predicates = [UseSSE2], Constraints = "$src1 = $dst" in
4620defm PINSRW : sse2_pinsrw, PD;
4621
4622} // ExeDomain = SSEPackedInt
4623
4624//===---------------------------------------------------------------------===//
4625// SSE2 - Packed Mask Creation
4626//===---------------------------------------------------------------------===//
4627
4628let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLogic] in {
4629
4630def VPMOVMSKBrr  : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
4631           (ins VR128:$src),
4632           "pmovmskb\t{$src, $dst|$dst, $src}",
4633           [(set GR32orGR64:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))],
4634           IIC_SSE_MOVMSK>, VEX;
4635
4636let Predicates = [HasAVX2] in {
4637def VPMOVMSKBYrr  : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
4638           (ins VR256:$src),
4639           "pmovmskb\t{$src, $dst|$dst, $src}",
4640           [(set GR32orGR64:$dst, (int_x86_avx2_pmovmskb VR256:$src))]>,
4641           VEX, VEX_L;
4642}
4643
4644def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src),
4645           "pmovmskb\t{$src, $dst|$dst, $src}",
4646           [(set GR32orGR64:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))],
4647           IIC_SSE_MOVMSK>;
4648
4649} // ExeDomain = SSEPackedInt
4650
4651//===---------------------------------------------------------------------===//
4652// SSE2 - Conditional Store
4653//===---------------------------------------------------------------------===//
4654
4655let ExeDomain = SSEPackedInt, SchedRW = [WriteStore] in {
4656
4657let Uses = [EDI], Predicates = [HasAVX,Not64BitMode] in
4658def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs),
4659           (ins VR128:$src, VR128:$mask),
4660           "maskmovdqu\t{$mask, $src|$src, $mask}",
4661           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)],
4662           IIC_SSE_MASKMOV>, VEX;
4663let Uses = [RDI], Predicates = [HasAVX,In64BitMode] in
4664def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs),
4665           (ins VR128:$src, VR128:$mask),
4666           "maskmovdqu\t{$mask, $src|$src, $mask}",
4667           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)],
4668           IIC_SSE_MASKMOV>, VEX;
4669
4670let Uses = [EDI], Predicates = [UseSSE2,Not64BitMode] in
4671def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
4672           "maskmovdqu\t{$mask, $src|$src, $mask}",
4673           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)],
4674           IIC_SSE_MASKMOV>;
4675let Uses = [RDI], Predicates = [UseSSE2,In64BitMode] in
4676def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
4677           "maskmovdqu\t{$mask, $src|$src, $mask}",
4678           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)],
4679           IIC_SSE_MASKMOV>;
4680
4681} // ExeDomain = SSEPackedInt
4682
4683//===---------------------------------------------------------------------===//
4684// SSE2 - Move Doubleword/Quadword
4685//===---------------------------------------------------------------------===//
4686
4687//===---------------------------------------------------------------------===//
4688// Move Int Doubleword to Packed Double Int
4689//
4690def VMOVDI2PDIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
4691                      "movd\t{$src, $dst|$dst, $src}",
4692                      [(set VR128:$dst,
4693                        (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>,
4694                        VEX, Sched<[WriteMove]>;
4695def VMOVDI2PDIrm : VS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
4696                      "movd\t{$src, $dst|$dst, $src}",
4697                      [(set VR128:$dst,
4698                        (v4i32 (scalar_to_vector (loadi32 addr:$src))))],
4699                        IIC_SSE_MOVDQ>,
4700                      VEX, Sched<[WriteLoad]>;
4701def VMOV64toPQIrr : VRS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
4702                        "movq\t{$src, $dst|$dst, $src}",
4703                        [(set VR128:$dst,
4704                          (v2i64 (scalar_to_vector GR64:$src)))],
4705                          IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
4706let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
4707def VMOV64toPQIrm : VRS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4708                        "movq\t{$src, $dst|$dst, $src}",
4709                        [], IIC_SSE_MOVDQ>, VEX, Sched<[WriteLoad]>;
4710let isCodeGenOnly = 1 in
4711def VMOV64toSDrr : VRS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
4712                       "movq\t{$src, $dst|$dst, $src}",
4713                       [(set FR64:$dst, (bitconvert GR64:$src))],
4714                       IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
4715
4716def MOVDI2PDIrr : S2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
4717                      "movd\t{$src, $dst|$dst, $src}",
4718                      [(set VR128:$dst,
4719                        (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>,
4720                  Sched<[WriteMove]>;
4721def MOVDI2PDIrm : S2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
4722                      "movd\t{$src, $dst|$dst, $src}",
4723                      [(set VR128:$dst,
4724                        (v4i32 (scalar_to_vector (loadi32 addr:$src))))],
4725                        IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
4726def MOV64toPQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
4727                        "mov{d|q}\t{$src, $dst|$dst, $src}",
4728                        [(set VR128:$dst,
4729                          (v2i64 (scalar_to_vector GR64:$src)))],
4730                          IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
4731let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
4732def MOV64toPQIrm : RS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4733                        "mov{d|q}\t{$src, $dst|$dst, $src}",
4734                        [], IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
4735let isCodeGenOnly = 1 in
4736def MOV64toSDrr : RS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
4737                       "mov{d|q}\t{$src, $dst|$dst, $src}",
4738                       [(set FR64:$dst, (bitconvert GR64:$src))],
4739                       IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
4740
4741//===---------------------------------------------------------------------===//
4742// Move Int Doubleword to Single Scalar
4743//
4744let isCodeGenOnly = 1 in {
4745  def VMOVDI2SSrr  : VS2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
4746                        "movd\t{$src, $dst|$dst, $src}",
4747                        [(set FR32:$dst, (bitconvert GR32:$src))],
4748                        IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
4749
4750  def VMOVDI2SSrm  : VS2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
4751                        "movd\t{$src, $dst|$dst, $src}",
4752                        [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))],
4753                        IIC_SSE_MOVDQ>,
4754                        VEX, Sched<[WriteLoad]>;
4755  def MOVDI2SSrr  : S2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
4756                        "movd\t{$src, $dst|$dst, $src}",
4757                        [(set FR32:$dst, (bitconvert GR32:$src))],
4758                        IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
4759
4760  def MOVDI2SSrm  : S2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
4761                        "movd\t{$src, $dst|$dst, $src}",
4762                        [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))],
4763                        IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
4764}
4765
4766//===---------------------------------------------------------------------===//
4767// Move Packed Doubleword Int to Packed Double Int
4768//
4769def VMOVPDI2DIrr  : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
4770                       "movd\t{$src, $dst|$dst, $src}",
4771                       [(set GR32:$dst, (extractelt (v4i32 VR128:$src),
4772                                        (iPTR 0)))], IIC_SSE_MOVD_ToGP>, VEX,
4773                    Sched<[WriteMove]>;
4774def VMOVPDI2DImr  : VS2I<0x7E, MRMDestMem, (outs),
4775                       (ins i32mem:$dst, VR128:$src),
4776                       "movd\t{$src, $dst|$dst, $src}",
4777                       [(store (i32 (extractelt (v4i32 VR128:$src),
4778                                     (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>,
4779                                     VEX, Sched<[WriteStore]>;
4780def MOVPDI2DIrr  : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
4781                       "movd\t{$src, $dst|$dst, $src}",
4782                       [(set GR32:$dst, (extractelt (v4i32 VR128:$src),
4783                                        (iPTR 0)))], IIC_SSE_MOVD_ToGP>,
4784                   Sched<[WriteMove]>;
4785def MOVPDI2DImr  : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src),
4786                       "movd\t{$src, $dst|$dst, $src}",
4787                       [(store (i32 (extractelt (v4i32 VR128:$src),
4788                                     (iPTR 0))), addr:$dst)],
4789                                     IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
4790
4791def : Pat<(v8i32 (X86Vinsert (v8i32 immAllZerosV), GR32:$src2, (iPTR 0))),
4792        (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src2), sub_xmm)>;
4793
4794def : Pat<(v4i64 (X86Vinsert (bc_v4i64 (v8i32 immAllZerosV)), GR64:$src2, (iPTR 0))),
4795        (SUBREG_TO_REG (i32 0), (VMOV64toPQIrr GR64:$src2), sub_xmm)>;
4796
4797def : Pat<(v8i32 (X86Vinsert undef, GR32:$src2, (iPTR 0))),
4798        (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src2), sub_xmm)>;
4799
4800def : Pat<(v4i64 (X86Vinsert undef, GR64:$src2, (iPTR 0))),
4801        (SUBREG_TO_REG (i32 0), (VMOV64toPQIrr GR64:$src2), sub_xmm)>;
4802
4803//===---------------------------------------------------------------------===//
4804// Move Packed Doubleword Int first element to Doubleword Int
4805//
4806let SchedRW = [WriteMove] in {
4807def VMOVPQIto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
4808                          "movq\t{$src, $dst|$dst, $src}",
4809                          [(set GR64:$dst, (extractelt (v2i64 VR128:$src),
4810                                                        (iPTR 0)))],
4811                                                           IIC_SSE_MOVD_ToGP>,
4812                      VEX;
4813
4814def MOVPQIto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
4815                        "mov{d|q}\t{$src, $dst|$dst, $src}",
4816                        [(set GR64:$dst, (extractelt (v2i64 VR128:$src),
4817                                                         (iPTR 0)))],
4818                                                         IIC_SSE_MOVD_ToGP>;
4819} //SchedRW
4820
4821let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
4822def VMOVPQIto64rm : VRS2I<0x7E, MRMDestMem, (outs i64mem:$dst),
4823                          (ins VR128:$src), "movq\t{$src, $dst|$dst, $src}",
4824                          [], IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>;
4825let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
4826def MOVPQIto64rm : RS2I<0x7E, MRMDestMem, (outs i64mem:$dst), (ins VR128:$src),
4827                        "mov{d|q}\t{$src, $dst|$dst, $src}",
4828                        [], IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
4829
4830//===---------------------------------------------------------------------===//
4831// Bitcast FR64 <-> GR64
4832//
4833let isCodeGenOnly = 1 in {
4834  let Predicates = [UseAVX] in
4835  def VMOV64toSDrm : VS2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
4836                          "movq\t{$src, $dst|$dst, $src}",
4837                          [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>,
4838                          VEX, Sched<[WriteLoad]>;
4839  def VMOVSDto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
4840                           "movq\t{$src, $dst|$dst, $src}",
4841                           [(set GR64:$dst, (bitconvert FR64:$src))],
4842                           IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
4843  def VMOVSDto64mr : VRS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
4844                           "movq\t{$src, $dst|$dst, $src}",
4845                           [(store (i64 (bitconvert FR64:$src)), addr:$dst)],
4846                           IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>;
4847
4848  def MOV64toSDrm : S2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
4849                         "movq\t{$src, $dst|$dst, $src}",
4850                         [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))],
4851                         IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
4852  def MOVSDto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
4853                         "mov{d|q}\t{$src, $dst|$dst, $src}",
4854                         [(set GR64:$dst, (bitconvert FR64:$src))],
4855                         IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>;
4856  def MOVSDto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
4857                         "movq\t{$src, $dst|$dst, $src}",
4858                         [(store (i64 (bitconvert FR64:$src)), addr:$dst)],
4859                         IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
4860}
4861
4862//===---------------------------------------------------------------------===//
4863// Move Scalar Single to Double Int
4864//
4865let isCodeGenOnly = 1 in {
4866  def VMOVSS2DIrr  : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
4867                        "movd\t{$src, $dst|$dst, $src}",
4868                        [(set GR32:$dst, (bitconvert FR32:$src))],
4869                        IIC_SSE_MOVD_ToGP>, VEX, Sched<[WriteMove]>;
4870  def VMOVSS2DImr  : VS2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
4871                        "movd\t{$src, $dst|$dst, $src}",
4872                        [(store (i32 (bitconvert FR32:$src)), addr:$dst)],
4873                        IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>;
4874  def MOVSS2DIrr  : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
4875                        "movd\t{$src, $dst|$dst, $src}",
4876                        [(set GR32:$dst, (bitconvert FR32:$src))],
4877                        IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>;
4878  def MOVSS2DImr  : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
4879                        "movd\t{$src, $dst|$dst, $src}",
4880                        [(store (i32 (bitconvert FR32:$src)), addr:$dst)],
4881                        IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
4882}
4883
4884let Predicates = [UseAVX] in {
4885  let AddedComplexity = 15 in {
4886    def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
4887              (VMOVDI2PDIrr GR32:$src)>;
4888
4889    def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
4890              (VMOV64toPQIrr GR64:$src)>;
4891
4892    def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
4893                (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
4894              (SUBREG_TO_REG (i64 0), (VMOV64toPQIrr GR64:$src), sub_xmm)>;
4895  }
4896  // AVX 128-bit movd/movq instructions write zeros in the high 128-bit part.
4897  // These instructions also write zeros in the high part of a 256-bit register.
4898  let AddedComplexity = 20 in {
4899    def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
4900              (VMOVDI2PDIrm addr:$src)>;
4901    def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
4902              (VMOVDI2PDIrm addr:$src)>;
4903    def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
4904              (VMOVDI2PDIrm addr:$src)>;
4905    def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
4906                (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))),
4907              (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrm addr:$src), sub_xmm)>;
4908  }
4909  // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext.
4910  def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
4911                               (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),
4912            (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src), sub_xmm)>;
4913}
4914
4915let Predicates = [UseSSE2] in {
4916  let AddedComplexity = 15 in {
4917    def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
4918              (MOVDI2PDIrr GR32:$src)>;
4919
4920    def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
4921              (MOV64toPQIrr GR64:$src)>;
4922  }
4923  let AddedComplexity = 20 in {
4924    def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
4925              (MOVDI2PDIrm addr:$src)>;
4926    def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
4927              (MOVDI2PDIrm addr:$src)>;
4928    def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
4929              (MOVDI2PDIrm addr:$src)>;
4930  }
4931}
4932
4933// These are the correct encodings of the instructions so that we know how to
4934// read correct assembly, even though we continue to emit the wrong ones for
4935// compatibility with Darwin's buggy assembler.
4936def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
4937                (MOV64toPQIrr VR128:$dst, GR64:$src), 0>;
4938def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
4939                (MOVPQIto64rr GR64:$dst, VR128:$src), 0>;
4940// Allow "vmovd" but print "vmovq" since we don't need compatibility for AVX.
4941def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
4942                (VMOV64toPQIrr VR128:$dst, GR64:$src), 0>;
4943def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
4944                (VMOVPQIto64rr GR64:$dst, VR128:$src), 0>;
4945
4946//===---------------------------------------------------------------------===//
4947// SSE2 - Move Quadword
4948//===---------------------------------------------------------------------===//
4949
4950//===---------------------------------------------------------------------===//
4951// Move Quadword Int to Packed Quadword Int
4952//
4953
4954let ExeDomain = SSEPackedInt, SchedRW = [WriteLoad] in {
4955def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4956                    "vmovq\t{$src, $dst|$dst, $src}",
4957                    [(set VR128:$dst,
4958                      (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS,
4959                    VEX, Requires<[UseAVX]>;
4960def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4961                    "movq\t{$src, $dst|$dst, $src}",
4962                    [(set VR128:$dst,
4963                      (v2i64 (scalar_to_vector (loadi64 addr:$src))))],
4964                      IIC_SSE_MOVDQ>, XS,
4965                    Requires<[UseSSE2]>; // SSE2 instruction with XS Prefix
4966} // ExeDomain, SchedRW
4967
4968//===---------------------------------------------------------------------===//
4969// Move Packed Quadword Int to Quadword Int
4970//
4971let ExeDomain = SSEPackedInt, SchedRW = [WriteStore] in {
4972def VMOVPQI2QImr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
4973                      "movq\t{$src, $dst|$dst, $src}",
4974                      [(store (i64 (extractelt (v2i64 VR128:$src),
4975                                    (iPTR 0))), addr:$dst)],
4976                                    IIC_SSE_MOVDQ>, VEX;
4977def MOVPQI2QImr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
4978                      "movq\t{$src, $dst|$dst, $src}",
4979                      [(store (i64 (extractelt (v2i64 VR128:$src),
4980                                    (iPTR 0))), addr:$dst)],
4981                                    IIC_SSE_MOVDQ>;
4982} // ExeDomain, SchedRW
4983
4984// For disassembler only
4985let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
4986    SchedRW = [WriteVecLogic] in {
4987def VMOVPQI2QIrr : VS2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
4988                     "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVQ_RR>, VEX;
4989def MOVPQI2QIrr : S2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
4990                      "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVQ_RR>;
4991}
4992
4993//===---------------------------------------------------------------------===//
4994// Store / copy lower 64-bits of a XMM register.
4995//
4996let Predicates = [HasAVX] in
4997def : Pat<(int_x86_sse2_storel_dq addr:$dst, VR128:$src),
4998          (VMOVPQI2QImr addr:$dst, VR128:$src)>;
4999let Predicates = [UseSSE2] in
5000def : Pat<(int_x86_sse2_storel_dq addr:$dst, VR128:$src),
5001          (MOVPQI2QImr addr:$dst, VR128:$src)>;
5002
5003let ExeDomain = SSEPackedInt, isCodeGenOnly = 1, AddedComplexity = 20 in {
5004def VMOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
5005                     "vmovq\t{$src, $dst|$dst, $src}",
5006                     [(set VR128:$dst,
5007                       (v2i64 (X86vzmovl (v2i64 (scalar_to_vector
5008                                                 (loadi64 addr:$src))))))],
5009                                                 IIC_SSE_MOVDQ>,
5010                     XS, VEX, Requires<[UseAVX]>, Sched<[WriteLoad]>;
5011
5012def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
5013                     "movq\t{$src, $dst|$dst, $src}",
5014                     [(set VR128:$dst,
5015                       (v2i64 (X86vzmovl (v2i64 (scalar_to_vector
5016                                                 (loadi64 addr:$src))))))],
5017                                                 IIC_SSE_MOVDQ>,
5018                     XS, Requires<[UseSSE2]>, Sched<[WriteLoad]>;
5019} // ExeDomain, isCodeGenOnly, AddedComplexity
5020
5021let Predicates = [UseAVX], AddedComplexity = 20 in {
5022  def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))),
5023            (VMOVZQI2PQIrm addr:$src)>;
5024  def : Pat<(v2i64 (X86vzload addr:$src)),
5025            (VMOVZQI2PQIrm addr:$src)>;
5026  def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
5027              (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))),
5028            (SUBREG_TO_REG (i64 0), (VMOVZQI2PQIrm addr:$src), sub_xmm)>;
5029}
5030
5031let Predicates = [UseSSE2], AddedComplexity = 20 in {
5032  def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))),
5033            (MOVZQI2PQIrm addr:$src)>;
5034  def : Pat<(v2i64 (X86vzload addr:$src)), (MOVZQI2PQIrm addr:$src)>;
5035}
5036
5037let Predicates = [HasAVX] in {
5038def : Pat<(v4i64 (alignedX86vzload addr:$src)),
5039          (SUBREG_TO_REG (i32 0), (VMOVAPSrm addr:$src), sub_xmm)>;
5040def : Pat<(v4i64 (X86vzload addr:$src)),
5041          (SUBREG_TO_REG (i32 0), (VMOVUPSrm addr:$src), sub_xmm)>;
5042}
5043
5044//===---------------------------------------------------------------------===//
5045// Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in
5046// IA32 document. movq xmm1, xmm2 does clear the high bits.
5047//
5048let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLogic] in {
5049let AddedComplexity = 15 in
5050def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
5051                        "vmovq\t{$src, $dst|$dst, $src}",
5052                    [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))],
5053                    IIC_SSE_MOVQ_RR>,
5054                      XS, VEX, Requires<[UseAVX]>;
5055let AddedComplexity = 15 in
5056def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
5057                        "movq\t{$src, $dst|$dst, $src}",
5058                    [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))],
5059                    IIC_SSE_MOVQ_RR>,
5060                      XS, Requires<[UseSSE2]>;
5061} // ExeDomain, SchedRW
5062
5063let ExeDomain = SSEPackedInt, isCodeGenOnly = 1, SchedRW = [WriteVecLogicLd] in {
5064let AddedComplexity = 20 in
5065def VMOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
5066                        "vmovq\t{$src, $dst|$dst, $src}",
5067                    [(set VR128:$dst, (v2i64 (X86vzmovl
5068                                             (loadv2i64 addr:$src))))],
5069                                             IIC_SSE_MOVDQ>,
5070                      XS, VEX, Requires<[UseAVX]>;
5071let AddedComplexity = 20 in {
5072def MOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
5073                        "movq\t{$src, $dst|$dst, $src}",
5074                    [(set VR128:$dst, (v2i64 (X86vzmovl
5075                                             (loadv2i64 addr:$src))))],
5076                                             IIC_SSE_MOVDQ>,
5077                      XS, Requires<[UseSSE2]>;
5078}
5079} // ExeDomain, isCodeGenOnly, SchedRW
5080
5081let AddedComplexity = 20 in {
5082  let Predicates = [UseAVX] in {
5083    def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
5084              (VMOVZPQILo2PQIrr VR128:$src)>;
5085  }
5086  let Predicates = [UseSSE2] in {
5087    def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
5088              (MOVZPQILo2PQIrr VR128:$src)>;
5089  }
5090}
5091
5092//===---------------------------------------------------------------------===//
5093// SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP
5094//===---------------------------------------------------------------------===//
5095multiclass sse3_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr,
5096                              ValueType vt, RegisterClass RC, PatFrag mem_frag,
5097                              X86MemOperand x86memop> {
5098def rr : S3SI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
5099                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5100                      [(set RC:$dst, (vt (OpNode RC:$src)))],
5101                      IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>;
5102def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
5103                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5104                      [(set RC:$dst, (OpNode (mem_frag addr:$src)))],
5105                      IIC_SSE_MOV_LH>, Sched<[WriteLoad]>;
5106}
5107
5108let Predicates = [HasAVX, NoVLX] in {
5109  defm VMOVSHDUP  : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
5110                                       v4f32, VR128, loadv4f32, f128mem>, VEX;
5111  defm VMOVSLDUP  : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
5112                                       v4f32, VR128, loadv4f32, f128mem>, VEX;
5113  defm VMOVSHDUPY : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
5114                                 v8f32, VR256, loadv8f32, f256mem>, VEX, VEX_L;
5115  defm VMOVSLDUPY : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
5116                                 v8f32, VR256, loadv8f32, f256mem>, VEX, VEX_L;
5117}
5118defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128,
5119                                   memopv4f32, f128mem>;
5120defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128,
5121                                   memopv4f32, f128mem>;
5122
5123let Predicates = [HasAVX, NoVLX] in {
5124  def : Pat<(v4i32 (X86Movshdup VR128:$src)),
5125            (VMOVSHDUPrr VR128:$src)>;
5126  def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (loadv2i64 addr:$src)))),
5127            (VMOVSHDUPrm addr:$src)>;
5128  def : Pat<(v4i32 (X86Movsldup VR128:$src)),
5129            (VMOVSLDUPrr VR128:$src)>;
5130  def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (loadv2i64 addr:$src)))),
5131            (VMOVSLDUPrm addr:$src)>;
5132  def : Pat<(v8i32 (X86Movshdup VR256:$src)),
5133            (VMOVSHDUPYrr VR256:$src)>;
5134  def : Pat<(v8i32 (X86Movshdup (bc_v8i32 (loadv4i64 addr:$src)))),
5135            (VMOVSHDUPYrm addr:$src)>;
5136  def : Pat<(v8i32 (X86Movsldup VR256:$src)),
5137            (VMOVSLDUPYrr VR256:$src)>;
5138  def : Pat<(v8i32 (X86Movsldup (bc_v8i32 (loadv4i64 addr:$src)))),
5139            (VMOVSLDUPYrm addr:$src)>;
5140}
5141
5142let Predicates = [UseSSE3] in {
5143  def : Pat<(v4i32 (X86Movshdup VR128:$src)),
5144            (MOVSHDUPrr VR128:$src)>;
5145  def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (memopv2i64 addr:$src)))),
5146            (MOVSHDUPrm addr:$src)>;
5147  def : Pat<(v4i32 (X86Movsldup VR128:$src)),
5148            (MOVSLDUPrr VR128:$src)>;
5149  def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (memopv2i64 addr:$src)))),
5150            (MOVSLDUPrm addr:$src)>;
5151}
5152
5153//===---------------------------------------------------------------------===//
5154// SSE3 - Replicate Double FP - MOVDDUP
5155//===---------------------------------------------------------------------===//
5156
5157multiclass sse3_replicate_dfp<string OpcodeStr> {
5158def rr  : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
5159                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5160                    [(set VR128:$dst, (v2f64 (X86Movddup VR128:$src)))],
5161                    IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>;
5162def rm  : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
5163                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5164                    [(set VR128:$dst,
5165                      (v2f64 (X86Movddup
5166                              (scalar_to_vector (loadf64 addr:$src)))))],
5167                              IIC_SSE_MOV_LH>, Sched<[WriteLoad]>;
5168}
5169
5170// FIXME: Merge with above classe when there're patterns for the ymm version
5171multiclass sse3_replicate_dfp_y<string OpcodeStr> {
5172def rr  : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
5173                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5174                    [(set VR256:$dst, (v4f64 (X86Movddup VR256:$src)))]>,
5175                    Sched<[WriteFShuffle]>;
5176def rm  : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
5177                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5178                    [(set VR256:$dst,
5179                      (v4f64 (X86Movddup (loadv4f64 addr:$src))))]>,
5180                    Sched<[WriteLoad]>;
5181}
5182
5183let Predicates = [HasAVX, NoVLX] in {
5184  defm VMOVDDUP  : sse3_replicate_dfp<"vmovddup">, VEX;
5185  defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup">, VEX, VEX_L;
5186}
5187
5188defm MOVDDUP : sse3_replicate_dfp<"movddup">;
5189
5190
5191let Predicates = [HasAVX, NoVLX] in {
5192  def : Pat<(X86Movddup (loadv2f64 addr:$src)),
5193            (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
5194
5195  // 256-bit version
5196  def : Pat<(X86Movddup (loadv4i64 addr:$src)),
5197            (VMOVDDUPYrm addr:$src)>;
5198  def : Pat<(X86Movddup (v4i64 VR256:$src)),
5199            (VMOVDDUPYrr VR256:$src)>;
5200}
5201
5202let Predicates = [HasAVX] in {
5203  def : Pat<(X86Movddup (bc_v2f64 (loadv4f32 addr:$src))),
5204            (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
5205  def : Pat<(X86Movddup (bc_v2f64 (loadv2i64 addr:$src))),
5206            (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
5207  def : Pat<(X86Movddup (bc_v2f64
5208                             (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
5209            (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
5210}
5211
5212let Predicates = [UseAVX, OptForSize] in {
5213  def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))),
5214            (VMOVDDUPrm addr:$src)>;
5215  def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))),
5216            (VMOVDDUPrm addr:$src)>;
5217}
5218
5219let Predicates = [UseSSE3] in {
5220  def : Pat<(X86Movddup (memopv2f64 addr:$src)),
5221            (MOVDDUPrm addr:$src)>;
5222  def : Pat<(X86Movddup (bc_v2f64 (memopv4f32 addr:$src))),
5223            (MOVDDUPrm addr:$src)>;
5224  def : Pat<(X86Movddup (bc_v2f64 (memopv2i64 addr:$src))),
5225            (MOVDDUPrm addr:$src)>;
5226  def : Pat<(X86Movddup (bc_v2f64
5227                             (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
5228            (MOVDDUPrm addr:$src)>;
5229}
5230
5231//===---------------------------------------------------------------------===//
5232// SSE3 - Move Unaligned Integer
5233//===---------------------------------------------------------------------===//
5234
5235let SchedRW = [WriteLoad] in {
5236let Predicates = [HasAVX] in {
5237  def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
5238                   "vlddqu\t{$src, $dst|$dst, $src}",
5239                   [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>, VEX;
5240  def VLDDQUYrm : S3DI<0xF0, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
5241                   "vlddqu\t{$src, $dst|$dst, $src}",
5242                   [(set VR256:$dst, (int_x86_avx_ldu_dq_256 addr:$src))]>,
5243                   VEX, VEX_L;
5244}
5245def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
5246                   "lddqu\t{$src, $dst|$dst, $src}",
5247                   [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))],
5248                   IIC_SSE_LDDQU>;
5249}
5250
5251//===---------------------------------------------------------------------===//
5252// SSE3 - Arithmetic
5253//===---------------------------------------------------------------------===//
5254
5255multiclass sse3_addsub<Intrinsic Int, string OpcodeStr, RegisterClass RC,
5256                       X86MemOperand x86memop, OpndItins itins,
5257                       PatFrag ld_frag, bit Is2Addr = 1> {
5258  def rr : I<0xD0, MRMSrcReg,
5259       (outs RC:$dst), (ins RC:$src1, RC:$src2),
5260       !if(Is2Addr,
5261           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5262           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5263       [(set RC:$dst, (Int RC:$src1, RC:$src2))], itins.rr>,
5264       Sched<[itins.Sched]>;
5265  def rm : I<0xD0, MRMSrcMem,
5266       (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
5267       !if(Is2Addr,
5268           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5269           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5270       [(set RC:$dst, (Int RC:$src1, (ld_frag addr:$src2)))], itins.rr>,
5271       Sched<[itins.Sched.Folded, ReadAfterLd]>;
5272}
5273
5274let Predicates = [HasAVX] in {
5275  let ExeDomain = SSEPackedSingle in {
5276    defm VADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "vaddsubps", VR128,
5277                               f128mem, SSE_ALU_F32P, loadv4f32, 0>, XD, VEX_4V;
5278    defm VADDSUBPSY : sse3_addsub<int_x86_avx_addsub_ps_256, "vaddsubps", VR256,
5279                        f256mem, SSE_ALU_F32P, loadv8f32, 0>, XD, VEX_4V, VEX_L;
5280  }
5281  let ExeDomain = SSEPackedDouble in {
5282    defm VADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "vaddsubpd", VR128,
5283                               f128mem, SSE_ALU_F64P, loadv2f64, 0>, PD, VEX_4V;
5284    defm VADDSUBPDY : sse3_addsub<int_x86_avx_addsub_pd_256, "vaddsubpd", VR256,
5285                        f256mem, SSE_ALU_F64P, loadv4f64, 0>, PD, VEX_4V, VEX_L;
5286  }
5287}
5288let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in {
5289  let ExeDomain = SSEPackedSingle in
5290  defm ADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "addsubps", VR128,
5291                              f128mem, SSE_ALU_F32P, memopv4f32>, XD;
5292  let ExeDomain = SSEPackedDouble in
5293  defm ADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "addsubpd", VR128,
5294                              f128mem, SSE_ALU_F64P, memopv2f64>, PD;
5295}
5296
5297// Patterns used to select 'addsub' instructions.
5298let Predicates = [HasAVX] in {
5299  def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (v4f32 VR128:$rhs))),
5300            (VADDSUBPSrr VR128:$lhs, VR128:$rhs)>;
5301  def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (loadv4f32 addr:$rhs))),
5302            (VADDSUBPSrm VR128:$lhs, f128mem:$rhs)>;
5303  def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (v2f64 VR128:$rhs))),
5304            (VADDSUBPDrr VR128:$lhs, VR128:$rhs)>;
5305  def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (loadv2f64 addr:$rhs))),
5306            (VADDSUBPDrm VR128:$lhs, f128mem:$rhs)>;
5307
5308  def : Pat<(v8f32 (X86Addsub (v8f32 VR256:$lhs), (v8f32 VR256:$rhs))),
5309            (VADDSUBPSYrr VR256:$lhs, VR256:$rhs)>;
5310  def : Pat<(v8f32 (X86Addsub (v8f32 VR256:$lhs), (loadv8f32 addr:$rhs))),
5311            (VADDSUBPSYrm VR256:$lhs, f256mem:$rhs)>;
5312  def : Pat<(v4f64 (X86Addsub (v4f64 VR256:$lhs), (v4f64 VR256:$rhs))),
5313            (VADDSUBPDYrr VR256:$lhs, VR256:$rhs)>;
5314  def : Pat<(v4f64 (X86Addsub (v4f64 VR256:$lhs), (loadv4f64 addr:$rhs))),
5315            (VADDSUBPDYrm VR256:$lhs, f256mem:$rhs)>;
5316}
5317
5318let Predicates = [UseSSE3] in {
5319  def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (v4f32 VR128:$rhs))),
5320            (ADDSUBPSrr VR128:$lhs, VR128:$rhs)>;
5321  def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (memopv4f32 addr:$rhs))),
5322            (ADDSUBPSrm VR128:$lhs, f128mem:$rhs)>;
5323  def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (v2f64 VR128:$rhs))),
5324            (ADDSUBPDrr VR128:$lhs, VR128:$rhs)>;
5325  def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (memopv2f64 addr:$rhs))),
5326            (ADDSUBPDrm VR128:$lhs, f128mem:$rhs)>;
5327}
5328
5329//===---------------------------------------------------------------------===//
5330// SSE3 Instructions
5331//===---------------------------------------------------------------------===//
5332
5333// Horizontal ops
5334multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
5335                   X86MemOperand x86memop, SDNode OpNode, PatFrag ld_frag,
5336                   bit Is2Addr = 1> {
5337  def rr : S3DI<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
5338       !if(Is2Addr,
5339         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5340         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5341      [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], IIC_SSE_HADDSUB_RR>,
5342      Sched<[WriteFAdd]>;
5343
5344  def rm : S3DI<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
5345       !if(Is2Addr,
5346         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5347         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5348      [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))],
5349        IIC_SSE_HADDSUB_RM>, Sched<[WriteFAddLd, ReadAfterLd]>;
5350}
5351multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
5352                  X86MemOperand x86memop, SDNode OpNode, PatFrag ld_frag,
5353                  bit Is2Addr = 1> {
5354  def rr : S3I<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
5355       !if(Is2Addr,
5356         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5357         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5358      [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], IIC_SSE_HADDSUB_RR>,
5359      Sched<[WriteFAdd]>;
5360
5361  def rm : S3I<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
5362       !if(Is2Addr,
5363         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5364         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5365      [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))],
5366        IIC_SSE_HADDSUB_RM>, Sched<[WriteFAddLd, ReadAfterLd]>;
5367}
5368
5369let Predicates = [HasAVX] in {
5370  let ExeDomain = SSEPackedSingle in {
5371    defm VHADDPS  : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem,
5372                            X86fhadd, loadv4f32, 0>, VEX_4V;
5373    defm VHSUBPS  : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem,
5374                            X86fhsub, loadv4f32, 0>, VEX_4V;
5375    defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem,
5376                            X86fhadd, loadv8f32, 0>, VEX_4V, VEX_L;
5377    defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem,
5378                            X86fhsub, loadv8f32, 0>, VEX_4V, VEX_L;
5379  }
5380  let ExeDomain = SSEPackedDouble in {
5381    defm VHADDPD  : S3_Int <0x7C, "vhaddpd", v2f64, VR128, f128mem,
5382                            X86fhadd, loadv2f64, 0>, VEX_4V;
5383    defm VHSUBPD  : S3_Int <0x7D, "vhsubpd", v2f64, VR128, f128mem,
5384                            X86fhsub, loadv2f64, 0>, VEX_4V;
5385    defm VHADDPDY : S3_Int <0x7C, "vhaddpd", v4f64, VR256, f256mem,
5386                            X86fhadd, loadv4f64, 0>, VEX_4V, VEX_L;
5387    defm VHSUBPDY : S3_Int <0x7D, "vhsubpd", v4f64, VR256, f256mem,
5388                            X86fhsub, loadv4f64, 0>, VEX_4V, VEX_L;
5389  }
5390}
5391
5392let Constraints = "$src1 = $dst" in {
5393  let ExeDomain = SSEPackedSingle in {
5394    defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, X86fhadd,
5395                          memopv4f32>;
5396    defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, X86fhsub,
5397                          memopv4f32>;
5398  }
5399  let ExeDomain = SSEPackedDouble in {
5400    defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, X86fhadd,
5401                         memopv2f64>;
5402    defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, X86fhsub,
5403                         memopv2f64>;
5404  }
5405}
5406
5407//===---------------------------------------------------------------------===//
5408// SSSE3 - Packed Absolute Instructions
5409//===---------------------------------------------------------------------===//
5410
5411
5412/// SS3I_unop_rm_int - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
5413multiclass SS3I_unop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId128,
5414                            PatFrag ld_frag> {
5415  def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
5416                    (ins VR128:$src),
5417                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5418                    [(set VR128:$dst, (IntId128 VR128:$src))], IIC_SSE_PABS_RR>,
5419                    Sched<[WriteVecALU]>;
5420
5421  def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
5422                    (ins i128mem:$src),
5423                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5424                    [(set VR128:$dst,
5425                      (IntId128
5426                       (bitconvert (ld_frag addr:$src))))], IIC_SSE_PABS_RM>,
5427                    Sched<[WriteVecALULd]>;
5428}
5429
5430/// SS3I_unop_rm_int_y - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
5431multiclass SS3I_unop_rm_int_y<bits<8> opc, string OpcodeStr,
5432                              Intrinsic IntId256> {
5433  def rr256 : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
5434                    (ins VR256:$src),
5435                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5436                    [(set VR256:$dst, (IntId256 VR256:$src))]>,
5437                    Sched<[WriteVecALU]>;
5438
5439  def rm256 : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
5440                    (ins i256mem:$src),
5441                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5442                    [(set VR256:$dst,
5443                      (IntId256
5444                       (bitconvert (loadv4i64 addr:$src))))]>,
5445                    Sched<[WriteVecALULd]>;
5446}
5447
5448// Helper fragments to match sext vXi1 to vXiY.
5449def v16i1sextv16i8 : PatLeaf<(v16i8 (X86pcmpgt (bc_v16i8 (v4i32 immAllZerosV)),
5450                                               VR128:$src))>;
5451def v8i1sextv8i16  : PatLeaf<(v8i16 (X86vsrai VR128:$src, (i8 15)))>;
5452def v4i1sextv4i32  : PatLeaf<(v4i32 (X86vsrai VR128:$src, (i8 31)))>;
5453def v32i1sextv32i8 : PatLeaf<(v32i8 (X86pcmpgt (bc_v32i8 (v8i32 immAllZerosV)),
5454                                               VR256:$src))>;
5455def v16i1sextv16i16: PatLeaf<(v16i16 (X86vsrai VR256:$src, (i8 15)))>;
5456def v8i1sextv8i32  : PatLeaf<(v8i32 (X86vsrai VR256:$src, (i8 31)))>;
5457
5458let Predicates = [HasAVX] in {
5459  defm VPABSB  : SS3I_unop_rm_int<0x1C, "vpabsb", int_x86_ssse3_pabs_b_128,
5460                                  loadv2i64>, VEX;
5461  defm VPABSW  : SS3I_unop_rm_int<0x1D, "vpabsw", int_x86_ssse3_pabs_w_128,
5462                                  loadv2i64>, VEX;
5463  defm VPABSD  : SS3I_unop_rm_int<0x1E, "vpabsd", int_x86_ssse3_pabs_d_128,
5464                                  loadv2i64>, VEX;
5465
5466  def : Pat<(xor
5467            (bc_v2i64 (v16i1sextv16i8)),
5468            (bc_v2i64 (add (v16i8 VR128:$src), (v16i1sextv16i8)))),
5469            (VPABSBrr128 VR128:$src)>;
5470  def : Pat<(xor
5471            (bc_v2i64 (v8i1sextv8i16)),
5472            (bc_v2i64 (add (v8i16 VR128:$src), (v8i1sextv8i16)))),
5473            (VPABSWrr128 VR128:$src)>;
5474  def : Pat<(xor
5475            (bc_v2i64 (v4i1sextv4i32)),
5476            (bc_v2i64 (add (v4i32 VR128:$src), (v4i1sextv4i32)))),
5477            (VPABSDrr128 VR128:$src)>;
5478}
5479
5480let Predicates = [HasAVX2] in {
5481  defm VPABSB  : SS3I_unop_rm_int_y<0x1C, "vpabsb",
5482                                    int_x86_avx2_pabs_b>, VEX, VEX_L;
5483  defm VPABSW  : SS3I_unop_rm_int_y<0x1D, "vpabsw",
5484                                    int_x86_avx2_pabs_w>, VEX, VEX_L;
5485  defm VPABSD  : SS3I_unop_rm_int_y<0x1E, "vpabsd",
5486                                    int_x86_avx2_pabs_d>, VEX, VEX_L;
5487
5488  def : Pat<(xor
5489            (bc_v4i64 (v32i1sextv32i8)),
5490            (bc_v4i64 (add (v32i8 VR256:$src), (v32i1sextv32i8)))),
5491            (VPABSBrr256 VR256:$src)>;
5492  def : Pat<(xor
5493            (bc_v4i64 (v16i1sextv16i16)),
5494            (bc_v4i64 (add (v16i16 VR256:$src), (v16i1sextv16i16)))),
5495            (VPABSWrr256 VR256:$src)>;
5496  def : Pat<(xor
5497            (bc_v4i64 (v8i1sextv8i32)),
5498            (bc_v4i64 (add (v8i32 VR256:$src), (v8i1sextv8i32)))),
5499            (VPABSDrr256 VR256:$src)>;
5500}
5501
5502defm PABSB : SS3I_unop_rm_int<0x1C, "pabsb", int_x86_ssse3_pabs_b_128,
5503                              memopv2i64>;
5504defm PABSW : SS3I_unop_rm_int<0x1D, "pabsw", int_x86_ssse3_pabs_w_128,
5505                              memopv2i64>;
5506defm PABSD : SS3I_unop_rm_int<0x1E, "pabsd", int_x86_ssse3_pabs_d_128,
5507                              memopv2i64>;
5508
5509let Predicates = [HasSSSE3] in {
5510  def : Pat<(xor
5511            (bc_v2i64 (v16i1sextv16i8)),
5512            (bc_v2i64 (add (v16i8 VR128:$src), (v16i1sextv16i8)))),
5513            (PABSBrr128 VR128:$src)>;
5514  def : Pat<(xor
5515            (bc_v2i64 (v8i1sextv8i16)),
5516            (bc_v2i64 (add (v8i16 VR128:$src), (v8i1sextv8i16)))),
5517            (PABSWrr128 VR128:$src)>;
5518  def : Pat<(xor
5519            (bc_v2i64 (v4i1sextv4i32)),
5520            (bc_v2i64 (add (v4i32 VR128:$src), (v4i1sextv4i32)))),
5521            (PABSDrr128 VR128:$src)>;
5522}
5523
5524//===---------------------------------------------------------------------===//
5525// SSSE3 - Packed Binary Operator Instructions
5526//===---------------------------------------------------------------------===//
5527
5528let Sched = WriteVecALU in {
5529def SSE_PHADDSUBD : OpndItins<
5530  IIC_SSE_PHADDSUBD_RR, IIC_SSE_PHADDSUBD_RM
5531>;
5532def SSE_PHADDSUBSW : OpndItins<
5533  IIC_SSE_PHADDSUBSW_RR, IIC_SSE_PHADDSUBSW_RM
5534>;
5535def SSE_PHADDSUBW : OpndItins<
5536  IIC_SSE_PHADDSUBW_RR, IIC_SSE_PHADDSUBW_RM
5537>;
5538}
5539let Sched = WriteShuffle in
5540def SSE_PSHUFB : OpndItins<
5541  IIC_SSE_PSHUFB_RR, IIC_SSE_PSHUFB_RM
5542>;
5543let Sched = WriteVecALU in
5544def SSE_PSIGN : OpndItins<
5545  IIC_SSE_PSIGN_RR, IIC_SSE_PSIGN_RM
5546>;
5547let Sched = WriteVecIMul in
5548def SSE_PMULHRSW : OpndItins<
5549  IIC_SSE_PMULHRSW, IIC_SSE_PMULHRSW
5550>;
5551
5552/// SS3I_binop_rm - Simple SSSE3 bin op
5553multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
5554                         ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
5555                         X86MemOperand x86memop, OpndItins itins,
5556                         bit Is2Addr = 1> {
5557  let isCommutable = 1 in
5558  def rr : SS38I<opc, MRMSrcReg, (outs RC:$dst),
5559       (ins RC:$src1, RC:$src2),
5560       !if(Is2Addr,
5561         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5562         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5563       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))], itins.rr>,
5564       Sched<[itins.Sched]>;
5565  def rm : SS38I<opc, MRMSrcMem, (outs RC:$dst),
5566       (ins RC:$src1, x86memop:$src2),
5567       !if(Is2Addr,
5568         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5569         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5570       [(set RC:$dst,
5571         (OpVT (OpNode RC:$src1,
5572          (bitconvert (memop_frag addr:$src2)))))], itins.rm>,
5573       Sched<[itins.Sched.Folded, ReadAfterLd]>;
5574}
5575
5576/// SS3I_binop_rm_int - Simple SSSE3 bin op whose type can be v*{i8,i16,i32}.
5577multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr,
5578                             Intrinsic IntId128, OpndItins itins,
5579                             PatFrag ld_frag, bit Is2Addr = 1> {
5580  let isCommutable = 1 in
5581  def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
5582       (ins VR128:$src1, VR128:$src2),
5583       !if(Is2Addr,
5584         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5585         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5586       [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
5587       Sched<[itins.Sched]>;
5588  def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
5589       (ins VR128:$src1, i128mem:$src2),
5590       !if(Is2Addr,
5591         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5592         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5593       [(set VR128:$dst,
5594         (IntId128 VR128:$src1,
5595          (bitconvert (ld_frag addr:$src2))))]>,
5596       Sched<[itins.Sched.Folded, ReadAfterLd]>;
5597}
5598
5599multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
5600                               Intrinsic IntId256,
5601                               X86FoldableSchedWrite Sched> {
5602  let isCommutable = 1 in
5603  def rr256 : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
5604       (ins VR256:$src1, VR256:$src2),
5605       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5606       [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>,
5607       Sched<[Sched]>;
5608  def rm256 : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
5609       (ins VR256:$src1, i256mem:$src2),
5610       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5611       [(set VR256:$dst,
5612         (IntId256 VR256:$src1, (bitconvert (loadv4i64 addr:$src2))))]>,
5613       Sched<[Sched.Folded, ReadAfterLd]>;
5614}
5615
5616let ImmT = NoImm, Predicates = [HasAVX] in {
5617let isCommutable = 0 in {
5618  defm VPHADDW    : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, VR128,
5619                                  loadv2i64, i128mem,
5620                                  SSE_PHADDSUBW, 0>, VEX_4V;
5621  defm VPHADDD    : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, VR128,
5622                                  loadv2i64, i128mem,
5623                                  SSE_PHADDSUBD, 0>, VEX_4V;
5624  defm VPHSUBW    : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, VR128,
5625                                  loadv2i64, i128mem,
5626                                  SSE_PHADDSUBW, 0>, VEX_4V;
5627  defm VPHSUBD    : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, VR128,
5628                                  loadv2i64, i128mem,
5629                                  SSE_PHADDSUBD, 0>, VEX_4V;
5630  defm VPSIGNB    : SS3I_binop_rm<0x08, "vpsignb", X86psign, v16i8, VR128,
5631                                  loadv2i64, i128mem,
5632                                  SSE_PSIGN, 0>, VEX_4V;
5633  defm VPSIGNW    : SS3I_binop_rm<0x09, "vpsignw", X86psign, v8i16, VR128,
5634                                  loadv2i64, i128mem,
5635                                  SSE_PSIGN, 0>, VEX_4V;
5636  defm VPSIGND    : SS3I_binop_rm<0x0A, "vpsignd", X86psign, v4i32, VR128,
5637                                  loadv2i64, i128mem,
5638                                  SSE_PSIGN, 0>, VEX_4V;
5639  defm VPSHUFB    : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, VR128,
5640                                  loadv2i64, i128mem,
5641                                  SSE_PSHUFB, 0>, VEX_4V;
5642  defm VPHADDSW   : SS3I_binop_rm_int<0x03, "vphaddsw",
5643                                      int_x86_ssse3_phadd_sw_128,
5644                                      SSE_PHADDSUBSW, loadv2i64, 0>, VEX_4V;
5645  defm VPHSUBSW   : SS3I_binop_rm_int<0x07, "vphsubsw",
5646                                      int_x86_ssse3_phsub_sw_128,
5647                                      SSE_PHADDSUBSW, loadv2i64, 0>, VEX_4V;
5648  defm VPMADDUBSW : SS3I_binop_rm_int<0x04, "vpmaddubsw",
5649                                      int_x86_ssse3_pmadd_ub_sw_128,
5650                                      SSE_PMADD, loadv2i64, 0>, VEX_4V;
5651}
5652defm VPMULHRSW    : SS3I_binop_rm_int<0x0B, "vpmulhrsw",
5653                                      int_x86_ssse3_pmul_hr_sw_128,
5654                                      SSE_PMULHRSW, loadv2i64, 0>, VEX_4V;
5655}
5656
5657let ImmT = NoImm, Predicates = [HasAVX2] in {
5658let isCommutable = 0 in {
5659  defm VPHADDWY   : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, VR256,
5660                                  loadv4i64, i256mem,
5661                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
5662  defm VPHADDDY   : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, VR256,
5663                                  loadv4i64, i256mem,
5664                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
5665  defm VPHSUBWY   : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, VR256,
5666                                  loadv4i64, i256mem,
5667                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
5668  defm VPHSUBDY   : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, VR256,
5669                                  loadv4i64, i256mem,
5670                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
5671  defm VPSIGNBY   : SS3I_binop_rm<0x08, "vpsignb", X86psign, v32i8, VR256,
5672                                  loadv4i64, i256mem,
5673                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
5674  defm VPSIGNWY   : SS3I_binop_rm<0x09, "vpsignw", X86psign, v16i16, VR256,
5675                                  loadv4i64, i256mem,
5676                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
5677  defm VPSIGNDY   : SS3I_binop_rm<0x0A, "vpsignd", X86psign, v8i32, VR256,
5678                                  loadv4i64, i256mem,
5679                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
5680  defm VPSHUFBY   : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, VR256,
5681                                  loadv4i64, i256mem,
5682                                  SSE_PSHUFB, 0>, VEX_4V, VEX_L;
5683  defm VPHADDSW   : SS3I_binop_rm_int_y<0x03, "vphaddsw",
5684                                        int_x86_avx2_phadd_sw,
5685                                        WriteVecALU>, VEX_4V, VEX_L;
5686  defm VPHSUBSW   : SS3I_binop_rm_int_y<0x07, "vphsubsw",
5687                                        int_x86_avx2_phsub_sw,
5688                                        WriteVecALU>, VEX_4V, VEX_L;
5689  defm VPMADDUBSW : SS3I_binop_rm_int_y<0x04, "vpmaddubsw",
5690                                       int_x86_avx2_pmadd_ub_sw,
5691                                        WriteVecIMul>, VEX_4V, VEX_L;
5692}
5693defm VPMULHRSW    : SS3I_binop_rm_int_y<0x0B, "vpmulhrsw",
5694                                        int_x86_avx2_pmul_hr_sw,
5695                                        WriteVecIMul>, VEX_4V, VEX_L;
5696}
5697
5698// None of these have i8 immediate fields.
5699let ImmT = NoImm, Constraints = "$src1 = $dst" in {
5700let isCommutable = 0 in {
5701  defm PHADDW    : SS3I_binop_rm<0x01, "phaddw", X86hadd, v8i16, VR128,
5702                                 memopv2i64, i128mem, SSE_PHADDSUBW>;
5703  defm PHADDD    : SS3I_binop_rm<0x02, "phaddd", X86hadd, v4i32, VR128,
5704                                 memopv2i64, i128mem, SSE_PHADDSUBD>;
5705  defm PHSUBW    : SS3I_binop_rm<0x05, "phsubw", X86hsub, v8i16, VR128,
5706                                 memopv2i64, i128mem, SSE_PHADDSUBW>;
5707  defm PHSUBD    : SS3I_binop_rm<0x06, "phsubd", X86hsub, v4i32, VR128,
5708                                 memopv2i64, i128mem, SSE_PHADDSUBD>;
5709  defm PSIGNB    : SS3I_binop_rm<0x08, "psignb", X86psign, v16i8, VR128,
5710                                 memopv2i64, i128mem, SSE_PSIGN>;
5711  defm PSIGNW    : SS3I_binop_rm<0x09, "psignw", X86psign, v8i16, VR128,
5712                                 memopv2i64, i128mem, SSE_PSIGN>;
5713  defm PSIGND    : SS3I_binop_rm<0x0A, "psignd", X86psign, v4i32, VR128,
5714                                 memopv2i64, i128mem, SSE_PSIGN>;
5715  defm PSHUFB    : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, VR128,
5716                                 memopv2i64, i128mem, SSE_PSHUFB>;
5717  defm PHADDSW   : SS3I_binop_rm_int<0x03, "phaddsw",
5718                                     int_x86_ssse3_phadd_sw_128,
5719                                     SSE_PHADDSUBSW, memopv2i64>;
5720  defm PHSUBSW   : SS3I_binop_rm_int<0x07, "phsubsw",
5721                                     int_x86_ssse3_phsub_sw_128,
5722                                     SSE_PHADDSUBSW, memopv2i64>;
5723  defm PMADDUBSW : SS3I_binop_rm_int<0x04, "pmaddubsw",
5724                                     int_x86_ssse3_pmadd_ub_sw_128,
5725                                     SSE_PMADD, memopv2i64>;
5726}
5727defm PMULHRSW    : SS3I_binop_rm_int<0x0B, "pmulhrsw",
5728                                     int_x86_ssse3_pmul_hr_sw_128,
5729                                     SSE_PMULHRSW, memopv2i64>;
5730}
5731
5732//===---------------------------------------------------------------------===//
5733// SSSE3 - Packed Align Instruction Patterns
5734//===---------------------------------------------------------------------===//
5735
5736multiclass ssse3_palignr<string asm, bit Is2Addr = 1> {
5737  let hasSideEffects = 0 in {
5738  def R128rr : SS3AI<0x0F, MRMSrcReg, (outs VR128:$dst),
5739      (ins VR128:$src1, VR128:$src2, u8imm:$src3),
5740      !if(Is2Addr,
5741        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5742        !strconcat(asm,
5743                  "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5744      [], IIC_SSE_PALIGNRR>, Sched<[WriteShuffle]>;
5745  let mayLoad = 1 in
5746  def R128rm : SS3AI<0x0F, MRMSrcMem, (outs VR128:$dst),
5747      (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
5748      !if(Is2Addr,
5749        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5750        !strconcat(asm,
5751                  "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5752      [], IIC_SSE_PALIGNRM>, Sched<[WriteShuffleLd, ReadAfterLd]>;
5753  }
5754}
5755
5756multiclass ssse3_palignr_y<string asm, bit Is2Addr = 1> {
5757  let hasSideEffects = 0 in {
5758  def R256rr : SS3AI<0x0F, MRMSrcReg, (outs VR256:$dst),
5759      (ins VR256:$src1, VR256:$src2, u8imm:$src3),
5760      !strconcat(asm,
5761                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5762      []>, Sched<[WriteShuffle]>;
5763  let mayLoad = 1 in
5764  def R256rm : SS3AI<0x0F, MRMSrcMem, (outs VR256:$dst),
5765      (ins VR256:$src1, i256mem:$src2, u8imm:$src3),
5766      !strconcat(asm,
5767                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5768      []>, Sched<[WriteShuffleLd, ReadAfterLd]>;
5769  }
5770}
5771
5772let Predicates = [HasAVX] in
5773  defm VPALIGN : ssse3_palignr<"vpalignr", 0>, VEX_4V;
5774let Predicates = [HasAVX2] in
5775  defm VPALIGN : ssse3_palignr_y<"vpalignr", 0>, VEX_4V, VEX_L;
5776let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in
5777  defm PALIGN : ssse3_palignr<"palignr">;
5778
5779let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
5780def : Pat<(v8i32 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
5781          (VPALIGNR256rr VR256:$src1, VR256:$src2, imm:$imm)>;
5782def : Pat<(v8f32 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
5783          (VPALIGNR256rr VR256:$src1, VR256:$src2, imm:$imm)>;
5784def : Pat<(v16i16 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
5785          (VPALIGNR256rr VR256:$src1, VR256:$src2, imm:$imm)>;
5786def : Pat<(v32i8 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
5787          (VPALIGNR256rr VR256:$src1, VR256:$src2, imm:$imm)>;
5788}
5789
5790let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
5791def : Pat<(v4i32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
5792          (VPALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>;
5793def : Pat<(v4f32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
5794          (VPALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>;
5795def : Pat<(v8i16 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
5796          (VPALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>;
5797def : Pat<(v16i8 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
5798          (VPALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>;
5799}
5800
5801let Predicates = [UseSSSE3] in {
5802def : Pat<(v4i32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
5803          (PALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>;
5804def : Pat<(v4f32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
5805          (PALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>;
5806def : Pat<(v8i16 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
5807          (PALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>;
5808def : Pat<(v16i8 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
5809          (PALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>;
5810}
5811
5812//===---------------------------------------------------------------------===//
5813// SSSE3 - Thread synchronization
5814//===---------------------------------------------------------------------===//
5815
5816let SchedRW = [WriteSystem] in {
5817let usesCustomInserter = 1 in {
5818def MONITOR : PseudoI<(outs), (ins i32mem:$src1, GR32:$src2, GR32:$src3),
5819                [(int_x86_sse3_monitor addr:$src1, GR32:$src2, GR32:$src3)]>,
5820                Requires<[HasSSE3]>;
5821}
5822
5823let Uses = [EAX, ECX, EDX] in
5824def MONITORrrr : I<0x01, MRM_C8, (outs), (ins), "monitor", [], IIC_SSE_MONITOR>,
5825                 TB, Requires<[HasSSE3]>;
5826let Uses = [ECX, EAX] in
5827def MWAITrr   : I<0x01, MRM_C9, (outs), (ins), "mwait",
5828                [(int_x86_sse3_mwait ECX, EAX)], IIC_SSE_MWAIT>,
5829                TB, Requires<[HasSSE3]>;
5830} // SchedRW
5831
5832def : InstAlias<"mwait\t{%eax, %ecx|ecx, eax}", (MWAITrr)>, Requires<[Not64BitMode]>;
5833def : InstAlias<"mwait\t{%rax, %rcx|rcx, rax}", (MWAITrr)>, Requires<[In64BitMode]>;
5834
5835def : InstAlias<"monitor\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITORrrr)>,
5836      Requires<[Not64BitMode]>;
5837def : InstAlias<"monitor\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITORrrr)>,
5838      Requires<[In64BitMode]>;
5839
5840//===----------------------------------------------------------------------===//
5841// SSE4.1 - Packed Move with Sign/Zero Extend
5842//===----------------------------------------------------------------------===//
5843
5844multiclass SS41I_pmovx_rrrm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp,
5845                          RegisterClass OutRC, RegisterClass InRC,
5846                          OpndItins itins> {
5847  def rr : SS48I<opc, MRMSrcReg, (outs OutRC:$dst), (ins InRC:$src),
5848                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5849                 [], itins.rr>,
5850                 Sched<[itins.Sched]>;
5851
5852  def rm : SS48I<opc, MRMSrcMem, (outs OutRC:$dst), (ins MemOp:$src),
5853                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5854                 [],
5855                 itins.rm>, Sched<[itins.Sched.Folded]>;
5856}
5857
5858multiclass SS41I_pmovx_rm_all<bits<8> opc, string OpcodeStr,
5859                          X86MemOperand MemOp, X86MemOperand MemYOp,
5860                          OpndItins SSEItins, OpndItins AVXItins,
5861                          OpndItins AVX2Itins> {
5862  defm NAME : SS41I_pmovx_rrrm<opc, OpcodeStr, MemOp, VR128, VR128, SSEItins>;
5863  let Predicates = [HasAVX, NoVLX] in
5864    defm V#NAME   : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemOp,
5865                                     VR128, VR128, AVXItins>, VEX;
5866  let Predicates = [HasAVX2, NoVLX] in
5867    defm V#NAME#Y : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemYOp,
5868                                     VR256, VR128, AVX2Itins>, VEX, VEX_L;
5869}
5870
5871multiclass SS41I_pmovx_rm<bits<8> opc, string OpcodeStr,
5872                                X86MemOperand MemOp, X86MemOperand MemYOp> {
5873  defm PMOVSX#NAME : SS41I_pmovx_rm_all<opc, !strconcat("pmovsx", OpcodeStr),
5874                                        MemOp, MemYOp,
5875                                        SSE_INTALU_ITINS_SHUFF_P,
5876                                        DEFAULT_ITINS_SHUFFLESCHED,
5877                                        DEFAULT_ITINS_SHUFFLESCHED>;
5878  defm PMOVZX#NAME : SS41I_pmovx_rm_all<!add(opc, 0x10),
5879                                        !strconcat("pmovzx", OpcodeStr),
5880                                        MemOp, MemYOp,
5881                                        SSE_INTALU_ITINS_SHUFF_P,
5882                                        DEFAULT_ITINS_SHUFFLESCHED,
5883                                        DEFAULT_ITINS_SHUFFLESCHED>;
5884}
5885
5886defm BW : SS41I_pmovx_rm<0x20, "bw", i64mem, i128mem>;
5887defm WD : SS41I_pmovx_rm<0x23, "wd", i64mem, i128mem>;
5888defm DQ : SS41I_pmovx_rm<0x25, "dq", i64mem, i128mem>;
5889
5890defm BD : SS41I_pmovx_rm<0x21, "bd", i32mem, i64mem>;
5891defm WQ : SS41I_pmovx_rm<0x24, "wq", i32mem, i64mem>;
5892
5893defm BQ : SS41I_pmovx_rm<0x22, "bq", i16mem, i32mem>;
5894
5895// AVX2 Patterns
5896multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy, SDNode ExtOp> {
5897  // Register-Register patterns
5898  def : Pat<(v16i16 (ExtOp (v16i8 VR128:$src))),
5899            (!cast<I>(OpcPrefix#BWYrr) VR128:$src)>;
5900  def : Pat<(v8i32 (ExtOp (v16i8 VR128:$src))),
5901            (!cast<I>(OpcPrefix#BDYrr) VR128:$src)>;
5902  def : Pat<(v4i64 (ExtOp (v16i8 VR128:$src))),
5903            (!cast<I>(OpcPrefix#BQYrr) VR128:$src)>;
5904
5905  def : Pat<(v8i32 (ExtOp (v8i16 VR128:$src))),
5906            (!cast<I>(OpcPrefix#WDYrr) VR128:$src)>;
5907  def : Pat<(v4i64 (ExtOp (v8i16 VR128:$src))),
5908            (!cast<I>(OpcPrefix#WQYrr) VR128:$src)>;
5909
5910  def : Pat<(v4i64 (ExtOp (v4i32 VR128:$src))),
5911            (!cast<I>(OpcPrefix#DQYrr) VR128:$src)>;
5912
5913  // On AVX2, we also support 256bit inputs.
5914  def : Pat<(v16i16 (ExtOp (v32i8 VR256:$src))),
5915            (!cast<I>(OpcPrefix#BWYrr) (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
5916  def : Pat<(v8i32 (ExtOp (v32i8 VR256:$src))),
5917            (!cast<I>(OpcPrefix#BDYrr) (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
5918  def : Pat<(v4i64 (ExtOp (v32i8 VR256:$src))),
5919            (!cast<I>(OpcPrefix#BQYrr) (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
5920
5921  def : Pat<(v8i32 (ExtOp (v16i16 VR256:$src))),
5922            (!cast<I>(OpcPrefix#WDYrr) (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
5923  def : Pat<(v4i64 (ExtOp (v16i16 VR256:$src))),
5924            (!cast<I>(OpcPrefix#WQYrr) (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
5925
5926  def : Pat<(v4i64 (ExtOp (v8i32 VR256:$src))),
5927            (!cast<I>(OpcPrefix#DQYrr) (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
5928
5929  // Simple Register-Memory patterns
5930  def : Pat<(v16i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5931            (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
5932  def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5933            (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
5934  def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5935            (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
5936
5937  def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
5938            (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
5939  def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
5940            (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
5941
5942  def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)),
5943            (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
5944
5945  // AVX2 Register-Memory patterns
5946  def : Pat<(v16i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
5947            (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
5948  def : Pat<(v16i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
5949            (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
5950  def : Pat<(v16i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
5951            (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
5952  def : Pat<(v16i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
5953            (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
5954
5955  def : Pat<(v8i32 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5956            (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
5957  def : Pat<(v8i32 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
5958            (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
5959  def : Pat<(v8i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
5960            (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
5961  def : Pat<(v8i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
5962            (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
5963
5964  def : Pat<(v4i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
5965            (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
5966  def : Pat<(v4i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))),
5967            (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
5968  def : Pat<(v4i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
5969            (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
5970  def : Pat<(v4i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
5971            (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
5972
5973  def : Pat<(v8i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
5974            (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
5975  def : Pat<(v8i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
5976            (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
5977  def : Pat<(v8i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
5978            (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
5979  def : Pat<(v8i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
5980            (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
5981
5982  def : Pat<(v4i64 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5983            (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
5984  def : Pat<(v4i64 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
5985            (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
5986  def : Pat<(v4i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
5987            (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
5988  def : Pat<(v4i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
5989            (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
5990
5991  def : Pat<(v4i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))),
5992            (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
5993  def : Pat<(v4i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))),
5994            (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
5995  def : Pat<(v4i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))),
5996            (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
5997  def : Pat<(v4i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))),
5998            (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
5999}
6000
6001let Predicates = [HasAVX2, NoVLX] in {
6002  defm : SS41I_pmovx_avx2_patterns<"VPMOVSX", "s", X86vsext>;
6003  defm : SS41I_pmovx_avx2_patterns<"VPMOVZX", "z", X86vzext>;
6004}
6005
6006// SSE4.1/AVX patterns.
6007multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
6008                                SDNode ExtOp, PatFrag ExtLoad16> {
6009  def : Pat<(v8i16 (ExtOp (v16i8 VR128:$src))),
6010            (!cast<I>(OpcPrefix#BWrr) VR128:$src)>;
6011  def : Pat<(v4i32 (ExtOp (v16i8 VR128:$src))),
6012            (!cast<I>(OpcPrefix#BDrr) VR128:$src)>;
6013  def : Pat<(v2i64 (ExtOp (v16i8 VR128:$src))),
6014            (!cast<I>(OpcPrefix#BQrr) VR128:$src)>;
6015
6016  def : Pat<(v4i32 (ExtOp (v8i16 VR128:$src))),
6017            (!cast<I>(OpcPrefix#WDrr) VR128:$src)>;
6018  def : Pat<(v2i64 (ExtOp (v8i16 VR128:$src))),
6019            (!cast<I>(OpcPrefix#WQrr) VR128:$src)>;
6020
6021  def : Pat<(v2i64 (ExtOp (v4i32 VR128:$src))),
6022            (!cast<I>(OpcPrefix#DQrr) VR128:$src)>;
6023
6024  def : Pat<(v8i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
6025            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
6026  def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
6027            (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
6028  def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
6029            (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
6030
6031  def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
6032            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
6033  def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
6034            (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
6035
6036  def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)),
6037            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
6038
6039  def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
6040            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
6041  def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
6042            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
6043  def : Pat<(v8i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
6044            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
6045  def : Pat<(v8i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
6046            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
6047  def : Pat<(v8i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
6048            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
6049
6050  def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
6051            (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
6052  def : Pat<(v4i32 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))),
6053            (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
6054  def : Pat<(v4i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
6055            (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
6056  def : Pat<(v4i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
6057            (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
6058
6059  def : Pat<(v2i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (ExtLoad16 addr:$src)))))),
6060            (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
6061  def : Pat<(v2i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))),
6062            (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
6063  def : Pat<(v2i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
6064            (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
6065  def : Pat<(v2i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
6066            (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
6067
6068  def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
6069            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
6070  def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
6071            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
6072  def : Pat<(v4i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
6073            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
6074  def : Pat<(v4i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
6075            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
6076  def : Pat<(v4i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
6077            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
6078
6079  def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
6080            (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
6081  def : Pat<(v2i64 (ExtOp (v8i16 (vzmovl_v4i32 addr:$src)))),
6082            (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
6083  def : Pat<(v2i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
6084            (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
6085  def : Pat<(v2i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
6086            (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
6087
6088  def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
6089            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
6090  def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
6091            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
6092  def : Pat<(v2i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))),
6093            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
6094  def : Pat<(v2i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))),
6095            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
6096  def : Pat<(v2i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))),
6097            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
6098}
6099
6100let Predicates = [HasAVX, NoVLX] in {
6101  defm : SS41I_pmovx_patterns<"VPMOVSX", "s", X86vsext, extloadi32i16>;
6102  defm : SS41I_pmovx_patterns<"VPMOVZX", "z", X86vzext, loadi16_anyext>;
6103}
6104
6105let Predicates = [UseSSE41] in {
6106  defm : SS41I_pmovx_patterns<"PMOVSX", "s", X86vsext, extloadi32i16>;
6107  defm : SS41I_pmovx_patterns<"PMOVZX", "z", X86vzext, loadi16_anyext>;
6108}
6109
6110//===----------------------------------------------------------------------===//
6111// SSE4.1 - Extract Instructions
6112//===----------------------------------------------------------------------===//
6113
6114/// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem
6115multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> {
6116  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
6117                 (ins VR128:$src1, u8imm:$src2),
6118                 !strconcat(OpcodeStr,
6119                            "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6120                 [(set GR32orGR64:$dst, (X86pextrb (v16i8 VR128:$src1),
6121                                         imm:$src2))]>,
6122                  Sched<[WriteShuffle]>;
6123  let hasSideEffects = 0, mayStore = 1,
6124      SchedRW = [WriteShuffleLd, WriteRMW] in
6125  def mr : SS4AIi8<opc, MRMDestMem, (outs),
6126                 (ins i8mem:$dst, VR128:$src1, u8imm:$src2),
6127                 !strconcat(OpcodeStr,
6128                            "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6129                 [(store (i8 (trunc (assertzext (X86pextrb (v16i8 VR128:$src1),
6130                                                 imm:$src2)))), addr:$dst)]>;
6131}
6132
6133let Predicates = [HasAVX, NoBWI] in
6134  defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX;
6135
6136defm PEXTRB      : SS41I_extract8<0x14, "pextrb">;
6137
6138
6139/// SS41I_extract16 - SSE 4.1 extract 16 bits to memory destination
6140multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> {
6141  let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
6142  def rr_REV : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
6143                   (ins VR128:$src1, u8imm:$src2),
6144                   !strconcat(OpcodeStr,
6145                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6146                   []>, Sched<[WriteShuffle]>;
6147
6148  let hasSideEffects = 0, mayStore = 1,
6149      SchedRW = [WriteShuffleLd, WriteRMW] in
6150  def mr : SS4AIi8<opc, MRMDestMem, (outs),
6151                 (ins i16mem:$dst, VR128:$src1, u8imm:$src2),
6152                 !strconcat(OpcodeStr,
6153                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6154                 [(store (i16 (trunc (assertzext (X86pextrw (v8i16 VR128:$src1),
6155                                                  imm:$src2)))), addr:$dst)]>;
6156}
6157
6158let Predicates = [HasAVX, NoBWI] in
6159  defm VPEXTRW : SS41I_extract16<0x15, "vpextrw">, VEX;
6160
6161defm PEXTRW      : SS41I_extract16<0x15, "pextrw">;
6162
6163
6164/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
6165multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> {
6166  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
6167                 (ins VR128:$src1, u8imm:$src2),
6168                 !strconcat(OpcodeStr,
6169                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6170                 [(set GR32:$dst,
6171                  (extractelt (v4i32 VR128:$src1), imm:$src2))]>,
6172                  Sched<[WriteShuffle]>;
6173  let SchedRW = [WriteShuffleLd, WriteRMW] in
6174  def mr : SS4AIi8<opc, MRMDestMem, (outs),
6175                 (ins i32mem:$dst, VR128:$src1, u8imm:$src2),
6176                 !strconcat(OpcodeStr,
6177                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6178                 [(store (extractelt (v4i32 VR128:$src1), imm:$src2),
6179                          addr:$dst)]>;
6180}
6181
6182let Predicates = [HasAVX, NoDQI] in
6183  defm VPEXTRD : SS41I_extract32<0x16, "vpextrd">, VEX;
6184
6185defm PEXTRD      : SS41I_extract32<0x16, "pextrd">;
6186
6187/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
6188multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> {
6189  def rr : SS4AIi8<opc, MRMDestReg, (outs GR64:$dst),
6190                 (ins VR128:$src1, u8imm:$src2),
6191                 !strconcat(OpcodeStr,
6192                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6193                 [(set GR64:$dst,
6194                  (extractelt (v2i64 VR128:$src1), imm:$src2))]>,
6195                  Sched<[WriteShuffle]>, REX_W;
6196  let SchedRW = [WriteShuffleLd, WriteRMW] in
6197  def mr : SS4AIi8<opc, MRMDestMem, (outs),
6198                 (ins i64mem:$dst, VR128:$src1, u8imm:$src2),
6199                 !strconcat(OpcodeStr,
6200                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6201                 [(store (extractelt (v2i64 VR128:$src1), imm:$src2),
6202                          addr:$dst)]>, REX_W;
6203}
6204
6205let Predicates = [HasAVX, NoDQI] in
6206  defm VPEXTRQ : SS41I_extract64<0x16, "vpextrq">, VEX, VEX_W;
6207
6208defm PEXTRQ      : SS41I_extract64<0x16, "pextrq">;
6209
6210/// SS41I_extractf32 - SSE 4.1 extract 32 bits fp value to int reg or memory
6211/// destination
6212multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr,
6213                            OpndItins itins = DEFAULT_ITINS> {
6214  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
6215                 (ins VR128:$src1, u8imm:$src2),
6216                 !strconcat(OpcodeStr,
6217                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6218                 [(set GR32orGR64:$dst,
6219                    (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))],
6220                    itins.rr>, Sched<[WriteFBlend]>;
6221  let SchedRW = [WriteFBlendLd, WriteRMW] in
6222  def mr : SS4AIi8<opc, MRMDestMem, (outs),
6223                 (ins f32mem:$dst, VR128:$src1, u8imm:$src2),
6224                 !strconcat(OpcodeStr,
6225                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6226                 [(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2),
6227                          addr:$dst)], itins.rm>;
6228}
6229
6230let ExeDomain = SSEPackedSingle in {
6231  let Predicates = [UseAVX] in
6232    defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX;
6233  defm EXTRACTPS   : SS41I_extractf32<0x17, "extractps", SSE_EXTRACT_ITINS>;
6234}
6235
6236// Also match an EXTRACTPS store when the store is done as f32 instead of i32.
6237def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)),
6238                                              imm:$src2))),
6239                 addr:$dst),
6240          (VEXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>,
6241          Requires<[HasAVX]>;
6242def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)),
6243                                              imm:$src2))),
6244                 addr:$dst),
6245          (EXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>,
6246          Requires<[UseSSE41]>;
6247
6248//===----------------------------------------------------------------------===//
6249// SSE4.1 - Insert Instructions
6250//===----------------------------------------------------------------------===//
6251
6252multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> {
6253  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
6254      (ins VR128:$src1, GR32orGR64:$src2, u8imm:$src3),
6255      !if(Is2Addr,
6256        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6257        !strconcat(asm,
6258                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6259      [(set VR128:$dst,
6260        (X86pinsrb VR128:$src1, GR32orGR64:$src2, imm:$src3))]>,
6261      Sched<[WriteShuffle]>;
6262  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
6263      (ins VR128:$src1, i8mem:$src2, u8imm:$src3),
6264      !if(Is2Addr,
6265        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6266        !strconcat(asm,
6267                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6268      [(set VR128:$dst,
6269        (X86pinsrb VR128:$src1, (extloadi8 addr:$src2),
6270                   imm:$src3))]>, Sched<[WriteShuffleLd, ReadAfterLd]>;
6271}
6272
6273let Predicates = [HasAVX, NoBWI] in
6274  defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V;
6275let Constraints = "$src1 = $dst" in
6276  defm PINSRB  : SS41I_insert8<0x20, "pinsrb">;
6277
6278multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> {
6279  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
6280      (ins VR128:$src1, GR32:$src2, u8imm:$src3),
6281      !if(Is2Addr,
6282        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6283        !strconcat(asm,
6284                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6285      [(set VR128:$dst,
6286        (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>,
6287      Sched<[WriteShuffle]>;
6288  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
6289      (ins VR128:$src1, i32mem:$src2, u8imm:$src3),
6290      !if(Is2Addr,
6291        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6292        !strconcat(asm,
6293                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6294      [(set VR128:$dst,
6295        (v4i32 (insertelt VR128:$src1, (loadi32 addr:$src2),
6296                          imm:$src3)))]>, Sched<[WriteShuffleLd, ReadAfterLd]>;
6297}
6298
6299let Predicates = [HasAVX, NoDQI] in
6300  defm VPINSRD : SS41I_insert32<0x22, "vpinsrd", 0>, VEX_4V;
6301let Constraints = "$src1 = $dst" in
6302  defm PINSRD : SS41I_insert32<0x22, "pinsrd">;
6303
6304multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> {
6305  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
6306      (ins VR128:$src1, GR64:$src2, u8imm:$src3),
6307      !if(Is2Addr,
6308        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6309        !strconcat(asm,
6310                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6311      [(set VR128:$dst,
6312        (v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>,
6313      Sched<[WriteShuffle]>;
6314  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
6315      (ins VR128:$src1, i64mem:$src2, u8imm:$src3),
6316      !if(Is2Addr,
6317        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6318        !strconcat(asm,
6319                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6320      [(set VR128:$dst,
6321        (v2i64 (insertelt VR128:$src1, (loadi64 addr:$src2),
6322                          imm:$src3)))]>, Sched<[WriteShuffleLd, ReadAfterLd]>;
6323}
6324
6325let Predicates = [HasAVX, NoDQI] in
6326  defm VPINSRQ : SS41I_insert64<0x22, "vpinsrq", 0>, VEX_4V, VEX_W;
6327let Constraints = "$src1 = $dst" in
6328  defm PINSRQ : SS41I_insert64<0x22, "pinsrq">, REX_W;
6329
6330// insertps has a few different modes, there's the first two here below which
6331// are optimized inserts that won't zero arbitrary elements in the destination
6332// vector. The next one matches the intrinsic and could zero arbitrary elements
6333// in the target vector.
6334multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1,
6335                           OpndItins itins = DEFAULT_ITINS> {
6336  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
6337      (ins VR128:$src1, VR128:$src2, u8imm:$src3),
6338      !if(Is2Addr,
6339        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6340        !strconcat(asm,
6341                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6342      [(set VR128:$dst,
6343        (X86insertps VR128:$src1, VR128:$src2, imm:$src3))], itins.rr>,
6344      Sched<[WriteFShuffle]>;
6345  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
6346      (ins VR128:$src1, f32mem:$src2, u8imm:$src3),
6347      !if(Is2Addr,
6348        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6349        !strconcat(asm,
6350                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6351      [(set VR128:$dst,
6352        (X86insertps VR128:$src1,
6353                   (v4f32 (scalar_to_vector (loadf32 addr:$src2))),
6354                    imm:$src3))], itins.rm>,
6355      Sched<[WriteFShuffleLd, ReadAfterLd]>;
6356}
6357
6358let ExeDomain = SSEPackedSingle in {
6359  let Predicates = [UseAVX] in
6360    defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>, VEX_4V;
6361  let Constraints = "$src1 = $dst" in
6362    defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1, SSE_INSERT_ITINS>;
6363}
6364
6365let Predicates = [UseSSE41] in {
6366  // If we're inserting an element from a load or a null pshuf of a load,
6367  // fold the load into the insertps instruction.
6368  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), (X86PShufd (v4f32
6369                       (scalar_to_vector (loadf32 addr:$src2))), (i8 0)),
6370                   imm:$src3)),
6371            (INSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
6372  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), (X86PShufd
6373                      (loadv4f32 addr:$src2), (i8 0)), imm:$src3)),
6374            (INSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
6375}
6376
6377let Predicates = [UseAVX] in {
6378  // If we're inserting an element from a vbroadcast of a load, fold the
6379  // load into the X86insertps instruction.
6380  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1),
6381                (X86VBroadcast (loadf32 addr:$src2)), imm:$src3)),
6382            (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
6383  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1),
6384                (X86VBroadcast (loadv4f32 addr:$src2)), imm:$src3)),
6385            (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
6386}
6387
6388//===----------------------------------------------------------------------===//
6389// SSE4.1 - Round Instructions
6390//===----------------------------------------------------------------------===//
6391
6392multiclass sse41_fp_unop_rm<bits<8> opcps, bits<8> opcpd, string OpcodeStr,
6393                            X86MemOperand x86memop, RegisterClass RC,
6394                            PatFrag mem_frag32, PatFrag mem_frag64,
6395                            Intrinsic V4F32Int, Intrinsic V2F64Int> {
6396let ExeDomain = SSEPackedSingle in {
6397  // Intrinsic operation, reg.
6398  // Vector intrinsic operation, reg
6399  def PSr : SS4AIi8<opcps, MRMSrcReg,
6400                    (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2),
6401                    !strconcat(OpcodeStr,
6402                    "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6403                    [(set RC:$dst, (V4F32Int RC:$src1, imm:$src2))],
6404                    IIC_SSE_ROUNDPS_REG>, Sched<[WriteFAdd]>;
6405
6406  // Vector intrinsic operation, mem
6407  def PSm : SS4AIi8<opcps, MRMSrcMem,
6408                    (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2),
6409                    !strconcat(OpcodeStr,
6410                    "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6411                    [(set RC:$dst,
6412                          (V4F32Int (mem_frag32 addr:$src1),imm:$src2))],
6413                          IIC_SSE_ROUNDPS_MEM>, Sched<[WriteFAddLd]>;
6414} // ExeDomain = SSEPackedSingle
6415
6416let ExeDomain = SSEPackedDouble in {
6417  // Vector intrinsic operation, reg
6418  def PDr : SS4AIi8<opcpd, MRMSrcReg,
6419                    (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2),
6420                    !strconcat(OpcodeStr,
6421                    "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6422                    [(set RC:$dst, (V2F64Int RC:$src1, imm:$src2))],
6423                    IIC_SSE_ROUNDPS_REG>, Sched<[WriteFAdd]>;
6424
6425  // Vector intrinsic operation, mem
6426  def PDm : SS4AIi8<opcpd, MRMSrcMem,
6427                    (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2),
6428                    !strconcat(OpcodeStr,
6429                    "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6430                    [(set RC:$dst,
6431                          (V2F64Int (mem_frag64 addr:$src1),imm:$src2))],
6432                          IIC_SSE_ROUNDPS_REG>, Sched<[WriteFAddLd]>;
6433} // ExeDomain = SSEPackedDouble
6434}
6435
6436multiclass sse41_fp_binop_rm<bits<8> opcss, bits<8> opcsd,
6437                            string OpcodeStr,
6438                            Intrinsic F32Int,
6439                            Intrinsic F64Int, bit Is2Addr = 1> {
6440let ExeDomain = GenericDomain in {
6441  // Operation, reg.
6442  let hasSideEffects = 0 in
6443  def SSr : SS4AIi8<opcss, MRMSrcReg,
6444      (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32u8imm:$src3),
6445      !if(Is2Addr,
6446          !strconcat(OpcodeStr,
6447              "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6448          !strconcat(OpcodeStr,
6449              "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6450      []>, Sched<[WriteFAdd]>;
6451
6452  // Intrinsic operation, reg.
6453  let isCodeGenOnly = 1 in
6454  def SSr_Int : SS4AIi8<opcss, MRMSrcReg,
6455        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
6456        !if(Is2Addr,
6457            !strconcat(OpcodeStr,
6458                "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6459            !strconcat(OpcodeStr,
6460                "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6461        [(set VR128:$dst, (F32Int VR128:$src1, VR128:$src2, imm:$src3))]>,
6462        Sched<[WriteFAdd]>;
6463
6464  // Intrinsic operation, mem.
6465  def SSm : SS4AIi8<opcss, MRMSrcMem,
6466        (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32u8imm:$src3),
6467        !if(Is2Addr,
6468            !strconcat(OpcodeStr,
6469                "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6470            !strconcat(OpcodeStr,
6471                "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6472        [(set VR128:$dst,
6473             (F32Int VR128:$src1, sse_load_f32:$src2, imm:$src3))]>,
6474        Sched<[WriteFAddLd, ReadAfterLd]>;
6475
6476  // Operation, reg.
6477  let hasSideEffects = 0 in
6478  def SDr : SS4AIi8<opcsd, MRMSrcReg,
6479        (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32u8imm:$src3),
6480        !if(Is2Addr,
6481            !strconcat(OpcodeStr,
6482                "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6483            !strconcat(OpcodeStr,
6484                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6485        []>, Sched<[WriteFAdd]>;
6486
6487  // Intrinsic operation, reg.
6488  let isCodeGenOnly = 1 in
6489  def SDr_Int : SS4AIi8<opcsd, MRMSrcReg,
6490        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
6491        !if(Is2Addr,
6492            !strconcat(OpcodeStr,
6493                "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6494            !strconcat(OpcodeStr,
6495                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6496        [(set VR128:$dst, (F64Int VR128:$src1, VR128:$src2, imm:$src3))]>,
6497        Sched<[WriteFAdd]>;
6498
6499  // Intrinsic operation, mem.
6500  def SDm : SS4AIi8<opcsd, MRMSrcMem,
6501        (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32u8imm:$src3),
6502        !if(Is2Addr,
6503            !strconcat(OpcodeStr,
6504                "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6505            !strconcat(OpcodeStr,
6506                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6507        [(set VR128:$dst,
6508              (F64Int VR128:$src1, sse_load_f64:$src2, imm:$src3))]>,
6509        Sched<[WriteFAddLd, ReadAfterLd]>;
6510} // ExeDomain = GenericDomain
6511}
6512
6513// FP round - roundss, roundps, roundsd, roundpd
6514let Predicates = [HasAVX] in {
6515  // Intrinsic form
6516  defm VROUND  : sse41_fp_unop_rm<0x08, 0x09, "vround", f128mem, VR128,
6517                                  loadv4f32, loadv2f64,
6518                                  int_x86_sse41_round_ps,
6519                                  int_x86_sse41_round_pd>, VEX;
6520  defm VROUNDY : sse41_fp_unop_rm<0x08, 0x09, "vround", f256mem, VR256,
6521                                  loadv8f32, loadv4f64,
6522                                  int_x86_avx_round_ps_256,
6523                                  int_x86_avx_round_pd_256>, VEX, VEX_L;
6524  defm VROUND  : sse41_fp_binop_rm<0x0A, 0x0B, "vround",
6525                                  int_x86_sse41_round_ss,
6526                                  int_x86_sse41_round_sd, 0>, VEX_4V, VEX_LIG;
6527}
6528
6529let Predicates = [UseAVX] in {
6530  def : Pat<(ffloor FR32:$src),
6531            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x9))>;
6532  def : Pat<(f64 (ffloor FR64:$src)),
6533            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x9))>;
6534  def : Pat<(f32 (fnearbyint FR32:$src)),
6535            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>;
6536  def : Pat<(f64 (fnearbyint FR64:$src)),
6537            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>;
6538  def : Pat<(f32 (fceil FR32:$src)),
6539            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xA))>;
6540  def : Pat<(f64 (fceil FR64:$src)),
6541            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xA))>;
6542  def : Pat<(f32 (frint FR32:$src)),
6543            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>;
6544  def : Pat<(f64 (frint FR64:$src)),
6545            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>;
6546  def : Pat<(f32 (ftrunc FR32:$src)),
6547            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xB))>;
6548  def : Pat<(f64 (ftrunc FR64:$src)),
6549            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xB))>;
6550}
6551
6552let Predicates = [HasAVX] in {
6553  def : Pat<(v4f32 (ffloor VR128:$src)),
6554            (VROUNDPSr VR128:$src, (i32 0x9))>;
6555  def : Pat<(v4f32 (fnearbyint VR128:$src)),
6556            (VROUNDPSr VR128:$src, (i32 0xC))>;
6557  def : Pat<(v4f32 (fceil VR128:$src)),
6558            (VROUNDPSr VR128:$src, (i32 0xA))>;
6559  def : Pat<(v4f32 (frint VR128:$src)),
6560            (VROUNDPSr VR128:$src, (i32 0x4))>;
6561  def : Pat<(v4f32 (ftrunc VR128:$src)),
6562            (VROUNDPSr VR128:$src, (i32 0xB))>;
6563
6564  def : Pat<(v2f64 (ffloor VR128:$src)),
6565            (VROUNDPDr VR128:$src, (i32 0x9))>;
6566  def : Pat<(v2f64 (fnearbyint VR128:$src)),
6567            (VROUNDPDr VR128:$src, (i32 0xC))>;
6568  def : Pat<(v2f64 (fceil VR128:$src)),
6569            (VROUNDPDr VR128:$src, (i32 0xA))>;
6570  def : Pat<(v2f64 (frint VR128:$src)),
6571            (VROUNDPDr VR128:$src, (i32 0x4))>;
6572  def : Pat<(v2f64 (ftrunc VR128:$src)),
6573            (VROUNDPDr VR128:$src, (i32 0xB))>;
6574
6575  def : Pat<(v8f32 (ffloor VR256:$src)),
6576            (VROUNDYPSr VR256:$src, (i32 0x9))>;
6577  def : Pat<(v8f32 (fnearbyint VR256:$src)),
6578            (VROUNDYPSr VR256:$src, (i32 0xC))>;
6579  def : Pat<(v8f32 (fceil VR256:$src)),
6580            (VROUNDYPSr VR256:$src, (i32 0xA))>;
6581  def : Pat<(v8f32 (frint VR256:$src)),
6582            (VROUNDYPSr VR256:$src, (i32 0x4))>;
6583  def : Pat<(v8f32 (ftrunc VR256:$src)),
6584            (VROUNDYPSr VR256:$src, (i32 0xB))>;
6585
6586  def : Pat<(v4f64 (ffloor VR256:$src)),
6587            (VROUNDYPDr VR256:$src, (i32 0x9))>;
6588  def : Pat<(v4f64 (fnearbyint VR256:$src)),
6589            (VROUNDYPDr VR256:$src, (i32 0xC))>;
6590  def : Pat<(v4f64 (fceil VR256:$src)),
6591            (VROUNDYPDr VR256:$src, (i32 0xA))>;
6592  def : Pat<(v4f64 (frint VR256:$src)),
6593            (VROUNDYPDr VR256:$src, (i32 0x4))>;
6594  def : Pat<(v4f64 (ftrunc VR256:$src)),
6595            (VROUNDYPDr VR256:$src, (i32 0xB))>;
6596}
6597
6598defm ROUND  : sse41_fp_unop_rm<0x08, 0x09, "round", f128mem, VR128,
6599                               memopv4f32, memopv2f64,
6600                               int_x86_sse41_round_ps, int_x86_sse41_round_pd>;
6601let Constraints = "$src1 = $dst" in
6602defm ROUND  : sse41_fp_binop_rm<0x0A, 0x0B, "round",
6603                               int_x86_sse41_round_ss, int_x86_sse41_round_sd>;
6604
6605let Predicates = [UseSSE41] in {
6606  def : Pat<(ffloor FR32:$src),
6607            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x9))>;
6608  def : Pat<(f64 (ffloor FR64:$src)),
6609            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x9))>;
6610  def : Pat<(f32 (fnearbyint FR32:$src)),
6611            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>;
6612  def : Pat<(f64 (fnearbyint FR64:$src)),
6613            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>;
6614  def : Pat<(f32 (fceil FR32:$src)),
6615            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xA))>;
6616  def : Pat<(f64 (fceil FR64:$src)),
6617            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xA))>;
6618  def : Pat<(f32 (frint FR32:$src)),
6619            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>;
6620  def : Pat<(f64 (frint FR64:$src)),
6621            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>;
6622  def : Pat<(f32 (ftrunc FR32:$src)),
6623            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xB))>;
6624  def : Pat<(f64 (ftrunc FR64:$src)),
6625            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xB))>;
6626
6627  def : Pat<(v4f32 (ffloor VR128:$src)),
6628            (ROUNDPSr VR128:$src, (i32 0x9))>;
6629  def : Pat<(v4f32 (fnearbyint VR128:$src)),
6630            (ROUNDPSr VR128:$src, (i32 0xC))>;
6631  def : Pat<(v4f32 (fceil VR128:$src)),
6632            (ROUNDPSr VR128:$src, (i32 0xA))>;
6633  def : Pat<(v4f32 (frint VR128:$src)),
6634            (ROUNDPSr VR128:$src, (i32 0x4))>;
6635  def : Pat<(v4f32 (ftrunc VR128:$src)),
6636            (ROUNDPSr VR128:$src, (i32 0xB))>;
6637
6638  def : Pat<(v2f64 (ffloor VR128:$src)),
6639            (ROUNDPDr VR128:$src, (i32 0x9))>;
6640  def : Pat<(v2f64 (fnearbyint VR128:$src)),
6641            (ROUNDPDr VR128:$src, (i32 0xC))>;
6642  def : Pat<(v2f64 (fceil VR128:$src)),
6643            (ROUNDPDr VR128:$src, (i32 0xA))>;
6644  def : Pat<(v2f64 (frint VR128:$src)),
6645            (ROUNDPDr VR128:$src, (i32 0x4))>;
6646  def : Pat<(v2f64 (ftrunc VR128:$src)),
6647            (ROUNDPDr VR128:$src, (i32 0xB))>;
6648}
6649
6650//===----------------------------------------------------------------------===//
6651// SSE4.1 - Packed Bit Test
6652//===----------------------------------------------------------------------===//
6653
6654// ptest instruction we'll lower to this in X86ISelLowering primarily from
6655// the intel intrinsic that corresponds to this.
6656let Defs = [EFLAGS], Predicates = [HasAVX] in {
6657def VPTESTrr  : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
6658                "vptest\t{$src2, $src1|$src1, $src2}",
6659                [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
6660                Sched<[WriteVecLogic]>, VEX;
6661def VPTESTrm  : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
6662                "vptest\t{$src2, $src1|$src1, $src2}",
6663                [(set EFLAGS,(X86ptest VR128:$src1, (loadv2i64 addr:$src2)))]>,
6664                Sched<[WriteVecLogicLd, ReadAfterLd]>, VEX;
6665
6666def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2),
6667                "vptest\t{$src2, $src1|$src1, $src2}",
6668                [(set EFLAGS, (X86ptest VR256:$src1, (v4i64 VR256:$src2)))]>,
6669                Sched<[WriteVecLogic]>, VEX, VEX_L;
6670def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2),
6671                "vptest\t{$src2, $src1|$src1, $src2}",
6672                [(set EFLAGS,(X86ptest VR256:$src1, (loadv4i64 addr:$src2)))]>,
6673                Sched<[WriteVecLogicLd, ReadAfterLd]>, VEX, VEX_L;
6674}
6675
6676let Defs = [EFLAGS] in {
6677def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
6678              "ptest\t{$src2, $src1|$src1, $src2}",
6679              [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
6680              Sched<[WriteVecLogic]>;
6681def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
6682              "ptest\t{$src2, $src1|$src1, $src2}",
6683              [(set EFLAGS, (X86ptest VR128:$src1, (memopv2i64 addr:$src2)))]>,
6684              Sched<[WriteVecLogicLd, ReadAfterLd]>;
6685}
6686
6687// The bit test instructions below are AVX only
6688multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC,
6689                       X86MemOperand x86memop, PatFrag mem_frag, ValueType vt> {
6690  def rr : SS48I<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
6691            !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
6692            [(set EFLAGS, (X86testp RC:$src1, (vt RC:$src2)))]>,
6693            Sched<[WriteVecLogic]>, VEX;
6694  def rm : SS48I<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
6695            !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
6696            [(set EFLAGS, (X86testp RC:$src1, (mem_frag addr:$src2)))]>,
6697            Sched<[WriteVecLogicLd, ReadAfterLd]>, VEX;
6698}
6699
6700let Defs = [EFLAGS], Predicates = [HasAVX] in {
6701let ExeDomain = SSEPackedSingle in {
6702defm VTESTPS  : avx_bittest<0x0E, "vtestps", VR128, f128mem, loadv4f32, v4f32>;
6703defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, loadv8f32, v8f32>,
6704                            VEX_L;
6705}
6706let ExeDomain = SSEPackedDouble in {
6707defm VTESTPD  : avx_bittest<0x0F, "vtestpd", VR128, f128mem, loadv2f64, v2f64>;
6708defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, loadv4f64, v4f64>,
6709                            VEX_L;
6710}
6711}
6712
6713//===----------------------------------------------------------------------===//
6714// SSE4.1 - Misc Instructions
6715//===----------------------------------------------------------------------===//
6716
6717let Defs = [EFLAGS], Predicates = [HasPOPCNT] in {
6718  def POPCNT16rr : I<0xB8, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
6719                     "popcnt{w}\t{$src, $dst|$dst, $src}",
6720                     [(set GR16:$dst, (ctpop GR16:$src)), (implicit EFLAGS)],
6721                     IIC_SSE_POPCNT_RR>, Sched<[WriteFAdd]>,
6722                     OpSize16, XS;
6723  def POPCNT16rm : I<0xB8, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
6724                     "popcnt{w}\t{$src, $dst|$dst, $src}",
6725                     [(set GR16:$dst, (ctpop (loadi16 addr:$src))),
6726                      (implicit EFLAGS)], IIC_SSE_POPCNT_RM>,
6727                      Sched<[WriteFAddLd]>, OpSize16, XS;
6728
6729  def POPCNT32rr : I<0xB8, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
6730                     "popcnt{l}\t{$src, $dst|$dst, $src}",
6731                     [(set GR32:$dst, (ctpop GR32:$src)), (implicit EFLAGS)],
6732                     IIC_SSE_POPCNT_RR>, Sched<[WriteFAdd]>,
6733                     OpSize32, XS;
6734
6735  def POPCNT32rm : I<0xB8, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
6736                     "popcnt{l}\t{$src, $dst|$dst, $src}",
6737                     [(set GR32:$dst, (ctpop (loadi32 addr:$src))),
6738                      (implicit EFLAGS)], IIC_SSE_POPCNT_RM>,
6739                      Sched<[WriteFAddLd]>, OpSize32, XS;
6740
6741  def POPCNT64rr : RI<0xB8, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
6742                      "popcnt{q}\t{$src, $dst|$dst, $src}",
6743                      [(set GR64:$dst, (ctpop GR64:$src)), (implicit EFLAGS)],
6744                      IIC_SSE_POPCNT_RR>, Sched<[WriteFAdd]>, XS;
6745  def POPCNT64rm : RI<0xB8, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
6746                      "popcnt{q}\t{$src, $dst|$dst, $src}",
6747                      [(set GR64:$dst, (ctpop (loadi64 addr:$src))),
6748                       (implicit EFLAGS)], IIC_SSE_POPCNT_RM>,
6749                       Sched<[WriteFAddLd]>, XS;
6750}
6751
6752
6753
6754// SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16.
6755multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr,
6756                                 Intrinsic IntId128, PatFrag ld_frag,
6757                                 X86FoldableSchedWrite Sched> {
6758  def rr128 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
6759                    (ins VR128:$src),
6760                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
6761                    [(set VR128:$dst, (IntId128 VR128:$src))]>,
6762                    Sched<[Sched]>;
6763  def rm128 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
6764                     (ins i128mem:$src),
6765                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
6766                     [(set VR128:$dst,
6767                       (IntId128 (bitconvert (ld_frag addr:$src))))]>,
6768                    Sched<[Sched.Folded]>;
6769}
6770
6771// PHMIN has the same profile as PSAD, thus we use the same scheduling
6772// model, although the naming is misleading.
6773let Predicates = [HasAVX] in
6774defm VPHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "vphminposuw",
6775                                         int_x86_sse41_phminposuw, loadv2i64,
6776                                         WriteVecIMul>, VEX;
6777defm PHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "phminposuw",
6778                                         int_x86_sse41_phminposuw, memopv2i64,
6779                                         WriteVecIMul>;
6780
6781/// SS48I_binop_rm - Simple SSE41 binary operator.
6782multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
6783                          ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
6784                          X86MemOperand x86memop, bit Is2Addr = 1,
6785                          OpndItins itins = SSE_INTALU_ITINS_P> {
6786  let isCommutable = 1 in
6787  def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst),
6788       (ins RC:$src1, RC:$src2),
6789       !if(Is2Addr,
6790           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
6791           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
6792       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
6793       Sched<[itins.Sched]>;
6794  def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst),
6795       (ins RC:$src1, x86memop:$src2),
6796       !if(Is2Addr,
6797           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
6798           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
6799       [(set RC:$dst,
6800         (OpVT (OpNode RC:$src1, (bitconvert (memop_frag addr:$src2)))))]>,
6801       Sched<[itins.Sched.Folded, ReadAfterLd]>;
6802}
6803
6804/// SS48I_binop_rm2 - Simple SSE41 binary operator with different src and dst
6805/// types.
6806multiclass SS48I_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
6807                         ValueType DstVT, ValueType SrcVT, RegisterClass RC,
6808                         PatFrag memop_frag, X86MemOperand x86memop,
6809                         OpndItins itins,
6810                         bit IsCommutable = 0, bit Is2Addr = 1> {
6811  let isCommutable = IsCommutable in
6812  def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst),
6813       (ins RC:$src1, RC:$src2),
6814       !if(Is2Addr,
6815           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
6816           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
6817       [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>,
6818       Sched<[itins.Sched]>;
6819  def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst),
6820       (ins RC:$src1, x86memop:$src2),
6821       !if(Is2Addr,
6822           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
6823           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
6824       [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1),
6825                                     (bitconvert (memop_frag addr:$src2)))))]>,
6826       Sched<[itins.Sched.Folded, ReadAfterLd]>;
6827}
6828
6829let Predicates = [HasAVX, NoVLX] in {
6830  defm VPMINSB   : SS48I_binop_rm<0x38, "vpminsb", smin, v16i8, VR128,
6831                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
6832                                  VEX_4V;
6833  defm VPMINSD   : SS48I_binop_rm<0x39, "vpminsd", smin, v4i32, VR128,
6834                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
6835                                  VEX_4V;
6836  defm VPMINUD   : SS48I_binop_rm<0x3B, "vpminud", umin, v4i32, VR128,
6837                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
6838                                  VEX_4V;
6839  defm VPMINUW   : SS48I_binop_rm<0x3A, "vpminuw", umin, v8i16, VR128,
6840                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
6841                                  VEX_4V;
6842  defm VPMAXSB   : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v16i8, VR128,
6843                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
6844                                  VEX_4V;
6845  defm VPMAXSD   : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v4i32, VR128,
6846                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
6847                                  VEX_4V;
6848  defm VPMAXUD   : SS48I_binop_rm<0x3F, "vpmaxud", umax, v4i32, VR128,
6849                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
6850                                  VEX_4V;
6851  defm VPMAXUW   : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v8i16, VR128,
6852                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
6853                                  VEX_4V;
6854  defm VPMULDQ   : SS48I_binop_rm2<0x28, "vpmuldq", X86pmuldq, v2i64, v4i32,
6855                                   VR128, loadv2i64, i128mem,
6856                                   SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V;
6857}
6858
6859let Predicates = [HasAVX2, NoVLX] in {
6860  defm VPMINSBY  : SS48I_binop_rm<0x38, "vpminsb", smin, v32i8, VR256,
6861                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
6862                                  VEX_4V, VEX_L;
6863  defm VPMINSDY  : SS48I_binop_rm<0x39, "vpminsd", smin, v8i32, VR256,
6864                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
6865                                  VEX_4V, VEX_L;
6866  defm VPMINUDY  : SS48I_binop_rm<0x3B, "vpminud", umin, v8i32, VR256,
6867                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
6868                                  VEX_4V, VEX_L;
6869  defm VPMINUWY  : SS48I_binop_rm<0x3A, "vpminuw", umin, v16i16, VR256,
6870                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
6871                                  VEX_4V, VEX_L;
6872  defm VPMAXSBY  : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v32i8, VR256,
6873                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
6874                                  VEX_4V, VEX_L;
6875  defm VPMAXSDY  : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v8i32, VR256,
6876                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
6877                                  VEX_4V, VEX_L;
6878  defm VPMAXUDY  : SS48I_binop_rm<0x3F, "vpmaxud", umax, v8i32, VR256,
6879                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
6880                                  VEX_4V, VEX_L;
6881  defm VPMAXUWY  : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v16i16, VR256,
6882                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
6883                                  VEX_4V, VEX_L;
6884  defm VPMULDQY : SS48I_binop_rm2<0x28, "vpmuldq", X86pmuldq, v4i64, v8i32,
6885                                  VR256, loadv4i64, i256mem,
6886                                  SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L;
6887}
6888
6889let Constraints = "$src1 = $dst" in {
6890  defm PMINSB   : SS48I_binop_rm<0x38, "pminsb", smin, v16i8, VR128,
6891                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
6892  defm PMINSD   : SS48I_binop_rm<0x39, "pminsd", smin, v4i32, VR128,
6893                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
6894  defm PMINUD   : SS48I_binop_rm<0x3B, "pminud", umin, v4i32, VR128,
6895                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
6896  defm PMINUW   : SS48I_binop_rm<0x3A, "pminuw", umin, v8i16, VR128,
6897                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
6898  defm PMAXSB   : SS48I_binop_rm<0x3C, "pmaxsb", smax, v16i8, VR128,
6899                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
6900  defm PMAXSD   : SS48I_binop_rm<0x3D, "pmaxsd", smax, v4i32, VR128,
6901                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
6902  defm PMAXUD   : SS48I_binop_rm<0x3F, "pmaxud", umax, v4i32, VR128,
6903                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
6904  defm PMAXUW   : SS48I_binop_rm<0x3E, "pmaxuw", umax, v8i16, VR128,
6905                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
6906  defm PMULDQ   : SS48I_binop_rm2<0x28, "pmuldq", X86pmuldq, v2i64, v4i32,
6907                                  VR128, memopv2i64, i128mem,
6908                                  SSE_INTMUL_ITINS_P, 1>;
6909}
6910
6911let Predicates = [HasAVX, NoVLX] in {
6912  defm VPMULLD  : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128,
6913                                 memopv2i64, i128mem, 0, SSE_PMULLD_ITINS>,
6914                                 VEX_4V;
6915  defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128,
6916                                 memopv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
6917                                 VEX_4V;
6918}
6919let Predicates = [HasAVX2] in {
6920  defm VPMULLDY  : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256,
6921                                  loadv4i64, i256mem, 0, SSE_PMULLD_ITINS>,
6922                                  VEX_4V, VEX_L;
6923  defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256,
6924                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
6925                                  VEX_4V, VEX_L;
6926}
6927
6928let Constraints = "$src1 = $dst" in {
6929  defm PMULLD  : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, VR128,
6930                                memopv2i64, i128mem, 1, SSE_PMULLD_ITINS>;
6931  defm PCMPEQQ : SS48I_binop_rm<0x29, "pcmpeqq", X86pcmpeq, v2i64, VR128,
6932                                memopv2i64, i128mem, 1, SSE_INTALUQ_ITINS_P>;
6933}
6934
6935/// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate
6936multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
6937                 Intrinsic IntId, RegisterClass RC, PatFrag memop_frag,
6938                 X86MemOperand x86memop, bit Is2Addr = 1,
6939                 OpndItins itins = DEFAULT_ITINS> {
6940  let isCommutable = 1 in
6941  def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
6942        (ins RC:$src1, RC:$src2, u8imm:$src3),
6943        !if(Is2Addr,
6944            !strconcat(OpcodeStr,
6945                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6946            !strconcat(OpcodeStr,
6947                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6948        [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))], itins.rr>,
6949        Sched<[itins.Sched]>;
6950  def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
6951        (ins RC:$src1, x86memop:$src2, u8imm:$src3),
6952        !if(Is2Addr,
6953            !strconcat(OpcodeStr,
6954                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6955            !strconcat(OpcodeStr,
6956                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6957        [(set RC:$dst,
6958          (IntId RC:$src1,
6959           (bitconvert (memop_frag addr:$src2)), imm:$src3))], itins.rm>,
6960        Sched<[itins.Sched.Folded, ReadAfterLd]>;
6961}
6962
6963/// SS41I_binop_rmi - SSE 4.1 binary operator with 8-bit immediate
6964multiclass SS41I_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
6965                           ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
6966                           X86MemOperand x86memop, bit Is2Addr = 1,
6967                           OpndItins itins = DEFAULT_ITINS> {
6968  let isCommutable = 1 in
6969  def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
6970        (ins RC:$src1, RC:$src2, u8imm:$src3),
6971        !if(Is2Addr,
6972            !strconcat(OpcodeStr,
6973                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6974            !strconcat(OpcodeStr,
6975                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6976        [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))],
6977        itins.rr>, Sched<[itins.Sched]>;
6978  def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
6979        (ins RC:$src1, x86memop:$src2, u8imm:$src3),
6980        !if(Is2Addr,
6981            !strconcat(OpcodeStr,
6982                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6983            !strconcat(OpcodeStr,
6984                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6985        [(set RC:$dst,
6986          (OpVT (OpNode RC:$src1,
6987                 (bitconvert (memop_frag addr:$src2)), imm:$src3)))], itins.rm>,
6988        Sched<[itins.Sched.Folded, ReadAfterLd]>;
6989}
6990
6991let Predicates = [HasAVX] in {
6992  let isCommutable = 0 in {
6993    defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw,
6994                                        VR128, loadv2i64, i128mem, 0,
6995                                        DEFAULT_ITINS_MPSADSCHED>, VEX_4V;
6996  }
6997
6998  let ExeDomain = SSEPackedSingle in {
6999  defm VBLENDPS : SS41I_binop_rmi<0x0C, "vblendps", X86Blendi, v4f32,
7000                                  VR128, loadv4f32, f128mem, 0,
7001                                  DEFAULT_ITINS_FBLENDSCHED>, VEX_4V;
7002  defm VBLENDPSY : SS41I_binop_rmi<0x0C, "vblendps", X86Blendi, v8f32,
7003                                   VR256, loadv8f32, f256mem, 0,
7004                                   DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_L;
7005  }
7006  let ExeDomain = SSEPackedDouble in {
7007  defm VBLENDPD : SS41I_binop_rmi<0x0D, "vblendpd", X86Blendi, v2f64,
7008                                  VR128, loadv2f64, f128mem, 0,
7009                                  DEFAULT_ITINS_FBLENDSCHED>, VEX_4V;
7010  defm VBLENDPDY : SS41I_binop_rmi<0x0D, "vblendpd", X86Blendi, v4f64,
7011                                   VR256, loadv4f64, f256mem, 0,
7012                                   DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_L;
7013  }
7014  defm VPBLENDW : SS41I_binop_rmi<0x0E, "vpblendw", X86Blendi, v8i16,
7015                                  VR128, loadv2i64, i128mem, 0,
7016                                  DEFAULT_ITINS_BLENDSCHED>, VEX_4V;
7017
7018  let ExeDomain = SSEPackedSingle in
7019  defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps,
7020                                   VR128, loadv4f32, f128mem, 0,
7021                                   SSE_DPPS_ITINS>, VEX_4V;
7022  let ExeDomain = SSEPackedDouble in
7023  defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd,
7024                                   VR128, loadv2f64, f128mem, 0,
7025                                   SSE_DPPS_ITINS>, VEX_4V;
7026  let ExeDomain = SSEPackedSingle in
7027  defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256,
7028                                    VR256, loadv8f32, i256mem, 0,
7029                                    SSE_DPPS_ITINS>, VEX_4V, VEX_L;
7030}
7031
7032let Predicates = [HasAVX2] in {
7033  let isCommutable = 0 in {
7034  defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw,
7035                                  VR256, loadv4i64, i256mem, 0,
7036                                  DEFAULT_ITINS_MPSADSCHED>, VEX_4V, VEX_L;
7037  }
7038  defm VPBLENDWY : SS41I_binop_rmi<0x0E, "vpblendw", X86Blendi, v16i16,
7039                                   VR256, loadv4i64, i256mem, 0,
7040                                   DEFAULT_ITINS_BLENDSCHED>, VEX_4V, VEX_L;
7041}
7042
7043let Constraints = "$src1 = $dst" in {
7044  let isCommutable = 0 in {
7045  defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw,
7046                                     VR128, memopv2i64, i128mem,
7047                                     1, SSE_MPSADBW_ITINS>;
7048  }
7049  let ExeDomain = SSEPackedSingle in
7050  defm BLENDPS : SS41I_binop_rmi<0x0C, "blendps", X86Blendi, v4f32,
7051                                 VR128, memopv4f32, f128mem,
7052                                 1, SSE_INTALU_ITINS_FBLEND_P>;
7053  let ExeDomain = SSEPackedDouble in
7054  defm BLENDPD : SS41I_binop_rmi<0x0D, "blendpd", X86Blendi, v2f64,
7055                                 VR128, memopv2f64, f128mem,
7056                                 1, SSE_INTALU_ITINS_FBLEND_P>;
7057  defm PBLENDW : SS41I_binop_rmi<0x0E, "pblendw", X86Blendi, v8i16,
7058                                 VR128, memopv2i64, i128mem,
7059                                 1, SSE_INTALU_ITINS_BLEND_P>;
7060  let ExeDomain = SSEPackedSingle in
7061  defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps,
7062                                  VR128, memopv4f32, f128mem, 1,
7063                                  SSE_DPPS_ITINS>;
7064  let ExeDomain = SSEPackedDouble in
7065  defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd,
7066                                  VR128, memopv2f64, f128mem, 1,
7067                                  SSE_DPPD_ITINS>;
7068}
7069
7070/// SS41I_quaternary_int_avx - AVX SSE 4.1 with 4 operators
7071multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr,
7072                                    RegisterClass RC, X86MemOperand x86memop,
7073                                    PatFrag mem_frag, Intrinsic IntId,
7074                                    X86FoldableSchedWrite Sched> {
7075  def rr : Ii8<opc, MRMSrcReg, (outs RC:$dst),
7076                  (ins RC:$src1, RC:$src2, RC:$src3),
7077                  !strconcat(OpcodeStr,
7078                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
7079                  [(set RC:$dst, (IntId RC:$src1, RC:$src2, RC:$src3))],
7080                  NoItinerary, SSEPackedInt>, TAPD, VEX_4V, VEX_I8IMM,
7081                Sched<[Sched]>;
7082
7083  def rm : Ii8<opc, MRMSrcMem, (outs RC:$dst),
7084                  (ins RC:$src1, x86memop:$src2, RC:$src3),
7085                  !strconcat(OpcodeStr,
7086                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
7087                  [(set RC:$dst,
7088                        (IntId RC:$src1, (bitconvert (mem_frag addr:$src2)),
7089                               RC:$src3))],
7090                  NoItinerary, SSEPackedInt>, TAPD, VEX_4V, VEX_I8IMM,
7091                Sched<[Sched.Folded, ReadAfterLd]>;
7092}
7093
7094let Predicates = [HasAVX] in {
7095let ExeDomain = SSEPackedDouble in {
7096defm VBLENDVPD  : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR128, f128mem,
7097                                           loadv2f64, int_x86_sse41_blendvpd,
7098                                           WriteFVarBlend>;
7099defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, f256mem,
7100                                  loadv4f64, int_x86_avx_blendv_pd_256,
7101                                  WriteFVarBlend>, VEX_L;
7102} // ExeDomain = SSEPackedDouble
7103let ExeDomain = SSEPackedSingle in {
7104defm VBLENDVPS  : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR128, f128mem,
7105                                           loadv4f32, int_x86_sse41_blendvps,
7106                                           WriteFVarBlend>;
7107defm VBLENDVPSY : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR256, f256mem,
7108                                  loadv8f32, int_x86_avx_blendv_ps_256,
7109                                  WriteFVarBlend>, VEX_L;
7110} // ExeDomain = SSEPackedSingle
7111defm VPBLENDVB  : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR128, i128mem,
7112                                           loadv2i64, int_x86_sse41_pblendvb,
7113                                           WriteVarBlend>;
7114}
7115
7116let Predicates = [HasAVX2] in {
7117defm VPBLENDVBY : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR256, i256mem,
7118                                      loadv4i64, int_x86_avx2_pblendvb,
7119                                      WriteVarBlend>, VEX_L;
7120}
7121
7122let Predicates = [HasAVX] in {
7123  def : Pat<(v16i8 (vselect (v16i8 VR128:$mask), (v16i8 VR128:$src1),
7124                            (v16i8 VR128:$src2))),
7125            (VPBLENDVBrr VR128:$src2, VR128:$src1, VR128:$mask)>;
7126  def : Pat<(v4i32 (vselect (v4i32 VR128:$mask), (v4i32 VR128:$src1),
7127                            (v4i32 VR128:$src2))),
7128            (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>;
7129  def : Pat<(v4f32 (vselect (v4i32 VR128:$mask), (v4f32 VR128:$src1),
7130                            (v4f32 VR128:$src2))),
7131            (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>;
7132  def : Pat<(v2i64 (vselect (v2i64 VR128:$mask), (v2i64 VR128:$src1),
7133                            (v2i64 VR128:$src2))),
7134            (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>;
7135  def : Pat<(v2f64 (vselect (v2i64 VR128:$mask), (v2f64 VR128:$src1),
7136                            (v2f64 VR128:$src2))),
7137            (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>;
7138  def : Pat<(v8i32 (vselect (v8i32 VR256:$mask), (v8i32 VR256:$src1),
7139                            (v8i32 VR256:$src2))),
7140            (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
7141  def : Pat<(v8f32 (vselect (v8i32 VR256:$mask), (v8f32 VR256:$src1),
7142                            (v8f32 VR256:$src2))),
7143            (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
7144  def : Pat<(v4i64 (vselect (v4i64 VR256:$mask), (v4i64 VR256:$src1),
7145                            (v4i64 VR256:$src2))),
7146            (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
7147  def : Pat<(v4f64 (vselect (v4i64 VR256:$mask), (v4f64 VR256:$src1),
7148                            (v4f64 VR256:$src2))),
7149            (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
7150}
7151
7152let Predicates = [HasAVX2] in {
7153  def : Pat<(v32i8 (vselect (v32i8 VR256:$mask), (v32i8 VR256:$src1),
7154                            (v32i8 VR256:$src2))),
7155            (VPBLENDVBYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
7156}
7157
7158// Patterns
7159// FIXME: Prefer a movss or movsd over a blendps when optimizing for size or
7160// on targets where they have equal performance. These were changed to use
7161// blends because blends have better throughput on SandyBridge and Haswell, but
7162// movs[s/d] are 1-2 byte shorter instructions.
7163let Predicates = [UseAVX] in {
7164  let AddedComplexity = 15 in {
7165  // Move scalar to XMM zero-extended, zeroing a VR128 then do a
7166  // MOVS{S,D} to the lower bits.
7167  def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
7168            (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)>;
7169  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
7170            (VBLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
7171  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
7172            (VPBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
7173  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
7174            (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)>;
7175
7176  // Move low f32 and clear high bits.
7177  def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
7178            (VBLENDPSYrri (v8f32 (AVX_SET0)), VR256:$src, (i8 1))>;
7179
7180  // Move low f64 and clear high bits.
7181  def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))),
7182            (VBLENDPDYrri (v4f64 (AVX_SET0)), VR256:$src, (i8 1))>;
7183  }
7184
7185  def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
7186                   (v4f32 (scalar_to_vector FR32:$src)), (iPTR 0)))),
7187            (SUBREG_TO_REG (i32 0),
7188                           (v4f32 (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)),
7189                           sub_xmm)>;
7190  def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
7191                   (v2f64 (scalar_to_vector FR64:$src)), (iPTR 0)))),
7192            (SUBREG_TO_REG (i64 0),
7193                           (v2f64 (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)),
7194                           sub_xmm)>;
7195
7196  // These will incur an FP/int domain crossing penalty, but it may be the only
7197  // way without AVX2. Do not add any complexity because we may be able to match
7198  // more optimal patterns defined earlier in this file.
7199  def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
7200            (VBLENDPSYrri (v8i32 (AVX_SET0)), VR256:$src, (i8 1))>;
7201  def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))),
7202            (VBLENDPDYrri (v4i64 (AVX_SET0)), VR256:$src, (i8 1))>;
7203}
7204
7205// FIXME: Prefer a movss or movsd over a blendps when optimizing for size or
7206// on targets where they have equal performance. These were changed to use
7207// blends because blends have better throughput on SandyBridge and Haswell, but
7208// movs[s/d] are 1-2 byte shorter instructions.
7209let Predicates = [UseSSE41] in {
7210  // With SSE41 we can use blends for these patterns.
7211  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
7212            (BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
7213  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
7214            (PBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
7215  def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
7216            (BLENDPDrri (v2f64 (V_SET0)), VR128:$src, (i8 1))>;
7217}
7218
7219
7220/// SS41I_ternary_int - SSE 4.1 ternary operator
7221let Uses = [XMM0], Constraints = "$src1 = $dst" in {
7222  multiclass SS41I_ternary_int<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
7223                               X86MemOperand x86memop, Intrinsic IntId,
7224                               OpndItins itins = DEFAULT_ITINS> {
7225    def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
7226                    (ins VR128:$src1, VR128:$src2),
7227                    !strconcat(OpcodeStr,
7228                     "\t{$src2, $dst|$dst, $src2}"),
7229                    [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0))],
7230                    itins.rr>, Sched<[itins.Sched]>;
7231
7232    def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
7233                    (ins VR128:$src1, x86memop:$src2),
7234                    !strconcat(OpcodeStr,
7235                     "\t{$src2, $dst|$dst, $src2}"),
7236                    [(set VR128:$dst,
7237                      (IntId VR128:$src1,
7238                       (bitconvert (mem_frag addr:$src2)), XMM0))],
7239                       itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
7240  }
7241}
7242
7243let ExeDomain = SSEPackedDouble in
7244defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", memopv2f64, f128mem,
7245                                  int_x86_sse41_blendvpd,
7246                                  DEFAULT_ITINS_FBLENDSCHED>;
7247let ExeDomain = SSEPackedSingle in
7248defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", memopv4f32, f128mem,
7249                                  int_x86_sse41_blendvps,
7250                                  DEFAULT_ITINS_FBLENDSCHED>;
7251defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", memopv2i64, i128mem,
7252                                  int_x86_sse41_pblendvb,
7253                                  DEFAULT_ITINS_VARBLENDSCHED>;
7254
7255// Aliases with the implicit xmm0 argument
7256def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
7257                (BLENDVPDrr0 VR128:$dst, VR128:$src2)>;
7258def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
7259                (BLENDVPDrm0 VR128:$dst, f128mem:$src2)>;
7260def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
7261                (BLENDVPSrr0 VR128:$dst, VR128:$src2)>;
7262def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
7263                (BLENDVPSrm0 VR128:$dst, f128mem:$src2)>;
7264def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
7265                (PBLENDVBrr0 VR128:$dst, VR128:$src2)>;
7266def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
7267                (PBLENDVBrm0 VR128:$dst, i128mem:$src2)>;
7268
7269let Predicates = [UseSSE41] in {
7270  def : Pat<(v16i8 (vselect (v16i8 XMM0), (v16i8 VR128:$src1),
7271                            (v16i8 VR128:$src2))),
7272            (PBLENDVBrr0 VR128:$src2, VR128:$src1)>;
7273  def : Pat<(v4i32 (vselect (v4i32 XMM0), (v4i32 VR128:$src1),
7274                            (v4i32 VR128:$src2))),
7275            (BLENDVPSrr0 VR128:$src2, VR128:$src1)>;
7276  def : Pat<(v4f32 (vselect (v4i32 XMM0), (v4f32 VR128:$src1),
7277                            (v4f32 VR128:$src2))),
7278            (BLENDVPSrr0 VR128:$src2, VR128:$src1)>;
7279  def : Pat<(v2i64 (vselect (v2i64 XMM0), (v2i64 VR128:$src1),
7280                            (v2i64 VR128:$src2))),
7281            (BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
7282  def : Pat<(v2f64 (vselect (v2i64 XMM0), (v2f64 VR128:$src1),
7283                            (v2f64 VR128:$src2))),
7284            (BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
7285}
7286
7287let SchedRW = [WriteLoad] in {
7288let Predicates = [HasAVX] in
7289def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
7290                       "vmovntdqa\t{$src, $dst|$dst, $src}",
7291                       [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>,
7292                       VEX;
7293let Predicates = [HasAVX2] in
7294def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
7295                         "vmovntdqa\t{$src, $dst|$dst, $src}",
7296                         [(set VR256:$dst, (int_x86_avx2_movntdqa addr:$src))]>,
7297                         VEX, VEX_L;
7298def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
7299                       "movntdqa\t{$src, $dst|$dst, $src}",
7300                       [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>;
7301} // SchedRW
7302
7303//===----------------------------------------------------------------------===//
7304// SSE4.2 - Compare Instructions
7305//===----------------------------------------------------------------------===//
7306
7307/// SS42I_binop_rm - Simple SSE 4.2 binary operator
7308multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
7309                          ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
7310                          X86MemOperand x86memop, bit Is2Addr = 1> {
7311  def rr : SS428I<opc, MRMSrcReg, (outs RC:$dst),
7312       (ins RC:$src1, RC:$src2),
7313       !if(Is2Addr,
7314           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
7315           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
7316       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>;
7317  def rm : SS428I<opc, MRMSrcMem, (outs RC:$dst),
7318       (ins RC:$src1, x86memop:$src2),
7319       !if(Is2Addr,
7320           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
7321           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
7322       [(set RC:$dst,
7323         (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>;
7324}
7325
7326let Predicates = [HasAVX] in
7327  defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128,
7328                                 loadv2i64, i128mem, 0>, VEX_4V;
7329
7330let Predicates = [HasAVX2] in
7331  defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256,
7332                                  loadv4i64, i256mem, 0>, VEX_4V, VEX_L;
7333
7334let Constraints = "$src1 = $dst" in
7335  defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128,
7336                                memopv2i64, i128mem>;
7337
7338//===----------------------------------------------------------------------===//
7339// SSE4.2 - String/text Processing Instructions
7340//===----------------------------------------------------------------------===//
7341
7342// Packed Compare Implicit Length Strings, Return Mask
7343multiclass pseudo_pcmpistrm<string asm, PatFrag ld_frag> {
7344  def REG : PseudoI<(outs VR128:$dst),
7345                    (ins VR128:$src1, VR128:$src2, u8imm:$src3),
7346    [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1, VR128:$src2,
7347                                                  imm:$src3))]>;
7348  def MEM : PseudoI<(outs VR128:$dst),
7349                    (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
7350    [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1,
7351                       (bc_v16i8 (ld_frag addr:$src2)), imm:$src3))]>;
7352}
7353
7354let Defs = [EFLAGS], usesCustomInserter = 1 in {
7355  defm VPCMPISTRM128 : pseudo_pcmpistrm<"#VPCMPISTRM128", loadv2i64>,
7356                         Requires<[HasAVX]>;
7357  defm PCMPISTRM128 : pseudo_pcmpistrm<"#PCMPISTRM128", memopv2i64>,
7358                         Requires<[UseSSE42]>;
7359}
7360
7361multiclass pcmpistrm_SS42AI<string asm> {
7362  def rr : SS42AI<0x62, MRMSrcReg, (outs),
7363    (ins VR128:$src1, VR128:$src2, u8imm:$src3),
7364    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
7365    []>, Sched<[WritePCmpIStrM]>;
7366  let mayLoad = 1 in
7367  def rm :SS42AI<0x62, MRMSrcMem, (outs),
7368    (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
7369    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
7370    []>, Sched<[WritePCmpIStrMLd, ReadAfterLd]>;
7371}
7372
7373let Defs = [XMM0, EFLAGS], hasSideEffects = 0 in {
7374  let Predicates = [HasAVX] in
7375  defm VPCMPISTRM128 : pcmpistrm_SS42AI<"vpcmpistrm">, VEX;
7376  defm PCMPISTRM128  : pcmpistrm_SS42AI<"pcmpistrm"> ;
7377}
7378
7379// Packed Compare Explicit Length Strings, Return Mask
7380multiclass pseudo_pcmpestrm<string asm, PatFrag ld_frag> {
7381  def REG : PseudoI<(outs VR128:$dst),
7382                    (ins VR128:$src1, VR128:$src3, u8imm:$src5),
7383    [(set VR128:$dst, (int_x86_sse42_pcmpestrm128
7384                       VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>;
7385  def MEM : PseudoI<(outs VR128:$dst),
7386                    (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
7387    [(set VR128:$dst, (int_x86_sse42_pcmpestrm128 VR128:$src1, EAX,
7388                       (bc_v16i8 (ld_frag addr:$src3)), EDX, imm:$src5))]>;
7389}
7390
7391let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in {
7392  defm VPCMPESTRM128 : pseudo_pcmpestrm<"#VPCMPESTRM128", loadv2i64>,
7393                         Requires<[HasAVX]>;
7394  defm PCMPESTRM128 : pseudo_pcmpestrm<"#PCMPESTRM128", memopv2i64>,
7395                         Requires<[UseSSE42]>;
7396}
7397
7398multiclass SS42AI_pcmpestrm<string asm> {
7399  def rr : SS42AI<0x60, MRMSrcReg, (outs),
7400    (ins VR128:$src1, VR128:$src3, u8imm:$src5),
7401    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
7402    []>, Sched<[WritePCmpEStrM]>;
7403  let mayLoad = 1 in
7404  def rm : SS42AI<0x60, MRMSrcMem, (outs),
7405    (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
7406    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
7407    []>, Sched<[WritePCmpEStrMLd, ReadAfterLd]>;
7408}
7409
7410let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
7411  let Predicates = [HasAVX] in
7412  defm VPCMPESTRM128 : SS42AI_pcmpestrm<"vpcmpestrm">, VEX;
7413  defm PCMPESTRM128 :  SS42AI_pcmpestrm<"pcmpestrm">;
7414}
7415
7416// Packed Compare Implicit Length Strings, Return Index
7417multiclass pseudo_pcmpistri<string asm, PatFrag ld_frag> {
7418  def REG : PseudoI<(outs GR32:$dst),
7419                    (ins VR128:$src1, VR128:$src2, u8imm:$src3),
7420    [(set GR32:$dst, EFLAGS,
7421      (X86pcmpistri VR128:$src1, VR128:$src2, imm:$src3))]>;
7422  def MEM : PseudoI<(outs GR32:$dst),
7423                    (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
7424    [(set GR32:$dst, EFLAGS, (X86pcmpistri VR128:$src1,
7425                              (bc_v16i8 (ld_frag addr:$src2)), imm:$src3))]>;
7426}
7427
7428let Defs = [EFLAGS], usesCustomInserter = 1 in {
7429  defm VPCMPISTRI : pseudo_pcmpistri<"#VPCMPISTRI", loadv2i64>,
7430                      Requires<[HasAVX]>;
7431  defm PCMPISTRI  : pseudo_pcmpistri<"#PCMPISTRI", memopv2i64>,
7432                      Requires<[UseSSE42]>;
7433}
7434
7435multiclass SS42AI_pcmpistri<string asm> {
7436  def rr : SS42AI<0x63, MRMSrcReg, (outs),
7437    (ins VR128:$src1, VR128:$src2, u8imm:$src3),
7438    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
7439    []>, Sched<[WritePCmpIStrI]>;
7440  let mayLoad = 1 in
7441  def rm : SS42AI<0x63, MRMSrcMem, (outs),
7442    (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
7443    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
7444    []>, Sched<[WritePCmpIStrILd, ReadAfterLd]>;
7445}
7446
7447let Defs = [ECX, EFLAGS], hasSideEffects = 0 in {
7448  let Predicates = [HasAVX] in
7449  defm VPCMPISTRI : SS42AI_pcmpistri<"vpcmpistri">, VEX;
7450  defm PCMPISTRI  : SS42AI_pcmpistri<"pcmpistri">;
7451}
7452
7453// Packed Compare Explicit Length Strings, Return Index
7454multiclass pseudo_pcmpestri<string asm, PatFrag ld_frag> {
7455  def REG : PseudoI<(outs GR32:$dst),
7456                    (ins VR128:$src1, VR128:$src3, u8imm:$src5),
7457    [(set GR32:$dst, EFLAGS,
7458      (X86pcmpestri VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>;
7459  def MEM : PseudoI<(outs GR32:$dst),
7460                    (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
7461    [(set GR32:$dst, EFLAGS,
7462      (X86pcmpestri VR128:$src1, EAX, (bc_v16i8 (ld_frag addr:$src3)), EDX,
7463       imm:$src5))]>;
7464}
7465
7466let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in {
7467  defm VPCMPESTRI : pseudo_pcmpestri<"#VPCMPESTRI", loadv2i64>,
7468                      Requires<[HasAVX]>;
7469  defm PCMPESTRI  : pseudo_pcmpestri<"#PCMPESTRI", memopv2i64>,
7470                      Requires<[UseSSE42]>;
7471}
7472
7473multiclass SS42AI_pcmpestri<string asm> {
7474  def rr : SS42AI<0x61, MRMSrcReg, (outs),
7475    (ins VR128:$src1, VR128:$src3, u8imm:$src5),
7476    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
7477    []>, Sched<[WritePCmpEStrI]>;
7478  let mayLoad = 1 in
7479  def rm : SS42AI<0x61, MRMSrcMem, (outs),
7480    (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
7481    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
7482    []>, Sched<[WritePCmpEStrILd, ReadAfterLd]>;
7483}
7484
7485let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
7486  let Predicates = [HasAVX] in
7487  defm VPCMPESTRI : SS42AI_pcmpestri<"vpcmpestri">, VEX;
7488  defm PCMPESTRI  : SS42AI_pcmpestri<"pcmpestri">;
7489}
7490
7491//===----------------------------------------------------------------------===//
7492// SSE4.2 - CRC Instructions
7493//===----------------------------------------------------------------------===//
7494
7495// No CRC instructions have AVX equivalents
7496
7497// crc intrinsic instruction
7498// This set of instructions are only rm, the only difference is the size
7499// of r and m.
7500class SS42I_crc32r<bits<8> opc, string asm, RegisterClass RCOut,
7501                   RegisterClass RCIn, SDPatternOperator Int> :
7502  SS42FI<opc, MRMSrcReg, (outs RCOut:$dst), (ins RCOut:$src1, RCIn:$src2),
7503         !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
7504         [(set RCOut:$dst, (Int RCOut:$src1, RCIn:$src2))], IIC_CRC32_REG>,
7505         Sched<[WriteFAdd]>;
7506
7507class SS42I_crc32m<bits<8> opc, string asm, RegisterClass RCOut,
7508                   X86MemOperand x86memop, SDPatternOperator Int> :
7509  SS42FI<opc, MRMSrcMem, (outs RCOut:$dst), (ins RCOut:$src1, x86memop:$src2),
7510         !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
7511         [(set RCOut:$dst, (Int RCOut:$src1, (load addr:$src2)))],
7512         IIC_CRC32_MEM>, Sched<[WriteFAddLd, ReadAfterLd]>;
7513
7514let Constraints = "$src1 = $dst" in {
7515  def CRC32r32m8  : SS42I_crc32m<0xF0, "crc32{b}", GR32, i8mem,
7516                                 int_x86_sse42_crc32_32_8>;
7517  def CRC32r32r8  : SS42I_crc32r<0xF0, "crc32{b}", GR32, GR8,
7518                                 int_x86_sse42_crc32_32_8>;
7519  def CRC32r32m16 : SS42I_crc32m<0xF1, "crc32{w}", GR32, i16mem,
7520                                 int_x86_sse42_crc32_32_16>, OpSize16;
7521  def CRC32r32r16 : SS42I_crc32r<0xF1, "crc32{w}", GR32, GR16,
7522                                 int_x86_sse42_crc32_32_16>, OpSize16;
7523  def CRC32r32m32 : SS42I_crc32m<0xF1, "crc32{l}", GR32, i32mem,
7524                                 int_x86_sse42_crc32_32_32>, OpSize32;
7525  def CRC32r32r32 : SS42I_crc32r<0xF1, "crc32{l}", GR32, GR32,
7526                                 int_x86_sse42_crc32_32_32>, OpSize32;
7527  def CRC32r64m64 : SS42I_crc32m<0xF1, "crc32{q}", GR64, i64mem,
7528                                 int_x86_sse42_crc32_64_64>, REX_W;
7529  def CRC32r64r64 : SS42I_crc32r<0xF1, "crc32{q}", GR64, GR64,
7530                                 int_x86_sse42_crc32_64_64>, REX_W;
7531  let hasSideEffects = 0 in {
7532    let mayLoad = 1 in
7533    def CRC32r64m8 : SS42I_crc32m<0xF0, "crc32{b}", GR64, i8mem,
7534                                   null_frag>, REX_W;
7535    def CRC32r64r8 : SS42I_crc32r<0xF0, "crc32{b}", GR64, GR8,
7536                                   null_frag>, REX_W;
7537  }
7538}
7539
7540//===----------------------------------------------------------------------===//
7541// SHA-NI Instructions
7542//===----------------------------------------------------------------------===//
7543
7544multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId,
7545                      bit UsesXMM0 = 0> {
7546  def rr : I<Opc, MRMSrcReg, (outs VR128:$dst),
7547             (ins VR128:$src1, VR128:$src2),
7548             !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
7549             [!if(UsesXMM0,
7550                  (set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0)),
7551                  (set VR128:$dst, (IntId VR128:$src1, VR128:$src2)))]>, T8;
7552
7553  def rm : I<Opc, MRMSrcMem, (outs VR128:$dst),
7554             (ins VR128:$src1, i128mem:$src2),
7555             !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
7556             [!if(UsesXMM0,
7557                  (set VR128:$dst, (IntId VR128:$src1,
7558                    (bc_v4i32 (memopv2i64 addr:$src2)), XMM0)),
7559                  (set VR128:$dst, (IntId VR128:$src1,
7560                    (bc_v4i32 (memopv2i64 addr:$src2)))))]>, T8;
7561}
7562
7563let Constraints = "$src1 = $dst", Predicates = [HasSHA] in {
7564  def SHA1RNDS4rri : Ii8<0xCC, MRMSrcReg, (outs VR128:$dst),
7565                         (ins VR128:$src1, VR128:$src2, u8imm:$src3),
7566                         "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
7567                         [(set VR128:$dst,
7568                           (int_x86_sha1rnds4 VR128:$src1, VR128:$src2,
7569                            (i8 imm:$src3)))]>, TA;
7570  def SHA1RNDS4rmi : Ii8<0xCC, MRMSrcMem, (outs VR128:$dst),
7571                         (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
7572                         "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
7573                         [(set VR128:$dst,
7574                           (int_x86_sha1rnds4 VR128:$src1,
7575                            (bc_v4i32 (memopv2i64 addr:$src2)),
7576                            (i8 imm:$src3)))]>, TA;
7577
7578  defm SHA1NEXTE : SHAI_binop<0xC8, "sha1nexte", int_x86_sha1nexte>;
7579  defm SHA1MSG1  : SHAI_binop<0xC9, "sha1msg1", int_x86_sha1msg1>;
7580  defm SHA1MSG2  : SHAI_binop<0xCA, "sha1msg2", int_x86_sha1msg2>;
7581
7582  let Uses=[XMM0] in
7583  defm SHA256RNDS2 : SHAI_binop<0xCB, "sha256rnds2", int_x86_sha256rnds2, 1>;
7584
7585  defm SHA256MSG1 : SHAI_binop<0xCC, "sha256msg1", int_x86_sha256msg1>;
7586  defm SHA256MSG2 : SHAI_binop<0xCD, "sha256msg2", int_x86_sha256msg2>;
7587}
7588
7589// Aliases with explicit %xmm0
7590def : InstAlias<"sha256rnds2\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
7591                (SHA256RNDS2rr VR128:$dst, VR128:$src2)>;
7592def : InstAlias<"sha256rnds2\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
7593                (SHA256RNDS2rm VR128:$dst, i128mem:$src2)>;
7594
7595//===----------------------------------------------------------------------===//
7596// AES-NI Instructions
7597//===----------------------------------------------------------------------===//
7598
7599multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId128,
7600                             PatFrag ld_frag, bit Is2Addr = 1> {
7601  def rr : AES8I<opc, MRMSrcReg, (outs VR128:$dst),
7602       (ins VR128:$src1, VR128:$src2),
7603       !if(Is2Addr,
7604           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
7605           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
7606       [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
7607       Sched<[WriteAESDecEnc]>;
7608  def rm : AES8I<opc, MRMSrcMem, (outs VR128:$dst),
7609       (ins VR128:$src1, i128mem:$src2),
7610       !if(Is2Addr,
7611           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
7612           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
7613       [(set VR128:$dst,
7614         (IntId128 VR128:$src1, (ld_frag addr:$src2)))]>,
7615       Sched<[WriteAESDecEncLd, ReadAfterLd]>;
7616}
7617
7618// Perform One Round of an AES Encryption/Decryption Flow
7619let Predicates = [HasAVX, HasAES] in {
7620  defm VAESENC          : AESI_binop_rm_int<0xDC, "vaesenc",
7621                         int_x86_aesni_aesenc, loadv2i64, 0>, VEX_4V;
7622  defm VAESENCLAST      : AESI_binop_rm_int<0xDD, "vaesenclast",
7623                         int_x86_aesni_aesenclast, loadv2i64, 0>, VEX_4V;
7624  defm VAESDEC          : AESI_binop_rm_int<0xDE, "vaesdec",
7625                         int_x86_aesni_aesdec, loadv2i64, 0>, VEX_4V;
7626  defm VAESDECLAST      : AESI_binop_rm_int<0xDF, "vaesdeclast",
7627                         int_x86_aesni_aesdeclast, loadv2i64, 0>, VEX_4V;
7628}
7629
7630let Constraints = "$src1 = $dst" in {
7631  defm AESENC          : AESI_binop_rm_int<0xDC, "aesenc",
7632                         int_x86_aesni_aesenc, memopv2i64>;
7633  defm AESENCLAST      : AESI_binop_rm_int<0xDD, "aesenclast",
7634                         int_x86_aesni_aesenclast, memopv2i64>;
7635  defm AESDEC          : AESI_binop_rm_int<0xDE, "aesdec",
7636                         int_x86_aesni_aesdec, memopv2i64>;
7637  defm AESDECLAST      : AESI_binop_rm_int<0xDF, "aesdeclast",
7638                         int_x86_aesni_aesdeclast, memopv2i64>;
7639}
7640
7641// Perform the AES InvMixColumn Transformation
7642let Predicates = [HasAVX, HasAES] in {
7643  def VAESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
7644      (ins VR128:$src1),
7645      "vaesimc\t{$src1, $dst|$dst, $src1}",
7646      [(set VR128:$dst,
7647        (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>,
7648      VEX;
7649  def VAESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
7650      (ins i128mem:$src1),
7651      "vaesimc\t{$src1, $dst|$dst, $src1}",
7652      [(set VR128:$dst, (int_x86_aesni_aesimc (loadv2i64 addr:$src1)))]>,
7653      Sched<[WriteAESIMCLd]>, VEX;
7654}
7655def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
7656  (ins VR128:$src1),
7657  "aesimc\t{$src1, $dst|$dst, $src1}",
7658  [(set VR128:$dst,
7659    (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>;
7660def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
7661  (ins i128mem:$src1),
7662  "aesimc\t{$src1, $dst|$dst, $src1}",
7663  [(set VR128:$dst, (int_x86_aesni_aesimc (memopv2i64 addr:$src1)))]>,
7664  Sched<[WriteAESIMCLd]>;
7665
7666// AES Round Key Generation Assist
7667let Predicates = [HasAVX, HasAES] in {
7668  def VAESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
7669      (ins VR128:$src1, u8imm:$src2),
7670      "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7671      [(set VR128:$dst,
7672        (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>,
7673      Sched<[WriteAESKeyGen]>, VEX;
7674  def VAESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
7675      (ins i128mem:$src1, u8imm:$src2),
7676      "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7677      [(set VR128:$dst,
7678        (int_x86_aesni_aeskeygenassist (loadv2i64 addr:$src1), imm:$src2))]>,
7679      Sched<[WriteAESKeyGenLd]>, VEX;
7680}
7681def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
7682  (ins VR128:$src1, u8imm:$src2),
7683  "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7684  [(set VR128:$dst,
7685    (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>,
7686  Sched<[WriteAESKeyGen]>;
7687def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
7688  (ins i128mem:$src1, u8imm:$src2),
7689  "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7690  [(set VR128:$dst,
7691    (int_x86_aesni_aeskeygenassist (memopv2i64 addr:$src1), imm:$src2))]>,
7692  Sched<[WriteAESKeyGenLd]>;
7693
7694//===----------------------------------------------------------------------===//
7695// PCLMUL Instructions
7696//===----------------------------------------------------------------------===//
7697
7698// AVX carry-less Multiplication instructions
7699let isCommutable = 1 in
7700def VPCLMULQDQrr : AVXPCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
7701           (ins VR128:$src1, VR128:$src2, u8imm:$src3),
7702           "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7703           [(set VR128:$dst,
7704             (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))]>,
7705           Sched<[WriteCLMul]>;
7706
7707def VPCLMULQDQrm : AVXPCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
7708           (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
7709           "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7710           [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1,
7711                              (loadv2i64 addr:$src2), imm:$src3))]>,
7712           Sched<[WriteCLMulLd, ReadAfterLd]>;
7713
7714// Carry-less Multiplication instructions
7715let Constraints = "$src1 = $dst" in {
7716let isCommutable = 1 in
7717def PCLMULQDQrr : PCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
7718           (ins VR128:$src1, VR128:$src2, u8imm:$src3),
7719           "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
7720           [(set VR128:$dst,
7721             (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))],
7722             IIC_SSE_PCLMULQDQ_RR>, Sched<[WriteCLMul]>;
7723
7724def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
7725           (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
7726           "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
7727           [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1,
7728                              (memopv2i64 addr:$src2), imm:$src3))],
7729                              IIC_SSE_PCLMULQDQ_RM>,
7730           Sched<[WriteCLMulLd, ReadAfterLd]>;
7731} // Constraints = "$src1 = $dst"
7732
7733
7734multiclass pclmul_alias<string asm, int immop> {
7735  def : InstAlias<!strconcat("pclmul", asm, "dq {$src, $dst|$dst, $src}"),
7736                  (PCLMULQDQrr VR128:$dst, VR128:$src, immop), 0>;
7737
7738  def : InstAlias<!strconcat("pclmul", asm, "dq {$src, $dst|$dst, $src}"),
7739                  (PCLMULQDQrm VR128:$dst, i128mem:$src, immop), 0>;
7740
7741  def : InstAlias<!strconcat("vpclmul", asm,
7742                             "dq {$src2, $src1, $dst|$dst, $src1, $src2}"),
7743                  (VPCLMULQDQrr VR128:$dst, VR128:$src1, VR128:$src2, immop),
7744                  0>;
7745
7746  def : InstAlias<!strconcat("vpclmul", asm,
7747                             "dq {$src2, $src1, $dst|$dst, $src1, $src2}"),
7748                  (VPCLMULQDQrm VR128:$dst, VR128:$src1, i128mem:$src2, immop),
7749                  0>;
7750}
7751defm : pclmul_alias<"hqhq", 0x11>;
7752defm : pclmul_alias<"hqlq", 0x01>;
7753defm : pclmul_alias<"lqhq", 0x10>;
7754defm : pclmul_alias<"lqlq", 0x00>;
7755
7756//===----------------------------------------------------------------------===//
7757// SSE4A Instructions
7758//===----------------------------------------------------------------------===//
7759
7760let Predicates = [HasSSE4A] in {
7761
7762let Constraints = "$src = $dst" in {
7763def EXTRQI : Ii8<0x78, MRMXr, (outs VR128:$dst),
7764                 (ins VR128:$src, u8imm:$len, u8imm:$idx),
7765                 "extrq\t{$idx, $len, $src|$src, $len, $idx}",
7766                 [(set VR128:$dst, (X86extrqi VR128:$src, imm:$len,
7767                                    imm:$idx))]>, PD;
7768def EXTRQ  : I<0x79, MRMSrcReg, (outs VR128:$dst),
7769              (ins VR128:$src, VR128:$mask),
7770              "extrq\t{$mask, $src|$src, $mask}",
7771              [(set VR128:$dst, (int_x86_sse4a_extrq VR128:$src,
7772                                 VR128:$mask))]>, PD;
7773
7774def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst),
7775                   (ins VR128:$src, VR128:$src2, u8imm:$len, u8imm:$idx),
7776                   "insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}",
7777                   [(set VR128:$dst, (X86insertqi VR128:$src, VR128:$src2,
7778                                      imm:$len, imm:$idx))]>, XD;
7779def INSERTQ  : I<0x79, MRMSrcReg, (outs VR128:$dst),
7780                 (ins VR128:$src, VR128:$mask),
7781                 "insertq\t{$mask, $src|$src, $mask}",
7782                 [(set VR128:$dst, (int_x86_sse4a_insertq VR128:$src,
7783                                    VR128:$mask))]>, XD;
7784}
7785
7786def MOVNTSS : I<0x2B, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src),
7787                "movntss\t{$src, $dst|$dst, $src}",
7788                [(int_x86_sse4a_movnt_ss addr:$dst, VR128:$src)]>, XS;
7789
7790def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
7791                "movntsd\t{$src, $dst|$dst, $src}",
7792                [(int_x86_sse4a_movnt_sd addr:$dst, VR128:$src)]>, XD;
7793}
7794
7795//===----------------------------------------------------------------------===//
7796// AVX Instructions
7797//===----------------------------------------------------------------------===//
7798
7799//===----------------------------------------------------------------------===//
7800// VBROADCAST - Load from memory and broadcast to all elements of the
7801//              destination operand
7802//
7803class avx_broadcast_rm<bits<8> opc, string OpcodeStr, RegisterClass RC,
7804                           X86MemOperand x86memop, ValueType VT,
7805                           PatFrag ld_frag, SchedWrite Sched> :
7806  AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
7807        !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7808        [(set RC:$dst, (VT (X86VBroadcast (ld_frag addr:$src))))]>,
7809        Sched<[Sched]>, VEX {
7810    let mayLoad = 1;
7811}
7812
7813// AVX2 adds register forms
7814class avx2_broadcast_rr<bits<8> opc, string OpcodeStr, RegisterClass RC,
7815                        ValueType ResVT, ValueType OpVT, SchedWrite Sched> :
7816  AVX28I<opc, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
7817         !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7818         [(set RC:$dst, (ResVT (X86VBroadcast (OpVT VR128:$src))))]>,
7819         Sched<[Sched]>, VEX;
7820
7821let ExeDomain = SSEPackedSingle in {
7822  def VBROADCASTSSrm  : avx_broadcast_rm<0x18, "vbroadcastss", VR128,
7823                                             f32mem, v4f32, loadf32, WriteLoad>;
7824  def VBROADCASTSSYrm : avx_broadcast_rm<0x18, "vbroadcastss", VR256,
7825                                             f32mem, v8f32, loadf32,
7826                                             WriteFShuffleLd>, VEX_L;
7827}
7828let ExeDomain = SSEPackedDouble in
7829def VBROADCASTSDYrm  : avx_broadcast_rm<0x19, "vbroadcastsd", VR256, f64mem,
7830                                    v4f64, loadf64, WriteFShuffleLd>, VEX_L;
7831
7832let ExeDomain = SSEPackedSingle in {
7833  def VBROADCASTSSrr  : avx2_broadcast_rr<0x18, "vbroadcastss", VR128,
7834                                          v4f32, v4f32, WriteFShuffle>;
7835  def VBROADCASTSSYrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR256,
7836                                          v8f32, v4f32, WriteFShuffle256>, VEX_L;
7837}
7838let ExeDomain = SSEPackedDouble in
7839def VBROADCASTSDYrr  : avx2_broadcast_rr<0x19, "vbroadcastsd", VR256,
7840                                         v4f64, v2f64, WriteFShuffle256>, VEX_L;
7841
7842let mayLoad = 1, Predicates = [HasAVX2] in
7843def VBROADCASTI128 : AVX8I<0x5A, MRMSrcMem, (outs VR256:$dst),
7844                           (ins i128mem:$src),
7845                           "vbroadcasti128\t{$src, $dst|$dst, $src}", []>,
7846                           Sched<[WriteLoad]>, VEX, VEX_L;
7847
7848def VBROADCASTF128 : AVX8I<0x1A, MRMSrcMem, (outs VR256:$dst),
7849                           (ins f128mem:$src),
7850                           "vbroadcastf128\t{$src, $dst|$dst, $src}",
7851                           [(set VR256:$dst,
7852                              (int_x86_avx_vbroadcastf128_pd_256 addr:$src))]>,
7853                           Sched<[WriteFShuffleLd]>, VEX, VEX_L;
7854
7855let Predicates = [HasAVX] in
7856def : Pat<(int_x86_avx_vbroadcastf128_ps_256 addr:$src),
7857          (VBROADCASTF128 addr:$src)>;
7858
7859
7860//===----------------------------------------------------------------------===//
7861// VINSERTF128 - Insert packed floating-point values
7862//
7863let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
7864def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst),
7865          (ins VR256:$src1, VR128:$src2, u8imm:$src3),
7866          "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7867          []>, Sched<[WriteFShuffle]>, VEX_4V, VEX_L;
7868let mayLoad = 1 in
7869def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst),
7870          (ins VR256:$src1, f128mem:$src2, u8imm:$src3),
7871          "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7872          []>, Sched<[WriteFShuffleLd, ReadAfterLd]>, VEX_4V, VEX_L;
7873}
7874
7875let Predicates = [HasAVX, NoVLX] in {
7876def : Pat<(vinsert128_insert:$ins (v8f32 VR256:$src1), (v4f32 VR128:$src2),
7877                                   (iPTR imm)),
7878          (VINSERTF128rr VR256:$src1, VR128:$src2,
7879                         (INSERT_get_vinsert128_imm VR256:$ins))>;
7880def : Pat<(vinsert128_insert:$ins (v4f64 VR256:$src1), (v2f64 VR128:$src2),
7881                                   (iPTR imm)),
7882          (VINSERTF128rr VR256:$src1, VR128:$src2,
7883                         (INSERT_get_vinsert128_imm VR256:$ins))>;
7884
7885def : Pat<(vinsert128_insert:$ins (v8f32 VR256:$src1), (loadv4f32 addr:$src2),
7886                                   (iPTR imm)),
7887          (VINSERTF128rm VR256:$src1, addr:$src2,
7888                         (INSERT_get_vinsert128_imm VR256:$ins))>;
7889def : Pat<(vinsert128_insert:$ins (v4f64 VR256:$src1), (loadv2f64 addr:$src2),
7890                                   (iPTR imm)),
7891          (VINSERTF128rm VR256:$src1, addr:$src2,
7892                         (INSERT_get_vinsert128_imm VR256:$ins))>;
7893}
7894
7895let Predicates = [HasAVX1Only] in {
7896def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2),
7897                                   (iPTR imm)),
7898          (VINSERTF128rr VR256:$src1, VR128:$src2,
7899                         (INSERT_get_vinsert128_imm VR256:$ins))>;
7900def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2),
7901                                   (iPTR imm)),
7902          (VINSERTF128rr VR256:$src1, VR128:$src2,
7903                         (INSERT_get_vinsert128_imm VR256:$ins))>;
7904def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2),
7905                                   (iPTR imm)),
7906          (VINSERTF128rr VR256:$src1, VR128:$src2,
7907                         (INSERT_get_vinsert128_imm VR256:$ins))>;
7908def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2),
7909                                   (iPTR imm)),
7910          (VINSERTF128rr VR256:$src1, VR128:$src2,
7911                         (INSERT_get_vinsert128_imm VR256:$ins))>;
7912
7913def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (loadv2i64 addr:$src2),
7914                                   (iPTR imm)),
7915          (VINSERTF128rm VR256:$src1, addr:$src2,
7916                         (INSERT_get_vinsert128_imm VR256:$ins))>;
7917def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1),
7918                                   (bc_v4i32 (loadv2i64 addr:$src2)),
7919                                   (iPTR imm)),
7920          (VINSERTF128rm VR256:$src1, addr:$src2,
7921                         (INSERT_get_vinsert128_imm VR256:$ins))>;
7922def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1),
7923                                   (bc_v16i8 (loadv2i64 addr:$src2)),
7924                                   (iPTR imm)),
7925          (VINSERTF128rm VR256:$src1, addr:$src2,
7926                         (INSERT_get_vinsert128_imm VR256:$ins))>;
7927def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1),
7928                                   (bc_v8i16 (loadv2i64 addr:$src2)),
7929                                   (iPTR imm)),
7930          (VINSERTF128rm VR256:$src1, addr:$src2,
7931                         (INSERT_get_vinsert128_imm VR256:$ins))>;
7932}
7933
7934//===----------------------------------------------------------------------===//
7935// VEXTRACTF128 - Extract packed floating-point values
7936//
7937let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
7938def VEXTRACTF128rr : AVXAIi8<0x19, MRMDestReg, (outs VR128:$dst),
7939          (ins VR256:$src1, u8imm:$src2),
7940          "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7941          []>, Sched<[WriteFShuffle]>, VEX, VEX_L;
7942let mayStore = 1 in
7943def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs),
7944          (ins f128mem:$dst, VR256:$src1, u8imm:$src2),
7945          "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7946          []>, Sched<[WriteStore]>, VEX, VEX_L;
7947}
7948
7949// AVX1 patterns
7950let Predicates = [HasAVX] in {
7951def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
7952          (v4f32 (VEXTRACTF128rr
7953                    (v8f32 VR256:$src1),
7954                    (EXTRACT_get_vextract128_imm VR128:$ext)))>;
7955def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
7956          (v2f64 (VEXTRACTF128rr
7957                    (v4f64 VR256:$src1),
7958                    (EXTRACT_get_vextract128_imm VR128:$ext)))>;
7959
7960def : Pat<(store (v4f32 (vextract128_extract:$ext (v8f32 VR256:$src1),
7961                         (iPTR imm))), addr:$dst),
7962          (VEXTRACTF128mr addr:$dst, VR256:$src1,
7963           (EXTRACT_get_vextract128_imm VR128:$ext))>;
7964def : Pat<(store (v2f64 (vextract128_extract:$ext (v4f64 VR256:$src1),
7965                         (iPTR imm))), addr:$dst),
7966          (VEXTRACTF128mr addr:$dst, VR256:$src1,
7967           (EXTRACT_get_vextract128_imm VR128:$ext))>;
7968}
7969
7970let Predicates = [HasAVX1Only] in {
7971def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
7972          (v2i64 (VEXTRACTF128rr
7973                  (v4i64 VR256:$src1),
7974                  (EXTRACT_get_vextract128_imm VR128:$ext)))>;
7975def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
7976          (v4i32 (VEXTRACTF128rr
7977                  (v8i32 VR256:$src1),
7978                  (EXTRACT_get_vextract128_imm VR128:$ext)))>;
7979def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
7980          (v8i16 (VEXTRACTF128rr
7981                  (v16i16 VR256:$src1),
7982                  (EXTRACT_get_vextract128_imm VR128:$ext)))>;
7983def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
7984          (v16i8 (VEXTRACTF128rr
7985                  (v32i8 VR256:$src1),
7986                  (EXTRACT_get_vextract128_imm VR128:$ext)))>;
7987
7988def : Pat<(alignedstore (v2i64 (vextract128_extract:$ext (v4i64 VR256:$src1),
7989                                (iPTR imm))), addr:$dst),
7990          (VEXTRACTF128mr addr:$dst, VR256:$src1,
7991           (EXTRACT_get_vextract128_imm VR128:$ext))>;
7992def : Pat<(alignedstore (v4i32 (vextract128_extract:$ext (v8i32 VR256:$src1),
7993                                (iPTR imm))), addr:$dst),
7994          (VEXTRACTF128mr addr:$dst, VR256:$src1,
7995           (EXTRACT_get_vextract128_imm VR128:$ext))>;
7996def : Pat<(alignedstore (v8i16 (vextract128_extract:$ext (v16i16 VR256:$src1),
7997                                (iPTR imm))), addr:$dst),
7998          (VEXTRACTF128mr addr:$dst, VR256:$src1,
7999           (EXTRACT_get_vextract128_imm VR128:$ext))>;
8000def : Pat<(alignedstore (v16i8 (vextract128_extract:$ext (v32i8 VR256:$src1),
8001                                (iPTR imm))), addr:$dst),
8002          (VEXTRACTF128mr addr:$dst, VR256:$src1,
8003           (EXTRACT_get_vextract128_imm VR128:$ext))>;
8004}
8005
8006//===----------------------------------------------------------------------===//
8007// VMASKMOV - Conditional SIMD Packed Loads and Stores
8008//
8009multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr,
8010                          Intrinsic IntLd, Intrinsic IntLd256,
8011                          Intrinsic IntSt, Intrinsic IntSt256> {
8012  def rm  : AVX8I<opc_rm, MRMSrcMem, (outs VR128:$dst),
8013             (ins VR128:$src1, f128mem:$src2),
8014             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8015             [(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))]>,
8016             VEX_4V;
8017  def Yrm : AVX8I<opc_rm, MRMSrcMem, (outs VR256:$dst),
8018             (ins VR256:$src1, f256mem:$src2),
8019             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8020             [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
8021             VEX_4V, VEX_L;
8022  def mr  : AVX8I<opc_mr, MRMDestMem, (outs),
8023             (ins f128mem:$dst, VR128:$src1, VR128:$src2),
8024             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8025             [(IntSt addr:$dst, VR128:$src1, VR128:$src2)]>, VEX_4V;
8026  def Ymr : AVX8I<opc_mr, MRMDestMem, (outs),
8027             (ins f256mem:$dst, VR256:$src1, VR256:$src2),
8028             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8029             [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, VEX_4V, VEX_L;
8030}
8031
8032let ExeDomain = SSEPackedSingle in
8033defm VMASKMOVPS : avx_movmask_rm<0x2C, 0x2E, "vmaskmovps",
8034                                 int_x86_avx_maskload_ps,
8035                                 int_x86_avx_maskload_ps_256,
8036                                 int_x86_avx_maskstore_ps,
8037                                 int_x86_avx_maskstore_ps_256>;
8038let ExeDomain = SSEPackedDouble in
8039defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd",
8040                                 int_x86_avx_maskload_pd,
8041                                 int_x86_avx_maskload_pd_256,
8042                                 int_x86_avx_maskstore_pd,
8043                                 int_x86_avx_maskstore_pd_256>;
8044
8045//===----------------------------------------------------------------------===//
8046// VPERMIL - Permute Single and Double Floating-Point Values
8047//
8048multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
8049                      RegisterClass RC, X86MemOperand x86memop_f,
8050                      X86MemOperand x86memop_i, PatFrag i_frag,
8051                      Intrinsic IntVar, ValueType vt> {
8052  def rr  : AVX8I<opc_rm, MRMSrcReg, (outs RC:$dst),
8053             (ins RC:$src1, RC:$src2),
8054             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8055             [(set RC:$dst, (IntVar RC:$src1, RC:$src2))]>, VEX_4V,
8056             Sched<[WriteFShuffle]>;
8057  def rm  : AVX8I<opc_rm, MRMSrcMem, (outs RC:$dst),
8058             (ins RC:$src1, x86memop_i:$src2),
8059             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8060             [(set RC:$dst, (IntVar RC:$src1,
8061                             (bitconvert (i_frag addr:$src2))))]>, VEX_4V,
8062             Sched<[WriteFShuffleLd, ReadAfterLd]>;
8063
8064  let Predicates = [HasAVX, NoVLX] in {
8065    def ri  : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst),
8066             (ins RC:$src1, u8imm:$src2),
8067             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8068             [(set RC:$dst, (vt (X86VPermilpi RC:$src1, (i8 imm:$src2))))]>, VEX,
8069             Sched<[WriteFShuffle]>;
8070    def mi  : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst),
8071             (ins x86memop_f:$src1, u8imm:$src2),
8072             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8073             [(set RC:$dst,
8074               (vt (X86VPermilpi (load addr:$src1), (i8 imm:$src2))))]>, VEX,
8075             Sched<[WriteFShuffleLd]>;
8076  }// Predicates = [HasAVX, NoVLX]
8077}
8078
8079let ExeDomain = SSEPackedSingle in {
8080  defm VPERMILPS  : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem,
8081                               loadv2i64, int_x86_avx_vpermilvar_ps, v4f32>;
8082  defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem,
8083                       loadv4i64, int_x86_avx_vpermilvar_ps_256, v8f32>, VEX_L;
8084}
8085let ExeDomain = SSEPackedDouble in {
8086  defm VPERMILPD  : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem,
8087                               loadv2i64, int_x86_avx_vpermilvar_pd, v2f64>;
8088  defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem,
8089                       loadv4i64, int_x86_avx_vpermilvar_pd_256, v4f64>, VEX_L;
8090}
8091
8092let Predicates = [HasAVX, NoVLX] in {
8093def : Pat<(v8f32 (X86VPermilpv VR256:$src1, (v8i32 VR256:$src2))),
8094          (VPERMILPSYrr VR256:$src1, VR256:$src2)>;
8095def : Pat<(v8f32 (X86VPermilpv VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))),
8096          (VPERMILPSYrm VR256:$src1, addr:$src2)>;
8097def : Pat<(v4f64 (X86VPermilpv VR256:$src1, (v4i64 VR256:$src2))),
8098          (VPERMILPDYrr VR256:$src1, VR256:$src2)>;
8099def : Pat<(v4f64 (X86VPermilpv VR256:$src1, (loadv4i64 addr:$src2))),
8100          (VPERMILPDYrm VR256:$src1, addr:$src2)>;
8101
8102def : Pat<(v8i32 (X86VPermilpi VR256:$src1, (i8 imm:$imm))),
8103          (VPERMILPSYri VR256:$src1, imm:$imm)>;
8104def : Pat<(v4i64 (X86VPermilpi VR256:$src1, (i8 imm:$imm))),
8105          (VPERMILPDYri VR256:$src1, imm:$imm)>;
8106def : Pat<(v8i32 (X86VPermilpi (bc_v8i32 (loadv4i64 addr:$src1)),
8107                               (i8 imm:$imm))),
8108          (VPERMILPSYmi addr:$src1, imm:$imm)>;
8109def : Pat<(v4i64 (X86VPermilpi (loadv4i64 addr:$src1), (i8 imm:$imm))),
8110          (VPERMILPDYmi addr:$src1, imm:$imm)>;
8111
8112def : Pat<(v4f32 (X86VPermilpv VR128:$src1, (v4i32 VR128:$src2))),
8113          (VPERMILPSrr VR128:$src1, VR128:$src2)>;
8114def : Pat<(v4f32 (X86VPermilpv VR128:$src1, (bc_v4i32 (loadv2i64 addr:$src2)))),
8115          (VPERMILPSrm VR128:$src1, addr:$src2)>;
8116def : Pat<(v2f64 (X86VPermilpv VR128:$src1, (v2i64 VR128:$src2))),
8117          (VPERMILPDrr VR128:$src1, VR128:$src2)>;
8118def : Pat<(v2f64 (X86VPermilpv VR128:$src1, (loadv2i64 addr:$src2))),
8119          (VPERMILPDrm VR128:$src1, addr:$src2)>;
8120
8121def : Pat<(v2i64 (X86VPermilpi VR128:$src1, (i8 imm:$imm))),
8122          (VPERMILPDri VR128:$src1, imm:$imm)>;
8123def : Pat<(v2i64 (X86VPermilpi (loadv2i64 addr:$src1), (i8 imm:$imm))),
8124          (VPERMILPDmi addr:$src1, imm:$imm)>;
8125}
8126
8127//===----------------------------------------------------------------------===//
8128// VPERM2F128 - Permute Floating-Point Values in 128-bit chunks
8129//
8130let ExeDomain = SSEPackedSingle in {
8131def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst),
8132          (ins VR256:$src1, VR256:$src2, u8imm:$src3),
8133          "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
8134          [(set VR256:$dst, (v8f32 (X86VPerm2x128 VR256:$src1, VR256:$src2,
8135                              (i8 imm:$src3))))]>, VEX_4V, VEX_L,
8136          Sched<[WriteFShuffle]>;
8137def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst),
8138          (ins VR256:$src1, f256mem:$src2, u8imm:$src3),
8139          "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
8140          [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv8f32 addr:$src2),
8141                             (i8 imm:$src3)))]>, VEX_4V, VEX_L,
8142          Sched<[WriteFShuffleLd, ReadAfterLd]>;
8143}
8144
8145let Predicates = [HasAVX] in {
8146def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
8147          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
8148def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1,
8149                  (loadv4f64 addr:$src2), (i8 imm:$imm))),
8150          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
8151}
8152
8153let Predicates = [HasAVX1Only] in {
8154def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
8155          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
8156def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
8157          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
8158def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
8159          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
8160def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
8161          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
8162
8163def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1,
8164                  (bc_v8i32 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
8165          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
8166def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1,
8167                  (loadv4i64 addr:$src2), (i8 imm:$imm))),
8168          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
8169def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1,
8170                  (bc_v32i8 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
8171          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
8172def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1,
8173                  (bc_v16i16 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
8174          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
8175}
8176
8177//===----------------------------------------------------------------------===//
8178// VZERO - Zero YMM registers
8179//
8180let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
8181            YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15] in {
8182  // Zero All YMM registers
8183  def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall",
8184                  [(int_x86_avx_vzeroall)]>, PS, VEX, VEX_L, Requires<[HasAVX]>;
8185
8186  // Zero Upper bits of YMM registers
8187  def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper",
8188                     [(int_x86_avx_vzeroupper)]>, PS, VEX, Requires<[HasAVX]>;
8189}
8190
8191//===----------------------------------------------------------------------===//
8192// Half precision conversion instructions
8193//===----------------------------------------------------------------------===//
8194multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> {
8195  def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
8196             "vcvtph2ps\t{$src, $dst|$dst, $src}",
8197             [(set RC:$dst, (Int VR128:$src))]>,
8198             T8PD, VEX, Sched<[WriteCvtF2F]>;
8199  let hasSideEffects = 0, mayLoad = 1 in
8200  def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
8201             "vcvtph2ps\t{$src, $dst|$dst, $src}", []>, T8PD, VEX,
8202             Sched<[WriteCvtF2FLd]>;
8203}
8204
8205multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> {
8206  def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst),
8207               (ins RC:$src1, i32u8imm:$src2),
8208               "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
8209               [(set VR128:$dst, (Int RC:$src1, imm:$src2))]>,
8210               TAPD, VEX, Sched<[WriteCvtF2F]>;
8211  let hasSideEffects = 0, mayStore = 1,
8212      SchedRW = [WriteCvtF2FLd, WriteRMW] in
8213  def mr : Ii8<0x1D, MRMDestMem, (outs),
8214               (ins x86memop:$dst, RC:$src1, i32u8imm:$src2),
8215               "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
8216               TAPD, VEX;
8217}
8218
8219let Predicates = [HasF16C] in {
8220  defm VCVTPH2PS  : f16c_ph2ps<VR128, f64mem, int_x86_vcvtph2ps_128>;
8221  defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, int_x86_vcvtph2ps_256>, VEX_L;
8222  defm VCVTPS2PH  : f16c_ps2ph<VR128, f64mem, int_x86_vcvtps2ph_128>;
8223  defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, int_x86_vcvtps2ph_256>, VEX_L;
8224
8225  // Pattern match vcvtph2ps of a scalar i64 load.
8226  def : Pat<(int_x86_vcvtph2ps_128 (vzmovl_v2i64 addr:$src)),
8227            (VCVTPH2PSrm addr:$src)>;
8228  def : Pat<(int_x86_vcvtph2ps_128 (vzload_v2i64 addr:$src)),
8229            (VCVTPH2PSrm addr:$src)>;
8230
8231  def : Pat<(store (f64 (extractelt (bc_v2f64 (v8i16
8232                  (int_x86_vcvtps2ph_128 VR128:$src1, i32:$src2))), (iPTR 0))),
8233                   addr:$dst),
8234                   (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>;
8235  def : Pat<(store (i64 (extractelt (bc_v2i64 (v8i16
8236                  (int_x86_vcvtps2ph_128 VR128:$src1, i32:$src2))), (iPTR 0))),
8237                   addr:$dst),
8238                   (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>;
8239  def : Pat<(store (v8i16 (int_x86_vcvtps2ph_256 VR256:$src1, i32:$src2)),
8240                   addr:$dst),
8241                   (VCVTPS2PHYmr addr:$dst, VR256:$src1, imm:$src2)>;
8242}
8243
8244// Patterns for  matching conversions from float to half-float and vice versa.
8245let Predicates = [HasF16C] in {
8246  def : Pat<(fp_to_f16 FR32:$src),
8247            (i16 (EXTRACT_SUBREG (VMOVPDI2DIrr (VCVTPS2PHrr
8248              (COPY_TO_REGCLASS FR32:$src, VR128), 0)), sub_16bit))>;
8249
8250  def : Pat<(f16_to_fp GR16:$src),
8251            (f32 (COPY_TO_REGCLASS (VCVTPH2PSrr
8252              (COPY_TO_REGCLASS (MOVSX32rr16 GR16:$src), VR128)), FR32)) >;
8253
8254  def : Pat<(f16_to_fp (i16 (fp_to_f16 FR32:$src))),
8255            (f32 (COPY_TO_REGCLASS (VCVTPH2PSrr
8256              (VCVTPS2PHrr (COPY_TO_REGCLASS FR32:$src, VR128), 0)), FR32)) >;
8257}
8258
8259//===----------------------------------------------------------------------===//
8260// AVX2 Instructions
8261//===----------------------------------------------------------------------===//
8262
8263/// AVX2_binop_rmi - AVX2 binary operator with 8-bit immediate
8264multiclass AVX2_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
8265                          ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
8266                          X86MemOperand x86memop> {
8267  let isCommutable = 1 in
8268  def rri : AVX2AIi8<opc, MRMSrcReg, (outs RC:$dst),
8269        (ins RC:$src1, RC:$src2, u8imm:$src3),
8270        !strconcat(OpcodeStr,
8271            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
8272        [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))]>,
8273        Sched<[WriteBlend]>, VEX_4V;
8274  def rmi : AVX2AIi8<opc, MRMSrcMem, (outs RC:$dst),
8275        (ins RC:$src1, x86memop:$src2, u8imm:$src3),
8276        !strconcat(OpcodeStr,
8277            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
8278        [(set RC:$dst,
8279          (OpVT (OpNode RC:$src1,
8280           (bitconvert (memop_frag addr:$src2)), imm:$src3)))]>,
8281        Sched<[WriteBlendLd, ReadAfterLd]>, VEX_4V;
8282}
8283
8284defm VPBLENDD : AVX2_binop_rmi<0x02, "vpblendd", X86Blendi, v4i32,
8285                               VR128, loadv2i64, i128mem>;
8286defm VPBLENDDY : AVX2_binop_rmi<0x02, "vpblendd", X86Blendi, v8i32,
8287                                VR256, loadv4i64, i256mem>, VEX_L;
8288
8289//===----------------------------------------------------------------------===//
8290// VPBROADCAST - Load from memory and broadcast to all elements of the
8291//               destination operand
8292//
8293multiclass avx2_broadcast<bits<8> opc, string OpcodeStr,
8294                          X86MemOperand x86memop, PatFrag ld_frag,
8295                          ValueType OpVT128, ValueType OpVT256, Predicate prd> {
8296  let Predicates = [HasAVX2, prd] in {
8297    def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
8298                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
8299                  [(set VR128:$dst,
8300                   (OpVT128 (X86VBroadcast (OpVT128 VR128:$src))))]>,
8301                  Sched<[WriteShuffle]>, VEX;
8302    def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
8303                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
8304                  [(set VR128:$dst,
8305                   (OpVT128 (X86VBroadcast (ld_frag addr:$src))))]>,
8306                  Sched<[WriteLoad]>, VEX;
8307    def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
8308                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
8309                   [(set VR256:$dst,
8310                    (OpVT256 (X86VBroadcast (OpVT128 VR128:$src))))]>,
8311                   Sched<[WriteShuffle256]>, VEX, VEX_L;
8312    def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src),
8313                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
8314                   [(set VR256:$dst,
8315                    (OpVT256 (X86VBroadcast (ld_frag addr:$src))))]>,
8316                   Sched<[WriteLoad]>, VEX, VEX_L;
8317
8318    // Provide aliases for broadcast from the same register class that
8319    // automatically does the extract.
8320    def : Pat<(OpVT256 (X86VBroadcast (OpVT256 VR256:$src))),
8321              (!cast<Instruction>(NAME#"Yrr")
8322                  (OpVT128 (EXTRACT_SUBREG (OpVT256 VR256:$src),sub_xmm)))>;
8323  }
8324}
8325
8326defm VPBROADCASTB  : avx2_broadcast<0x78, "vpbroadcastb", i8mem, loadi8,
8327                                    v16i8, v32i8, NoVLX_Or_NoBWI>;
8328defm VPBROADCASTW  : avx2_broadcast<0x79, "vpbroadcastw", i16mem, loadi16,
8329                                    v8i16, v16i16, NoVLX_Or_NoBWI>;
8330defm VPBROADCASTD  : avx2_broadcast<0x58, "vpbroadcastd", i32mem, loadi32,
8331                                    v4i32, v8i32, NoVLX>;
8332defm VPBROADCASTQ  : avx2_broadcast<0x59, "vpbroadcastq", i64mem, loadi64,
8333                                    v2i64, v4i64, NoVLX>;
8334
8335let Predicates = [HasAVX2] in {
8336  // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably.
8337  // This means we'll encounter truncated i32 loads; match that here.
8338  def : Pat<(v8i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
8339            (VPBROADCASTWrm addr:$src)>;
8340  def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
8341            (VPBROADCASTWYrm addr:$src)>;
8342  def : Pat<(v8i16 (X86VBroadcast
8343              (i16 (trunc (i32 (zextloadi16 addr:$src)))))),
8344            (VPBROADCASTWrm addr:$src)>;
8345  def : Pat<(v16i16 (X86VBroadcast
8346              (i16 (trunc (i32 (zextloadi16 addr:$src)))))),
8347            (VPBROADCASTWYrm addr:$src)>;
8348
8349  // Provide aliases for broadcast from the same register class that
8350  // automatically does the extract.
8351  def : Pat<(v8f32 (X86VBroadcast (v8f32 VR256:$src))),
8352            (VBROADCASTSSYrr (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src),
8353                                                    sub_xmm)))>;
8354  def : Pat<(v4f64 (X86VBroadcast (v4f64 VR256:$src))),
8355            (VBROADCASTSDYrr (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src),
8356                                                    sub_xmm)))>;
8357
8358  // Provide fallback in case the load node that is used in the patterns above
8359  // is used by additional users, which prevents the pattern selection.
8360  let AddedComplexity = 20 in {
8361    def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
8362              (VBROADCASTSSrr (COPY_TO_REGCLASS FR32:$src, VR128))>;
8363    def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
8364              (VBROADCASTSSYrr (COPY_TO_REGCLASS FR32:$src, VR128))>;
8365    def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
8366              (VBROADCASTSDYrr (COPY_TO_REGCLASS FR64:$src, VR128))>;
8367
8368    def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
8369              (VBROADCASTSSrr (COPY_TO_REGCLASS GR32:$src, VR128))>;
8370    def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
8371              (VBROADCASTSSYrr (COPY_TO_REGCLASS GR32:$src, VR128))>;
8372    def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
8373              (VBROADCASTSDYrr (COPY_TO_REGCLASS GR64:$src, VR128))>;
8374
8375    def : Pat<(v16i8 (X86VBroadcast GR8:$src)),
8376          (VPBROADCASTBrr (COPY_TO_REGCLASS
8377                           (i32 (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit)),
8378                           VR128))>;
8379    def : Pat<(v32i8 (X86VBroadcast GR8:$src)),
8380          (VPBROADCASTBYrr (COPY_TO_REGCLASS
8381                            (i32 (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit)),
8382                            VR128))>;
8383
8384    def : Pat<(v8i16 (X86VBroadcast GR16:$src)),
8385          (VPBROADCASTWrr (COPY_TO_REGCLASS
8386                           (i32 (SUBREG_TO_REG (i32 0), GR16:$src, sub_16bit)),
8387                           VR128))>;
8388    def : Pat<(v16i16 (X86VBroadcast GR16:$src)),
8389          (VPBROADCASTWYrr (COPY_TO_REGCLASS
8390                            (i32 (SUBREG_TO_REG (i32 0), GR16:$src, sub_16bit)),
8391                            VR128))>;
8392
8393    // The patterns for VPBROADCASTD are not needed because they would match
8394    // the exact same thing as VBROADCASTSS patterns.
8395
8396    def : Pat<(v2i64 (X86VBroadcast GR64:$src)),
8397          (VPBROADCASTQrr (COPY_TO_REGCLASS GR64:$src, VR128))>;
8398    // The v4i64 pattern is not needed because VBROADCASTSDYrr already match.
8399  }
8400}
8401
8402// AVX1 broadcast patterns
8403let Predicates = [HasAVX1Only] in {
8404def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))),
8405          (VBROADCASTSSYrm addr:$src)>;
8406def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))),
8407          (VBROADCASTSDYrm addr:$src)>;
8408def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
8409          (VBROADCASTSSrm addr:$src)>;
8410}
8411
8412let Predicates = [HasAVX] in {
8413  // Provide fallback in case the load node that is used in the patterns above
8414  // is used by additional users, which prevents the pattern selection.
8415  let AddedComplexity = 20 in {
8416  // 128bit broadcasts:
8417  def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
8418            (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0)>;
8419  def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
8420            (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
8421              (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0), sub_xmm),
8422              (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0), 1)>;
8423  def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
8424            (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
8425              (VPSHUFDri (COPY_TO_REGCLASS FR64:$src, VR128), 0x44), sub_xmm),
8426              (VPSHUFDri (COPY_TO_REGCLASS FR64:$src, VR128), 0x44), 1)>;
8427
8428  def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
8429            (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0)>;
8430  def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
8431            (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
8432              (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0), sub_xmm),
8433              (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0), 1)>;
8434  def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
8435            (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)),
8436              (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44), sub_xmm),
8437              (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44), 1)>;
8438  }
8439
8440  def : Pat<(v2f64 (X86VBroadcast f64:$src)),
8441            (VMOVDDUPrr (COPY_TO_REGCLASS FR64:$src, VR128))>;
8442  def : Pat<(v2i64 (X86VBroadcast i64:$src)),
8443            (VMOVDDUPrr (COPY_TO_REGCLASS GR64:$src, VR128))>;
8444}
8445
8446//===----------------------------------------------------------------------===//
8447// VPERM - Permute instructions
8448//
8449
8450multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
8451                     ValueType OpVT, X86FoldableSchedWrite Sched> {
8452  def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
8453                   (ins VR256:$src1, VR256:$src2),
8454                   !strconcat(OpcodeStr,
8455                       "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8456                   [(set VR256:$dst,
8457                     (OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>,
8458                   Sched<[Sched]>, VEX_4V, VEX_L;
8459  def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
8460                   (ins VR256:$src1, i256mem:$src2),
8461                   !strconcat(OpcodeStr,
8462                       "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8463                   [(set VR256:$dst,
8464                     (OpVT (X86VPermv VR256:$src1,
8465                            (bitconvert (mem_frag addr:$src2)))))]>,
8466                   Sched<[Sched.Folded, ReadAfterLd]>, VEX_4V, VEX_L;
8467}
8468
8469defm VPERMD : avx2_perm<0x36, "vpermd", loadv4i64, v8i32, WriteShuffle256>;
8470let ExeDomain = SSEPackedSingle in
8471defm VPERMPS : avx2_perm<0x16, "vpermps", loadv8f32, v8f32, WriteFShuffle256>;
8472
8473multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
8474                         ValueType OpVT, X86FoldableSchedWrite Sched> {
8475  def Yri : AVX2AIi8<opc, MRMSrcReg, (outs VR256:$dst),
8476                     (ins VR256:$src1, u8imm:$src2),
8477                     !strconcat(OpcodeStr,
8478                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8479                     [(set VR256:$dst,
8480                       (OpVT (X86VPermi VR256:$src1, (i8 imm:$src2))))]>,
8481                     Sched<[Sched]>, VEX, VEX_L;
8482  def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst),
8483                     (ins i256mem:$src1, u8imm:$src2),
8484                     !strconcat(OpcodeStr,
8485                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8486                     [(set VR256:$dst,
8487                       (OpVT (X86VPermi (mem_frag addr:$src1),
8488                              (i8 imm:$src2))))]>,
8489                     Sched<[Sched.Folded, ReadAfterLd]>, VEX, VEX_L;
8490}
8491
8492defm VPERMQ : avx2_perm_imm<0x00, "vpermq", loadv4i64, v4i64,
8493                            WriteShuffle256>, VEX_W;
8494let ExeDomain = SSEPackedDouble in
8495defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64,
8496                             WriteFShuffle256>, VEX_W;
8497
8498//===----------------------------------------------------------------------===//
8499// VPERM2I128 - Permute Floating-Point Values in 128-bit chunks
8500//
8501def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst),
8502          (ins VR256:$src1, VR256:$src2, u8imm:$src3),
8503          "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
8504          [(set VR256:$dst, (v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2,
8505                            (i8 imm:$src3))))]>, Sched<[WriteShuffle256]>,
8506          VEX_4V, VEX_L;
8507def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst),
8508          (ins VR256:$src1, f256mem:$src2, u8imm:$src3),
8509          "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
8510          [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4i64 addr:$src2),
8511                             (i8 imm:$src3)))]>,
8512          Sched<[WriteShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L;
8513
8514let Predicates = [HasAVX2] in {
8515def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
8516          (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>;
8517def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
8518          (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>;
8519def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
8520          (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>;
8521
8522def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, (bc_v32i8 (loadv4i64 addr:$src2)),
8523                  (i8 imm:$imm))),
8524          (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>;
8525def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1,
8526                   (bc_v16i16 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
8527          (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>;
8528def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)),
8529                  (i8 imm:$imm))),
8530          (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>;
8531}
8532
8533
8534//===----------------------------------------------------------------------===//
8535// VINSERTI128 - Insert packed integer values
8536//
8537let hasSideEffects = 0 in {
8538def VINSERTI128rr : AVX2AIi8<0x38, MRMSrcReg, (outs VR256:$dst),
8539          (ins VR256:$src1, VR128:$src2, u8imm:$src3),
8540          "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
8541          []>, Sched<[WriteShuffle256]>, VEX_4V, VEX_L;
8542let mayLoad = 1 in
8543def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst),
8544          (ins VR256:$src1, i128mem:$src2, u8imm:$src3),
8545          "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
8546          []>, Sched<[WriteShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L;
8547}
8548
8549let Predicates = [HasAVX2, NoVLX] in {
8550def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2),
8551                                   (iPTR imm)),
8552          (VINSERTI128rr VR256:$src1, VR128:$src2,
8553                         (INSERT_get_vinsert128_imm VR256:$ins))>;
8554def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2),
8555                                   (iPTR imm)),
8556          (VINSERTI128rr VR256:$src1, VR128:$src2,
8557                         (INSERT_get_vinsert128_imm VR256:$ins))>;
8558def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2),
8559                                   (iPTR imm)),
8560          (VINSERTI128rr VR256:$src1, VR128:$src2,
8561                         (INSERT_get_vinsert128_imm VR256:$ins))>;
8562def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2),
8563                                   (iPTR imm)),
8564          (VINSERTI128rr VR256:$src1, VR128:$src2,
8565                         (INSERT_get_vinsert128_imm VR256:$ins))>;
8566
8567def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (loadv2i64 addr:$src2),
8568                                   (iPTR imm)),
8569          (VINSERTI128rm VR256:$src1, addr:$src2,
8570                         (INSERT_get_vinsert128_imm VR256:$ins))>;
8571def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1),
8572                                   (bc_v4i32 (loadv2i64 addr:$src2)),
8573                                   (iPTR imm)),
8574          (VINSERTI128rm VR256:$src1, addr:$src2,
8575                         (INSERT_get_vinsert128_imm VR256:$ins))>;
8576def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1),
8577                                   (bc_v16i8 (loadv2i64 addr:$src2)),
8578                                   (iPTR imm)),
8579          (VINSERTI128rm VR256:$src1, addr:$src2,
8580                         (INSERT_get_vinsert128_imm VR256:$ins))>;
8581def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1),
8582                                   (bc_v8i16 (loadv2i64 addr:$src2)),
8583                                   (iPTR imm)),
8584          (VINSERTI128rm VR256:$src1, addr:$src2,
8585                         (INSERT_get_vinsert128_imm VR256:$ins))>;
8586}
8587
8588//===----------------------------------------------------------------------===//
8589// VEXTRACTI128 - Extract packed integer values
8590//
8591def VEXTRACTI128rr : AVX2AIi8<0x39, MRMDestReg, (outs VR128:$dst),
8592          (ins VR256:$src1, u8imm:$src2),
8593          "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
8594          Sched<[WriteShuffle256]>, VEX, VEX_L;
8595let hasSideEffects = 0, mayStore = 1 in
8596def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs),
8597          (ins i128mem:$dst, VR256:$src1, u8imm:$src2),
8598          "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
8599          Sched<[WriteStore]>, VEX, VEX_L;
8600
8601let Predicates = [HasAVX2] in {
8602def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
8603          (v2i64 (VEXTRACTI128rr
8604                    (v4i64 VR256:$src1),
8605                    (EXTRACT_get_vextract128_imm VR128:$ext)))>;
8606def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
8607          (v4i32 (VEXTRACTI128rr
8608                    (v8i32 VR256:$src1),
8609                    (EXTRACT_get_vextract128_imm VR128:$ext)))>;
8610def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
8611          (v8i16 (VEXTRACTI128rr
8612                    (v16i16 VR256:$src1),
8613                    (EXTRACT_get_vextract128_imm VR128:$ext)))>;
8614def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
8615          (v16i8 (VEXTRACTI128rr
8616                    (v32i8 VR256:$src1),
8617                    (EXTRACT_get_vextract128_imm VR128:$ext)))>;
8618
8619def : Pat<(store (v2i64 (vextract128_extract:$ext (v4i64 VR256:$src1),
8620                         (iPTR imm))), addr:$dst),
8621          (VEXTRACTI128mr addr:$dst, VR256:$src1,
8622           (EXTRACT_get_vextract128_imm VR128:$ext))>;
8623def : Pat<(store (v4i32 (vextract128_extract:$ext (v8i32 VR256:$src1),
8624                         (iPTR imm))), addr:$dst),
8625          (VEXTRACTI128mr addr:$dst, VR256:$src1,
8626           (EXTRACT_get_vextract128_imm VR128:$ext))>;
8627def : Pat<(store (v8i16 (vextract128_extract:$ext (v16i16 VR256:$src1),
8628                         (iPTR imm))), addr:$dst),
8629          (VEXTRACTI128mr addr:$dst, VR256:$src1,
8630           (EXTRACT_get_vextract128_imm VR128:$ext))>;
8631def : Pat<(store (v16i8 (vextract128_extract:$ext (v32i8 VR256:$src1),
8632                         (iPTR imm))), addr:$dst),
8633          (VEXTRACTI128mr addr:$dst, VR256:$src1,
8634           (EXTRACT_get_vextract128_imm VR128:$ext))>;
8635}
8636
8637//===----------------------------------------------------------------------===//
8638// VPMASKMOV - Conditional SIMD Integer Packed Loads and Stores
8639//
8640multiclass avx2_pmovmask<string OpcodeStr,
8641                         Intrinsic IntLd128, Intrinsic IntLd256,
8642                         Intrinsic IntSt128, Intrinsic IntSt256> {
8643  def rm  : AVX28I<0x8c, MRMSrcMem, (outs VR128:$dst),
8644             (ins VR128:$src1, i128mem:$src2),
8645             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8646             [(set VR128:$dst, (IntLd128 addr:$src2, VR128:$src1))]>, VEX_4V;
8647  def Yrm : AVX28I<0x8c, MRMSrcMem, (outs VR256:$dst),
8648             (ins VR256:$src1, i256mem:$src2),
8649             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8650             [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
8651             VEX_4V, VEX_L;
8652  def mr  : AVX28I<0x8e, MRMDestMem, (outs),
8653             (ins i128mem:$dst, VR128:$src1, VR128:$src2),
8654             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8655             [(IntSt128 addr:$dst, VR128:$src1, VR128:$src2)]>, VEX_4V;
8656  def Ymr : AVX28I<0x8e, MRMDestMem, (outs),
8657             (ins i256mem:$dst, VR256:$src1, VR256:$src2),
8658             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8659             [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, VEX_4V, VEX_L;
8660}
8661
8662defm VPMASKMOVD : avx2_pmovmask<"vpmaskmovd",
8663                                int_x86_avx2_maskload_d,
8664                                int_x86_avx2_maskload_d_256,
8665                                int_x86_avx2_maskstore_d,
8666                                int_x86_avx2_maskstore_d_256>;
8667defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq",
8668                                int_x86_avx2_maskload_q,
8669                                int_x86_avx2_maskload_q_256,
8670                                int_x86_avx2_maskstore_q,
8671                                int_x86_avx2_maskstore_q_256>, VEX_W;
8672
8673def: Pat<(X86mstore addr:$ptr, (v8i32 VR256:$mask), (v8f32 VR256:$src)),
8674         (VMASKMOVPSYmr addr:$ptr, VR256:$mask, VR256:$src)>;
8675
8676def: Pat<(X86mstore addr:$ptr, (v8i32 VR256:$mask), (v8i32 VR256:$src)),
8677         (VPMASKMOVDYmr addr:$ptr, VR256:$mask, VR256:$src)>;
8678
8679def: Pat<(X86mstore addr:$ptr, (v4i32 VR128:$mask), (v4f32 VR128:$src)),
8680         (VMASKMOVPSmr addr:$ptr, VR128:$mask, VR128:$src)>;
8681
8682def: Pat<(X86mstore addr:$ptr, (v4i32 VR128:$mask), (v4i32 VR128:$src)),
8683         (VPMASKMOVDmr addr:$ptr, VR128:$mask, VR128:$src)>;
8684
8685def: Pat<(v8f32 (masked_load addr:$ptr, (v8i32 VR256:$mask), undef)),
8686         (VMASKMOVPSYrm VR256:$mask, addr:$ptr)>;
8687
8688def: Pat<(v8f32 (masked_load addr:$ptr, (v8i32 VR256:$mask),
8689                             (bc_v8f32 (v8i32 immAllZerosV)))),
8690         (VMASKMOVPSYrm VR256:$mask, addr:$ptr)>;
8691
8692def: Pat<(v8f32 (masked_load addr:$ptr, (v8i32 VR256:$mask), (v8f32 VR256:$src0))),
8693         (VBLENDVPSYrr VR256:$src0, (VMASKMOVPSYrm VR256:$mask, addr:$ptr),
8694                       VR256:$mask)>;
8695
8696def: Pat<(v8i32 (masked_load addr:$ptr, (v8i32 VR256:$mask), undef)),
8697         (VPMASKMOVDYrm VR256:$mask, addr:$ptr)>;
8698
8699def: Pat<(v8i32 (masked_load addr:$ptr, (v8i32 VR256:$mask), (v8i32 immAllZerosV))),
8700         (VPMASKMOVDYrm VR256:$mask, addr:$ptr)>;
8701
8702def: Pat<(v8i32 (masked_load addr:$ptr, (v8i32 VR256:$mask), (v8i32 VR256:$src0))),
8703         (VBLENDVPSYrr VR256:$src0, (VPMASKMOVDYrm VR256:$mask, addr:$ptr),
8704                       VR256:$mask)>;
8705
8706def: Pat<(v4f32 (masked_load addr:$ptr, (v4i32 VR128:$mask), undef)),
8707         (VMASKMOVPSrm VR128:$mask, addr:$ptr)>;
8708
8709def: Pat<(v4f32 (masked_load addr:$ptr, (v4i32 VR128:$mask),
8710                             (bc_v4f32 (v4i32 immAllZerosV)))),
8711         (VMASKMOVPSrm VR128:$mask, addr:$ptr)>;
8712
8713def: Pat<(v4f32 (masked_load addr:$ptr, (v4i32 VR128:$mask), (v4f32 VR128:$src0))),
8714         (VBLENDVPSrr VR128:$src0, (VMASKMOVPSrm VR128:$mask, addr:$ptr),
8715                       VR128:$mask)>;
8716
8717def: Pat<(v4i32 (masked_load addr:$ptr, (v4i32 VR128:$mask), undef)),
8718         (VPMASKMOVDrm VR128:$mask, addr:$ptr)>;
8719
8720def: Pat<(v4i32 (masked_load addr:$ptr, (v4i32 VR128:$mask), (v4i32 immAllZerosV))),
8721         (VPMASKMOVDrm VR128:$mask, addr:$ptr)>;
8722
8723def: Pat<(v4i32 (masked_load addr:$ptr, (v4i32 VR128:$mask), (v4i32 VR128:$src0))),
8724         (VBLENDVPSrr VR128:$src0, (VPMASKMOVDrm VR128:$mask, addr:$ptr),
8725                       VR128:$mask)>;
8726
8727def: Pat<(X86mstore addr:$ptr, (v4i64 VR256:$mask), (v4f64 VR256:$src)),
8728         (VMASKMOVPDYmr addr:$ptr, VR256:$mask, VR256:$src)>;
8729
8730def: Pat<(X86mstore addr:$ptr, (v4i64 VR256:$mask), (v4i64 VR256:$src)),
8731         (VPMASKMOVQYmr addr:$ptr, VR256:$mask, VR256:$src)>;
8732
8733def: Pat<(v4f64 (masked_load addr:$ptr, (v4i64 VR256:$mask), undef)),
8734         (VMASKMOVPDYrm VR256:$mask, addr:$ptr)>;
8735
8736def: Pat<(v4f64 (masked_load addr:$ptr, (v4i64 VR256:$mask),
8737                             (v4f64 immAllZerosV))),
8738         (VMASKMOVPDYrm VR256:$mask, addr:$ptr)>;
8739
8740def: Pat<(v4f64 (masked_load addr:$ptr, (v4i64 VR256:$mask), (v4f64 VR256:$src0))),
8741         (VBLENDVPDYrr VR256:$src0, (VMASKMOVPDYrm VR256:$mask, addr:$ptr),
8742                       VR256:$mask)>;
8743
8744def: Pat<(v4i64 (masked_load addr:$ptr, (v4i64 VR256:$mask), undef)),
8745         (VPMASKMOVQYrm VR256:$mask, addr:$ptr)>;
8746
8747def: Pat<(v4i64 (masked_load addr:$ptr, (v4i64 VR256:$mask),
8748                             (bc_v4i64 (v8i32 immAllZerosV)))),
8749         (VPMASKMOVQYrm VR256:$mask, addr:$ptr)>;
8750
8751def: Pat<(v4i64 (masked_load addr:$ptr, (v4i64 VR256:$mask), (v4i64 VR256:$src0))),
8752         (VBLENDVPDYrr VR256:$src0, (VPMASKMOVQYrm VR256:$mask, addr:$ptr),
8753                       VR256:$mask)>;
8754
8755def: Pat<(X86mstore addr:$ptr, (v2i64 VR128:$mask), (v2f64 VR128:$src)),
8756         (VMASKMOVPDmr addr:$ptr, VR128:$mask, VR128:$src)>;
8757
8758def: Pat<(X86mstore addr:$ptr, (v2i64 VR128:$mask), (v2i64 VR128:$src)),
8759         (VPMASKMOVQmr addr:$ptr, VR128:$mask, VR128:$src)>;
8760
8761def: Pat<(v2f64 (masked_load addr:$ptr, (v2i64 VR128:$mask), undef)),
8762         (VMASKMOVPDrm VR128:$mask, addr:$ptr)>;
8763
8764def: Pat<(v2f64 (masked_load addr:$ptr, (v2i64 VR128:$mask),
8765                             (v2f64 immAllZerosV))),
8766         (VMASKMOVPDrm VR128:$mask, addr:$ptr)>;
8767
8768def: Pat<(v2f64 (masked_load addr:$ptr, (v2i64 VR128:$mask), (v2f64 VR128:$src0))),
8769         (VBLENDVPDrr VR128:$src0, (VMASKMOVPDrm VR128:$mask, addr:$ptr),
8770                       VR128:$mask)>;
8771
8772def: Pat<(v2i64 (masked_load addr:$ptr, (v2i64 VR128:$mask), undef)),
8773         (VPMASKMOVQrm VR128:$mask, addr:$ptr)>;
8774
8775def: Pat<(v2i64 (masked_load addr:$ptr, (v2i64 VR128:$mask),
8776                             (bc_v2i64 (v4i32 immAllZerosV)))),
8777         (VPMASKMOVQrm VR128:$mask, addr:$ptr)>;
8778
8779def: Pat<(v2i64 (masked_load addr:$ptr, (v2i64 VR128:$mask), (v2i64 VR128:$src0))),
8780         (VBLENDVPDrr VR128:$src0, (VPMASKMOVQrm VR128:$mask, addr:$ptr),
8781                       VR128:$mask)>;
8782
8783//===----------------------------------------------------------------------===//
8784// Variable Bit Shifts
8785//
8786multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
8787                          ValueType vt128, ValueType vt256> {
8788  def rr  : AVX28I<opc, MRMSrcReg, (outs VR128:$dst),
8789             (ins VR128:$src1, VR128:$src2),
8790             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8791             [(set VR128:$dst,
8792               (vt128 (OpNode VR128:$src1, (vt128 VR128:$src2))))]>,
8793             VEX_4V, Sched<[WriteVarVecShift]>;
8794  def rm  : AVX28I<opc, MRMSrcMem, (outs VR128:$dst),
8795             (ins VR128:$src1, i128mem:$src2),
8796             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8797             [(set VR128:$dst,
8798               (vt128 (OpNode VR128:$src1,
8799                       (vt128 (bitconvert (loadv2i64 addr:$src2))))))]>,
8800             VEX_4V, Sched<[WriteVarVecShiftLd, ReadAfterLd]>;
8801  def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
8802             (ins VR256:$src1, VR256:$src2),
8803             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8804             [(set VR256:$dst,
8805               (vt256 (OpNode VR256:$src1, (vt256 VR256:$src2))))]>,
8806             VEX_4V, VEX_L, Sched<[WriteVarVecShift]>;
8807  def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
8808             (ins VR256:$src1, i256mem:$src2),
8809             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8810             [(set VR256:$dst,
8811               (vt256 (OpNode VR256:$src1,
8812                       (vt256 (bitconvert (loadv4i64 addr:$src2))))))]>,
8813             VEX_4V, VEX_L, Sched<[WriteVarVecShiftLd, ReadAfterLd]>;
8814}
8815
8816defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", shl, v4i32, v8i32>;
8817defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", shl, v2i64, v4i64>, VEX_W;
8818defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", srl, v4i32, v8i32>;
8819defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", srl, v2i64, v4i64>, VEX_W;
8820defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", sra, v4i32, v8i32>;
8821
8822//===----------------------------------------------------------------------===//
8823// VGATHER - GATHER Operations
8824multiclass avx2_gather<bits<8> opc, string OpcodeStr, RegisterClass RC256,
8825                       X86MemOperand memop128, X86MemOperand memop256> {
8826  def rm  : AVX28I<opc, MRMSrcMem, (outs VR128:$dst, VR128:$mask_wb),
8827            (ins VR128:$src1, memop128:$src2, VR128:$mask),
8828            !strconcat(OpcodeStr,
8829              "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
8830            []>, VEX_4VOp3;
8831  def Yrm : AVX28I<opc, MRMSrcMem, (outs RC256:$dst, RC256:$mask_wb),
8832            (ins RC256:$src1, memop256:$src2, RC256:$mask),
8833            !strconcat(OpcodeStr,
8834              "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
8835            []>, VEX_4VOp3, VEX_L;
8836}
8837
8838let mayLoad = 1, Constraints
8839  = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb"
8840  in {
8841  defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", VR256, vx64mem, vx64mem>, VEX_W;
8842  defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", VR256, vx64mem, vy64mem>, VEX_W;
8843  defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", VR256, vx32mem, vy32mem>;
8844  defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", VR128, vx32mem, vy32mem>;
8845
8846  let ExeDomain = SSEPackedDouble in {
8847    defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", VR256, vx64mem, vx64mem>, VEX_W;
8848    defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", VR256, vx64mem, vy64mem>, VEX_W;
8849  }
8850
8851  let ExeDomain = SSEPackedSingle in {
8852    defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", VR256, vx32mem, vy32mem>;
8853    defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", VR128, vx32mem, vy32mem>;
8854  }
8855}
8856
8857//===----------------------------------------------------------------------===//
8858// Extra selection patterns for FR128, f128, f128mem
8859
8860// movaps is shorter than movdqa. movaps is in SSE and movdqa is in SSE2.
8861def : Pat<(store (f128 FR128:$src), addr:$dst),
8862          (MOVAPSmr addr:$dst, (COPY_TO_REGCLASS (f128 FR128:$src), VR128))>;
8863
8864def : Pat<(loadf128 addr:$src),
8865          (COPY_TO_REGCLASS (MOVAPSrm addr:$src), FR128)>;
8866
8867// andps is shorter than andpd or pand. andps is SSE and andpd/pand are in SSE2
8868def : Pat<(X86fand FR128:$src1, (loadf128 addr:$src2)),
8869          (COPY_TO_REGCLASS
8870           (ANDPSrm (COPY_TO_REGCLASS FR128:$src1, VR128), f128mem:$src2),
8871           FR128)>;
8872
8873def : Pat<(X86fand FR128:$src1, FR128:$src2),
8874          (COPY_TO_REGCLASS
8875           (ANDPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
8876                    (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;
8877
8878def : Pat<(and FR128:$src1, FR128:$src2),
8879          (COPY_TO_REGCLASS
8880           (ANDPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
8881                    (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;
8882
8883def : Pat<(X86for FR128:$src1, (loadf128 addr:$src2)),
8884          (COPY_TO_REGCLASS
8885           (ORPSrm (COPY_TO_REGCLASS FR128:$src1, VR128), f128mem:$src2),
8886           FR128)>;
8887
8888def : Pat<(X86for FR128:$src1, FR128:$src2),
8889          (COPY_TO_REGCLASS
8890           (ORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
8891                   (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;
8892
8893def : Pat<(or FR128:$src1, FR128:$src2),
8894          (COPY_TO_REGCLASS
8895           (ORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
8896                   (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;
8897
8898def : Pat<(X86fxor FR128:$src1, (loadf128 addr:$src2)),
8899          (COPY_TO_REGCLASS
8900           (XORPSrm (COPY_TO_REGCLASS FR128:$src1, VR128), f128mem:$src2),
8901           FR128)>;
8902
8903def : Pat<(X86fxor FR128:$src1, FR128:$src2),
8904          (COPY_TO_REGCLASS
8905           (XORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
8906                    (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;
8907
8908def : Pat<(xor FR128:$src1, FR128:$src2),
8909          (COPY_TO_REGCLASS
8910           (XORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
8911                    (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;
8912