1//===-- X86InstrSSE.td - SSE Instruction Set ---------------*- tablegen -*-===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file describes the X86 SSE instruction set, defining the instructions,
11// and properties of the instructions which are needed for code generation,
12// machine code emission, and analysis.
13//
14//===----------------------------------------------------------------------===//
15
16class OpndItins<InstrItinClass arg_rr, InstrItinClass arg_rm> {
17  InstrItinClass rr = arg_rr;
18  InstrItinClass rm = arg_rm;
19  // InstrSchedModel info.
20  X86FoldableSchedWrite Sched = WriteFAdd;
21}
22
23class SizeItins<OpndItins arg_s, OpndItins arg_d> {
24  OpndItins s = arg_s;
25  OpndItins d = arg_d;
26}
27
28
29class ShiftOpndItins<InstrItinClass arg_rr, InstrItinClass arg_rm,
30  InstrItinClass arg_ri> {
31  InstrItinClass rr = arg_rr;
32  InstrItinClass rm = arg_rm;
33  InstrItinClass ri = arg_ri;
34}
35
36
37// scalar
38let Sched = WriteFAdd in {
39def SSE_ALU_F32S : OpndItins<
40  IIC_SSE_ALU_F32S_RR, IIC_SSE_ALU_F32S_RM
41>;
42
43def SSE_ALU_F64S : OpndItins<
44  IIC_SSE_ALU_F64S_RR, IIC_SSE_ALU_F64S_RM
45>;
46}
47
48def SSE_ALU_ITINS_S : SizeItins<
49  SSE_ALU_F32S, SSE_ALU_F64S
50>;
51
52let Sched = WriteFMul in {
53def SSE_MUL_F32S : OpndItins<
54  IIC_SSE_MUL_F32S_RR, IIC_SSE_MUL_F64S_RM
55>;
56
57def SSE_MUL_F64S : OpndItins<
58  IIC_SSE_MUL_F64S_RR, IIC_SSE_MUL_F64S_RM
59>;
60}
61
62def SSE_MUL_ITINS_S : SizeItins<
63  SSE_MUL_F32S, SSE_MUL_F64S
64>;
65
66let Sched = WriteFDiv in {
67def SSE_DIV_F32S : OpndItins<
68  IIC_SSE_DIV_F32S_RR, IIC_SSE_DIV_F64S_RM
69>;
70
71def SSE_DIV_F64S : OpndItins<
72  IIC_SSE_DIV_F64S_RR, IIC_SSE_DIV_F64S_RM
73>;
74}
75
76def SSE_DIV_ITINS_S : SizeItins<
77  SSE_DIV_F32S, SSE_DIV_F64S
78>;
79
80// parallel
81let Sched = WriteFAdd in {
82def SSE_ALU_F32P : OpndItins<
83  IIC_SSE_ALU_F32P_RR, IIC_SSE_ALU_F32P_RM
84>;
85
86def SSE_ALU_F64P : OpndItins<
87  IIC_SSE_ALU_F64P_RR, IIC_SSE_ALU_F64P_RM
88>;
89}
90
91def SSE_ALU_ITINS_P : SizeItins<
92  SSE_ALU_F32P, SSE_ALU_F64P
93>;
94
95let Sched = WriteFMul in {
96def SSE_MUL_F32P : OpndItins<
97  IIC_SSE_MUL_F32P_RR, IIC_SSE_MUL_F64P_RM
98>;
99
100def SSE_MUL_F64P : OpndItins<
101  IIC_SSE_MUL_F64P_RR, IIC_SSE_MUL_F64P_RM
102>;
103}
104
105def SSE_MUL_ITINS_P : SizeItins<
106  SSE_MUL_F32P, SSE_MUL_F64P
107>;
108
109let Sched = WriteFDiv in {
110def SSE_DIV_F32P : OpndItins<
111  IIC_SSE_DIV_F32P_RR, IIC_SSE_DIV_F64P_RM
112>;
113
114def SSE_DIV_F64P : OpndItins<
115  IIC_SSE_DIV_F64P_RR, IIC_SSE_DIV_F64P_RM
116>;
117}
118
119def SSE_DIV_ITINS_P : SizeItins<
120  SSE_DIV_F32P, SSE_DIV_F64P
121>;
122
123let Sched = WriteVecLogic in
124def SSE_VEC_BIT_ITINS_P : OpndItins<
125  IIC_SSE_BIT_P_RR, IIC_SSE_BIT_P_RM
126>;
127
128def SSE_BIT_ITINS_P : OpndItins<
129  IIC_SSE_BIT_P_RR, IIC_SSE_BIT_P_RM
130>;
131
132let Sched = WriteVecALU in {
133def SSE_INTALU_ITINS_P : OpndItins<
134  IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
135>;
136
137def SSE_INTALUQ_ITINS_P : OpndItins<
138  IIC_SSE_INTALUQ_P_RR, IIC_SSE_INTALUQ_P_RM
139>;
140}
141
142let Sched = WriteVecIMul in
143def SSE_INTMUL_ITINS_P : OpndItins<
144  IIC_SSE_INTMUL_P_RR, IIC_SSE_INTMUL_P_RM
145>;
146
147def SSE_INTSHIFT_ITINS_P : ShiftOpndItins<
148  IIC_SSE_INTSH_P_RR, IIC_SSE_INTSH_P_RM, IIC_SSE_INTSH_P_RI
149>;
150
151def SSE_MOVA_ITINS : OpndItins<
152  IIC_SSE_MOVA_P_RR, IIC_SSE_MOVA_P_RM
153>;
154
155def SSE_MOVU_ITINS : OpndItins<
156  IIC_SSE_MOVU_P_RR, IIC_SSE_MOVU_P_RM
157>;
158
159def SSE_DPPD_ITINS : OpndItins<
160  IIC_SSE_DPPD_RR, IIC_SSE_DPPD_RM
161>;
162
163def SSE_DPPS_ITINS : OpndItins<
164  IIC_SSE_DPPS_RR, IIC_SSE_DPPD_RM
165>;
166
167def DEFAULT_ITINS : OpndItins<
168  IIC_ALU_NONMEM, IIC_ALU_MEM
169>;
170
171def SSE_EXTRACT_ITINS : OpndItins<
172  IIC_SSE_EXTRACTPS_RR, IIC_SSE_EXTRACTPS_RM
173>;
174
175def SSE_INSERT_ITINS : OpndItins<
176  IIC_SSE_INSERTPS_RR, IIC_SSE_INSERTPS_RM
177>;
178
179let Sched = WriteMPSAD in
180def SSE_MPSADBW_ITINS : OpndItins<
181  IIC_SSE_MPSADBW_RR, IIC_SSE_MPSADBW_RM
182>;
183
184let Sched = WriteVecIMul in
185def SSE_PMULLD_ITINS : OpndItins<
186  IIC_SSE_PMULLD_RR, IIC_SSE_PMULLD_RM
187>;
188
189// Definitions for backward compatibility.
190// The instructions mapped on these definitions uses a different itinerary
191// than the actual scheduling model.
192let Sched = WriteShuffle in
193def DEFAULT_ITINS_SHUFFLESCHED :  OpndItins<
194  IIC_ALU_NONMEM, IIC_ALU_MEM
195>;
196
197let Sched = WriteVecIMul in
198def DEFAULT_ITINS_VECIMULSCHED :  OpndItins<
199  IIC_ALU_NONMEM, IIC_ALU_MEM
200>;
201
202let Sched = WriteShuffle in
203def SSE_INTALU_ITINS_SHUFF_P : OpndItins<
204  IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
205>;
206
207let Sched = WriteMPSAD in
208def DEFAULT_ITINS_MPSADSCHED :  OpndItins<
209  IIC_ALU_NONMEM, IIC_ALU_MEM
210>;
211
212let Sched = WriteFBlend in
213def DEFAULT_ITINS_FBLENDSCHED :  OpndItins<
214  IIC_ALU_NONMEM, IIC_ALU_MEM
215>;
216
217let Sched = WriteBlend in
218def DEFAULT_ITINS_BLENDSCHED :  OpndItins<
219  IIC_ALU_NONMEM, IIC_ALU_MEM
220>;
221
222let Sched = WriteVarBlend in
223def DEFAULT_ITINS_VARBLENDSCHED :  OpndItins<
224  IIC_ALU_NONMEM, IIC_ALU_MEM
225>;
226
227let Sched = WriteFBlend in
228def SSE_INTALU_ITINS_FBLEND_P : OpndItins<
229  IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
230>;
231
232let Sched = WriteBlend in
233def SSE_INTALU_ITINS_BLEND_P : OpndItins<
234  IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
235>;
236
237//===----------------------------------------------------------------------===//
238// SSE 1 & 2 Instructions Classes
239//===----------------------------------------------------------------------===//
240
241/// sse12_fp_scalar - SSE 1 & 2 scalar instructions class
242multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
243                           RegisterClass RC, X86MemOperand x86memop,
244                           OpndItins itins,
245                           bit Is2Addr = 1> {
246  let isCommutable = 1 in {
247    def rr : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
248       !if(Is2Addr,
249           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
250           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
251       [(set RC:$dst, (OpNode RC:$src1, RC:$src2))], itins.rr>,
252       Sched<[itins.Sched]>;
253  }
254  def rm : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
255       !if(Is2Addr,
256           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
257           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
258       [(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))], itins.rm>,
259       Sched<[itins.Sched.Folded, ReadAfterLd]>;
260}
261
262/// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class
263multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr, RegisterClass RC,
264                             string asm, string SSEVer, string FPSizeStr,
265                             Operand memopr, ComplexPattern mem_cpat,
266                             OpndItins itins,
267                             bit Is2Addr = 1> {
268let isCodeGenOnly = 1 in {
269  def rr_Int : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
270       !if(Is2Addr,
271           !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
272           !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
273       [(set RC:$dst, (!cast<Intrinsic>(
274                 !strconcat("int_x86_sse", SSEVer, "_", OpcodeStr, FPSizeStr))
275             RC:$src1, RC:$src2))], itins.rr>,
276       Sched<[itins.Sched]>;
277  def rm_Int : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, memopr:$src2),
278       !if(Is2Addr,
279           !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
280           !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
281       [(set RC:$dst, (!cast<Intrinsic>(!strconcat("int_x86_sse",
282                                          SSEVer, "_", OpcodeStr, FPSizeStr))
283             RC:$src1, mem_cpat:$src2))], itins.rm>,
284       Sched<[itins.Sched.Folded, ReadAfterLd]>;
285}
286}
287
288/// sse12_fp_packed - SSE 1 & 2 packed instructions class
289multiclass sse12_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
290                           RegisterClass RC, ValueType vt,
291                           X86MemOperand x86memop, PatFrag mem_frag,
292                           Domain d, OpndItins itins, bit Is2Addr = 1> {
293  let isCommutable = 1 in
294    def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
295       !if(Is2Addr,
296           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
297           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
298       [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], itins.rr, d>,
299       Sched<[itins.Sched]>;
300  let mayLoad = 1 in
301    def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
302       !if(Is2Addr,
303           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
304           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
305       [(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))],
306          itins.rm, d>,
307       Sched<[itins.Sched.Folded, ReadAfterLd]>;
308}
309
310/// sse12_fp_packed_logical_rm - SSE 1 & 2 packed instructions class
311multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d,
312                                      string OpcodeStr, X86MemOperand x86memop,
313                                      list<dag> pat_rr, list<dag> pat_rm,
314                                      bit Is2Addr = 1> {
315  let isCommutable = 1, hasSideEffects = 0 in
316    def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
317       !if(Is2Addr,
318           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
319           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
320       pat_rr, NoItinerary, d>,
321       Sched<[WriteVecLogic]>;
322  def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
323       !if(Is2Addr,
324           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
325           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
326       pat_rm, NoItinerary, d>,
327       Sched<[WriteVecLogicLd, ReadAfterLd]>;
328}
329
330//===----------------------------------------------------------------------===//
331//  Non-instruction patterns
332//===----------------------------------------------------------------------===//
333
334// A vector extract of the first f32/f64 position is a subregister copy
335def : Pat<(f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
336          (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32)>;
337def : Pat<(f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
338          (COPY_TO_REGCLASS (v2f64 VR128:$src), FR64)>;
339
340// A 128-bit subvector extract from the first 256-bit vector position
341// is a subregister copy that needs no instruction.
342def : Pat<(v4i32 (extract_subvector (v8i32 VR256:$src), (iPTR 0))),
343          (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm))>;
344def : Pat<(v4f32 (extract_subvector (v8f32 VR256:$src), (iPTR 0))),
345          (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm))>;
346
347def : Pat<(v2i64 (extract_subvector (v4i64 VR256:$src), (iPTR 0))),
348          (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm))>;
349def : Pat<(v2f64 (extract_subvector (v4f64 VR256:$src), (iPTR 0))),
350          (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm))>;
351
352def : Pat<(v8i16 (extract_subvector (v16i16 VR256:$src), (iPTR 0))),
353          (v8i16 (EXTRACT_SUBREG (v16i16 VR256:$src), sub_xmm))>;
354def : Pat<(v16i8 (extract_subvector (v32i8 VR256:$src), (iPTR 0))),
355          (v16i8 (EXTRACT_SUBREG (v32i8 VR256:$src), sub_xmm))>;
356
357// A 128-bit subvector insert to the first 256-bit vector position
358// is a subregister copy that needs no instruction.
359let AddedComplexity = 25 in { // to give priority over vinsertf128rm
360def : Pat<(insert_subvector undef, (v2i64 VR128:$src), (iPTR 0)),
361          (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
362def : Pat<(insert_subvector undef, (v2f64 VR128:$src), (iPTR 0)),
363          (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
364def : Pat<(insert_subvector undef, (v4i32 VR128:$src), (iPTR 0)),
365          (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
366def : Pat<(insert_subvector undef, (v4f32 VR128:$src), (iPTR 0)),
367          (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
368def : Pat<(insert_subvector undef, (v8i16 VR128:$src), (iPTR 0)),
369          (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
370def : Pat<(insert_subvector undef, (v16i8 VR128:$src), (iPTR 0)),
371          (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
372}
373
374// Implicitly promote a 32-bit scalar to a vector.
375def : Pat<(v4f32 (scalar_to_vector FR32:$src)),
376          (COPY_TO_REGCLASS FR32:$src, VR128)>;
377def : Pat<(v8f32 (scalar_to_vector FR32:$src)),
378          (COPY_TO_REGCLASS FR32:$src, VR128)>;
379// Implicitly promote a 64-bit scalar to a vector.
380def : Pat<(v2f64 (scalar_to_vector FR64:$src)),
381          (COPY_TO_REGCLASS FR64:$src, VR128)>;
382def : Pat<(v4f64 (scalar_to_vector FR64:$src)),
383          (COPY_TO_REGCLASS FR64:$src, VR128)>;
384
385// Bitcasts between 128-bit vector types. Return the original type since
386// no instruction is needed for the conversion
387let Predicates = [HasSSE2] in {
388  def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>;
389  def : Pat<(v2i64 (bitconvert (v8i16 VR128:$src))), (v2i64 VR128:$src)>;
390  def : Pat<(v2i64 (bitconvert (v16i8 VR128:$src))), (v2i64 VR128:$src)>;
391  def : Pat<(v2i64 (bitconvert (v2f64 VR128:$src))), (v2i64 VR128:$src)>;
392  def : Pat<(v2i64 (bitconvert (v4f32 VR128:$src))), (v2i64 VR128:$src)>;
393  def : Pat<(v4i32 (bitconvert (v2i64 VR128:$src))), (v4i32 VR128:$src)>;
394  def : Pat<(v4i32 (bitconvert (v8i16 VR128:$src))), (v4i32 VR128:$src)>;
395  def : Pat<(v4i32 (bitconvert (v16i8 VR128:$src))), (v4i32 VR128:$src)>;
396  def : Pat<(v4i32 (bitconvert (v2f64 VR128:$src))), (v4i32 VR128:$src)>;
397  def : Pat<(v4i32 (bitconvert (v4f32 VR128:$src))), (v4i32 VR128:$src)>;
398  def : Pat<(v8i16 (bitconvert (v2i64 VR128:$src))), (v8i16 VR128:$src)>;
399  def : Pat<(v8i16 (bitconvert (v4i32 VR128:$src))), (v8i16 VR128:$src)>;
400  def : Pat<(v8i16 (bitconvert (v16i8 VR128:$src))), (v8i16 VR128:$src)>;
401  def : Pat<(v8i16 (bitconvert (v2f64 VR128:$src))), (v8i16 VR128:$src)>;
402  def : Pat<(v8i16 (bitconvert (v4f32 VR128:$src))), (v8i16 VR128:$src)>;
403  def : Pat<(v16i8 (bitconvert (v2i64 VR128:$src))), (v16i8 VR128:$src)>;
404  def : Pat<(v16i8 (bitconvert (v4i32 VR128:$src))), (v16i8 VR128:$src)>;
405  def : Pat<(v16i8 (bitconvert (v8i16 VR128:$src))), (v16i8 VR128:$src)>;
406  def : Pat<(v16i8 (bitconvert (v2f64 VR128:$src))), (v16i8 VR128:$src)>;
407  def : Pat<(v16i8 (bitconvert (v4f32 VR128:$src))), (v16i8 VR128:$src)>;
408  def : Pat<(v4f32 (bitconvert (v2i64 VR128:$src))), (v4f32 VR128:$src)>;
409  def : Pat<(v4f32 (bitconvert (v4i32 VR128:$src))), (v4f32 VR128:$src)>;
410  def : Pat<(v4f32 (bitconvert (v8i16 VR128:$src))), (v4f32 VR128:$src)>;
411  def : Pat<(v4f32 (bitconvert (v16i8 VR128:$src))), (v4f32 VR128:$src)>;
412  def : Pat<(v4f32 (bitconvert (v2f64 VR128:$src))), (v4f32 VR128:$src)>;
413  def : Pat<(v2f64 (bitconvert (v2i64 VR128:$src))), (v2f64 VR128:$src)>;
414  def : Pat<(v2f64 (bitconvert (v4i32 VR128:$src))), (v2f64 VR128:$src)>;
415  def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>;
416  def : Pat<(v2f64 (bitconvert (v16i8 VR128:$src))), (v2f64 VR128:$src)>;
417  def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>;
418}
419
420// Bitcasts between 256-bit vector types. Return the original type since
421// no instruction is needed for the conversion
422let Predicates = [HasAVX] in {
423  def : Pat<(v4f64  (bitconvert (v8f32 VR256:$src))),  (v4f64 VR256:$src)>;
424  def : Pat<(v4f64  (bitconvert (v8i32 VR256:$src))),  (v4f64 VR256:$src)>;
425  def : Pat<(v4f64  (bitconvert (v4i64 VR256:$src))),  (v4f64 VR256:$src)>;
426  def : Pat<(v4f64  (bitconvert (v16i16 VR256:$src))), (v4f64 VR256:$src)>;
427  def : Pat<(v4f64  (bitconvert (v32i8 VR256:$src))),  (v4f64 VR256:$src)>;
428  def : Pat<(v8f32  (bitconvert (v8i32 VR256:$src))),  (v8f32 VR256:$src)>;
429  def : Pat<(v8f32  (bitconvert (v4i64 VR256:$src))),  (v8f32 VR256:$src)>;
430  def : Pat<(v8f32  (bitconvert (v4f64 VR256:$src))),  (v8f32 VR256:$src)>;
431  def : Pat<(v8f32  (bitconvert (v32i8 VR256:$src))),  (v8f32 VR256:$src)>;
432  def : Pat<(v8f32  (bitconvert (v16i16 VR256:$src))), (v8f32 VR256:$src)>;
433  def : Pat<(v4i64  (bitconvert (v8f32 VR256:$src))),  (v4i64 VR256:$src)>;
434  def : Pat<(v4i64  (bitconvert (v8i32 VR256:$src))),  (v4i64 VR256:$src)>;
435  def : Pat<(v4i64  (bitconvert (v4f64 VR256:$src))),  (v4i64 VR256:$src)>;
436  def : Pat<(v4i64  (bitconvert (v32i8 VR256:$src))),  (v4i64 VR256:$src)>;
437  def : Pat<(v4i64  (bitconvert (v16i16 VR256:$src))), (v4i64 VR256:$src)>;
438  def : Pat<(v32i8  (bitconvert (v4f64 VR256:$src))),  (v32i8 VR256:$src)>;
439  def : Pat<(v32i8  (bitconvert (v4i64 VR256:$src))),  (v32i8 VR256:$src)>;
440  def : Pat<(v32i8  (bitconvert (v8f32 VR256:$src))),  (v32i8 VR256:$src)>;
441  def : Pat<(v32i8  (bitconvert (v8i32 VR256:$src))),  (v32i8 VR256:$src)>;
442  def : Pat<(v32i8  (bitconvert (v16i16 VR256:$src))), (v32i8 VR256:$src)>;
443  def : Pat<(v8i32  (bitconvert (v32i8 VR256:$src))),  (v8i32 VR256:$src)>;
444  def : Pat<(v8i32  (bitconvert (v16i16 VR256:$src))), (v8i32 VR256:$src)>;
445  def : Pat<(v8i32  (bitconvert (v8f32 VR256:$src))),  (v8i32 VR256:$src)>;
446  def : Pat<(v8i32  (bitconvert (v4i64 VR256:$src))),  (v8i32 VR256:$src)>;
447  def : Pat<(v8i32  (bitconvert (v4f64 VR256:$src))),  (v8i32 VR256:$src)>;
448  def : Pat<(v16i16 (bitconvert (v8f32 VR256:$src))),  (v16i16 VR256:$src)>;
449  def : Pat<(v16i16 (bitconvert (v8i32 VR256:$src))),  (v16i16 VR256:$src)>;
450  def : Pat<(v16i16 (bitconvert (v4i64 VR256:$src))),  (v16i16 VR256:$src)>;
451  def : Pat<(v16i16 (bitconvert (v4f64 VR256:$src))),  (v16i16 VR256:$src)>;
452  def : Pat<(v16i16 (bitconvert (v32i8 VR256:$src))),  (v16i16 VR256:$src)>;
453}
454
455// Alias instructions that map fld0 to xorps for sse or vxorps for avx.
456// This is expanded by ExpandPostRAPseudos.
457let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
458    isPseudo = 1, SchedRW = [WriteZero] in {
459  def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "",
460                   [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1]>;
461  def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "",
462                   [(set FR64:$dst, fpimm0)]>, Requires<[HasSSE2]>;
463}
464
465//===----------------------------------------------------------------------===//
466// AVX & SSE - Zero/One Vectors
467//===----------------------------------------------------------------------===//
468
469// Alias instruction that maps zero vector to pxor / xorp* for sse.
470// This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then
471// swizzled by ExecutionDepsFix to pxor.
472// We set canFoldAsLoad because this can be converted to a constant-pool
473// load of an all-zeros value if folding it would be beneficial.
474let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
475    isPseudo = 1, SchedRW = [WriteZero] in {
476def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "",
477               [(set VR128:$dst, (v4f32 immAllZerosV))]>;
478}
479
480def : Pat<(v2f64 immAllZerosV), (V_SET0)>;
481def : Pat<(v4i32 immAllZerosV), (V_SET0)>;
482def : Pat<(v2i64 immAllZerosV), (V_SET0)>;
483def : Pat<(v8i16 immAllZerosV), (V_SET0)>;
484def : Pat<(v16i8 immAllZerosV), (V_SET0)>;
485
486
487// The same as done above but for AVX.  The 256-bit AVX1 ISA doesn't support PI,
488// and doesn't need it because on sandy bridge the register is set to zero
489// at the rename stage without using any execution unit, so SET0PSY
490// and SET0PDY can be used for vector int instructions without penalty
491let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
492    isPseudo = 1, Predicates = [HasAVX], SchedRW = [WriteZero] in {
493def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "",
494                 [(set VR256:$dst, (v8f32 immAllZerosV))]>;
495}
496
497let Predicates = [HasAVX] in
498  def : Pat<(v4f64 immAllZerosV), (AVX_SET0)>;
499
500let Predicates = [HasAVX2] in {
501  def : Pat<(v4i64 immAllZerosV), (AVX_SET0)>;
502  def : Pat<(v8i32 immAllZerosV), (AVX_SET0)>;
503  def : Pat<(v16i16 immAllZerosV), (AVX_SET0)>;
504  def : Pat<(v32i8 immAllZerosV), (AVX_SET0)>;
505}
506
507// AVX1 has no support for 256-bit integer instructions, but since the 128-bit
508// VPXOR instruction writes zero to its upper part, it's safe build zeros.
509let Predicates = [HasAVX1Only] in {
510def : Pat<(v32i8 immAllZerosV), (SUBREG_TO_REG (i8 0), (V_SET0), sub_xmm)>;
511def : Pat<(bc_v32i8 (v8f32 immAllZerosV)),
512          (SUBREG_TO_REG (i8 0), (V_SET0), sub_xmm)>;
513
514def : Pat<(v16i16 immAllZerosV), (SUBREG_TO_REG (i16 0), (V_SET0), sub_xmm)>;
515def : Pat<(bc_v16i16 (v8f32 immAllZerosV)),
516          (SUBREG_TO_REG (i16 0), (V_SET0), sub_xmm)>;
517
518def : Pat<(v8i32 immAllZerosV), (SUBREG_TO_REG (i32 0), (V_SET0), sub_xmm)>;
519def : Pat<(bc_v8i32 (v8f32 immAllZerosV)),
520          (SUBREG_TO_REG (i32 0), (V_SET0), sub_xmm)>;
521
522def : Pat<(v4i64 immAllZerosV), (SUBREG_TO_REG (i64 0), (V_SET0), sub_xmm)>;
523def : Pat<(bc_v4i64 (v8f32 immAllZerosV)),
524          (SUBREG_TO_REG (i64 0), (V_SET0), sub_xmm)>;
525}
526
527// We set canFoldAsLoad because this can be converted to a constant-pool
528// load of an all-ones value if folding it would be beneficial.
529let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
530    isPseudo = 1, SchedRW = [WriteZero] in {
531  def V_SETALLONES : I<0, Pseudo, (outs VR128:$dst), (ins), "",
532                       [(set VR128:$dst, (v4i32 immAllOnesV))]>;
533  let Predicates = [HasAVX2] in
534  def AVX2_SETALLONES : I<0, Pseudo, (outs VR256:$dst), (ins), "",
535                          [(set VR256:$dst, (v8i32 immAllOnesV))]>;
536}
537
538
539//===----------------------------------------------------------------------===//
540// SSE 1 & 2 - Move FP Scalar Instructions
541//
542// Move Instructions. Register-to-register movss/movsd is not used for FR32/64
543// register copies because it's a partial register update; Register-to-register
544// movss/movsd is not modeled as an INSERT_SUBREG because INSERT_SUBREG requires
545// that the insert be implementable in terms of a copy, and just mentioned, we
546// don't use movss/movsd for copies.
547//===----------------------------------------------------------------------===//
548
549multiclass sse12_move_rr<RegisterClass RC, SDNode OpNode, ValueType vt,
550                         X86MemOperand x86memop, string base_opc,
551                         string asm_opr, Domain d = GenericDomain> {
552  def rr : SI<0x10, MRMSrcReg, (outs VR128:$dst),
553              (ins VR128:$src1, RC:$src2),
554              !strconcat(base_opc, asm_opr),
555              [(set VR128:$dst, (vt (OpNode VR128:$src1,
556                                 (scalar_to_vector RC:$src2))))],
557              IIC_SSE_MOV_S_RR, d>, Sched<[WriteFShuffle]>;
558
559  // For the disassembler
560  let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
561  def rr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
562                  (ins VR128:$src1, RC:$src2),
563                  !strconcat(base_opc, asm_opr),
564                  [], IIC_SSE_MOV_S_RR>, Sched<[WriteFShuffle]>;
565}
566
567multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
568                      X86MemOperand x86memop, string OpcodeStr,
569                      Domain d = GenericDomain> {
570  // AVX
571  defm V#NAME : sse12_move_rr<RC, OpNode, vt, x86memop, OpcodeStr,
572                              "\t{$src2, $src1, $dst|$dst, $src1, $src2}", d>,
573                              VEX_4V, VEX_LIG;
574
575  def V#NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
576                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
577                     [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR, d>,
578                     VEX, VEX_LIG, Sched<[WriteStore]>;
579  // SSE1 & 2
580  let Constraints = "$src1 = $dst" in {
581    defm NAME : sse12_move_rr<RC, OpNode, vt, x86memop, OpcodeStr,
582                              "\t{$src2, $dst|$dst, $src2}", d>;
583  }
584
585  def NAME#mr   : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
586                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
587                     [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR, d>,
588                  Sched<[WriteStore]>;
589}
590
591// Loading from memory automatically zeroing upper bits.
592multiclass sse12_move_rm<RegisterClass RC, X86MemOperand x86memop,
593                         PatFrag mem_pat, string OpcodeStr,
594                         Domain d = GenericDomain> {
595  def V#NAME#rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
596                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
597                     [(set RC:$dst, (mem_pat addr:$src))],
598                     IIC_SSE_MOV_S_RM, d>, VEX, VEX_LIG, Sched<[WriteLoad]>;
599  def NAME#rm   : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
600                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
601                     [(set RC:$dst, (mem_pat addr:$src))],
602                     IIC_SSE_MOV_S_RM, d>, Sched<[WriteLoad]>;
603}
604
605defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss",
606                        SSEPackedSingle>, XS;
607defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd",
608                        SSEPackedDouble>, XD;
609
610let canFoldAsLoad = 1, isReMaterializable = 1 in {
611  defm MOVSS : sse12_move_rm<FR32, f32mem, loadf32, "movss",
612                             SSEPackedSingle>, XS;
613
614  let AddedComplexity = 20 in
615    defm MOVSD : sse12_move_rm<FR64, f64mem, loadf64, "movsd",
616                               SSEPackedDouble>, XD;
617}
618
619// Patterns
620let Predicates = [UseAVX] in {
621  let AddedComplexity = 20 in {
622  // MOVSSrm zeros the high parts of the register; represent this
623  // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
624  def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
625            (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
626  def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
627            (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
628  def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
629            (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
630
631  // MOVSDrm zeros the high parts of the register; represent this
632  // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
633  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
634            (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
635  def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
636            (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
637  def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
638            (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
639  def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
640            (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
641  def : Pat<(v2f64 (X86vzload addr:$src)),
642            (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
643
644  // Represent the same patterns above but in the form they appear for
645  // 256-bit types
646  def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
647                   (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))),
648            (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
649  def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
650                   (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))),
651            (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
652  }
653
654  // Extract and store.
655  def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
656                   addr:$dst),
657            (VMOVSSmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32))>;
658  def : Pat<(store (f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
659                   addr:$dst),
660            (VMOVSDmr addr:$dst, (COPY_TO_REGCLASS (v2f64 VR128:$src), FR64))>;
661
662  // Shuffle with VMOVSS
663  def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)),
664            (VMOVSSrr (v4i32 VR128:$src1),
665                      (COPY_TO_REGCLASS (v4i32 VR128:$src2), FR32))>;
666  def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
667            (VMOVSSrr (v4f32 VR128:$src1),
668                      (COPY_TO_REGCLASS (v4f32 VR128:$src2), FR32))>;
669
670  // 256-bit variants
671  def : Pat<(v8i32 (X86Movss VR256:$src1, VR256:$src2)),
672            (SUBREG_TO_REG (i32 0),
673              (VMOVSSrr (EXTRACT_SUBREG (v8i32 VR256:$src1), sub_xmm),
674                        (EXTRACT_SUBREG (v8i32 VR256:$src2), sub_xmm)),
675              sub_xmm)>;
676  def : Pat<(v8f32 (X86Movss VR256:$src1, VR256:$src2)),
677            (SUBREG_TO_REG (i32 0),
678              (VMOVSSrr (EXTRACT_SUBREG (v8f32 VR256:$src1), sub_xmm),
679                        (EXTRACT_SUBREG (v8f32 VR256:$src2), sub_xmm)),
680              sub_xmm)>;
681
682  // Shuffle with VMOVSD
683  def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
684            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
685  def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
686            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
687  def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)),
688            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
689  def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)),
690            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
691
692  // 256-bit variants
693  def : Pat<(v4i64 (X86Movsd VR256:$src1, VR256:$src2)),
694            (SUBREG_TO_REG (i32 0),
695              (VMOVSDrr (EXTRACT_SUBREG (v4i64 VR256:$src1), sub_xmm),
696                        (EXTRACT_SUBREG (v4i64 VR256:$src2), sub_xmm)),
697              sub_xmm)>;
698  def : Pat<(v4f64 (X86Movsd VR256:$src1, VR256:$src2)),
699            (SUBREG_TO_REG (i32 0),
700              (VMOVSDrr (EXTRACT_SUBREG (v4f64 VR256:$src1), sub_xmm),
701                        (EXTRACT_SUBREG (v4f64 VR256:$src2), sub_xmm)),
702              sub_xmm)>;
703
704  // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem
705  // is during lowering, where it's not possible to recognize the fold cause
706  // it has two uses through a bitcast. One use disappears at isel time and the
707  // fold opportunity reappears.
708  def : Pat<(v2f64 (X86Movlpd VR128:$src1, VR128:$src2)),
709            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
710  def : Pat<(v2i64 (X86Movlpd VR128:$src1, VR128:$src2)),
711            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
712  def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)),
713            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
714  def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)),
715            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
716}
717
718let Predicates = [UseSSE1] in {
719  let Predicates = [NoSSE41], AddedComplexity = 15 in {
720  // Move scalar to XMM zero-extended, zeroing a VR128 then do a
721  // MOVSS to the lower bits.
722  def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
723            (MOVSSrr (v4f32 (V_SET0)), FR32:$src)>;
724  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
725            (MOVSSrr (v4f32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>;
726  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
727            (MOVSSrr (v4i32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>;
728  }
729
730  let AddedComplexity = 20 in {
731  // MOVSSrm already zeros the high parts of the register.
732  def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
733            (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
734  def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
735            (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
736  def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
737            (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
738  }
739
740  // Extract and store.
741  def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
742                   addr:$dst),
743            (MOVSSmr addr:$dst, (COPY_TO_REGCLASS VR128:$src, FR32))>;
744
745  // Shuffle with MOVSS
746  def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)),
747            (MOVSSrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR32))>;
748  def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
749            (MOVSSrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR32))>;
750}
751
752let Predicates = [UseSSE2] in {
753  let Predicates = [NoSSE41], AddedComplexity = 15 in {
754  // Move scalar to XMM zero-extended, zeroing a VR128 then do a
755  // MOVSD to the lower bits.
756  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
757            (MOVSDrr (v2f64 (V_SET0)), FR64:$src)>;
758  }
759
760  let AddedComplexity = 20 in {
761  // MOVSDrm already zeros the high parts of the register.
762  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
763            (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
764  def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
765            (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
766  def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
767            (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
768  def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
769            (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
770  def : Pat<(v2f64 (X86vzload addr:$src)),
771            (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
772  }
773
774  // Extract and store.
775  def : Pat<(store (f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
776                   addr:$dst),
777            (MOVSDmr addr:$dst, (COPY_TO_REGCLASS VR128:$src, FR64))>;
778
779  // Shuffle with MOVSD
780  def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
781            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
782  def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
783            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
784  def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)),
785            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
786  def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)),
787            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
788
789  // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem
790  // is during lowering, where it's not possible to recognize the fold because
791  // it has two uses through a bitcast. One use disappears at isel time and the
792  // fold opportunity reappears.
793  def : Pat<(v2f64 (X86Movlpd VR128:$src1, VR128:$src2)),
794            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
795  def : Pat<(v2i64 (X86Movlpd VR128:$src1, VR128:$src2)),
796            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
797  def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)),
798            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
799  def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)),
800            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
801}
802
803//===----------------------------------------------------------------------===//
804// SSE 1 & 2 - Move Aligned/Unaligned FP Instructions
805//===----------------------------------------------------------------------===//
806
807multiclass sse12_mov_packed<bits<8> opc, RegisterClass RC,
808                            X86MemOperand x86memop, PatFrag ld_frag,
809                            string asm, Domain d,
810                            OpndItins itins,
811                            bit IsReMaterializable = 1> {
812let hasSideEffects = 0 in
813  def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
814              !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], itins.rr, d>,
815           Sched<[WriteFShuffle]>;
816let canFoldAsLoad = 1, isReMaterializable = IsReMaterializable in
817  def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
818              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
819                   [(set RC:$dst, (ld_frag addr:$src))], itins.rm, d>,
820           Sched<[WriteLoad]>;
821}
822
823let Predicates = [HasAVX, NoVLX] in {
824defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32,
825                              "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
826                              PS, VEX;
827defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64,
828                              "movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
829                              PD, VEX;
830defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32,
831                              "movups", SSEPackedSingle, SSE_MOVU_ITINS>,
832                              PS, VEX;
833defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64,
834                              "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>,
835                              PD, VEX;
836
837defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32,
838                              "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
839                              PS, VEX, VEX_L;
840defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64,
841                              "movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
842                              PD, VEX, VEX_L;
843defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32,
844                              "movups", SSEPackedSingle, SSE_MOVU_ITINS>,
845                              PS, VEX, VEX_L;
846defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64,
847                              "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>,
848                              PD, VEX, VEX_L;
849}
850
851let Predicates = [UseSSE1] in {
852defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32,
853                              "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
854                              PS;
855defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32,
856                              "movups", SSEPackedSingle, SSE_MOVU_ITINS>,
857                              PS;
858}
859let Predicates = [UseSSE2] in {
860defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64,
861                              "movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
862                              PD;
863defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64,
864                              "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>,
865                              PD;
866}
867
868let SchedRW = [WriteStore], Predicates = [HasAVX, NoVLX]  in {
869def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
870                   "movaps\t{$src, $dst|$dst, $src}",
871                   [(alignedstore (v4f32 VR128:$src), addr:$dst)],
872                   IIC_SSE_MOVA_P_MR>, VEX;
873def VMOVAPDmr : VPDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
874                   "movapd\t{$src, $dst|$dst, $src}",
875                   [(alignedstore (v2f64 VR128:$src), addr:$dst)],
876                   IIC_SSE_MOVA_P_MR>, VEX;
877def VMOVUPSmr : VPSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
878                   "movups\t{$src, $dst|$dst, $src}",
879                   [(store (v4f32 VR128:$src), addr:$dst)],
880                   IIC_SSE_MOVU_P_MR>, VEX;
881def VMOVUPDmr : VPDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
882                   "movupd\t{$src, $dst|$dst, $src}",
883                   [(store (v2f64 VR128:$src), addr:$dst)],
884                   IIC_SSE_MOVU_P_MR>, VEX;
885def VMOVAPSYmr : VPSI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
886                   "movaps\t{$src, $dst|$dst, $src}",
887                   [(alignedstore256 (v8f32 VR256:$src), addr:$dst)],
888                   IIC_SSE_MOVA_P_MR>, VEX, VEX_L;
889def VMOVAPDYmr : VPDI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
890                   "movapd\t{$src, $dst|$dst, $src}",
891                   [(alignedstore256 (v4f64 VR256:$src), addr:$dst)],
892                   IIC_SSE_MOVA_P_MR>, VEX, VEX_L;
893def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
894                   "movups\t{$src, $dst|$dst, $src}",
895                   [(store (v8f32 VR256:$src), addr:$dst)],
896                   IIC_SSE_MOVU_P_MR>, VEX, VEX_L;
897def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
898                   "movupd\t{$src, $dst|$dst, $src}",
899                   [(store (v4f64 VR256:$src), addr:$dst)],
900                   IIC_SSE_MOVU_P_MR>, VEX, VEX_L;
901} // SchedRW
902
903// For disassembler
904let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
905    SchedRW = [WriteFShuffle] in {
906  def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst),
907                          (ins VR128:$src),
908                          "movaps\t{$src, $dst|$dst, $src}", [],
909                          IIC_SSE_MOVA_P_RR>, VEX;
910  def VMOVAPDrr_REV : VPDI<0x29, MRMDestReg, (outs VR128:$dst),
911                           (ins VR128:$src),
912                           "movapd\t{$src, $dst|$dst, $src}", [],
913                           IIC_SSE_MOVA_P_RR>, VEX;
914  def VMOVUPSrr_REV : VPSI<0x11, MRMDestReg, (outs VR128:$dst),
915                           (ins VR128:$src),
916                           "movups\t{$src, $dst|$dst, $src}", [],
917                           IIC_SSE_MOVU_P_RR>, VEX;
918  def VMOVUPDrr_REV : VPDI<0x11, MRMDestReg, (outs VR128:$dst),
919                           (ins VR128:$src),
920                           "movupd\t{$src, $dst|$dst, $src}", [],
921                           IIC_SSE_MOVU_P_RR>, VEX;
922  def VMOVAPSYrr_REV : VPSI<0x29, MRMDestReg, (outs VR256:$dst),
923                            (ins VR256:$src),
924                            "movaps\t{$src, $dst|$dst, $src}", [],
925                            IIC_SSE_MOVA_P_RR>, VEX, VEX_L;
926  def VMOVAPDYrr_REV : VPDI<0x29, MRMDestReg, (outs VR256:$dst),
927                            (ins VR256:$src),
928                            "movapd\t{$src, $dst|$dst, $src}", [],
929                            IIC_SSE_MOVA_P_RR>, VEX, VEX_L;
930  def VMOVUPSYrr_REV : VPSI<0x11, MRMDestReg, (outs VR256:$dst),
931                            (ins VR256:$src),
932                            "movups\t{$src, $dst|$dst, $src}", [],
933                            IIC_SSE_MOVU_P_RR>, VEX, VEX_L;
934  def VMOVUPDYrr_REV : VPDI<0x11, MRMDestReg, (outs VR256:$dst),
935                            (ins VR256:$src),
936                            "movupd\t{$src, $dst|$dst, $src}", [],
937                            IIC_SSE_MOVU_P_RR>, VEX, VEX_L;
938}
939
940let Predicates = [HasAVX] in {
941def : Pat<(v8i32 (X86vzmovl
942                  (insert_subvector undef, (v4i32 VR128:$src), (iPTR 0)))),
943          (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>;
944def : Pat<(v4i64 (X86vzmovl
945                  (insert_subvector undef, (v2i64 VR128:$src), (iPTR 0)))),
946          (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>;
947def : Pat<(v8f32 (X86vzmovl
948                  (insert_subvector undef, (v4f32 VR128:$src), (iPTR 0)))),
949          (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>;
950def : Pat<(v4f64 (X86vzmovl
951                  (insert_subvector undef, (v2f64 VR128:$src), (iPTR 0)))),
952          (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>;
953}
954
955
956def : Pat<(int_x86_avx_storeu_ps_256 addr:$dst, VR256:$src),
957          (VMOVUPSYmr addr:$dst, VR256:$src)>;
958def : Pat<(int_x86_avx_storeu_pd_256 addr:$dst, VR256:$src),
959          (VMOVUPDYmr addr:$dst, VR256:$src)>;
960
961let SchedRW = [WriteStore] in {
962def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
963                   "movaps\t{$src, $dst|$dst, $src}",
964                   [(alignedstore (v4f32 VR128:$src), addr:$dst)],
965                   IIC_SSE_MOVA_P_MR>;
966def MOVAPDmr : PDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
967                   "movapd\t{$src, $dst|$dst, $src}",
968                   [(alignedstore (v2f64 VR128:$src), addr:$dst)],
969                   IIC_SSE_MOVA_P_MR>;
970def MOVUPSmr : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
971                   "movups\t{$src, $dst|$dst, $src}",
972                   [(store (v4f32 VR128:$src), addr:$dst)],
973                   IIC_SSE_MOVU_P_MR>;
974def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
975                   "movupd\t{$src, $dst|$dst, $src}",
976                   [(store (v2f64 VR128:$src), addr:$dst)],
977                   IIC_SSE_MOVU_P_MR>;
978} // SchedRW
979
980// For disassembler
981let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
982    SchedRW = [WriteFShuffle] in {
983  def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
984                         "movaps\t{$src, $dst|$dst, $src}", [],
985                         IIC_SSE_MOVA_P_RR>;
986  def MOVAPDrr_REV : PDI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
987                         "movapd\t{$src, $dst|$dst, $src}", [],
988                         IIC_SSE_MOVA_P_RR>;
989  def MOVUPSrr_REV : PSI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
990                         "movups\t{$src, $dst|$dst, $src}", [],
991                         IIC_SSE_MOVU_P_RR>;
992  def MOVUPDrr_REV : PDI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
993                         "movupd\t{$src, $dst|$dst, $src}", [],
994                         IIC_SSE_MOVU_P_RR>;
995}
996
997let Predicates = [HasAVX] in {
998  def : Pat<(int_x86_sse_storeu_ps addr:$dst, VR128:$src),
999            (VMOVUPSmr addr:$dst, VR128:$src)>;
1000  def : Pat<(int_x86_sse2_storeu_pd addr:$dst, VR128:$src),
1001            (VMOVUPDmr addr:$dst, VR128:$src)>;
1002}
1003
1004let Predicates = [UseSSE1] in
1005  def : Pat<(int_x86_sse_storeu_ps addr:$dst, VR128:$src),
1006            (MOVUPSmr addr:$dst, VR128:$src)>;
1007let Predicates = [UseSSE2] in
1008  def : Pat<(int_x86_sse2_storeu_pd addr:$dst, VR128:$src),
1009            (MOVUPDmr addr:$dst, VR128:$src)>;
1010
1011// Use vmovaps/vmovups for AVX integer load/store.
1012let Predicates = [HasAVX, NoVLX] in {
1013  // 128-bit load/store
1014  def : Pat<(alignedloadv2i64 addr:$src),
1015            (VMOVAPSrm addr:$src)>;
1016  def : Pat<(loadv2i64 addr:$src),
1017            (VMOVUPSrm addr:$src)>;
1018
1019  def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
1020            (VMOVAPSmr addr:$dst, VR128:$src)>;
1021  def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
1022            (VMOVAPSmr addr:$dst, VR128:$src)>;
1023  def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
1024            (VMOVAPSmr addr:$dst, VR128:$src)>;
1025  def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
1026            (VMOVAPSmr addr:$dst, VR128:$src)>;
1027  def : Pat<(store (v2i64 VR128:$src), addr:$dst),
1028            (VMOVUPSmr addr:$dst, VR128:$src)>;
1029  def : Pat<(store (v4i32 VR128:$src), addr:$dst),
1030            (VMOVUPSmr addr:$dst, VR128:$src)>;
1031  def : Pat<(store (v8i16 VR128:$src), addr:$dst),
1032            (VMOVUPSmr addr:$dst, VR128:$src)>;
1033  def : Pat<(store (v16i8 VR128:$src), addr:$dst),
1034            (VMOVUPSmr addr:$dst, VR128:$src)>;
1035
1036  // 256-bit load/store
1037  def : Pat<(alignedloadv4i64 addr:$src),
1038            (VMOVAPSYrm addr:$src)>;
1039  def : Pat<(loadv4i64 addr:$src),
1040            (VMOVUPSYrm addr:$src)>;
1041  def : Pat<(alignedstore256 (v4i64 VR256:$src), addr:$dst),
1042            (VMOVAPSYmr addr:$dst, VR256:$src)>;
1043  def : Pat<(alignedstore256 (v8i32 VR256:$src), addr:$dst),
1044            (VMOVAPSYmr addr:$dst, VR256:$src)>;
1045  def : Pat<(alignedstore256 (v16i16 VR256:$src), addr:$dst),
1046            (VMOVAPSYmr addr:$dst, VR256:$src)>;
1047  def : Pat<(alignedstore256 (v32i8 VR256:$src), addr:$dst),
1048            (VMOVAPSYmr addr:$dst, VR256:$src)>;
1049  def : Pat<(store (v4i64 VR256:$src), addr:$dst),
1050            (VMOVUPSYmr addr:$dst, VR256:$src)>;
1051  def : Pat<(store (v8i32 VR256:$src), addr:$dst),
1052            (VMOVUPSYmr addr:$dst, VR256:$src)>;
1053  def : Pat<(store (v16i16 VR256:$src), addr:$dst),
1054            (VMOVUPSYmr addr:$dst, VR256:$src)>;
1055  def : Pat<(store (v32i8 VR256:$src), addr:$dst),
1056            (VMOVUPSYmr addr:$dst, VR256:$src)>;
1057
1058  // Special patterns for storing subvector extracts of lower 128-bits
1059  // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr
1060  def : Pat<(alignedstore (v2f64 (extract_subvector
1061                                  (v4f64 VR256:$src), (iPTR 0))), addr:$dst),
1062            (VMOVAPDmr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1063  def : Pat<(alignedstore (v4f32 (extract_subvector
1064                                  (v8f32 VR256:$src), (iPTR 0))), addr:$dst),
1065            (VMOVAPSmr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1066  def : Pat<(alignedstore (v2i64 (extract_subvector
1067                                  (v4i64 VR256:$src), (iPTR 0))), addr:$dst),
1068            (VMOVAPDmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1069  def : Pat<(alignedstore (v4i32 (extract_subvector
1070                                  (v8i32 VR256:$src), (iPTR 0))), addr:$dst),
1071            (VMOVAPSmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1072  def : Pat<(alignedstore (v8i16 (extract_subvector
1073                                  (v16i16 VR256:$src), (iPTR 0))), addr:$dst),
1074            (VMOVAPSmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1075  def : Pat<(alignedstore (v16i8 (extract_subvector
1076                                  (v32i8 VR256:$src), (iPTR 0))), addr:$dst),
1077            (VMOVAPSmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1078
1079  def : Pat<(store (v2f64 (extract_subvector
1080                           (v4f64 VR256:$src), (iPTR 0))), addr:$dst),
1081            (VMOVUPDmr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1082  def : Pat<(store (v4f32 (extract_subvector
1083                           (v8f32 VR256:$src), (iPTR 0))), addr:$dst),
1084            (VMOVUPSmr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1085  def : Pat<(store (v2i64 (extract_subvector
1086                           (v4i64 VR256:$src), (iPTR 0))), addr:$dst),
1087            (VMOVUPDmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1088  def : Pat<(store (v4i32 (extract_subvector
1089                           (v8i32 VR256:$src), (iPTR 0))), addr:$dst),
1090            (VMOVUPSmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1091  def : Pat<(store (v8i16 (extract_subvector
1092                           (v16i16 VR256:$src), (iPTR 0))), addr:$dst),
1093            (VMOVUPSmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1094  def : Pat<(store (v16i8 (extract_subvector
1095                           (v32i8 VR256:$src), (iPTR 0))), addr:$dst),
1096            (VMOVUPSmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1097}
1098
1099// Use movaps / movups for SSE integer load / store (one byte shorter).
1100// The instructions selected below are then converted to MOVDQA/MOVDQU
1101// during the SSE domain pass.
1102let Predicates = [UseSSE1] in {
1103  def : Pat<(alignedloadv2i64 addr:$src),
1104            (MOVAPSrm addr:$src)>;
1105  def : Pat<(loadv2i64 addr:$src),
1106            (MOVUPSrm addr:$src)>;
1107
1108  def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
1109            (MOVAPSmr addr:$dst, VR128:$src)>;
1110  def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
1111            (MOVAPSmr addr:$dst, VR128:$src)>;
1112  def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
1113            (MOVAPSmr addr:$dst, VR128:$src)>;
1114  def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
1115            (MOVAPSmr addr:$dst, VR128:$src)>;
1116  def : Pat<(store (v2i64 VR128:$src), addr:$dst),
1117            (MOVUPSmr addr:$dst, VR128:$src)>;
1118  def : Pat<(store (v4i32 VR128:$src), addr:$dst),
1119            (MOVUPSmr addr:$dst, VR128:$src)>;
1120  def : Pat<(store (v8i16 VR128:$src), addr:$dst),
1121            (MOVUPSmr addr:$dst, VR128:$src)>;
1122  def : Pat<(store (v16i8 VR128:$src), addr:$dst),
1123            (MOVUPSmr addr:$dst, VR128:$src)>;
1124}
1125
1126// Alias instruction to load FR32 or FR64 from f128mem using movaps. Upper
1127// bits are disregarded. FIXME: Set encoding to pseudo!
1128let canFoldAsLoad = 1, isReMaterializable = 1, SchedRW = [WriteLoad] in {
1129let isCodeGenOnly = 1 in {
1130  def FsVMOVAPSrm : VPSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src),
1131                         "movaps\t{$src, $dst|$dst, $src}",
1132                         [(set FR32:$dst, (alignedloadfsf32 addr:$src))],
1133                         IIC_SSE_MOVA_P_RM>, VEX;
1134  def FsVMOVAPDrm : VPDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src),
1135                         "movapd\t{$src, $dst|$dst, $src}",
1136                         [(set FR64:$dst, (alignedloadfsf64 addr:$src))],
1137                         IIC_SSE_MOVA_P_RM>, VEX;
1138  def FsMOVAPSrm : PSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src),
1139                       "movaps\t{$src, $dst|$dst, $src}",
1140                       [(set FR32:$dst, (alignedloadfsf32 addr:$src))],
1141                       IIC_SSE_MOVA_P_RM>;
1142  def FsMOVAPDrm : PDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src),
1143                       "movapd\t{$src, $dst|$dst, $src}",
1144                       [(set FR64:$dst, (alignedloadfsf64 addr:$src))],
1145                       IIC_SSE_MOVA_P_RM>;
1146}
1147}
1148
1149//===----------------------------------------------------------------------===//
1150// SSE 1 & 2 - Move Low packed FP Instructions
1151//===----------------------------------------------------------------------===//
1152
1153multiclass sse12_mov_hilo_packed_base<bits<8>opc, SDNode psnode, SDNode pdnode,
1154                                      string base_opc, string asm_opr,
1155                                      InstrItinClass itin> {
1156  def PSrm : PI<opc, MRMSrcMem,
1157         (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
1158         !strconcat(base_opc, "s", asm_opr),
1159     [(set VR128:$dst,
1160       (psnode VR128:$src1,
1161              (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))))],
1162              itin, SSEPackedSingle>, PS,
1163     Sched<[WriteFShuffleLd, ReadAfterLd]>;
1164
1165  def PDrm : PI<opc, MRMSrcMem,
1166         (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
1167         !strconcat(base_opc, "d", asm_opr),
1168     [(set VR128:$dst, (v2f64 (pdnode VR128:$src1,
1169                              (scalar_to_vector (loadf64 addr:$src2)))))],
1170              itin, SSEPackedDouble>, PD,
1171     Sched<[WriteFShuffleLd, ReadAfterLd]>;
1172
1173}
1174
1175multiclass sse12_mov_hilo_packed<bits<8>opc, SDNode psnode, SDNode pdnode,
1176                                 string base_opc, InstrItinClass itin> {
1177  defm V#NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc,
1178                                    "\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1179                                    itin>, VEX_4V;
1180
1181let Constraints = "$src1 = $dst" in
1182  defm NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc,
1183                                    "\t{$src2, $dst|$dst, $src2}",
1184                                    itin>;
1185}
1186
1187let AddedComplexity = 20 in {
1188  defm MOVL : sse12_mov_hilo_packed<0x12, X86Movlps, X86Movlpd, "movlp",
1189                                    IIC_SSE_MOV_LH>;
1190}
1191
1192let SchedRW = [WriteStore] in {
1193def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
1194                   "movlps\t{$src, $dst|$dst, $src}",
1195                   [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128:$src)),
1196                                 (iPTR 0))), addr:$dst)],
1197                                 IIC_SSE_MOV_LH>, VEX;
1198def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
1199                   "movlpd\t{$src, $dst|$dst, $src}",
1200                   [(store (f64 (vector_extract (v2f64 VR128:$src),
1201                                 (iPTR 0))), addr:$dst)],
1202                                 IIC_SSE_MOV_LH>, VEX;
1203def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
1204                   "movlps\t{$src, $dst|$dst, $src}",
1205                   [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128:$src)),
1206                                 (iPTR 0))), addr:$dst)],
1207                                 IIC_SSE_MOV_LH>;
1208def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
1209                   "movlpd\t{$src, $dst|$dst, $src}",
1210                   [(store (f64 (vector_extract (v2f64 VR128:$src),
1211                                 (iPTR 0))), addr:$dst)],
1212                                 IIC_SSE_MOV_LH>;
1213} // SchedRW
1214
1215let Predicates = [HasAVX] in {
1216  // Shuffle with VMOVLPS
1217  def : Pat<(v4f32 (X86Movlps VR128:$src1, (load addr:$src2))),
1218            (VMOVLPSrm VR128:$src1, addr:$src2)>;
1219  def : Pat<(v4i32 (X86Movlps VR128:$src1, (load addr:$src2))),
1220            (VMOVLPSrm VR128:$src1, addr:$src2)>;
1221
1222  // Shuffle with VMOVLPD
1223  def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))),
1224            (VMOVLPDrm VR128:$src1, addr:$src2)>;
1225  def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))),
1226            (VMOVLPDrm VR128:$src1, addr:$src2)>;
1227  def : Pat<(v2f64 (X86Movsd VR128:$src1,
1228                             (v2f64 (scalar_to_vector (loadf64 addr:$src2))))),
1229            (VMOVLPDrm VR128:$src1, addr:$src2)>;
1230
1231  // Store patterns
1232  def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)),
1233                   addr:$src1),
1234            (VMOVLPSmr addr:$src1, VR128:$src2)>;
1235  def : Pat<(store (v4i32 (X86Movlps
1236                   (bc_v4i32 (loadv2i64 addr:$src1)), VR128:$src2)), addr:$src1),
1237            (VMOVLPSmr addr:$src1, VR128:$src2)>;
1238  def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128:$src2)),
1239                   addr:$src1),
1240            (VMOVLPDmr addr:$src1, VR128:$src2)>;
1241  def : Pat<(store (v2i64 (X86Movlpd (load addr:$src1), VR128:$src2)),
1242                   addr:$src1),
1243            (VMOVLPDmr addr:$src1, VR128:$src2)>;
1244}
1245
1246let Predicates = [UseSSE1] in {
1247  // (store (vector_shuffle (load addr), v2, <4, 5, 2, 3>), addr) using MOVLPS
1248  def : Pat<(store (i64 (vector_extract (bc_v2i64 (v4f32 VR128:$src2)),
1249                                 (iPTR 0))), addr:$src1),
1250            (MOVLPSmr addr:$src1, VR128:$src2)>;
1251
1252  // Shuffle with MOVLPS
1253  def : Pat<(v4f32 (X86Movlps VR128:$src1, (load addr:$src2))),
1254            (MOVLPSrm VR128:$src1, addr:$src2)>;
1255  def : Pat<(v4i32 (X86Movlps VR128:$src1, (load addr:$src2))),
1256            (MOVLPSrm VR128:$src1, addr:$src2)>;
1257  def : Pat<(X86Movlps VR128:$src1,
1258                      (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
1259            (MOVLPSrm VR128:$src1, addr:$src2)>;
1260
1261  // Store patterns
1262  def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)),
1263                                      addr:$src1),
1264            (MOVLPSmr addr:$src1, VR128:$src2)>;
1265  def : Pat<(store (v4i32 (X86Movlps
1266                   (bc_v4i32 (loadv2i64 addr:$src1)), VR128:$src2)),
1267                              addr:$src1),
1268            (MOVLPSmr addr:$src1, VR128:$src2)>;
1269}
1270
1271let Predicates = [UseSSE2] in {
1272  // Shuffle with MOVLPD
1273  def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))),
1274            (MOVLPDrm VR128:$src1, addr:$src2)>;
1275  def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))),
1276            (MOVLPDrm VR128:$src1, addr:$src2)>;
1277  def : Pat<(v2f64 (X86Movsd VR128:$src1,
1278                             (v2f64 (scalar_to_vector (loadf64 addr:$src2))))),
1279            (MOVLPDrm VR128:$src1, addr:$src2)>;
1280
1281  // Store patterns
1282  def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128:$src2)),
1283                           addr:$src1),
1284            (MOVLPDmr addr:$src1, VR128:$src2)>;
1285  def : Pat<(store (v2i64 (X86Movlpd (load addr:$src1), VR128:$src2)),
1286                           addr:$src1),
1287            (MOVLPDmr addr:$src1, VR128:$src2)>;
1288}
1289
1290//===----------------------------------------------------------------------===//
1291// SSE 1 & 2 - Move Hi packed FP Instructions
1292//===----------------------------------------------------------------------===//
1293
1294let AddedComplexity = 20 in {
1295  defm MOVH : sse12_mov_hilo_packed<0x16, X86Movlhps, X86Movlhpd, "movhp",
1296                                    IIC_SSE_MOV_LH>;
1297}
1298
1299let SchedRW = [WriteStore] in {
1300// v2f64 extract element 1 is always custom lowered to unpack high to low
1301// and extract element 0 so the non-store version isn't too horrible.
1302def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
1303                   "movhps\t{$src, $dst|$dst, $src}",
1304                   [(store (f64 (vector_extract
1305                                 (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)),
1306                                            (bc_v2f64 (v4f32 VR128:$src))),
1307                                 (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX;
1308def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
1309                   "movhpd\t{$src, $dst|$dst, $src}",
1310                   [(store (f64 (vector_extract
1311                                 (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
1312                                 (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX;
1313def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
1314                   "movhps\t{$src, $dst|$dst, $src}",
1315                   [(store (f64 (vector_extract
1316                                 (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)),
1317                                            (bc_v2f64 (v4f32 VR128:$src))),
1318                                 (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>;
1319def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
1320                   "movhpd\t{$src, $dst|$dst, $src}",
1321                   [(store (f64 (vector_extract
1322                                 (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
1323                                 (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>;
1324} // SchedRW
1325
1326let Predicates = [HasAVX] in {
1327  // VMOVHPS patterns
1328  def : Pat<(X86Movlhps VR128:$src1,
1329                 (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
1330            (VMOVHPSrm VR128:$src1, addr:$src2)>;
1331  def : Pat<(X86Movlhps VR128:$src1,
1332                 (bc_v4i32 (v2i64 (X86vzload addr:$src2)))),
1333            (VMOVHPSrm VR128:$src1, addr:$src2)>;
1334
1335  // VMOVHPD patterns
1336
1337  // FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem
1338  // is during lowering, where it's not possible to recognize the load fold
1339  // cause it has two uses through a bitcast. One use disappears at isel time
1340  // and the fold opportunity reappears.
1341  def : Pat<(v2f64 (X86Unpckl VR128:$src1,
1342                      (scalar_to_vector (loadf64 addr:$src2)))),
1343            (VMOVHPDrm VR128:$src1, addr:$src2)>;
1344  // Also handle an i64 load because that may get selected as a faster way to
1345  // load the data.
1346  def : Pat<(v2f64 (X86Unpckl VR128:$src1,
1347                      (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
1348            (VMOVHPDrm VR128:$src1, addr:$src2)>;
1349
1350  def : Pat<(store (f64 (vector_extract
1351                          (v2f64 (X86VPermilpi VR128:$src, (i8 1))),
1352                          (iPTR 0))), addr:$dst),
1353            (VMOVHPDmr addr:$dst, VR128:$src)>;
1354}
1355
1356let Predicates = [UseSSE1] in {
1357  // MOVHPS patterns
1358  def : Pat<(X86Movlhps VR128:$src1,
1359                 (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
1360            (MOVHPSrm VR128:$src1, addr:$src2)>;
1361  def : Pat<(X86Movlhps VR128:$src1,
1362                 (bc_v4f32 (v2i64 (X86vzload addr:$src2)))),
1363            (MOVHPSrm VR128:$src1, addr:$src2)>;
1364}
1365
1366let Predicates = [UseSSE2] in {
1367  // MOVHPD patterns
1368
1369  // FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem
1370  // is during lowering, where it's not possible to recognize the load fold
1371  // cause it has two uses through a bitcast. One use disappears at isel time
1372  // and the fold opportunity reappears.
1373  def : Pat<(v2f64 (X86Unpckl VR128:$src1,
1374                      (scalar_to_vector (loadf64 addr:$src2)))),
1375            (MOVHPDrm VR128:$src1, addr:$src2)>;
1376  // Also handle an i64 load because that may get selected as a faster way to
1377  // load the data.
1378  def : Pat<(v2f64 (X86Unpckl VR128:$src1,
1379                      (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
1380            (MOVHPDrm VR128:$src1, addr:$src2)>;
1381
1382  def : Pat<(store (f64 (vector_extract
1383                          (v2f64 (X86Shufp VR128:$src, VR128:$src, (i8 1))),
1384                          (iPTR 0))), addr:$dst),
1385            (MOVHPDmr addr:$dst, VR128:$src)>;
1386}
1387
1388//===----------------------------------------------------------------------===//
1389// SSE 1 & 2 - Move Low to High and High to Low packed FP Instructions
1390//===----------------------------------------------------------------------===//
1391
1392let AddedComplexity = 20, Predicates = [UseAVX] in {
1393  def VMOVLHPSrr : VPSI<0x16, MRMSrcReg, (outs VR128:$dst),
1394                                       (ins VR128:$src1, VR128:$src2),
1395                      "movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1396                      [(set VR128:$dst,
1397                        (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))],
1398                        IIC_SSE_MOV_LH>,
1399                      VEX_4V, Sched<[WriteFShuffle]>;
1400  def VMOVHLPSrr : VPSI<0x12, MRMSrcReg, (outs VR128:$dst),
1401                                       (ins VR128:$src1, VR128:$src2),
1402                      "movhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1403                      [(set VR128:$dst,
1404                        (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))],
1405                        IIC_SSE_MOV_LH>,
1406                      VEX_4V, Sched<[WriteFShuffle]>;
1407}
1408let Constraints = "$src1 = $dst", AddedComplexity = 20 in {
1409  def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst),
1410                                       (ins VR128:$src1, VR128:$src2),
1411                      "movlhps\t{$src2, $dst|$dst, $src2}",
1412                      [(set VR128:$dst,
1413                        (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))],
1414                        IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>;
1415  def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst),
1416                                       (ins VR128:$src1, VR128:$src2),
1417                      "movhlps\t{$src2, $dst|$dst, $src2}",
1418                      [(set VR128:$dst,
1419                        (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))],
1420                        IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>;
1421}
1422
1423let Predicates = [UseAVX] in {
1424  // MOVLHPS patterns
1425  def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)),
1426            (VMOVLHPSrr VR128:$src1, VR128:$src2)>;
1427  def : Pat<(v2i64 (X86Movlhps VR128:$src1, VR128:$src2)),
1428            (VMOVLHPSrr (v2i64 VR128:$src1), VR128:$src2)>;
1429
1430  // MOVHLPS patterns
1431  def : Pat<(v4i32 (X86Movhlps VR128:$src1, VR128:$src2)),
1432            (VMOVHLPSrr VR128:$src1, VR128:$src2)>;
1433}
1434
1435let Predicates = [UseSSE1] in {
1436  // MOVLHPS patterns
1437  def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)),
1438            (MOVLHPSrr VR128:$src1, VR128:$src2)>;
1439  def : Pat<(v2i64 (X86Movlhps VR128:$src1, VR128:$src2)),
1440            (MOVLHPSrr (v2i64 VR128:$src1), VR128:$src2)>;
1441
1442  // MOVHLPS patterns
1443  def : Pat<(v4i32 (X86Movhlps VR128:$src1, VR128:$src2)),
1444            (MOVHLPSrr VR128:$src1, VR128:$src2)>;
1445}
1446
1447//===----------------------------------------------------------------------===//
1448// SSE 1 & 2 - Conversion Instructions
1449//===----------------------------------------------------------------------===//
1450
1451def SSE_CVT_PD : OpndItins<
1452  IIC_SSE_CVT_PD_RR, IIC_SSE_CVT_PD_RM
1453>;
1454
1455let Sched = WriteCvtI2F in
1456def SSE_CVT_PS : OpndItins<
1457  IIC_SSE_CVT_PS_RR, IIC_SSE_CVT_PS_RM
1458>;
1459
1460let Sched = WriteCvtI2F in
1461def SSE_CVT_Scalar : OpndItins<
1462  IIC_SSE_CVT_Scalar_RR, IIC_SSE_CVT_Scalar_RM
1463>;
1464
1465let Sched = WriteCvtF2I in
1466def SSE_CVT_SS2SI_32 : OpndItins<
1467  IIC_SSE_CVT_SS2SI32_RR, IIC_SSE_CVT_SS2SI32_RM
1468>;
1469
1470let Sched = WriteCvtF2I in
1471def SSE_CVT_SS2SI_64 : OpndItins<
1472  IIC_SSE_CVT_SS2SI64_RR, IIC_SSE_CVT_SS2SI64_RM
1473>;
1474
1475let Sched = WriteCvtF2I in
1476def SSE_CVT_SD2SI : OpndItins<
1477  IIC_SSE_CVT_SD2SI_RR, IIC_SSE_CVT_SD2SI_RM
1478>;
1479
1480multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
1481                     SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag,
1482                     string asm, OpndItins itins> {
1483  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
1484                        [(set DstRC:$dst, (OpNode SrcRC:$src))],
1485                        itins.rr>, Sched<[itins.Sched]>;
1486  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
1487                        [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))],
1488                        itins.rm>, Sched<[itins.Sched.Folded]>;
1489}
1490
1491multiclass sse12_cvt_p<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
1492                       X86MemOperand x86memop, string asm, Domain d,
1493                       OpndItins itins> {
1494let hasSideEffects = 0 in {
1495  def rr : I<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
1496             [], itins.rr, d>, Sched<[itins.Sched]>;
1497  let mayLoad = 1 in
1498  def rm : I<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
1499             [], itins.rm, d>, Sched<[itins.Sched.Folded]>;
1500}
1501}
1502
1503multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
1504                          X86MemOperand x86memop, string asm> {
1505let hasSideEffects = 0, Predicates = [UseAVX] in {
1506  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src),
1507              !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
1508           Sched<[WriteCvtI2F]>;
1509  let mayLoad = 1 in
1510  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
1511              (ins DstRC:$src1, x86memop:$src),
1512              !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
1513           Sched<[WriteCvtI2FLd, ReadAfterLd]>;
1514} // hasSideEffects = 0
1515}
1516
1517let Predicates = [UseAVX] in {
1518defm VCVTTSS2SI   : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
1519                                "cvttss2si\t{$src, $dst|$dst, $src}",
1520                                SSE_CVT_SS2SI_32>,
1521                                XS, VEX, VEX_LIG;
1522defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32,
1523                                "cvttss2si\t{$src, $dst|$dst, $src}",
1524                                SSE_CVT_SS2SI_64>,
1525                                XS, VEX, VEX_W, VEX_LIG;
1526defm VCVTTSD2SI   : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
1527                                "cvttsd2si\t{$src, $dst|$dst, $src}",
1528                                SSE_CVT_SD2SI>,
1529                                XD, VEX, VEX_LIG;
1530defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
1531                                "cvttsd2si\t{$src, $dst|$dst, $src}",
1532                                SSE_CVT_SD2SI>,
1533                                XD, VEX, VEX_W, VEX_LIG;
1534
1535def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
1536                (VCVTTSS2SIrr GR32:$dst, FR32:$src), 0>;
1537def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
1538                (VCVTTSS2SIrm GR32:$dst, f32mem:$src), 0>;
1539def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
1540                (VCVTTSD2SIrr GR32:$dst, FR64:$src), 0>;
1541def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
1542                (VCVTTSD2SIrm GR32:$dst, f64mem:$src), 0>;
1543def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
1544                (VCVTTSS2SI64rr GR64:$dst, FR32:$src), 0>;
1545def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
1546                (VCVTTSS2SI64rm GR64:$dst, f32mem:$src), 0>;
1547def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
1548                (VCVTTSD2SI64rr GR64:$dst, FR64:$src), 0>;
1549def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
1550                (VCVTTSD2SI64rm GR64:$dst, f64mem:$src), 0>;
1551}
1552// The assembler can recognize rr 64-bit instructions by seeing a rxx
1553// register, but the same isn't true when only using memory operands,
1554// provide other assembly "l" and "q" forms to address this explicitly
1555// where appropriate to do so.
1556defm VCVTSI2SS   : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss{l}">,
1557                                  XS, VEX_4V, VEX_LIG;
1558defm VCVTSI2SS64 : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss{q}">,
1559                                  XS, VEX_4V, VEX_W, VEX_LIG;
1560defm VCVTSI2SD   : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd{l}">,
1561                                  XD, VEX_4V, VEX_LIG;
1562defm VCVTSI2SD64 : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd{q}">,
1563                                  XD, VEX_4V, VEX_W, VEX_LIG;
1564
1565let Predicates = [UseAVX] in {
1566  def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
1567                (VCVTSI2SSrm FR64:$dst, FR64:$src1, i32mem:$src), 0>;
1568  def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}",
1569                (VCVTSI2SDrm FR64:$dst, FR64:$src1, i32mem:$src), 0>;
1570
1571  def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))),
1572            (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
1573  def : Pat<(f32 (sint_to_fp (loadi64 addr:$src))),
1574            (VCVTSI2SS64rm (f32 (IMPLICIT_DEF)), addr:$src)>;
1575  def : Pat<(f64 (sint_to_fp (loadi32 addr:$src))),
1576            (VCVTSI2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>;
1577  def : Pat<(f64 (sint_to_fp (loadi64 addr:$src))),
1578            (VCVTSI2SD64rm (f64 (IMPLICIT_DEF)), addr:$src)>;
1579
1580  def : Pat<(f32 (sint_to_fp GR32:$src)),
1581            (VCVTSI2SSrr (f32 (IMPLICIT_DEF)), GR32:$src)>;
1582  def : Pat<(f32 (sint_to_fp GR64:$src)),
1583            (VCVTSI2SS64rr (f32 (IMPLICIT_DEF)), GR64:$src)>;
1584  def : Pat<(f64 (sint_to_fp GR32:$src)),
1585            (VCVTSI2SDrr (f64 (IMPLICIT_DEF)), GR32:$src)>;
1586  def : Pat<(f64 (sint_to_fp GR64:$src)),
1587            (VCVTSI2SD64rr (f64 (IMPLICIT_DEF)), GR64:$src)>;
1588}
1589
1590defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
1591                      "cvttss2si\t{$src, $dst|$dst, $src}",
1592                      SSE_CVT_SS2SI_32>, XS;
1593defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32,
1594                      "cvttss2si\t{$src, $dst|$dst, $src}",
1595                      SSE_CVT_SS2SI_64>, XS, REX_W;
1596defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
1597                      "cvttsd2si\t{$src, $dst|$dst, $src}",
1598                      SSE_CVT_SD2SI>, XD;
1599defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
1600                      "cvttsd2si\t{$src, $dst|$dst, $src}",
1601                      SSE_CVT_SD2SI>, XD, REX_W;
1602defm CVTSI2SS  : sse12_cvt_s<0x2A, GR32, FR32, sint_to_fp, i32mem, loadi32,
1603                      "cvtsi2ss{l}\t{$src, $dst|$dst, $src}",
1604                      SSE_CVT_Scalar>, XS;
1605defm CVTSI2SS64 : sse12_cvt_s<0x2A, GR64, FR32, sint_to_fp, i64mem, loadi64,
1606                      "cvtsi2ss{q}\t{$src, $dst|$dst, $src}",
1607                      SSE_CVT_Scalar>, XS, REX_W;
1608defm CVTSI2SD  : sse12_cvt_s<0x2A, GR32, FR64, sint_to_fp, i32mem, loadi32,
1609                      "cvtsi2sd{l}\t{$src, $dst|$dst, $src}",
1610                      SSE_CVT_Scalar>, XD;
1611defm CVTSI2SD64 : sse12_cvt_s<0x2A, GR64, FR64, sint_to_fp, i64mem, loadi64,
1612                      "cvtsi2sd{q}\t{$src, $dst|$dst, $src}",
1613                      SSE_CVT_Scalar>, XD, REX_W;
1614
1615def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
1616                (CVTTSS2SIrr GR32:$dst, FR32:$src), 0>;
1617def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
1618                (CVTTSS2SIrm GR32:$dst, f32mem:$src), 0>;
1619def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
1620                (CVTTSD2SIrr GR32:$dst, FR64:$src), 0>;
1621def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
1622                (CVTTSD2SIrm GR32:$dst, f64mem:$src), 0>;
1623def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
1624                (CVTTSS2SI64rr GR64:$dst, FR32:$src), 0>;
1625def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
1626                (CVTTSS2SI64rm GR64:$dst, f32mem:$src), 0>;
1627def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
1628                (CVTTSD2SI64rr GR64:$dst, FR64:$src), 0>;
1629def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
1630                (CVTTSD2SI64rm GR64:$dst, f64mem:$src), 0>;
1631
1632def : InstAlias<"cvtsi2ss\t{$src, $dst|$dst, $src}",
1633                (CVTSI2SSrm FR64:$dst, i32mem:$src), 0>;
1634def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}",
1635                (CVTSI2SDrm FR64:$dst, i32mem:$src), 0>;
1636
1637// Conversion Instructions Intrinsics - Match intrinsics which expect MM
1638// and/or XMM operand(s).
1639
1640multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
1641                         Intrinsic Int, Operand memop, ComplexPattern mem_cpat,
1642                         string asm, OpndItins itins> {
1643  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
1644              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
1645              [(set DstRC:$dst, (Int SrcRC:$src))], itins.rr>,
1646           Sched<[itins.Sched]>;
1647  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src),
1648              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
1649              [(set DstRC:$dst, (Int mem_cpat:$src))], itins.rm>,
1650           Sched<[itins.Sched.Folded]>;
1651}
1652
1653multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC,
1654                    RegisterClass DstRC, Intrinsic Int, X86MemOperand x86memop,
1655                    PatFrag ld_frag, string asm, OpndItins itins,
1656                    bit Is2Addr = 1> {
1657  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src2),
1658              !if(Is2Addr,
1659                  !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
1660                  !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
1661              [(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))],
1662              itins.rr>, Sched<[itins.Sched]>;
1663  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
1664              (ins DstRC:$src1, x86memop:$src2),
1665              !if(Is2Addr,
1666                  !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
1667                  !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
1668              [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))],
1669              itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
1670}
1671
1672let Predicates = [UseAVX] in {
1673defm VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32,
1674                  int_x86_sse2_cvtsd2si, sdmem, sse_load_f64, "cvtsd2si",
1675                  SSE_CVT_SD2SI>, XD, VEX, VEX_LIG;
1676defm VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64,
1677                    int_x86_sse2_cvtsd2si64, sdmem, sse_load_f64, "cvtsd2si",
1678                    SSE_CVT_SD2SI>, XD, VEX, VEX_W, VEX_LIG;
1679}
1680defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse2_cvtsd2si,
1681                 sdmem, sse_load_f64, "cvtsd2si", SSE_CVT_SD2SI>, XD;
1682defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse2_cvtsd2si64,
1683                   sdmem, sse_load_f64, "cvtsd2si", SSE_CVT_SD2SI>, XD, REX_W;
1684
1685
1686let isCodeGenOnly = 1 in {
1687  let Predicates = [UseAVX] in {
1688  defm Int_VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1689            int_x86_sse_cvtsi2ss, i32mem, loadi32, "cvtsi2ss{l}",
1690            SSE_CVT_Scalar, 0>, XS, VEX_4V;
1691  defm Int_VCVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1692            int_x86_sse_cvtsi642ss, i64mem, loadi64, "cvtsi2ss{q}",
1693            SSE_CVT_Scalar, 0>, XS, VEX_4V,
1694            VEX_W;
1695  defm Int_VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1696            int_x86_sse2_cvtsi2sd, i32mem, loadi32, "cvtsi2sd{l}",
1697            SSE_CVT_Scalar, 0>, XD, VEX_4V;
1698  defm Int_VCVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1699            int_x86_sse2_cvtsi642sd, i64mem, loadi64, "cvtsi2sd{q}",
1700            SSE_CVT_Scalar, 0>, XD,
1701            VEX_4V, VEX_W;
1702  }
1703  let Constraints = "$src1 = $dst" in {
1704    defm Int_CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1705                          int_x86_sse_cvtsi2ss, i32mem, loadi32,
1706                          "cvtsi2ss{l}", SSE_CVT_Scalar>, XS;
1707    defm Int_CVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1708                          int_x86_sse_cvtsi642ss, i64mem, loadi64,
1709                          "cvtsi2ss{q}", SSE_CVT_Scalar>, XS, REX_W;
1710    defm Int_CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1711                          int_x86_sse2_cvtsi2sd, i32mem, loadi32,
1712                          "cvtsi2sd{l}", SSE_CVT_Scalar>, XD;
1713    defm Int_CVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1714                          int_x86_sse2_cvtsi642sd, i64mem, loadi64,
1715                          "cvtsi2sd{q}", SSE_CVT_Scalar>, XD, REX_W;
1716  }
1717} // isCodeGenOnly = 1
1718
1719/// SSE 1 Only
1720
1721// Aliases for intrinsics
1722let isCodeGenOnly = 1 in {
1723let Predicates = [UseAVX] in {
1724defm Int_VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si,
1725                                    ssmem, sse_load_f32, "cvttss2si",
1726                                    SSE_CVT_SS2SI_32>, XS, VEX;
1727defm Int_VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
1728                                   int_x86_sse_cvttss2si64, ssmem, sse_load_f32,
1729                                   "cvttss2si", SSE_CVT_SS2SI_64>,
1730                                   XS, VEX, VEX_W;
1731defm Int_VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si,
1732                                    sdmem, sse_load_f64, "cvttsd2si",
1733                                    SSE_CVT_SD2SI>, XD, VEX;
1734defm Int_VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
1735                                  int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64,
1736                                  "cvttsd2si", SSE_CVT_SD2SI>,
1737                                  XD, VEX, VEX_W;
1738}
1739defm Int_CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si,
1740                                    ssmem, sse_load_f32, "cvttss2si",
1741                                    SSE_CVT_SS2SI_32>, XS;
1742defm Int_CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
1743                                   int_x86_sse_cvttss2si64, ssmem, sse_load_f32,
1744                                   "cvttss2si", SSE_CVT_SS2SI_64>, XS, REX_W;
1745defm Int_CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si,
1746                                    sdmem, sse_load_f64, "cvttsd2si",
1747                                    SSE_CVT_SD2SI>, XD;
1748defm Int_CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
1749                                  int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64,
1750                                  "cvttsd2si", SSE_CVT_SD2SI>, XD, REX_W;
1751} // isCodeGenOnly = 1
1752
1753let Predicates = [UseAVX] in {
1754defm VCVTSS2SI   : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si,
1755                                  ssmem, sse_load_f32, "cvtss2si",
1756                                  SSE_CVT_SS2SI_32>, XS, VEX, VEX_LIG;
1757defm VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64,
1758                                  ssmem, sse_load_f32, "cvtss2si",
1759                                  SSE_CVT_SS2SI_64>, XS, VEX, VEX_W, VEX_LIG;
1760}
1761defm CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si,
1762                               ssmem, sse_load_f32, "cvtss2si",
1763                               SSE_CVT_SS2SI_32>, XS;
1764defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64,
1765                                 ssmem, sse_load_f32, "cvtss2si",
1766                                 SSE_CVT_SS2SI_64>, XS, REX_W;
1767
1768defm VCVTDQ2PS   : sse12_cvt_p<0x5B, VR128, VR128, i128mem,
1769                               "vcvtdq2ps\t{$src, $dst|$dst, $src}",
1770                               SSEPackedSingle, SSE_CVT_PS>,
1771                               PS, VEX, Requires<[HasAVX]>;
1772defm VCVTDQ2PSY  : sse12_cvt_p<0x5B, VR256, VR256, i256mem,
1773                               "vcvtdq2ps\t{$src, $dst|$dst, $src}",
1774                               SSEPackedSingle, SSE_CVT_PS>,
1775                               PS, VEX, VEX_L, Requires<[HasAVX]>;
1776
1777defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, i128mem,
1778                            "cvtdq2ps\t{$src, $dst|$dst, $src}",
1779                            SSEPackedSingle, SSE_CVT_PS>,
1780                            PS, Requires<[UseSSE2]>;
1781
1782let Predicates = [UseAVX] in {
1783def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
1784                (VCVTSS2SIrr GR32:$dst, VR128:$src), 0>;
1785def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
1786                (VCVTSS2SIrm GR32:$dst, ssmem:$src), 0>;
1787def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
1788                (VCVTSD2SIrr GR32:$dst, VR128:$src), 0>;
1789def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
1790                (VCVTSD2SIrm GR32:$dst, sdmem:$src), 0>;
1791def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
1792                (VCVTSS2SI64rr GR64:$dst, VR128:$src), 0>;
1793def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
1794                (VCVTSS2SI64rm GR64:$dst, ssmem:$src), 0>;
1795def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
1796                (VCVTSD2SI64rr GR64:$dst, VR128:$src), 0>;
1797def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
1798                (VCVTSD2SI64rm GR64:$dst, sdmem:$src), 0>;
1799}
1800
1801def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
1802                (CVTSS2SIrr GR32:$dst, VR128:$src), 0>;
1803def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
1804                (CVTSS2SIrm GR32:$dst, ssmem:$src), 0>;
1805def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
1806                (CVTSD2SIrr GR32:$dst, VR128:$src), 0>;
1807def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
1808                (CVTSD2SIrm GR32:$dst, sdmem:$src), 0>;
1809def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
1810                (CVTSS2SI64rr GR64:$dst, VR128:$src), 0>;
1811def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
1812                (CVTSS2SI64rm GR64:$dst, ssmem:$src), 0>;
1813def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
1814                (CVTSD2SI64rr GR64:$dst, VR128:$src), 0>;
1815def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
1816                (CVTSD2SI64rm GR64:$dst, sdmem:$src)>;
1817
1818/// SSE 2 Only
1819
1820// Convert scalar double to scalar single
1821let hasSideEffects = 0, Predicates = [UseAVX] in {
1822def VCVTSD2SSrr  : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst),
1823                       (ins FR64:$src1, FR64:$src2),
1824                      "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [],
1825                      IIC_SSE_CVT_Scalar_RR>, VEX_4V, VEX_LIG,
1826                      Sched<[WriteCvtF2F]>;
1827let mayLoad = 1 in
1828def VCVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst),
1829                       (ins FR64:$src1, f64mem:$src2),
1830                      "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1831                      [], IIC_SSE_CVT_Scalar_RM>,
1832                      XD, Requires<[HasAVX, OptForSize]>, VEX_4V, VEX_LIG,
1833                      Sched<[WriteCvtF2FLd, ReadAfterLd]>;
1834}
1835
1836def : Pat<(f32 (fround FR64:$src)), (VCVTSD2SSrr FR64:$src, FR64:$src)>,
1837          Requires<[UseAVX]>;
1838
1839def CVTSD2SSrr  : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
1840                      "cvtsd2ss\t{$src, $dst|$dst, $src}",
1841                      [(set FR32:$dst, (fround FR64:$src))],
1842                      IIC_SSE_CVT_Scalar_RR>, Sched<[WriteCvtF2F]>;
1843def CVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src),
1844                      "cvtsd2ss\t{$src, $dst|$dst, $src}",
1845                      [(set FR32:$dst, (fround (loadf64 addr:$src)))],
1846                      IIC_SSE_CVT_Scalar_RM>,
1847                      XD,
1848                  Requires<[UseSSE2, OptForSize]>, Sched<[WriteCvtF2FLd]>;
1849
1850let isCodeGenOnly = 1 in {
1851def Int_VCVTSD2SSrr: I<0x5A, MRMSrcReg,
1852                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1853                       "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1854                       [(set VR128:$dst,
1855                         (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))],
1856                       IIC_SSE_CVT_Scalar_RR>, XD, VEX_4V, Requires<[UseAVX]>,
1857                       Sched<[WriteCvtF2F]>;
1858def Int_VCVTSD2SSrm: I<0x5A, MRMSrcReg,
1859                       (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
1860                       "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1861                       [(set VR128:$dst, (int_x86_sse2_cvtsd2ss
1862                                          VR128:$src1, sse_load_f64:$src2))],
1863                       IIC_SSE_CVT_Scalar_RM>, XD, VEX_4V, Requires<[UseAVX]>,
1864                       Sched<[WriteCvtF2FLd, ReadAfterLd]>;
1865
1866let Constraints = "$src1 = $dst" in {
1867def Int_CVTSD2SSrr: I<0x5A, MRMSrcReg,
1868                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1869                       "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
1870                       [(set VR128:$dst,
1871                         (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))],
1872                       IIC_SSE_CVT_Scalar_RR>, XD, Requires<[UseSSE2]>,
1873                       Sched<[WriteCvtF2F]>;
1874def Int_CVTSD2SSrm: I<0x5A, MRMSrcReg,
1875                       (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
1876                       "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
1877                       [(set VR128:$dst, (int_x86_sse2_cvtsd2ss
1878                                          VR128:$src1, sse_load_f64:$src2))],
1879                       IIC_SSE_CVT_Scalar_RM>, XD, Requires<[UseSSE2]>,
1880                       Sched<[WriteCvtF2FLd, ReadAfterLd]>;
1881}
1882} // isCodeGenOnly = 1
1883
1884// Convert scalar single to scalar double
1885// SSE2 instructions with XS prefix
1886let hasSideEffects = 0, Predicates = [UseAVX] in {
1887def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst),
1888                    (ins FR32:$src1, FR32:$src2),
1889                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1890                    [], IIC_SSE_CVT_Scalar_RR>,
1891                    XS, Requires<[HasAVX]>, VEX_4V, VEX_LIG,
1892                    Sched<[WriteCvtF2F]>;
1893let mayLoad = 1 in
1894def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst),
1895                    (ins FR32:$src1, f32mem:$src2),
1896                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1897                    [], IIC_SSE_CVT_Scalar_RM>,
1898                    XS, VEX_4V, VEX_LIG, Requires<[HasAVX, OptForSize]>,
1899                    Sched<[WriteCvtF2FLd, ReadAfterLd]>;
1900}
1901
1902def : Pat<(f64 (fextend FR32:$src)),
1903    (VCVTSS2SDrr FR32:$src, FR32:$src)>, Requires<[UseAVX]>;
1904def : Pat<(fextend (loadf32 addr:$src)),
1905    (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX]>;
1906
1907def : Pat<(extloadf32 addr:$src),
1908    (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>,
1909    Requires<[UseAVX, OptForSize]>;
1910def : Pat<(extloadf32 addr:$src),
1911    (VCVTSS2SDrr (f32 (IMPLICIT_DEF)), (VMOVSSrm addr:$src))>,
1912    Requires<[UseAVX, OptForSpeed]>;
1913
1914def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
1915                   "cvtss2sd\t{$src, $dst|$dst, $src}",
1916                   [(set FR64:$dst, (fextend FR32:$src))],
1917                   IIC_SSE_CVT_Scalar_RR>, XS,
1918                 Requires<[UseSSE2]>, Sched<[WriteCvtF2F]>;
1919def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
1920                   "cvtss2sd\t{$src, $dst|$dst, $src}",
1921                   [(set FR64:$dst, (extloadf32 addr:$src))],
1922                   IIC_SSE_CVT_Scalar_RM>, XS,
1923                 Requires<[UseSSE2, OptForSize]>, Sched<[WriteCvtF2FLd]>;
1924
1925// extload f32 -> f64.  This matches load+fextend because we have a hack in
1926// the isel (PreprocessForFPConvert) that can introduce loads after dag
1927// combine.
1928// Since these loads aren't folded into the fextend, we have to match it
1929// explicitly here.
1930def : Pat<(fextend (loadf32 addr:$src)),
1931          (CVTSS2SDrm addr:$src)>, Requires<[UseSSE2]>;
1932def : Pat<(extloadf32 addr:$src),
1933          (CVTSS2SDrr (MOVSSrm addr:$src))>, Requires<[UseSSE2, OptForSpeed]>;
1934
1935let isCodeGenOnly = 1 in {
1936def Int_VCVTSS2SDrr: I<0x5A, MRMSrcReg,
1937                      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1938                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1939                    [(set VR128:$dst,
1940                      (int_x86_sse2_cvtss2sd VR128:$src1, VR128:$src2))],
1941                    IIC_SSE_CVT_Scalar_RR>, XS, VEX_4V, Requires<[UseAVX]>,
1942                    Sched<[WriteCvtF2F]>;
1943def Int_VCVTSS2SDrm: I<0x5A, MRMSrcMem,
1944                      (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
1945                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1946                    [(set VR128:$dst,
1947                      (int_x86_sse2_cvtss2sd VR128:$src1, sse_load_f32:$src2))],
1948                    IIC_SSE_CVT_Scalar_RM>, XS, VEX_4V, Requires<[UseAVX]>,
1949                    Sched<[WriteCvtF2FLd, ReadAfterLd]>;
1950let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix
1951def Int_CVTSS2SDrr: I<0x5A, MRMSrcReg,
1952                      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1953                    "cvtss2sd\t{$src2, $dst|$dst, $src2}",
1954                    [(set VR128:$dst,
1955                      (int_x86_sse2_cvtss2sd VR128:$src1, VR128:$src2))],
1956                    IIC_SSE_CVT_Scalar_RR>, XS, Requires<[UseSSE2]>,
1957                    Sched<[WriteCvtF2F]>;
1958def Int_CVTSS2SDrm: I<0x5A, MRMSrcMem,
1959                      (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
1960                    "cvtss2sd\t{$src2, $dst|$dst, $src2}",
1961                    [(set VR128:$dst,
1962                      (int_x86_sse2_cvtss2sd VR128:$src1, sse_load_f32:$src2))],
1963                    IIC_SSE_CVT_Scalar_RM>, XS, Requires<[UseSSE2]>,
1964                    Sched<[WriteCvtF2FLd, ReadAfterLd]>;
1965}
1966} // isCodeGenOnly = 1
1967
1968// Convert packed single/double fp to doubleword
1969def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1970                       "cvtps2dq\t{$src, $dst|$dst, $src}",
1971                       [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))],
1972                       IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>;
1973def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1974                       "cvtps2dq\t{$src, $dst|$dst, $src}",
1975                       [(set VR128:$dst,
1976                         (int_x86_sse2_cvtps2dq (loadv4f32 addr:$src)))],
1977                       IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>;
1978def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
1979                        "cvtps2dq\t{$src, $dst|$dst, $src}",
1980                        [(set VR256:$dst,
1981                          (int_x86_avx_cvt_ps2dq_256 VR256:$src))],
1982                        IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
1983def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
1984                        "cvtps2dq\t{$src, $dst|$dst, $src}",
1985                        [(set VR256:$dst,
1986                          (int_x86_avx_cvt_ps2dq_256 (loadv8f32 addr:$src)))],
1987                        IIC_SSE_CVT_PS_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
1988def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1989                     "cvtps2dq\t{$src, $dst|$dst, $src}",
1990                     [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))],
1991                     IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>;
1992def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1993                     "cvtps2dq\t{$src, $dst|$dst, $src}",
1994                     [(set VR128:$dst,
1995                       (int_x86_sse2_cvtps2dq (memopv4f32 addr:$src)))],
1996                     IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>;
1997
1998
1999// Convert Packed Double FP to Packed DW Integers
2000let Predicates = [HasAVX] in {
2001// The assembler can recognize rr 256-bit instructions by seeing a ymm
2002// register, but the same isn't true when using memory operands instead.
2003// Provide other assembly rr and rm forms to address this explicitly.
2004def VCVTPD2DQrr  : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2005                       "vcvtpd2dq\t{$src, $dst|$dst, $src}",
2006                       [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))]>,
2007                       VEX, Sched<[WriteCvtF2I]>;
2008
2009// XMM only
2010def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
2011                (VCVTPD2DQrr VR128:$dst, VR128:$src), 0>;
2012def VCVTPD2DQXrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2013                       "vcvtpd2dqx\t{$src, $dst|$dst, $src}",
2014                       [(set VR128:$dst,
2015                         (int_x86_sse2_cvtpd2dq (loadv2f64 addr:$src)))]>, VEX,
2016                       Sched<[WriteCvtF2ILd]>;
2017
2018// YMM only
2019def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
2020                       "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}",
2021                       [(set VR128:$dst,
2022                         (int_x86_avx_cvt_pd2dq_256 VR256:$src))]>, VEX, VEX_L,
2023                       Sched<[WriteCvtF2I]>;
2024def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
2025                       "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}",
2026                       [(set VR128:$dst,
2027                         (int_x86_avx_cvt_pd2dq_256 (loadv4f64 addr:$src)))]>,
2028                       VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
2029def : InstAlias<"vcvtpd2dq\t{$src, $dst|$dst, $src}",
2030                (VCVTPD2DQYrr VR128:$dst, VR256:$src), 0>;
2031}
2032
2033def CVTPD2DQrm  : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2034                      "cvtpd2dq\t{$src, $dst|$dst, $src}",
2035                      [(set VR128:$dst,
2036                        (int_x86_sse2_cvtpd2dq (memopv2f64 addr:$src)))],
2037                      IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtF2ILd]>;
2038def CVTPD2DQrr  : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2039                      "cvtpd2dq\t{$src, $dst|$dst, $src}",
2040                      [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))],
2041                      IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2I]>;
2042
2043// Convert with truncation packed single/double fp to doubleword
2044// SSE2 packed instructions with XS prefix
2045def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2046                         "cvttps2dq\t{$src, $dst|$dst, $src}",
2047                         [(set VR128:$dst,
2048                           (int_x86_sse2_cvttps2dq VR128:$src))],
2049                         IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>;
2050def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2051                         "cvttps2dq\t{$src, $dst|$dst, $src}",
2052                         [(set VR128:$dst, (int_x86_sse2_cvttps2dq
2053                                            (loadv4f32 addr:$src)))],
2054                         IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>;
2055def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
2056                          "cvttps2dq\t{$src, $dst|$dst, $src}",
2057                          [(set VR256:$dst,
2058                            (int_x86_avx_cvtt_ps2dq_256 VR256:$src))],
2059                          IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
2060def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
2061                          "cvttps2dq\t{$src, $dst|$dst, $src}",
2062                          [(set VR256:$dst, (int_x86_avx_cvtt_ps2dq_256
2063                                             (loadv8f32 addr:$src)))],
2064                          IIC_SSE_CVT_PS_RM>, VEX, VEX_L,
2065                          Sched<[WriteCvtF2ILd]>;
2066
2067def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2068                       "cvttps2dq\t{$src, $dst|$dst, $src}",
2069                       [(set VR128:$dst, (int_x86_sse2_cvttps2dq VR128:$src))],
2070                       IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>;
2071def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2072                       "cvttps2dq\t{$src, $dst|$dst, $src}",
2073                       [(set VR128:$dst,
2074                         (int_x86_sse2_cvttps2dq (memopv4f32 addr:$src)))],
2075                       IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>;
2076
2077let Predicates = [HasAVX] in {
2078  def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))),
2079            (VCVTDQ2PSrr VR128:$src)>;
2080  def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))),
2081            (VCVTDQ2PSrm addr:$src)>;
2082
2083  def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src),
2084            (VCVTDQ2PSrr VR128:$src)>;
2085  def : Pat<(int_x86_sse2_cvtdq2ps (bc_v4i32 (loadv2i64 addr:$src))),
2086            (VCVTDQ2PSrm addr:$src)>;
2087
2088  def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
2089            (VCVTTPS2DQrr VR128:$src)>;
2090  def : Pat<(v4i32 (fp_to_sint (loadv4f32 addr:$src))),
2091            (VCVTTPS2DQrm addr:$src)>;
2092
2093  def : Pat<(v8f32 (sint_to_fp (v8i32 VR256:$src))),
2094            (VCVTDQ2PSYrr VR256:$src)>;
2095  def : Pat<(v8f32 (sint_to_fp (bc_v8i32 (loadv4i64 addr:$src)))),
2096            (VCVTDQ2PSYrm addr:$src)>;
2097
2098  def : Pat<(v8i32 (fp_to_sint (v8f32 VR256:$src))),
2099            (VCVTTPS2DQYrr VR256:$src)>;
2100  def : Pat<(v8i32 (fp_to_sint (loadv8f32 addr:$src))),
2101            (VCVTTPS2DQYrm addr:$src)>;
2102}
2103
2104let Predicates = [UseSSE2] in {
2105  def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))),
2106            (CVTDQ2PSrr VR128:$src)>;
2107  def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (memopv2i64 addr:$src)))),
2108            (CVTDQ2PSrm addr:$src)>;
2109
2110  def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src),
2111            (CVTDQ2PSrr VR128:$src)>;
2112  def : Pat<(int_x86_sse2_cvtdq2ps (bc_v4i32 (memopv2i64 addr:$src))),
2113            (CVTDQ2PSrm addr:$src)>;
2114
2115  def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
2116            (CVTTPS2DQrr VR128:$src)>;
2117  def : Pat<(v4i32 (fp_to_sint (memopv4f32 addr:$src))),
2118            (CVTTPS2DQrm addr:$src)>;
2119}
2120
2121def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2122                        "cvttpd2dq\t{$src, $dst|$dst, $src}",
2123                        [(set VR128:$dst,
2124                              (int_x86_sse2_cvttpd2dq VR128:$src))],
2125                              IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2I]>;
2126
2127// The assembler can recognize rr 256-bit instructions by seeing a ymm
2128// register, but the same isn't true when using memory operands instead.
2129// Provide other assembly rr and rm forms to address this explicitly.
2130
2131// XMM only
2132def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}",
2133                (VCVTTPD2DQrr VR128:$dst, VR128:$src), 0>;
2134def VCVTTPD2DQXrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2135                         "cvttpd2dqx\t{$src, $dst|$dst, $src}",
2136                         [(set VR128:$dst, (int_x86_sse2_cvttpd2dq
2137                                            (loadv2f64 addr:$src)))],
2138                         IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2ILd]>;
2139
2140// YMM only
2141def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
2142                         "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
2143                         [(set VR128:$dst,
2144                           (int_x86_avx_cvtt_pd2dq_256 VR256:$src))],
2145                         IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
2146def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
2147                         "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
2148                         [(set VR128:$dst,
2149                          (int_x86_avx_cvtt_pd2dq_256 (loadv4f64 addr:$src)))],
2150                         IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
2151def : InstAlias<"vcvttpd2dq\t{$src, $dst|$dst, $src}",
2152                (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0>;
2153
2154let Predicates = [HasAVX] in {
2155  def : Pat<(v4i32 (fp_to_sint (v4f64 VR256:$src))),
2156            (VCVTTPD2DQYrr VR256:$src)>;
2157  def : Pat<(v4i32 (fp_to_sint (loadv4f64 addr:$src))),
2158            (VCVTTPD2DQYrm addr:$src)>;
2159} // Predicates = [HasAVX]
2160
2161def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2162                      "cvttpd2dq\t{$src, $dst|$dst, $src}",
2163                      [(set VR128:$dst, (int_x86_sse2_cvttpd2dq VR128:$src))],
2164                      IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2I]>;
2165def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src),
2166                      "cvttpd2dq\t{$src, $dst|$dst, $src}",
2167                      [(set VR128:$dst, (int_x86_sse2_cvttpd2dq
2168                                        (memopv2f64 addr:$src)))],
2169                                        IIC_SSE_CVT_PD_RM>,
2170                      Sched<[WriteCvtF2ILd]>;
2171
2172// Convert packed single to packed double
2173let Predicates = [HasAVX] in {
2174                  // SSE2 instructions without OpSize prefix
2175def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2176                     "vcvtps2pd\t{$src, $dst|$dst, $src}",
2177                     [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))],
2178                     IIC_SSE_CVT_PD_RR>, PS, VEX, Sched<[WriteCvtF2F]>;
2179def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
2180                    "vcvtps2pd\t{$src, $dst|$dst, $src}",
2181                    [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))],
2182                    IIC_SSE_CVT_PD_RM>, PS, VEX, Sched<[WriteCvtF2FLd]>;
2183def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
2184                     "vcvtps2pd\t{$src, $dst|$dst, $src}",
2185                     [(set VR256:$dst,
2186                       (int_x86_avx_cvt_ps2_pd_256 VR128:$src))],
2187                     IIC_SSE_CVT_PD_RR>, PS, VEX, VEX_L, Sched<[WriteCvtF2F]>;
2188def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
2189                     "vcvtps2pd\t{$src, $dst|$dst, $src}",
2190                     [(set VR256:$dst,
2191                       (int_x86_avx_cvt_ps2_pd_256 (loadv4f32 addr:$src)))],
2192                     IIC_SSE_CVT_PD_RM>, PS, VEX, VEX_L, Sched<[WriteCvtF2FLd]>;
2193}
2194
2195let Predicates = [UseSSE2] in {
2196def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2197                       "cvtps2pd\t{$src, $dst|$dst, $src}",
2198                       [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))],
2199                       IIC_SSE_CVT_PD_RR>, PS, Sched<[WriteCvtF2F]>;
2200def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
2201                   "cvtps2pd\t{$src, $dst|$dst, $src}",
2202                   [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))],
2203                   IIC_SSE_CVT_PD_RM>, PS, Sched<[WriteCvtF2FLd]>;
2204}
2205
2206// Convert Packed DW Integers to Packed Double FP
2207let Predicates = [HasAVX] in {
2208let hasSideEffects = 0, mayLoad = 1 in
2209def VCVTDQ2PDrm  : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
2210                     "vcvtdq2pd\t{$src, $dst|$dst, $src}",
2211                     []>, VEX, Sched<[WriteCvtI2FLd]>;
2212def VCVTDQ2PDrr  : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2213                     "vcvtdq2pd\t{$src, $dst|$dst, $src}",
2214                     [(set VR128:$dst,
2215                       (int_x86_sse2_cvtdq2pd VR128:$src))]>, VEX,
2216                   Sched<[WriteCvtI2F]>;
2217def VCVTDQ2PDYrm  : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src),
2218                     "vcvtdq2pd\t{$src, $dst|$dst, $src}",
2219                     [(set VR256:$dst,
2220                       (int_x86_avx_cvtdq2_pd_256
2221                        (bitconvert (loadv2i64 addr:$src))))]>, VEX, VEX_L,
2222                    Sched<[WriteCvtI2FLd]>;
2223def VCVTDQ2PDYrr  : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
2224                     "vcvtdq2pd\t{$src, $dst|$dst, $src}",
2225                     [(set VR256:$dst,
2226                       (int_x86_avx_cvtdq2_pd_256 VR128:$src))]>, VEX, VEX_L,
2227                    Sched<[WriteCvtI2F]>;
2228}
2229
2230let hasSideEffects = 0, mayLoad = 1 in
2231def CVTDQ2PDrm  : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
2232                       "cvtdq2pd\t{$src, $dst|$dst, $src}", [],
2233                       IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtI2FLd]>;
2234def CVTDQ2PDrr  : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2235                       "cvtdq2pd\t{$src, $dst|$dst, $src}",
2236                       [(set VR128:$dst, (int_x86_sse2_cvtdq2pd VR128:$src))],
2237                       IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtI2F]>;
2238
2239// AVX 256-bit register conversion intrinsics
2240let Predicates = [HasAVX] in {
2241  def : Pat<(v4f64 (sint_to_fp (v4i32 VR128:$src))),
2242            (VCVTDQ2PDYrr VR128:$src)>;
2243  def : Pat<(v4f64 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))),
2244            (VCVTDQ2PDYrm addr:$src)>;
2245} // Predicates = [HasAVX]
2246
2247// Convert packed double to packed single
2248// The assembler can recognize rr 256-bit instructions by seeing a ymm
2249// register, but the same isn't true when using memory operands instead.
2250// Provide other assembly rr and rm forms to address this explicitly.
2251def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2252                       "cvtpd2ps\t{$src, $dst|$dst, $src}",
2253                       [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))],
2254                       IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2F]>;
2255
2256// XMM only
2257def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
2258                (VCVTPD2PSrr VR128:$dst, VR128:$src), 0>;
2259def VCVTPD2PSXrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2260                        "cvtpd2psx\t{$src, $dst|$dst, $src}",
2261                        [(set VR128:$dst,
2262                          (int_x86_sse2_cvtpd2ps (loadv2f64 addr:$src)))],
2263                        IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2FLd]>;
2264
2265// YMM only
2266def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
2267                        "cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
2268                        [(set VR128:$dst,
2269                          (int_x86_avx_cvt_pd2_ps_256 VR256:$src))],
2270                        IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2F]>;
2271def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
2272                        "cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
2273                        [(set VR128:$dst,
2274                          (int_x86_avx_cvt_pd2_ps_256 (loadv4f64 addr:$src)))],
2275                        IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2FLd]>;
2276def : InstAlias<"vcvtpd2ps\t{$src, $dst|$dst, $src}",
2277                (VCVTPD2PSYrr VR128:$dst, VR256:$src), 0>;
2278
2279def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2280                     "cvtpd2ps\t{$src, $dst|$dst, $src}",
2281                     [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))],
2282                     IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2F]>;
2283def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2284                     "cvtpd2ps\t{$src, $dst|$dst, $src}",
2285                     [(set VR128:$dst,
2286                       (int_x86_sse2_cvtpd2ps (memopv2f64 addr:$src)))],
2287                     IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtF2FLd]>;
2288
2289
2290// AVX 256-bit register conversion intrinsics
2291// FIXME: Migrate SSE conversion intrinsics matching to use patterns as below
2292// whenever possible to avoid declaring two versions of each one.
2293let Predicates = [HasAVX] in {
2294  def : Pat<(int_x86_avx_cvtdq2_ps_256 VR256:$src),
2295            (VCVTDQ2PSYrr VR256:$src)>;
2296  def : Pat<(int_x86_avx_cvtdq2_ps_256 (bitconvert (loadv4i64 addr:$src))),
2297            (VCVTDQ2PSYrm addr:$src)>;
2298
2299  // Match fround and fextend for 128/256-bit conversions
2300  def : Pat<(v4f32 (X86vfpround (v2f64 VR128:$src))),
2301            (VCVTPD2PSrr VR128:$src)>;
2302  def : Pat<(v4f32 (X86vfpround (loadv2f64 addr:$src))),
2303            (VCVTPD2PSXrm addr:$src)>;
2304  def : Pat<(v4f32 (fround (v4f64 VR256:$src))),
2305            (VCVTPD2PSYrr VR256:$src)>;
2306  def : Pat<(v4f32 (fround (loadv4f64 addr:$src))),
2307            (VCVTPD2PSYrm addr:$src)>;
2308
2309  def : Pat<(v2f64 (X86vfpext (v4f32 VR128:$src))),
2310            (VCVTPS2PDrr VR128:$src)>;
2311  def : Pat<(v4f64 (fextend (v4f32 VR128:$src))),
2312            (VCVTPS2PDYrr VR128:$src)>;
2313  def : Pat<(v4f64 (extloadv4f32 addr:$src)),
2314            (VCVTPS2PDYrm addr:$src)>;
2315}
2316
2317let Predicates = [UseSSE2] in {
2318  // Match fround and fextend for 128 conversions
2319  def : Pat<(v4f32 (X86vfpround (v2f64 VR128:$src))),
2320            (CVTPD2PSrr VR128:$src)>;
2321  def : Pat<(v4f32 (X86vfpround (memopv2f64 addr:$src))),
2322            (CVTPD2PSrm addr:$src)>;
2323
2324  def : Pat<(v2f64 (X86vfpext (v4f32 VR128:$src))),
2325            (CVTPS2PDrr VR128:$src)>;
2326}
2327
2328//===----------------------------------------------------------------------===//
2329// SSE 1 & 2 - Compare Instructions
2330//===----------------------------------------------------------------------===//
2331
2332// sse12_cmp_scalar - sse 1 & 2 compare scalar instructions
2333multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
2334                            Operand CC, SDNode OpNode, ValueType VT,
2335                            PatFrag ld_frag, string asm, string asm_alt,
2336                            OpndItins itins, ImmLeaf immLeaf> {
2337  def rr : SIi8<0xC2, MRMSrcReg,
2338                (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
2339                [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, immLeaf:$cc))],
2340                itins.rr>, Sched<[itins.Sched]>;
2341  def rm : SIi8<0xC2, MRMSrcMem,
2342                (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
2343                [(set RC:$dst, (OpNode (VT RC:$src1),
2344                                         (ld_frag addr:$src2), immLeaf:$cc))],
2345                                         itins.rm>,
2346           Sched<[itins.Sched.Folded, ReadAfterLd]>;
2347
2348  // Accept explicit immediate argument form instead of comparison code.
2349  let isAsmParserOnly = 1, hasSideEffects = 0 in {
2350    def rr_alt : SIi8<0xC2, MRMSrcReg, (outs RC:$dst),
2351                      (ins RC:$src1, RC:$src2, u8imm:$cc), asm_alt, [],
2352                      IIC_SSE_ALU_F32S_RR>, Sched<[itins.Sched]>;
2353    let mayLoad = 1 in
2354    def rm_alt : SIi8<0xC2, MRMSrcMem, (outs RC:$dst),
2355                      (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm_alt, [],
2356                      IIC_SSE_ALU_F32S_RM>,
2357                      Sched<[itins.Sched.Folded, ReadAfterLd]>;
2358  }
2359}
2360
2361defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, AVXCC, X86cmps, f32, loadf32,
2362                 "cmp${cc}ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2363                 "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
2364                 SSE_ALU_F32S, i8immZExt5>, XS, VEX_4V, VEX_LIG;
2365defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, AVXCC, X86cmps, f64, loadf64,
2366                 "cmp${cc}sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2367                 "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
2368                 SSE_ALU_F32S, i8immZExt5>, // same latency as 32 bit compare
2369                 XD, VEX_4V, VEX_LIG;
2370
2371let Constraints = "$src1 = $dst" in {
2372  defm CMPSS : sse12_cmp_scalar<FR32, f32mem, SSECC, X86cmps, f32, loadf32,
2373                  "cmp${cc}ss\t{$src2, $dst|$dst, $src2}",
2374                  "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}", SSE_ALU_F32S,
2375                  i8immZExt3>, XS;
2376  defm CMPSD : sse12_cmp_scalar<FR64, f64mem, SSECC, X86cmps, f64, loadf64,
2377                  "cmp${cc}sd\t{$src2, $dst|$dst, $src2}",
2378                  "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
2379                  SSE_ALU_F64S, i8immZExt3>, XD;
2380}
2381
2382multiclass sse12_cmp_scalar_int<X86MemOperand x86memop, Operand CC,
2383                         Intrinsic Int, string asm, OpndItins itins,
2384                         ImmLeaf immLeaf> {
2385  def rr : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst),
2386                      (ins VR128:$src1, VR128:$src, CC:$cc), asm,
2387                        [(set VR128:$dst, (Int VR128:$src1,
2388                                               VR128:$src, immLeaf:$cc))],
2389                                               itins.rr>,
2390           Sched<[itins.Sched]>;
2391  def rm : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst),
2392                      (ins VR128:$src1, x86memop:$src, CC:$cc), asm,
2393                        [(set VR128:$dst, (Int VR128:$src1,
2394                                               (load addr:$src), immLeaf:$cc))],
2395                                               itins.rm>,
2396           Sched<[itins.Sched.Folded, ReadAfterLd]>;
2397}
2398
2399let isCodeGenOnly = 1 in {
2400  // Aliases to match intrinsics which expect XMM operand(s).
2401  defm Int_VCMPSS  : sse12_cmp_scalar_int<f32mem, AVXCC, int_x86_sse_cmp_ss,
2402                       "cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}",
2403                       SSE_ALU_F32S, i8immZExt5>,
2404                       XS, VEX_4V;
2405  defm Int_VCMPSD  : sse12_cmp_scalar_int<f64mem, AVXCC, int_x86_sse2_cmp_sd,
2406                       "cmp${cc}sd\t{$src, $src1, $dst|$dst, $src1, $src}",
2407                       SSE_ALU_F32S, i8immZExt5>, // same latency as f32
2408                       XD, VEX_4V;
2409  let Constraints = "$src1 = $dst" in {
2410    defm Int_CMPSS  : sse12_cmp_scalar_int<f32mem, SSECC, int_x86_sse_cmp_ss,
2411                         "cmp${cc}ss\t{$src, $dst|$dst, $src}",
2412                         SSE_ALU_F32S, i8immZExt3>, XS;
2413    defm Int_CMPSD  : sse12_cmp_scalar_int<f64mem, SSECC, int_x86_sse2_cmp_sd,
2414                         "cmp${cc}sd\t{$src, $dst|$dst, $src}",
2415                         SSE_ALU_F64S, i8immZExt3>,
2416                         XD;
2417}
2418}
2419
2420
2421// sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS
2422multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode,
2423                            ValueType vt, X86MemOperand x86memop,
2424                            PatFrag ld_frag, string OpcodeStr> {
2425  def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
2426                     !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
2427                     [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))],
2428                     IIC_SSE_COMIS_RR>,
2429          Sched<[WriteFAdd]>;
2430  def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
2431                     !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
2432                     [(set EFLAGS, (OpNode (vt RC:$src1),
2433                                           (ld_frag addr:$src2)))],
2434                                           IIC_SSE_COMIS_RM>,
2435          Sched<[WriteFAddLd, ReadAfterLd]>;
2436}
2437
2438let Defs = [EFLAGS] in {
2439  defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
2440                                  "ucomiss">, PS, VEX, VEX_LIG;
2441  defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
2442                                  "ucomisd">, PD, VEX, VEX_LIG;
2443  let Pattern = []<dag> in {
2444    defm VCOMISS  : sse12_ord_cmp<0x2F, VR128, undef, v4f32, f128mem, load,
2445                                    "comiss">, PS, VEX, VEX_LIG;
2446    defm VCOMISD  : sse12_ord_cmp<0x2F, VR128, undef, v2f64, f128mem, load,
2447                                    "comisd">, PD, VEX, VEX_LIG;
2448  }
2449
2450  let isCodeGenOnly = 1 in {
2451    defm Int_VUCOMISS  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem,
2452                              load, "ucomiss">, PS, VEX;
2453    defm Int_VUCOMISD  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem,
2454                              load, "ucomisd">, PD, VEX;
2455
2456    defm Int_VCOMISS  : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem,
2457                              load, "comiss">, PS, VEX;
2458    defm Int_VCOMISD  : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem,
2459                              load, "comisd">, PD, VEX;
2460  }
2461  defm UCOMISS  : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
2462                                  "ucomiss">, PS;
2463  defm UCOMISD  : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
2464                                  "ucomisd">, PD;
2465
2466  let Pattern = []<dag> in {
2467    defm COMISS  : sse12_ord_cmp<0x2F, VR128, undef, v4f32, f128mem, load,
2468                                    "comiss">, PS;
2469    defm COMISD  : sse12_ord_cmp<0x2F, VR128, undef, v2f64, f128mem, load,
2470                                    "comisd">, PD;
2471  }
2472
2473  let isCodeGenOnly = 1 in {
2474    defm Int_UCOMISS  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem,
2475                                load, "ucomiss">, PS;
2476    defm Int_UCOMISD  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem,
2477                                load, "ucomisd">, PD;
2478
2479    defm Int_COMISS  : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem, load,
2480                                    "comiss">, PS;
2481    defm Int_COMISD  : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem, load,
2482                                    "comisd">, PD;
2483  }
2484} // Defs = [EFLAGS]
2485
2486// sse12_cmp_packed - sse 1 & 2 compare packed instructions
2487multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
2488                            Operand CC, Intrinsic Int, string asm,
2489                            string asm_alt, Domain d, ImmLeaf immLeaf,
2490                            PatFrag ld_frag, OpndItins itins = SSE_ALU_F32P> {
2491  let isCommutable = 1 in
2492  def rri : PIi8<0xC2, MRMSrcReg,
2493             (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
2494             [(set RC:$dst, (Int RC:$src1, RC:$src2, immLeaf:$cc))],
2495             itins.rr, d>,
2496            Sched<[WriteFAdd]>;
2497  def rmi : PIi8<0xC2, MRMSrcMem,
2498             (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
2499             [(set RC:$dst, (Int RC:$src1, (ld_frag addr:$src2), immLeaf:$cc))],
2500             itins.rm, d>,
2501            Sched<[WriteFAddLd, ReadAfterLd]>;
2502
2503  // Accept explicit immediate argument form instead of comparison code.
2504  let isAsmParserOnly = 1, hasSideEffects = 0 in {
2505    def rri_alt : PIi8<0xC2, MRMSrcReg,
2506               (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc),
2507               asm_alt, [], itins.rr, d>, Sched<[WriteFAdd]>;
2508    let mayLoad = 1 in
2509    def rmi_alt : PIi8<0xC2, MRMSrcMem,
2510               (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc),
2511               asm_alt, [], itins.rm, d>,
2512               Sched<[WriteFAddLd, ReadAfterLd]>;
2513  }
2514}
2515
2516defm VCMPPS : sse12_cmp_packed<VR128, f128mem, AVXCC, int_x86_sse_cmp_ps,
2517               "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2518               "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
2519               SSEPackedSingle, i8immZExt5, loadv4f32>, PS, VEX_4V;
2520defm VCMPPD : sse12_cmp_packed<VR128, f128mem, AVXCC, int_x86_sse2_cmp_pd,
2521               "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2522               "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
2523               SSEPackedDouble, i8immZExt5, loadv2f64>, PD, VEX_4V;
2524defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, AVXCC, int_x86_avx_cmp_ps_256,
2525               "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2526               "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
2527               SSEPackedSingle, i8immZExt5, loadv8f32>, PS, VEX_4V, VEX_L;
2528defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, AVXCC, int_x86_avx_cmp_pd_256,
2529               "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2530               "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
2531               SSEPackedDouble, i8immZExt5, loadv4f64>, PD, VEX_4V, VEX_L;
2532let Constraints = "$src1 = $dst" in {
2533  defm CMPPS : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse_cmp_ps,
2534                 "cmp${cc}ps\t{$src2, $dst|$dst, $src2}",
2535                 "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}",
2536                 SSEPackedSingle, i8immZExt5, memopv4f32, SSE_ALU_F32P>, PS;
2537  defm CMPPD : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse2_cmp_pd,
2538                 "cmp${cc}pd\t{$src2, $dst|$dst, $src2}",
2539                 "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
2540                 SSEPackedDouble, i8immZExt5, memopv2f64, SSE_ALU_F64P>, PD;
2541}
2542
2543let Predicates = [HasAVX] in {
2544def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)),
2545          (VCMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>;
2546def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), (loadv4f32 addr:$src2), imm:$cc)),
2547          (VCMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>;
2548def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)),
2549          (VCMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>;
2550def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), (loadv2f64 addr:$src2), imm:$cc)),
2551          (VCMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
2552
2553def : Pat<(v8i32 (X86cmpp (v8f32 VR256:$src1), VR256:$src2, imm:$cc)),
2554          (VCMPPSYrri (v8f32 VR256:$src1), (v8f32 VR256:$src2), imm:$cc)>;
2555def : Pat<(v8i32 (X86cmpp (v8f32 VR256:$src1), (loadv8f32 addr:$src2), imm:$cc)),
2556          (VCMPPSYrmi (v8f32 VR256:$src1), addr:$src2, imm:$cc)>;
2557def : Pat<(v4i64 (X86cmpp (v4f64 VR256:$src1), VR256:$src2, imm:$cc)),
2558          (VCMPPDYrri VR256:$src1, VR256:$src2, imm:$cc)>;
2559def : Pat<(v4i64 (X86cmpp (v4f64 VR256:$src1), (loadv4f64 addr:$src2), imm:$cc)),
2560          (VCMPPDYrmi VR256:$src1, addr:$src2, imm:$cc)>;
2561}
2562
2563let Predicates = [UseSSE1] in {
2564def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)),
2565          (CMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>;
2566def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), (memopv4f32 addr:$src2), imm:$cc)),
2567          (CMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>;
2568}
2569
2570let Predicates = [UseSSE2] in {
2571def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)),
2572          (CMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>;
2573def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), (memopv2f64 addr:$src2), imm:$cc)),
2574          (CMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
2575}
2576
2577//===----------------------------------------------------------------------===//
2578// SSE 1 & 2 - Shuffle Instructions
2579//===----------------------------------------------------------------------===//
2580
2581/// sse12_shuffle - sse 1 & 2 fp shuffle instructions
2582multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop,
2583                         ValueType vt, string asm, PatFrag mem_frag,
2584                         Domain d> {
2585  def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst),
2586                   (ins RC:$src1, x86memop:$src2, u8imm:$src3), asm,
2587                   [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2),
2588                                       (i8 imm:$src3))))], IIC_SSE_SHUFP, d>,
2589            Sched<[WriteFShuffleLd, ReadAfterLd]>;
2590  def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst),
2591                 (ins RC:$src1, RC:$src2, u8imm:$src3), asm,
2592                 [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2,
2593                                     (i8 imm:$src3))))], IIC_SSE_SHUFP, d>,
2594            Sched<[WriteFShuffle]>;
2595}
2596
2597defm VSHUFPS  : sse12_shuffle<VR128, f128mem, v4f32,
2598           "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2599           loadv4f32, SSEPackedSingle>, PS, VEX_4V;
2600defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32,
2601           "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2602           loadv8f32, SSEPackedSingle>, PS, VEX_4V, VEX_L;
2603defm VSHUFPD  : sse12_shuffle<VR128, f128mem, v2f64,
2604           "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2605           loadv2f64, SSEPackedDouble>, PD, VEX_4V;
2606defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64,
2607           "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2608           loadv4f64, SSEPackedDouble>, PD, VEX_4V, VEX_L;
2609
2610let Constraints = "$src1 = $dst" in {
2611  defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
2612                    "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
2613                    memopv4f32, SSEPackedSingle>, PS;
2614  defm SHUFPD : sse12_shuffle<VR128, f128mem, v2f64,
2615                    "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}",
2616                    memopv2f64, SSEPackedDouble>, PD;
2617}
2618
2619let Predicates = [HasAVX] in {
2620  def : Pat<(v4i32 (X86Shufp VR128:$src1,
2621                       (bc_v4i32 (loadv2i64 addr:$src2)), (i8 imm:$imm))),
2622            (VSHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>;
2623  def : Pat<(v4i32 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
2624            (VSHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>;
2625
2626  def : Pat<(v2i64 (X86Shufp VR128:$src1,
2627                       (loadv2i64 addr:$src2), (i8 imm:$imm))),
2628            (VSHUFPDrmi VR128:$src1, addr:$src2, imm:$imm)>;
2629  def : Pat<(v2i64 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
2630            (VSHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>;
2631
2632  // 256-bit patterns
2633  def : Pat<(v8i32 (X86Shufp VR256:$src1, VR256:$src2, (i8 imm:$imm))),
2634            (VSHUFPSYrri VR256:$src1, VR256:$src2, imm:$imm)>;
2635  def : Pat<(v8i32 (X86Shufp VR256:$src1,
2636                      (bc_v8i32 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
2637            (VSHUFPSYrmi VR256:$src1, addr:$src2, imm:$imm)>;
2638
2639  def : Pat<(v4i64 (X86Shufp VR256:$src1, VR256:$src2, (i8 imm:$imm))),
2640            (VSHUFPDYrri VR256:$src1, VR256:$src2, imm:$imm)>;
2641  def : Pat<(v4i64 (X86Shufp VR256:$src1,
2642                              (loadv4i64 addr:$src2), (i8 imm:$imm))),
2643            (VSHUFPDYrmi VR256:$src1, addr:$src2, imm:$imm)>;
2644}
2645
2646let Predicates = [UseSSE1] in {
2647  def : Pat<(v4i32 (X86Shufp VR128:$src1,
2648                       (bc_v4i32 (memopv2i64 addr:$src2)), (i8 imm:$imm))),
2649            (SHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>;
2650  def : Pat<(v4i32 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
2651            (SHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>;
2652}
2653
2654let Predicates = [UseSSE2] in {
2655  // Generic SHUFPD patterns
2656  def : Pat<(v2i64 (X86Shufp VR128:$src1,
2657                       (memopv2i64 addr:$src2), (i8 imm:$imm))),
2658            (SHUFPDrmi VR128:$src1, addr:$src2, imm:$imm)>;
2659  def : Pat<(v2i64 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
2660            (SHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>;
2661}
2662
2663//===----------------------------------------------------------------------===//
2664// SSE 1 & 2 - Unpack FP Instructions
2665//===----------------------------------------------------------------------===//
2666
2667/// sse12_unpack_interleave - sse 1 & 2 fp unpack and interleave
2668multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt,
2669                                   PatFrag mem_frag, RegisterClass RC,
2670                                   X86MemOperand x86memop, string asm,
2671                                   Domain d> {
2672    def rr : PI<opc, MRMSrcReg,
2673                (outs RC:$dst), (ins RC:$src1, RC:$src2),
2674                asm, [(set RC:$dst,
2675                           (vt (OpNode RC:$src1, RC:$src2)))],
2676                           IIC_SSE_UNPCK, d>, Sched<[WriteFShuffle]>;
2677    def rm : PI<opc, MRMSrcMem,
2678                (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
2679                asm, [(set RC:$dst,
2680                           (vt (OpNode RC:$src1,
2681                                       (mem_frag addr:$src2))))],
2682                                       IIC_SSE_UNPCK, d>,
2683             Sched<[WriteFShuffleLd, ReadAfterLd]>;
2684}
2685
2686defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, loadv4f32,
2687      VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2688                     SSEPackedSingle>, PS, VEX_4V;
2689defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, loadv2f64,
2690      VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2691                     SSEPackedDouble>, PD, VEX_4V;
2692defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, loadv4f32,
2693      VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2694                     SSEPackedSingle>, PS, VEX_4V;
2695defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, loadv2f64,
2696      VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2697                     SSEPackedDouble>, PD, VEX_4V;
2698
2699defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, loadv8f32,
2700      VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2701                     SSEPackedSingle>, PS, VEX_4V, VEX_L;
2702defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, loadv4f64,
2703      VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2704                     SSEPackedDouble>, PD, VEX_4V, VEX_L;
2705defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, loadv8f32,
2706      VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2707                     SSEPackedSingle>, PS, VEX_4V, VEX_L;
2708defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, loadv4f64,
2709      VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2710                     SSEPackedDouble>, PD, VEX_4V, VEX_L;
2711
2712let Constraints = "$src1 = $dst" in {
2713  defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memopv4f32,
2714        VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}",
2715                       SSEPackedSingle>, PS;
2716  defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memopv2f64,
2717        VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}",
2718                       SSEPackedDouble>, PD;
2719  defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memopv4f32,
2720        VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}",
2721                       SSEPackedSingle>, PS;
2722  defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memopv2f64,
2723        VR128, f128mem, "unpcklpd\t{$src2, $dst|$dst, $src2}",
2724                       SSEPackedDouble>, PD;
2725} // Constraints = "$src1 = $dst"
2726
2727let Predicates = [HasAVX1Only] in {
2728  def : Pat<(v8i32 (X86Unpckl VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))),
2729            (VUNPCKLPSYrm VR256:$src1, addr:$src2)>;
2730  def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)),
2731            (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>;
2732  def : Pat<(v8i32 (X86Unpckh VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))),
2733            (VUNPCKHPSYrm VR256:$src1, addr:$src2)>;
2734  def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)),
2735            (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>;
2736
2737  def : Pat<(v4i64 (X86Unpckl VR256:$src1, (loadv4i64 addr:$src2))),
2738            (VUNPCKLPDYrm VR256:$src1, addr:$src2)>;
2739  def : Pat<(v4i64 (X86Unpckl VR256:$src1, VR256:$src2)),
2740            (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>;
2741  def : Pat<(v4i64 (X86Unpckh VR256:$src1, (loadv4i64 addr:$src2))),
2742            (VUNPCKHPDYrm VR256:$src1, addr:$src2)>;
2743  def : Pat<(v4i64 (X86Unpckh VR256:$src1, VR256:$src2)),
2744            (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>;
2745}
2746
2747//===----------------------------------------------------------------------===//
2748// SSE 1 & 2 - Extract Floating-Point Sign mask
2749//===----------------------------------------------------------------------===//
2750
2751/// sse12_extr_sign_mask - sse 1 & 2 unpack and interleave
2752multiclass sse12_extr_sign_mask<RegisterClass RC, Intrinsic Int, string asm,
2753                                Domain d> {
2754  def rr : PI<0x50, MRMSrcReg, (outs GR32orGR64:$dst), (ins RC:$src),
2755              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
2756              [(set GR32orGR64:$dst, (Int RC:$src))], IIC_SSE_MOVMSK, d>,
2757              Sched<[WriteVecLogic]>;
2758}
2759
2760let Predicates = [HasAVX] in {
2761  defm VMOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps,
2762                                        "movmskps", SSEPackedSingle>, PS, VEX;
2763  defm VMOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd,
2764                                        "movmskpd", SSEPackedDouble>, PD, VEX;
2765  defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_ps_256,
2766                                        "movmskps", SSEPackedSingle>, PS,
2767                                        VEX, VEX_L;
2768  defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_pd_256,
2769                                        "movmskpd", SSEPackedDouble>, PD,
2770                                        VEX, VEX_L;
2771
2772  def : Pat<(i32 (X86fgetsign FR32:$src)),
2773            (VMOVMSKPSrr (COPY_TO_REGCLASS FR32:$src, VR128))>;
2774  def : Pat<(i64 (X86fgetsign FR32:$src)),
2775            (SUBREG_TO_REG (i64 0),
2776             (VMOVMSKPSrr (COPY_TO_REGCLASS FR32:$src, VR128)), sub_32bit)>;
2777  def : Pat<(i32 (X86fgetsign FR64:$src)),
2778            (VMOVMSKPDrr (COPY_TO_REGCLASS FR64:$src, VR128))>;
2779  def : Pat<(i64 (X86fgetsign FR64:$src)),
2780            (SUBREG_TO_REG (i64 0),
2781             (VMOVMSKPDrr (COPY_TO_REGCLASS FR64:$src, VR128)), sub_32bit)>;
2782}
2783
2784defm MOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps, "movmskps",
2785                                     SSEPackedSingle>, PS;
2786defm MOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd, "movmskpd",
2787                                     SSEPackedDouble>, PD;
2788
2789def : Pat<(i32 (X86fgetsign FR32:$src)),
2790          (MOVMSKPSrr (COPY_TO_REGCLASS FR32:$src, VR128))>,
2791      Requires<[UseSSE1]>;
2792def : Pat<(i64 (X86fgetsign FR32:$src)),
2793          (SUBREG_TO_REG (i64 0),
2794           (MOVMSKPSrr (COPY_TO_REGCLASS FR32:$src, VR128)), sub_32bit)>,
2795      Requires<[UseSSE1]>;
2796def : Pat<(i32 (X86fgetsign FR64:$src)),
2797          (MOVMSKPDrr (COPY_TO_REGCLASS FR64:$src, VR128))>,
2798      Requires<[UseSSE2]>;
2799def : Pat<(i64 (X86fgetsign FR64:$src)),
2800          (SUBREG_TO_REG (i64 0),
2801           (MOVMSKPDrr (COPY_TO_REGCLASS FR64:$src, VR128)), sub_32bit)>,
2802      Requires<[UseSSE2]>;
2803
2804//===---------------------------------------------------------------------===//
2805// SSE2 - Packed Integer Logical Instructions
2806//===---------------------------------------------------------------------===//
2807
2808let ExeDomain = SSEPackedInt in { // SSE integer instructions
2809
2810/// PDI_binop_rm - Simple SSE2 binary operator.
2811multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
2812                        ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
2813                        X86MemOperand x86memop, OpndItins itins,
2814                        bit IsCommutable, bit Is2Addr> {
2815  let isCommutable = IsCommutable in
2816  def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
2817       (ins RC:$src1, RC:$src2),
2818       !if(Is2Addr,
2819           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2820           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
2821       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))], itins.rr>,
2822       Sched<[itins.Sched]>;
2823  def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
2824       (ins RC:$src1, x86memop:$src2),
2825       !if(Is2Addr,
2826           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2827           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
2828       [(set RC:$dst, (OpVT (OpNode RC:$src1,
2829                                     (bitconvert (memop_frag addr:$src2)))))],
2830                                     itins.rm>,
2831       Sched<[itins.Sched.Folded, ReadAfterLd]>;
2832}
2833} // ExeDomain = SSEPackedInt
2834
2835multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode,
2836                         ValueType OpVT128, ValueType OpVT256,
2837                         OpndItins itins, bit IsCommutable = 0> {
2838let Predicates = [HasAVX, NoVLX] in
2839  defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128,
2840                    VR128, loadv2i64, i128mem, itins, IsCommutable, 0>, VEX_4V;
2841
2842let Constraints = "$src1 = $dst" in
2843  defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128,
2844                           memopv2i64, i128mem, itins, IsCommutable, 1>;
2845
2846let Predicates = [HasAVX2, NoVLX] in
2847  defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode,
2848                               OpVT256, VR256, loadv4i64, i256mem, itins,
2849                               IsCommutable, 0>, VEX_4V, VEX_L;
2850}
2851
2852// These are ordered here for pattern ordering requirements with the fp versions
2853
2854defm PAND  : PDI_binop_all<0xDB, "pand", and, v2i64, v4i64,
2855                           SSE_VEC_BIT_ITINS_P, 1>;
2856defm POR   : PDI_binop_all<0xEB, "por", or, v2i64, v4i64,
2857                           SSE_VEC_BIT_ITINS_P, 1>;
2858defm PXOR  : PDI_binop_all<0xEF, "pxor", xor, v2i64, v4i64,
2859                           SSE_VEC_BIT_ITINS_P, 1>;
2860defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64,
2861                           SSE_VEC_BIT_ITINS_P, 0>;
2862
2863//===----------------------------------------------------------------------===//
2864// SSE 1 & 2 - Logical Instructions
2865//===----------------------------------------------------------------------===//
2866
2867// Multiclass for scalars using the X86 logical operation aliases for FP.
2868multiclass sse12_fp_packed_scalar_logical_alias<
2869    bits<8> opc, string OpcodeStr, SDNode OpNode, OpndItins itins> {
2870  defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
2871                FR32, f32, f128mem, loadf32_128, SSEPackedSingle, itins, 0>,
2872                PS, VEX_4V;
2873
2874  defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
2875                FR64, f64, f128mem, loadf64_128, SSEPackedDouble, itins, 0>,
2876                PD, VEX_4V;
2877
2878  let Constraints = "$src1 = $dst" in {
2879    defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, FR32,
2880                f32, f128mem, memopfsf32_128, SSEPackedSingle, itins>, PS;
2881
2882    defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, FR64,
2883                f64, f128mem, memopfsf64_128, SSEPackedDouble, itins>, PD;
2884  }
2885}
2886
2887let isCodeGenOnly = 1 in {
2888  defm FsAND  : sse12_fp_packed_scalar_logical_alias<0x54, "and", X86fand,
2889                SSE_BIT_ITINS_P>;
2890  defm FsOR   : sse12_fp_packed_scalar_logical_alias<0x56, "or", X86for,
2891                SSE_BIT_ITINS_P>;
2892  defm FsXOR  : sse12_fp_packed_scalar_logical_alias<0x57, "xor", X86fxor,
2893                SSE_BIT_ITINS_P>;
2894
2895  let isCommutable = 0 in
2896    defm FsANDN : sse12_fp_packed_scalar_logical_alias<0x55, "andn", X86fandn,
2897                  SSE_BIT_ITINS_P>;
2898}
2899
2900// Multiclass for vectors using the X86 logical operation aliases for FP.
2901multiclass sse12_fp_packed_vector_logical_alias<
2902    bits<8> opc, string OpcodeStr, SDNode OpNode, OpndItins itins> {
2903  let Predicates = [HasAVX, NoVLX] in {
2904  defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
2905              VR128, v4f32, f128mem, loadv4f32, SSEPackedSingle, itins, 0>,
2906              PS, VEX_4V;
2907
2908  defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
2909        VR128, v2f64, f128mem, loadv2f64, SSEPackedDouble, itins, 0>,
2910        PD, VEX_4V;
2911  }
2912
2913  let Constraints = "$src1 = $dst" in {
2914    defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128,
2915                v4f32, f128mem, memopv4f32, SSEPackedSingle, itins>,
2916                PS;
2917
2918    defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128,
2919                v2f64, f128mem, memopv2f64, SSEPackedDouble, itins>,
2920                PD;
2921  }
2922}
2923
2924let isCodeGenOnly = 1 in {
2925  defm FvAND  : sse12_fp_packed_vector_logical_alias<0x54, "and", X86fand,
2926                SSE_BIT_ITINS_P>;
2927  defm FvOR   : sse12_fp_packed_vector_logical_alias<0x56, "or", X86for,
2928                SSE_BIT_ITINS_P>;
2929  defm FvXOR  : sse12_fp_packed_vector_logical_alias<0x57, "xor", X86fxor,
2930                SSE_BIT_ITINS_P>;
2931
2932  let isCommutable = 0 in
2933    defm FvANDN : sse12_fp_packed_vector_logical_alias<0x55, "andn", X86fandn,
2934                  SSE_BIT_ITINS_P>;
2935}
2936
2937/// sse12_fp_packed_logical - SSE 1 & 2 packed FP logical ops
2938///
2939multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
2940                                   SDNode OpNode> {
2941  let Predicates = [HasAVX, NoVLX] in {
2942  defm V#NAME#PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle,
2943        !strconcat(OpcodeStr, "ps"), f256mem,
2944        [(set VR256:$dst, (v4i64 (OpNode VR256:$src1, VR256:$src2)))],
2945        [(set VR256:$dst, (OpNode (bc_v4i64 (v8f32 VR256:$src1)),
2946                           (loadv4i64 addr:$src2)))], 0>, PS, VEX_4V, VEX_L;
2947
2948  defm V#NAME#PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble,
2949        !strconcat(OpcodeStr, "pd"), f256mem,
2950        [(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)),
2951                                  (bc_v4i64 (v4f64 VR256:$src2))))],
2952        [(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)),
2953                                  (loadv4i64 addr:$src2)))], 0>,
2954                                  PD, VEX_4V, VEX_L;
2955
2956  // In AVX no need to add a pattern for 128-bit logical rr ps, because they
2957  // are all promoted to v2i64, and the patterns are covered by the int
2958  // version. This is needed in SSE only, because v2i64 isn't supported on
2959  // SSE1, but only on SSE2.
2960  defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
2961       !strconcat(OpcodeStr, "ps"), f128mem, [],
2962       [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)),
2963                                 (loadv2i64 addr:$src2)))], 0>, PS, VEX_4V;
2964
2965  defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
2966       !strconcat(OpcodeStr, "pd"), f128mem,
2967       [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
2968                                 (bc_v2i64 (v2f64 VR128:$src2))))],
2969       [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
2970                                 (loadv2i64 addr:$src2)))], 0>,
2971                                                 PD, VEX_4V;
2972  }
2973
2974  let Constraints = "$src1 = $dst" in {
2975    defm PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
2976         !strconcat(OpcodeStr, "ps"), f128mem,
2977         [(set VR128:$dst, (v2i64 (OpNode VR128:$src1, VR128:$src2)))],
2978         [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)),
2979                                   (memopv2i64 addr:$src2)))]>, PS;
2980
2981    defm PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
2982         !strconcat(OpcodeStr, "pd"), f128mem,
2983         [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
2984                                   (bc_v2i64 (v2f64 VR128:$src2))))],
2985         [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
2986                                   (memopv2i64 addr:$src2)))]>, PD;
2987  }
2988}
2989
2990defm AND  : sse12_fp_packed_logical<0x54, "and", and>;
2991defm OR   : sse12_fp_packed_logical<0x56, "or", or>;
2992defm XOR  : sse12_fp_packed_logical<0x57, "xor", xor>;
2993let isCommutable = 0 in
2994  defm ANDN : sse12_fp_packed_logical<0x55, "andn", X86andnp>;
2995
2996// AVX1 requires type coercions in order to fold loads directly into logical
2997// operations.
2998let Predicates = [HasAVX1Only] in {
2999  def : Pat<(bc_v8f32 (and VR256:$src1, (loadv4i64 addr:$src2))),
3000            (VANDPSYrm VR256:$src1, addr:$src2)>;
3001  def : Pat<(bc_v8f32 (or VR256:$src1, (loadv4i64 addr:$src2))),
3002            (VORPSYrm VR256:$src1, addr:$src2)>;
3003  def : Pat<(bc_v8f32 (xor VR256:$src1, (loadv4i64 addr:$src2))),
3004            (VXORPSYrm VR256:$src1, addr:$src2)>;
3005  def : Pat<(bc_v8f32 (X86andnp VR256:$src1, (loadv4i64 addr:$src2))),
3006            (VANDNPSYrm VR256:$src1, addr:$src2)>;
3007}
3008
3009//===----------------------------------------------------------------------===//
3010// SSE 1 & 2 - Arithmetic Instructions
3011//===----------------------------------------------------------------------===//
3012
3013/// basic_sse12_fp_binop_xxx - SSE 1 & 2 binops come in both scalar and
3014/// vector forms.
3015///
3016/// In addition, we also have a special variant of the scalar form here to
3017/// represent the associated intrinsic operation.  This form is unlike the
3018/// plain scalar form, in that it takes an entire vector (instead of a scalar)
3019/// and leaves the top elements unmodified (therefore these cannot be commuted).
3020///
3021/// These three forms can each be reg+reg or reg+mem.
3022///
3023
3024/// FIXME: once all 256-bit intrinsics are matched, cleanup and refactor those
3025/// classes below
3026multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr,
3027                                  SDNode OpNode, SizeItins itins> {
3028  let Predicates = [HasAVX, NoVLX] in {
3029  defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
3030                               VR128, v4f32, f128mem, loadv4f32,
3031                               SSEPackedSingle, itins.s, 0>, PS, VEX_4V;
3032  defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
3033                               VR128, v2f64, f128mem, loadv2f64,
3034                               SSEPackedDouble, itins.d, 0>, PD, VEX_4V;
3035
3036  defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"),
3037                        OpNode, VR256, v8f32, f256mem, loadv8f32,
3038                        SSEPackedSingle, itins.s, 0>, PS, VEX_4V, VEX_L;
3039  defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"),
3040                        OpNode, VR256, v4f64, f256mem, loadv4f64,
3041                        SSEPackedDouble, itins.d, 0>, PD, VEX_4V, VEX_L;
3042  }
3043
3044  let Constraints = "$src1 = $dst" in {
3045    defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128,
3046                              v4f32, f128mem, memopv4f32, SSEPackedSingle,
3047                              itins.s>, PS;
3048    defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128,
3049                              v2f64, f128mem, memopv2f64, SSEPackedDouble,
3050                              itins.d>, PD;
3051  }
3052}
3053
3054multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
3055                                  SizeItins itins> {
3056  defm V#NAME#SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
3057                         OpNode, FR32, f32mem, itins.s, 0>, XS, VEX_4V, VEX_LIG;
3058  defm V#NAME#SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
3059                         OpNode, FR64, f64mem, itins.d, 0>, XD, VEX_4V, VEX_LIG;
3060
3061  let Constraints = "$src1 = $dst" in {
3062    defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
3063                              OpNode, FR32, f32mem, itins.s>, XS;
3064    defm SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
3065                              OpNode, FR64, f64mem, itins.d>, XD;
3066  }
3067}
3068
3069multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr,
3070                                      SizeItins itins> {
3071  defm V#NAME#SS : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
3072                   !strconcat(OpcodeStr, "ss"), "", "_ss", ssmem, sse_load_f32,
3073                   itins.s, 0>, XS, VEX_4V, VEX_LIG;
3074  defm V#NAME#SD : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
3075                   !strconcat(OpcodeStr, "sd"), "2", "_sd", sdmem, sse_load_f64,
3076                   itins.d, 0>, XD, VEX_4V, VEX_LIG;
3077
3078  let Constraints = "$src1 = $dst" in {
3079    defm SS : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
3080                   !strconcat(OpcodeStr, "ss"), "", "_ss", ssmem, sse_load_f32,
3081                   itins.s>, XS;
3082    defm SD : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
3083                   !strconcat(OpcodeStr, "sd"), "2", "_sd", sdmem, sse_load_f64,
3084                   itins.d>, XD;
3085  }
3086}
3087
3088// Binary Arithmetic instructions
3089defm ADD : basic_sse12_fp_binop_p<0x58, "add", fadd, SSE_ALU_ITINS_P>,
3090           basic_sse12_fp_binop_s<0x58, "add", fadd, SSE_ALU_ITINS_S>,
3091           basic_sse12_fp_binop_s_int<0x58, "add", SSE_ALU_ITINS_S>;
3092defm MUL : basic_sse12_fp_binop_p<0x59, "mul", fmul, SSE_MUL_ITINS_P>,
3093           basic_sse12_fp_binop_s<0x59, "mul", fmul, SSE_MUL_ITINS_S>,
3094           basic_sse12_fp_binop_s_int<0x59, "mul", SSE_MUL_ITINS_S>;
3095let isCommutable = 0 in {
3096  defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", fsub, SSE_ALU_ITINS_P>,
3097             basic_sse12_fp_binop_s<0x5C, "sub", fsub, SSE_ALU_ITINS_S>,
3098             basic_sse12_fp_binop_s_int<0x5C, "sub", SSE_ALU_ITINS_S>;
3099  defm DIV : basic_sse12_fp_binop_p<0x5E, "div", fdiv, SSE_DIV_ITINS_P>,
3100             basic_sse12_fp_binop_s<0x5E, "div", fdiv, SSE_DIV_ITINS_S>,
3101             basic_sse12_fp_binop_s_int<0x5E, "div", SSE_DIV_ITINS_S>;
3102  defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SSE_ALU_ITINS_P>,
3103             basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SSE_ALU_ITINS_S>,
3104             basic_sse12_fp_binop_s_int<0x5F, "max", SSE_ALU_ITINS_S>;
3105  defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SSE_ALU_ITINS_P>,
3106             basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SSE_ALU_ITINS_S>,
3107             basic_sse12_fp_binop_s_int<0x5D, "min", SSE_ALU_ITINS_S>;
3108}
3109
3110let isCodeGenOnly = 1 in {
3111  defm MAXC: basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_P>,
3112             basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_S>;
3113  defm MINC: basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SSE_ALU_ITINS_P>,
3114             basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SSE_ALU_ITINS_S>;
3115}
3116
3117// Patterns used to select SSE scalar fp arithmetic instructions from
3118// either:
3119//
3120// (1) a scalar fp operation followed by a blend
3121//
3122// The effect is that the backend no longer emits unnecessary vector
3123// insert instructions immediately after SSE scalar fp instructions
3124// like addss or mulss.
3125//
3126// For example, given the following code:
3127//   __m128 foo(__m128 A, __m128 B) {
3128//     A[0] += B[0];
3129//     return A;
3130//   }
3131//
3132// Previously we generated:
3133//   addss %xmm0, %xmm1
3134//   movss %xmm1, %xmm0
3135//
3136// We now generate:
3137//   addss %xmm1, %xmm0
3138//
3139// (2) a vector packed single/double fp operation followed by a vector insert
3140//
3141// The effect is that the backend converts the packed fp instruction
3142// followed by a vector insert into a single SSE scalar fp instruction.
3143//
3144// For example, given the following code:
3145//   __m128 foo(__m128 A, __m128 B) {
3146//     __m128 C = A + B;
3147//     return (__m128) {c[0], a[1], a[2], a[3]};
3148//   }
3149//
3150// Previously we generated:
3151//   addps %xmm0, %xmm1
3152//   movss %xmm1, %xmm0
3153//
3154// We now generate:
3155//   addss %xmm1, %xmm0
3156
3157// TODO: Some canonicalization in lowering would simplify the number of
3158// patterns we have to try to match.
3159multiclass scalar_math_f32_patterns<SDNode Op, string OpcPrefix> {
3160  let Predicates = [UseSSE1] in {
3161    // extracted scalar math op with insert via movss
3162    def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
3163          (Op (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
3164          FR32:$src))))),
3165      (!cast<I>(OpcPrefix#SSrr_Int) v4f32:$dst,
3166          (COPY_TO_REGCLASS FR32:$src, VR128))>;
3167
3168    // vector math op with insert via movss
3169    def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst),
3170          (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
3171      (!cast<I>(OpcPrefix#SSrr_Int) v4f32:$dst, v4f32:$src)>;
3172  }
3173
3174  // With SSE 4.1, insertps/blendi are preferred to movsd, so match those too.
3175  let Predicates = [UseSSE41] in {
3176    // extracted scalar math op with insert via insertps
3177    def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
3178          (Op (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
3179          FR32:$src))), (iPTR 0))),
3180      (!cast<I>(OpcPrefix#SSrr_Int) v4f32:$dst,
3181          (COPY_TO_REGCLASS FR32:$src, VR128))>;
3182
3183    // extracted scalar math op with insert via blend
3184    def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
3185          (Op (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
3186          FR32:$src))), (i8 1))),
3187      (!cast<I>(OpcPrefix#SSrr_Int) v4f32:$dst,
3188          (COPY_TO_REGCLASS FR32:$src, VR128))>;
3189
3190    // vector math op with insert via blend
3191    def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst),
3192          (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))),
3193      (!cast<I>(OpcPrefix#SSrr_Int)v4f32:$dst, v4f32:$src)>;
3194
3195  }
3196
3197  // Repeat everything for AVX, except for the movss + scalar combo...
3198  // because that one shouldn't occur with AVX codegen?
3199  let Predicates = [HasAVX] in {
3200    // extracted scalar math op with insert via insertps
3201    def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
3202          (Op (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
3203          FR32:$src))), (iPTR 0))),
3204      (!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst,
3205          (COPY_TO_REGCLASS FR32:$src, VR128))>;
3206
3207    // extracted scalar math op with insert via blend
3208    def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
3209          (Op (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
3210          FR32:$src))), (i8 1))),
3211      (!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst,
3212          (COPY_TO_REGCLASS FR32:$src, VR128))>;
3213
3214    // vector math op with insert via movss
3215    def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst),
3216          (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
3217      (!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst, v4f32:$src)>;
3218
3219    // vector math op with insert via blend
3220    def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst),
3221          (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))),
3222      (!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst, v4f32:$src)>;
3223  }
3224}
3225
3226defm : scalar_math_f32_patterns<fadd, "ADD">;
3227defm : scalar_math_f32_patterns<fsub, "SUB">;
3228defm : scalar_math_f32_patterns<fmul, "MUL">;
3229defm : scalar_math_f32_patterns<fdiv, "DIV">;
3230
3231multiclass scalar_math_f64_patterns<SDNode Op, string OpcPrefix> {
3232  let Predicates = [UseSSE2] in {
3233    // extracted scalar math op with insert via movsd
3234    def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector
3235          (Op (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
3236          FR64:$src))))),
3237      (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst,
3238          (COPY_TO_REGCLASS FR64:$src, VR128))>;
3239
3240    // vector math op with insert via movsd
3241    def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
3242          (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
3243      (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>;
3244  }
3245
3246  // With SSE 4.1, blendi is preferred to movsd, so match those too.
3247  let Predicates = [UseSSE41] in {
3248    // extracted scalar math op with insert via blend
3249    def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector
3250          (Op (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
3251          FR64:$src))), (i8 1))),
3252      (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst,
3253          (COPY_TO_REGCLASS FR64:$src, VR128))>;
3254
3255    // vector math op with insert via blend
3256    def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst),
3257          (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))),
3258      (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>;
3259  }
3260
3261  // Repeat everything for AVX.
3262  let Predicates = [HasAVX] in {
3263    // extracted scalar math op with insert via movsd
3264    def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector
3265          (Op (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
3266          FR64:$src))))),
3267      (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst,
3268          (COPY_TO_REGCLASS FR64:$src, VR128))>;
3269
3270    // extracted scalar math op with insert via blend
3271    def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector
3272          (Op (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
3273          FR64:$src))), (i8 1))),
3274      (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst,
3275          (COPY_TO_REGCLASS FR64:$src, VR128))>;
3276
3277    // vector math op with insert via movsd
3278    def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
3279          (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
3280      (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>;
3281
3282    // vector math op with insert via blend
3283    def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst),
3284          (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))),
3285      (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>;
3286  }
3287}
3288
3289defm : scalar_math_f64_patterns<fadd, "ADD">;
3290defm : scalar_math_f64_patterns<fsub, "SUB">;
3291defm : scalar_math_f64_patterns<fmul, "MUL">;
3292defm : scalar_math_f64_patterns<fdiv, "DIV">;
3293
3294
3295/// Unop Arithmetic
3296/// In addition, we also have a special variant of the scalar form here to
3297/// represent the associated intrinsic operation.  This form is unlike the
3298/// plain scalar form, in that it takes an entire vector (instead of a
3299/// scalar) and leaves the top elements undefined.
3300///
3301/// And, we have a special variant form for a full-vector intrinsic form.
3302
3303let Sched = WriteFSqrt in {
3304def SSE_SQRTPS : OpndItins<
3305  IIC_SSE_SQRTPS_RR, IIC_SSE_SQRTPS_RM
3306>;
3307
3308def SSE_SQRTSS : OpndItins<
3309  IIC_SSE_SQRTSS_RR, IIC_SSE_SQRTSS_RM
3310>;
3311
3312def SSE_SQRTPD : OpndItins<
3313  IIC_SSE_SQRTPD_RR, IIC_SSE_SQRTPD_RM
3314>;
3315
3316def SSE_SQRTSD : OpndItins<
3317  IIC_SSE_SQRTSD_RR, IIC_SSE_SQRTSD_RM
3318>;
3319}
3320
3321let Sched = WriteFRsqrt in {
3322def SSE_RSQRTPS : OpndItins<
3323  IIC_SSE_RSQRTPS_RR, IIC_SSE_RSQRTPS_RM
3324>;
3325
3326def SSE_RSQRTSS : OpndItins<
3327  IIC_SSE_RSQRTSS_RR, IIC_SSE_RSQRTSS_RM
3328>;
3329}
3330
3331let Sched = WriteFRcp in {
3332def SSE_RCPP : OpndItins<
3333  IIC_SSE_RCPP_RR, IIC_SSE_RCPP_RM
3334>;
3335
3336def SSE_RCPS : OpndItins<
3337  IIC_SSE_RCPS_RR, IIC_SSE_RCPS_RM
3338>;
3339}
3340
3341/// sse_fp_unop_s - SSE1 unops in scalar form
3342/// For the non-AVX defs, we need $src1 to be tied to $dst because
3343/// the HW instructions are 2 operand / destructive.
3344multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
3345                          ValueType vt, ValueType ScalarVT,
3346                          X86MemOperand x86memop, Operand vec_memop,
3347                          ComplexPattern mem_cpat, Intrinsic Intr,
3348                          SDNode OpNode, OpndItins itins, Predicate target,
3349                          string Suffix> {
3350  let hasSideEffects = 0 in {
3351  def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1),
3352              !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"),
3353            [(set RC:$dst, (OpNode RC:$src1))], itins.rr>, Sched<[itins.Sched]>,
3354            Requires<[target]>;
3355  let mayLoad = 1 in
3356  def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1),
3357            !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"),
3358            [(set RC:$dst, (OpNode (load addr:$src1)))], itins.rm>,
3359            Sched<[itins.Sched.Folded, ReadAfterLd]>,
3360            Requires<[target, OptForSize]>;
3361
3362  let isCodeGenOnly = 1, Constraints = "$src1 = $dst" in {
3363  def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
3364              !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3365            []>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
3366  let mayLoad = 1 in
3367  def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, vec_memop:$src2),
3368              !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3369            []>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
3370  }
3371  }
3372
3373  let Predicates = [target] in {
3374  def : Pat<(vt (OpNode mem_cpat:$src)),
3375            (vt (COPY_TO_REGCLASS (vt (!cast<Instruction>(NAME#Suffix##m_Int)
3376                 (vt (IMPLICIT_DEF)), mem_cpat:$src)), RC))>;
3377  // These are unary operations, but they are modeled as having 2 source operands
3378  // because the high elements of the destination are unchanged in SSE.
3379  def : Pat<(Intr VR128:$src),
3380            (!cast<Instruction>(NAME#Suffix##r_Int) VR128:$src, VR128:$src)>;
3381  def : Pat<(Intr (load addr:$src)),
3382            (vt (COPY_TO_REGCLASS(!cast<Instruction>(NAME#Suffix##m)
3383                                      addr:$src), VR128))>;
3384   def : Pat<(Intr mem_cpat:$src),
3385             (!cast<Instruction>(NAME#Suffix##m_Int)
3386                    (vt (IMPLICIT_DEF)), mem_cpat:$src)>;
3387  }
3388}
3389
3390multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
3391                          ValueType vt, ValueType ScalarVT,
3392                          X86MemOperand x86memop, Operand vec_memop,
3393                          ComplexPattern mem_cpat,
3394                          Intrinsic Intr, SDNode OpNode, OpndItins itins,
3395                          Predicate target, string Suffix> {
3396  let hasSideEffects = 0 in {
3397  def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
3398            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3399            [], itins.rr>, Sched<[itins.Sched]>;
3400  let mayLoad = 1 in
3401  def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
3402             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3403            [], itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
3404  let isCodeGenOnly = 1 in {
3405  // todo: uncomment when all r_Int forms will be added to X86InstrInfo.cpp
3406  //def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst),
3407  //              (ins VR128:$src1, VR128:$src2),
3408  //           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3409  //          []>, Sched<[itins.Sched.Folded]>;
3410  let mayLoad = 1 in
3411  def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst),
3412                (ins VR128:$src1, vec_memop:$src2),
3413             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3414            []>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
3415  }
3416  }
3417
3418  let Predicates = [target] in {
3419   def : Pat<(OpNode RC:$src),  (!cast<Instruction>("V"#NAME#Suffix##r)
3420                                (ScalarVT (IMPLICIT_DEF)), RC:$src)>;
3421
3422   def : Pat<(vt (OpNode mem_cpat:$src)),
3423             (!cast<Instruction>("V"#NAME#Suffix##m_Int) (vt (IMPLICIT_DEF)),
3424                                  mem_cpat:$src)>;
3425
3426   // todo: use r_Int form when it will be ready
3427   //def : Pat<(Intr VR128:$src), (!cast<Instruction>("V"#NAME#Suffix##r_Int)
3428   //                 (VT (IMPLICIT_DEF)), VR128:$src)>;
3429   def : Pat<(Intr VR128:$src),
3430             (vt (COPY_TO_REGCLASS(
3431             !cast<Instruction>("V"#NAME#Suffix##r) (ScalarVT (IMPLICIT_DEF)),
3432                    (ScalarVT (COPY_TO_REGCLASS VR128:$src, RC))), VR128))>;
3433   def : Pat<(Intr mem_cpat:$src),
3434             (!cast<Instruction>("V"#NAME#Suffix##m_Int)
3435                    (vt (IMPLICIT_DEF)), mem_cpat:$src)>;
3436  }
3437  let Predicates = [target, OptForSize] in
3438  def : Pat<(ScalarVT (OpNode (load addr:$src))),
3439            (!cast<Instruction>("V"#NAME#Suffix##m) (ScalarVT (IMPLICIT_DEF)),
3440             addr:$src)>;
3441}
3442
3443/// sse1_fp_unop_p - SSE1 unops in packed form.
3444multiclass sse1_fp_unop_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
3445                          OpndItins itins> {
3446let Predicates = [HasAVX] in {
3447  def V#NAME#PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3448                       !strconcat("v", OpcodeStr,
3449                                  "ps\t{$src, $dst|$dst, $src}"),
3450                       [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))],
3451                       itins.rr>, VEX, Sched<[itins.Sched]>;
3452  def V#NAME#PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
3453                       !strconcat("v", OpcodeStr,
3454                                  "ps\t{$src, $dst|$dst, $src}"),
3455                       [(set VR128:$dst, (OpNode (loadv4f32 addr:$src)))],
3456                       itins.rm>, VEX, Sched<[itins.Sched.Folded]>;
3457  def V#NAME#PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
3458                        !strconcat("v", OpcodeStr,
3459                                   "ps\t{$src, $dst|$dst, $src}"),
3460                        [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))],
3461                        itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>;
3462  def V#NAME#PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
3463                        !strconcat("v", OpcodeStr,
3464                                   "ps\t{$src, $dst|$dst, $src}"),
3465                        [(set VR256:$dst, (OpNode (loadv8f32 addr:$src)))],
3466                        itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>;
3467}
3468
3469  def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3470                !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
3471                [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))], itins.rr>,
3472            Sched<[itins.Sched]>;
3473  def PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
3474                !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
3475                [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))], itins.rm>,
3476            Sched<[itins.Sched.Folded]>;
3477}
3478
3479/// sse1_fp_unop_p_int - SSE1 intrinsics unops in packed forms.
3480multiclass sse1_fp_unop_p_int<bits<8> opc, string OpcodeStr,
3481                              Intrinsic V4F32Int, Intrinsic V8F32Int,
3482                              OpndItins itins> {
3483let isCodeGenOnly = 1 in {
3484let Predicates = [HasAVX] in {
3485  def V#NAME#PSr_Int : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3486                           !strconcat("v", OpcodeStr,
3487                                      "ps\t{$src, $dst|$dst, $src}"),
3488                           [(set VR128:$dst, (V4F32Int VR128:$src))],
3489                           itins.rr>, VEX, Sched<[itins.Sched]>;
3490  def V#NAME#PSm_Int : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
3491                          !strconcat("v", OpcodeStr,
3492                          "ps\t{$src, $dst|$dst, $src}"),
3493                          [(set VR128:$dst, (V4F32Int (loadv4f32 addr:$src)))],
3494                          itins.rm>, VEX, Sched<[itins.Sched.Folded]>;
3495  def V#NAME#PSYr_Int : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
3496                            !strconcat("v", OpcodeStr,
3497                                       "ps\t{$src, $dst|$dst, $src}"),
3498                            [(set VR256:$dst, (V8F32Int VR256:$src))],
3499                            itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>;
3500  def V#NAME#PSYm_Int : PSI<opc, MRMSrcMem, (outs VR256:$dst),
3501                          (ins f256mem:$src),
3502                          !strconcat("v", OpcodeStr,
3503                                    "ps\t{$src, $dst|$dst, $src}"),
3504                          [(set VR256:$dst, (V8F32Int (loadv8f32 addr:$src)))],
3505                          itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>;
3506}
3507
3508  def PSr_Int : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3509                    !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
3510                    [(set VR128:$dst, (V4F32Int VR128:$src))],
3511                    itins.rr>, Sched<[itins.Sched]>;
3512  def PSm_Int : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
3513                    !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
3514                    [(set VR128:$dst, (V4F32Int (memopv4f32 addr:$src)))],
3515                    itins.rm>, Sched<[itins.Sched.Folded]>;
3516} // isCodeGenOnly = 1
3517}
3518
3519/// sse2_fp_unop_p - SSE2 unops in vector forms.
3520multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr,
3521                          SDNode OpNode, OpndItins itins> {
3522let Predicates = [HasAVX] in {
3523  def V#NAME#PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3524                       !strconcat("v", OpcodeStr,
3525                                  "pd\t{$src, $dst|$dst, $src}"),
3526                       [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))],
3527                       itins.rr>, VEX, Sched<[itins.Sched]>;
3528  def V#NAME#PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
3529                       !strconcat("v", OpcodeStr,
3530                                  "pd\t{$src, $dst|$dst, $src}"),
3531                       [(set VR128:$dst, (OpNode (loadv2f64 addr:$src)))],
3532                       itins.rm>, VEX, Sched<[itins.Sched.Folded]>;
3533  def V#NAME#PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
3534                        !strconcat("v", OpcodeStr,
3535                                   "pd\t{$src, $dst|$dst, $src}"),
3536                        [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))],
3537                        itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>;
3538  def V#NAME#PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
3539                        !strconcat("v", OpcodeStr,
3540                                   "pd\t{$src, $dst|$dst, $src}"),
3541                        [(set VR256:$dst, (OpNode (loadv4f64 addr:$src)))],
3542                        itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>;
3543}
3544
3545  def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3546              !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
3547              [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))], itins.rr>,
3548            Sched<[itins.Sched]>;
3549  def PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
3550                !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
3551                [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))], itins.rm>,
3552            Sched<[itins.Sched.Folded]>;
3553}
3554
3555multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
3556                          OpndItins itins> {
3557  defm SS        :  sse_fp_unop_s<opc, OpcodeStr##ss, FR32, v4f32, f32, f32mem,
3558                      ssmem, sse_load_f32,
3559                      !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode,
3560                      itins, UseSSE1, "SS">, XS;
3561  defm V#NAME#SS  : avx_fp_unop_s<opc, "v"#OpcodeStr##ss, FR32, v4f32, f32,
3562                      f32mem, ssmem, sse_load_f32,
3563                      !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode,
3564                      itins, UseAVX, "SS">, XS, VEX_4V, VEX_LIG;
3565}
3566
3567multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
3568                          OpndItins itins> {
3569  defm SD         : sse_fp_unop_s<opc, OpcodeStr##sd, FR64, v2f64, f64, f64mem,
3570                         sdmem, sse_load_f64,
3571                         !cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd),
3572                         OpNode, itins, UseSSE2, "SD">, XD;
3573  defm V#NAME#SD  : avx_fp_unop_s<opc, "v"#OpcodeStr##sd, FR64, v2f64, f64,
3574                         f64mem, sdmem, sse_load_f64,
3575                         !cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd),
3576                         OpNode, itins, UseAVX, "SD">, XD, VEX_4V, VEX_LIG;
3577}
3578
3579// Square root.
3580defm SQRT  : sse1_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSS>,
3581             sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPS>,
3582             sse2_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSD>,
3583             sse2_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPD>;
3584
3585// Reciprocal approximations. Note that these typically require refinement
3586// in order to obtain suitable precision.
3587defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SSE_RSQRTSS>,
3588             sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_RSQRTPS>,
3589             sse1_fp_unop_p_int<0x52, "rsqrt", int_x86_sse_rsqrt_ps,
3590                                int_x86_avx_rsqrt_ps_256, SSE_RSQRTPS>;
3591defm RCP   : sse1_fp_unop_s<0x53, "rcp", X86frcp, SSE_RCPS>,
3592             sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP>,
3593             sse1_fp_unop_p_int<0x53, "rcp", int_x86_sse_rcp_ps,
3594                                int_x86_avx_rcp_ps_256, SSE_RCPP>;
3595
3596// There is no f64 version of the reciprocal approximation instructions.
3597
3598//===----------------------------------------------------------------------===//
3599// SSE 1 & 2 - Non-temporal stores
3600//===----------------------------------------------------------------------===//
3601
3602let AddedComplexity = 400 in { // Prefer non-temporal versions
3603let SchedRW = [WriteStore] in {
3604let Predicates = [HasAVX, NoVLX] in {
3605def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs),
3606                     (ins f128mem:$dst, VR128:$src),
3607                     "movntps\t{$src, $dst|$dst, $src}",
3608                     [(alignednontemporalstore (v4f32 VR128:$src),
3609                                               addr:$dst)],
3610                                               IIC_SSE_MOVNT>, VEX;
3611def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs),
3612                     (ins f128mem:$dst, VR128:$src),
3613                     "movntpd\t{$src, $dst|$dst, $src}",
3614                     [(alignednontemporalstore (v2f64 VR128:$src),
3615                                               addr:$dst)],
3616                                               IIC_SSE_MOVNT>, VEX;
3617
3618let ExeDomain = SSEPackedInt in
3619def VMOVNTDQmr    : VPDI<0xE7, MRMDestMem, (outs),
3620                         (ins f128mem:$dst, VR128:$src),
3621                         "movntdq\t{$src, $dst|$dst, $src}",
3622                         [(alignednontemporalstore (v2i64 VR128:$src),
3623                                                   addr:$dst)],
3624                                                   IIC_SSE_MOVNT>, VEX;
3625
3626def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs),
3627                     (ins f256mem:$dst, VR256:$src),
3628                     "movntps\t{$src, $dst|$dst, $src}",
3629                     [(alignednontemporalstore (v8f32 VR256:$src),
3630                                               addr:$dst)],
3631                                               IIC_SSE_MOVNT>, VEX, VEX_L;
3632def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs),
3633                     (ins f256mem:$dst, VR256:$src),
3634                     "movntpd\t{$src, $dst|$dst, $src}",
3635                     [(alignednontemporalstore (v4f64 VR256:$src),
3636                                               addr:$dst)],
3637                                               IIC_SSE_MOVNT>, VEX, VEX_L;
3638let ExeDomain = SSEPackedInt in
3639def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs),
3640                    (ins f256mem:$dst, VR256:$src),
3641                    "movntdq\t{$src, $dst|$dst, $src}",
3642                    [(alignednontemporalstore (v4i64 VR256:$src),
3643                                              addr:$dst)],
3644                                              IIC_SSE_MOVNT>, VEX, VEX_L;
3645}
3646
3647def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
3648                    "movntps\t{$src, $dst|$dst, $src}",
3649                    [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)],
3650                    IIC_SSE_MOVNT>;
3651def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
3652                    "movntpd\t{$src, $dst|$dst, $src}",
3653                    [(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)],
3654                    IIC_SSE_MOVNT>;
3655
3656let ExeDomain = SSEPackedInt in
3657def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
3658                    "movntdq\t{$src, $dst|$dst, $src}",
3659                    [(alignednontemporalstore (v2i64 VR128:$src), addr:$dst)],
3660                    IIC_SSE_MOVNT>;
3661
3662// There is no AVX form for instructions below this point
3663def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
3664                 "movnti{l}\t{$src, $dst|$dst, $src}",
3665                 [(nontemporalstore (i32 GR32:$src), addr:$dst)],
3666                 IIC_SSE_MOVNT>,
3667               PS, Requires<[HasSSE2]>;
3668def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
3669                     "movnti{q}\t{$src, $dst|$dst, $src}",
3670                     [(nontemporalstore (i64 GR64:$src), addr:$dst)],
3671                     IIC_SSE_MOVNT>,
3672                  PS, Requires<[HasSSE2]>;
3673} // SchedRW = [WriteStore]
3674
3675let Predicates = [HasAVX2, NoVLX] in {
3676  def : Pat<(alignednontemporalstore (v8i32 VR256:$src), addr:$dst),
3677            (VMOVNTDQYmr addr:$dst, VR256:$src)>;
3678  def : Pat<(alignednontemporalstore (v16i16 VR256:$src), addr:$dst),
3679            (VMOVNTDQYmr addr:$dst, VR256:$src)>;
3680  def : Pat<(alignednontemporalstore (v32i8 VR256:$src), addr:$dst),
3681            (VMOVNTDQYmr addr:$dst, VR256:$src)>;
3682}
3683
3684let Predicates = [HasAVX, NoVLX] in {
3685  def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst),
3686            (VMOVNTDQmr addr:$dst, VR128:$src)>;
3687  def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst),
3688            (VMOVNTDQmr addr:$dst, VR128:$src)>;
3689  def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst),
3690            (VMOVNTDQmr addr:$dst, VR128:$src)>;
3691}
3692
3693def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst),
3694          (MOVNTDQmr addr:$dst, VR128:$src)>;
3695def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst),
3696          (MOVNTDQmr addr:$dst, VR128:$src)>;
3697def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst),
3698          (MOVNTDQmr addr:$dst, VR128:$src)>;
3699
3700} // AddedComplexity
3701
3702//===----------------------------------------------------------------------===//
3703// SSE 1 & 2 - Prefetch and memory fence
3704//===----------------------------------------------------------------------===//
3705
3706// Prefetch intrinsic.
3707let Predicates = [HasSSE1], SchedRW = [WriteLoad] in {
3708def PREFETCHT0   : I<0x18, MRM1m, (outs), (ins i8mem:$src),
3709    "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3), (i32 1))],
3710    IIC_SSE_PREFETCH>, TB;
3711def PREFETCHT1   : I<0x18, MRM2m, (outs), (ins i8mem:$src),
3712    "prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2), (i32 1))],
3713    IIC_SSE_PREFETCH>, TB;
3714def PREFETCHT2   : I<0x18, MRM3m, (outs), (ins i8mem:$src),
3715    "prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1), (i32 1))],
3716    IIC_SSE_PREFETCH>, TB;
3717def PREFETCHNTA  : I<0x18, MRM0m, (outs), (ins i8mem:$src),
3718    "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0), (i32 1))],
3719    IIC_SSE_PREFETCH>, TB;
3720}
3721
3722// FIXME: How should flush instruction be modeled?
3723let SchedRW = [WriteLoad] in {
3724// Flush cache
3725def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
3726               "clflush\t$src", [(int_x86_sse2_clflush addr:$src)],
3727               IIC_SSE_PREFETCH>, PS, Requires<[HasSSE2]>;
3728}
3729
3730let SchedRW = [WriteNop] in {
3731// Pause. This "instruction" is encoded as "rep; nop", so even though it
3732// was introduced with SSE2, it's backward compatible.
3733def PAUSE : I<0x90, RawFrm, (outs), (ins),
3734              "pause", [(int_x86_sse2_pause)], IIC_SSE_PAUSE>,
3735              OBXS, Requires<[HasSSE2]>;
3736}
3737
3738let SchedRW = [WriteFence] in {
3739// Load, store, and memory fence
3740def SFENCE : I<0xAE, MRM_F8, (outs), (ins),
3741               "sfence", [(int_x86_sse_sfence)], IIC_SSE_SFENCE>,
3742               PS, Requires<[HasSSE1]>;
3743def LFENCE : I<0xAE, MRM_E8, (outs), (ins),
3744               "lfence", [(int_x86_sse2_lfence)], IIC_SSE_LFENCE>,
3745               TB, Requires<[HasSSE2]>;
3746def MFENCE : I<0xAE, MRM_F0, (outs), (ins),
3747               "mfence", [(int_x86_sse2_mfence)], IIC_SSE_MFENCE>,
3748               TB, Requires<[HasSSE2]>;
3749} // SchedRW
3750
3751def : Pat<(X86SFence), (SFENCE)>;
3752def : Pat<(X86LFence), (LFENCE)>;
3753def : Pat<(X86MFence), (MFENCE)>;
3754
3755//===----------------------------------------------------------------------===//
3756// SSE 1 & 2 - Load/Store XCSR register
3757//===----------------------------------------------------------------------===//
3758
3759def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src),
3760                  "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)],
3761                  IIC_SSE_LDMXCSR>, VEX, Sched<[WriteLoad]>;
3762def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
3763                  "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)],
3764                  IIC_SSE_STMXCSR>, VEX, Sched<[WriteStore]>;
3765
3766let Predicates = [UseSSE1] in {
3767def LDMXCSR : I<0xAE, MRM2m, (outs), (ins i32mem:$src),
3768                "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)],
3769                IIC_SSE_LDMXCSR>, TB, Sched<[WriteLoad]>;
3770def STMXCSR : I<0xAE, MRM3m, (outs), (ins i32mem:$dst),
3771                "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)],
3772                IIC_SSE_STMXCSR>, TB, Sched<[WriteStore]>;
3773}
3774
3775//===---------------------------------------------------------------------===//
3776// SSE2 - Move Aligned/Unaligned Packed Integer Instructions
3777//===---------------------------------------------------------------------===//
3778
3779let ExeDomain = SSEPackedInt in { // SSE integer instructions
3780
3781let hasSideEffects = 0, SchedRW = [WriteMove] in {
3782def VMOVDQArr  : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3783                    "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>,
3784                    VEX;
3785def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
3786                    "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>,
3787                    VEX, VEX_L;
3788def VMOVDQUrr  : VSSI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3789                    "movdqu\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVU_P_RR>,
3790                    VEX;
3791def VMOVDQUYrr : VSSI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
3792                    "movdqu\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVU_P_RR>,
3793                    VEX, VEX_L;
3794}
3795
3796// For Disassembler
3797let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
3798    SchedRW = [WriteMove] in {
3799def VMOVDQArr_REV  : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3800                        "movdqa\t{$src, $dst|$dst, $src}", [],
3801                        IIC_SSE_MOVA_P_RR>,
3802                        VEX;
3803def VMOVDQAYrr_REV : VPDI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
3804                        "movdqa\t{$src, $dst|$dst, $src}", [],
3805                        IIC_SSE_MOVA_P_RR>, VEX, VEX_L;
3806def VMOVDQUrr_REV  : VSSI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3807                        "movdqu\t{$src, $dst|$dst, $src}", [],
3808                        IIC_SSE_MOVU_P_RR>,
3809                        VEX;
3810def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
3811                        "movdqu\t{$src, $dst|$dst, $src}", [],
3812                        IIC_SSE_MOVU_P_RR>, VEX, VEX_L;
3813}
3814
3815let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
3816    hasSideEffects = 0, SchedRW = [WriteLoad] in {
3817def VMOVDQArm  : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3818                   "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RM>,
3819                   VEX;
3820def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
3821                   "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RM>,
3822                   VEX, VEX_L;
3823let Predicates = [HasAVX] in {
3824  def VMOVDQUrm  : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3825                    "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_RM>,
3826                    XS, VEX;
3827  def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
3828                    "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_RM>,
3829                    XS, VEX, VEX_L;
3830}
3831}
3832
3833let mayStore = 1, hasSideEffects = 0, SchedRW = [WriteStore] in {
3834def VMOVDQAmr  : VPDI<0x7F, MRMDestMem, (outs),
3835                     (ins i128mem:$dst, VR128:$src),
3836                     "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_MR>,
3837                     VEX;
3838def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs),
3839                     (ins i256mem:$dst, VR256:$src),
3840                     "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_MR>,
3841                     VEX, VEX_L;
3842let Predicates = [HasAVX] in {
3843def VMOVDQUmr  : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3844                  "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_MR>,
3845                  XS, VEX;
3846def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src),
3847                  "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_MR>,
3848                  XS, VEX, VEX_L;
3849}
3850}
3851
3852let SchedRW = [WriteMove] in {
3853let hasSideEffects = 0 in
3854def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3855                   "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>;
3856
3857def MOVDQUrr :   I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3858                   "movdqu\t{$src, $dst|$dst, $src}",
3859                   [], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>;
3860
3861// For Disassembler
3862let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
3863def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3864                       "movdqa\t{$src, $dst|$dst, $src}", [],
3865                       IIC_SSE_MOVA_P_RR>;
3866
3867def MOVDQUrr_REV :   I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3868                       "movdqu\t{$src, $dst|$dst, $src}",
3869                       [], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>;
3870}
3871} // SchedRW
3872
3873let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
3874    hasSideEffects = 0, SchedRW = [WriteLoad] in {
3875def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3876                   "movdqa\t{$src, $dst|$dst, $src}",
3877                   [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/],
3878                   IIC_SSE_MOVA_P_RM>;
3879def MOVDQUrm :   I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3880                   "movdqu\t{$src, $dst|$dst, $src}",
3881                   [/*(set VR128:$dst, (loadv2i64 addr:$src))*/],
3882                   IIC_SSE_MOVU_P_RM>,
3883                 XS, Requires<[UseSSE2]>;
3884}
3885
3886let mayStore = 1, hasSideEffects = 0, SchedRW = [WriteStore] in {
3887def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3888                   "movdqa\t{$src, $dst|$dst, $src}",
3889                   [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/],
3890                   IIC_SSE_MOVA_P_MR>;
3891def MOVDQUmr :   I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3892                   "movdqu\t{$src, $dst|$dst, $src}",
3893                   [/*(store (v2i64 VR128:$src), addr:$dst)*/],
3894                   IIC_SSE_MOVU_P_MR>,
3895                 XS, Requires<[UseSSE2]>;
3896}
3897
3898} // ExeDomain = SSEPackedInt
3899
3900let Predicates = [HasAVX] in {
3901  def : Pat<(int_x86_sse2_storeu_dq addr:$dst, VR128:$src),
3902            (VMOVDQUmr addr:$dst, VR128:$src)>;
3903  def : Pat<(int_x86_avx_storeu_dq_256 addr:$dst, VR256:$src),
3904            (VMOVDQUYmr addr:$dst, VR256:$src)>;
3905}
3906let Predicates = [UseSSE2] in
3907def : Pat<(int_x86_sse2_storeu_dq addr:$dst, VR128:$src),
3908          (MOVDQUmr addr:$dst, VR128:$src)>;
3909
3910//===---------------------------------------------------------------------===//
3911// SSE2 - Packed Integer Arithmetic Instructions
3912//===---------------------------------------------------------------------===//
3913
3914let Sched = WriteVecIMul in
3915def SSE_PMADD : OpndItins<
3916  IIC_SSE_PMADD, IIC_SSE_PMADD
3917>;
3918
3919let ExeDomain = SSEPackedInt in { // SSE integer instructions
3920
3921multiclass PDI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId,
3922                            RegisterClass RC, PatFrag memop_frag,
3923                            X86MemOperand x86memop,
3924                            OpndItins itins,
3925                            bit IsCommutable = 0,
3926                            bit Is2Addr = 1> {
3927  let isCommutable = IsCommutable in
3928  def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
3929       (ins RC:$src1, RC:$src2),
3930       !if(Is2Addr,
3931           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3932           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3933       [(set RC:$dst, (IntId RC:$src1, RC:$src2))], itins.rr>,
3934      Sched<[itins.Sched]>;
3935  def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
3936       (ins RC:$src1, x86memop:$src2),
3937       !if(Is2Addr,
3938           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3939           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3940       [(set RC:$dst, (IntId RC:$src1, (bitconvert (memop_frag addr:$src2))))],
3941       itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
3942}
3943
3944multiclass PDI_binop_all_int<bits<8> opc, string OpcodeStr, Intrinsic IntId128,
3945                             Intrinsic IntId256, OpndItins itins,
3946                             bit IsCommutable = 0> {
3947let Predicates = [HasAVX] in
3948  defm V#NAME : PDI_binop_rm_int<opc, !strconcat("v", OpcodeStr), IntId128,
3949                                 VR128, loadv2i64, i128mem, itins,
3950                                 IsCommutable, 0>, VEX_4V;
3951
3952let Constraints = "$src1 = $dst" in
3953  defm NAME : PDI_binop_rm_int<opc, OpcodeStr, IntId128, VR128, memopv2i64,
3954                               i128mem, itins, IsCommutable, 1>;
3955
3956let Predicates = [HasAVX2] in
3957  defm V#NAME#Y : PDI_binop_rm_int<opc, !strconcat("v", OpcodeStr), IntId256,
3958                                   VR256, loadv4i64, i256mem, itins,
3959                                   IsCommutable, 0>, VEX_4V, VEX_L;
3960}
3961
3962multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm,
3963                         string OpcodeStr, SDNode OpNode,
3964                         SDNode OpNode2, RegisterClass RC,
3965                         ValueType DstVT, ValueType SrcVT, PatFrag bc_frag,
3966                         PatFrag ld_frag, ShiftOpndItins itins,
3967                         bit Is2Addr = 1> {
3968  // src2 is always 128-bit
3969  def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
3970       (ins RC:$src1, VR128:$src2),
3971       !if(Is2Addr,
3972           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3973           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3974       [(set RC:$dst, (DstVT (OpNode RC:$src1, (SrcVT VR128:$src2))))],
3975        itins.rr>, Sched<[WriteVecShift]>;
3976  def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
3977       (ins RC:$src1, i128mem:$src2),
3978       !if(Is2Addr,
3979           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3980           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3981       [(set RC:$dst, (DstVT (OpNode RC:$src1,
3982                       (bc_frag (ld_frag addr:$src2)))))], itins.rm>,
3983      Sched<[WriteVecShiftLd, ReadAfterLd]>;
3984  def ri : PDIi8<opc2, ImmForm, (outs RC:$dst),
3985       (ins RC:$src1, u8imm:$src2),
3986       !if(Is2Addr,
3987           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3988           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3989       [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i8 imm:$src2))))], itins.ri>,
3990       Sched<[WriteVecShift]>;
3991}
3992
3993/// PDI_binop_rm2 - Simple SSE2 binary operator with different src and dst types
3994multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
3995                         ValueType DstVT, ValueType SrcVT, RegisterClass RC,
3996                         PatFrag memop_frag, X86MemOperand x86memop,
3997                         OpndItins itins,
3998                         bit IsCommutable = 0, bit Is2Addr = 1> {
3999  let isCommutable = IsCommutable in
4000  def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
4001       (ins RC:$src1, RC:$src2),
4002       !if(Is2Addr,
4003           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4004           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4005       [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>,
4006       Sched<[itins.Sched]>;
4007  def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
4008       (ins RC:$src1, x86memop:$src2),
4009       !if(Is2Addr,
4010           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4011           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4012       [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1),
4013                                     (bitconvert (memop_frag addr:$src2)))))]>,
4014       Sched<[itins.Sched.Folded, ReadAfterLd]>;
4015}
4016} // ExeDomain = SSEPackedInt
4017
4018defm PADDB   : PDI_binop_all<0xFC, "paddb", add, v16i8, v32i8,
4019                             SSE_INTALU_ITINS_P, 1>;
4020defm PADDW   : PDI_binop_all<0xFD, "paddw", add, v8i16, v16i16,
4021                             SSE_INTALU_ITINS_P, 1>;
4022defm PADDD   : PDI_binop_all<0xFE, "paddd", add, v4i32, v8i32,
4023                             SSE_INTALU_ITINS_P, 1>;
4024defm PADDQ   : PDI_binop_all<0xD4, "paddq", add, v2i64, v4i64,
4025                             SSE_INTALUQ_ITINS_P, 1>;
4026defm PMULLW  : PDI_binop_all<0xD5, "pmullw", mul, v8i16, v16i16,
4027                             SSE_INTMUL_ITINS_P, 1>;
4028defm PMULHUW : PDI_binop_all<0xE4, "pmulhuw", mulhu, v8i16, v16i16,
4029                             SSE_INTMUL_ITINS_P, 1>;
4030defm PMULHW  : PDI_binop_all<0xE5, "pmulhw", mulhs, v8i16, v16i16,
4031                             SSE_INTMUL_ITINS_P, 1>;
4032defm PSUBB   : PDI_binop_all<0xF8, "psubb", sub, v16i8, v32i8,
4033                             SSE_INTALU_ITINS_P, 0>;
4034defm PSUBW   : PDI_binop_all<0xF9, "psubw", sub, v8i16, v16i16,
4035                             SSE_INTALU_ITINS_P, 0>;
4036defm PSUBD   : PDI_binop_all<0xFA, "psubd", sub, v4i32, v8i32,
4037                             SSE_INTALU_ITINS_P, 0>;
4038defm PSUBQ   : PDI_binop_all<0xFB, "psubq", sub, v2i64, v4i64,
4039                             SSE_INTALUQ_ITINS_P, 0>;
4040defm PSUBUSB : PDI_binop_all<0xD8, "psubusb", X86subus, v16i8, v32i8,
4041                             SSE_INTALU_ITINS_P, 0>;
4042defm PSUBUSW : PDI_binop_all<0xD9, "psubusw", X86subus, v8i16, v16i16,
4043                             SSE_INTALU_ITINS_P, 0>;
4044defm PMINUB  : PDI_binop_all<0xDA, "pminub", X86umin, v16i8, v32i8,
4045                             SSE_INTALU_ITINS_P, 1>;
4046defm PMINSW  : PDI_binop_all<0xEA, "pminsw", X86smin, v8i16, v16i16,
4047                             SSE_INTALU_ITINS_P, 1>;
4048defm PMAXUB  : PDI_binop_all<0xDE, "pmaxub", X86umax, v16i8, v32i8,
4049                             SSE_INTALU_ITINS_P, 1>;
4050defm PMAXSW  : PDI_binop_all<0xEE, "pmaxsw", X86smax, v8i16, v16i16,
4051                             SSE_INTALU_ITINS_P, 1>;
4052
4053// Intrinsic forms
4054defm PSUBSB  : PDI_binop_all_int<0xE8, "psubsb", int_x86_sse2_psubs_b,
4055                                 int_x86_avx2_psubs_b, SSE_INTALU_ITINS_P, 0>;
4056defm PSUBSW  : PDI_binop_all_int<0xE9, "psubsw" , int_x86_sse2_psubs_w,
4057                                 int_x86_avx2_psubs_w, SSE_INTALU_ITINS_P, 0>;
4058defm PADDSB  : PDI_binop_all_int<0xEC, "paddsb" , int_x86_sse2_padds_b,
4059                                 int_x86_avx2_padds_b, SSE_INTALU_ITINS_P, 1>;
4060defm PADDSW  : PDI_binop_all_int<0xED, "paddsw" , int_x86_sse2_padds_w,
4061                                 int_x86_avx2_padds_w, SSE_INTALU_ITINS_P, 1>;
4062defm PADDUSB : PDI_binop_all_int<0xDC, "paddusb", int_x86_sse2_paddus_b,
4063                                 int_x86_avx2_paddus_b, SSE_INTALU_ITINS_P, 1>;
4064defm PADDUSW : PDI_binop_all_int<0xDD, "paddusw", int_x86_sse2_paddus_w,
4065                                 int_x86_avx2_paddus_w, SSE_INTALU_ITINS_P, 1>;
4066defm PMADDWD : PDI_binop_all_int<0xF5, "pmaddwd", int_x86_sse2_pmadd_wd,
4067                                 int_x86_avx2_pmadd_wd, SSE_PMADD, 1>;
4068defm PAVGB   : PDI_binop_all_int<0xE0, "pavgb", int_x86_sse2_pavg_b,
4069                                 int_x86_avx2_pavg_b, SSE_INTALU_ITINS_P, 1>;
4070defm PAVGW   : PDI_binop_all_int<0xE3, "pavgw", int_x86_sse2_pavg_w,
4071                                 int_x86_avx2_pavg_w, SSE_INTALU_ITINS_P, 1>;
4072defm PSADBW  : PDI_binop_all_int<0xF6, "psadbw", int_x86_sse2_psad_bw,
4073                                 int_x86_avx2_psad_bw, SSE_PMADD, 1>;
4074
4075let Predicates = [HasAVX] in
4076defm VPMULUDQ : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v2i64, v4i32, VR128,
4077                              loadv2i64, i128mem, SSE_INTMUL_ITINS_P, 1, 0>,
4078                              VEX_4V;
4079let Predicates = [HasAVX2] in
4080defm VPMULUDQY : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v4i64, v8i32,
4081                               VR256, loadv4i64, i256mem,
4082                               SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L;
4083let Constraints = "$src1 = $dst" in
4084defm PMULUDQ : PDI_binop_rm2<0xF4, "pmuludq", X86pmuludq, v2i64, v4i32, VR128,
4085                             memopv2i64, i128mem, SSE_INTMUL_ITINS_P, 1>;
4086
4087//===---------------------------------------------------------------------===//
4088// SSE2 - Packed Integer Logical Instructions
4089//===---------------------------------------------------------------------===//
4090
4091let Predicates = [HasAVX, NoVLX] in {
4092defm VPSLLW : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli,
4093                            VR128, v8i16, v8i16, bc_v8i16, loadv2i64,
4094                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
4095defm VPSLLD : PDI_binop_rmi<0xF2, 0x72, MRM6r, "vpslld", X86vshl, X86vshli,
4096                            VR128, v4i32, v4i32, bc_v4i32, loadv2i64,
4097                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
4098defm VPSLLQ : PDI_binop_rmi<0xF3, 0x73, MRM6r, "vpsllq", X86vshl, X86vshli,
4099                            VR128, v2i64, v2i64, bc_v2i64, loadv2i64,
4100                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
4101
4102defm VPSRLW : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli,
4103                            VR128, v8i16, v8i16, bc_v8i16, loadv2i64,
4104                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
4105defm VPSRLD : PDI_binop_rmi<0xD2, 0x72, MRM2r, "vpsrld", X86vsrl, X86vsrli,
4106                            VR128, v4i32, v4i32, bc_v4i32, loadv2i64,
4107                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
4108defm VPSRLQ : PDI_binop_rmi<0xD3, 0x73, MRM2r, "vpsrlq", X86vsrl, X86vsrli,
4109                            VR128, v2i64, v2i64, bc_v2i64, loadv2i64,
4110                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
4111
4112defm VPSRAW : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai,
4113                            VR128, v8i16, v8i16, bc_v8i16, loadv2i64,
4114                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
4115defm VPSRAD : PDI_binop_rmi<0xE2, 0x72, MRM4r, "vpsrad", X86vsra, X86vsrai,
4116                            VR128, v4i32, v4i32, bc_v4i32, loadv2i64,
4117                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
4118
4119let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] in {
4120  // 128-bit logical shifts.
4121  def VPSLLDQri : PDIi8<0x73, MRM7r,
4122                    (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
4123                    "vpslldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
4124                    [(set VR128:$dst,
4125                      (v2i64 (X86vshldq VR128:$src1, (i8 imm:$src2))))]>,
4126                    VEX_4V;
4127  def VPSRLDQri : PDIi8<0x73, MRM3r,
4128                    (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
4129                    "vpsrldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
4130                    [(set VR128:$dst,
4131                      (v2i64 (X86vshrdq VR128:$src1, (i8 imm:$src2))))]>,
4132                    VEX_4V;
4133  // PSRADQri doesn't exist in SSE[1-3].
4134}
4135} // Predicates = [HasAVX]
4136
4137let Predicates = [HasAVX2, NoVLX] in {
4138defm VPSLLWY : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli,
4139                             VR256, v16i16, v8i16, bc_v8i16, loadv2i64,
4140                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
4141defm VPSLLDY : PDI_binop_rmi<0xF2, 0x72, MRM6r, "vpslld", X86vshl, X86vshli,
4142                             VR256, v8i32, v4i32, bc_v4i32, loadv2i64,
4143                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
4144defm VPSLLQY : PDI_binop_rmi<0xF3, 0x73, MRM6r, "vpsllq", X86vshl, X86vshli,
4145                             VR256, v4i64, v2i64, bc_v2i64, loadv2i64,
4146                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
4147
4148defm VPSRLWY : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli,
4149                             VR256, v16i16, v8i16, bc_v8i16, loadv2i64,
4150                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
4151defm VPSRLDY : PDI_binop_rmi<0xD2, 0x72, MRM2r, "vpsrld", X86vsrl, X86vsrli,
4152                             VR256, v8i32, v4i32, bc_v4i32, loadv2i64,
4153                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
4154defm VPSRLQY : PDI_binop_rmi<0xD3, 0x73, MRM2r, "vpsrlq", X86vsrl, X86vsrli,
4155                             VR256, v4i64, v2i64, bc_v2i64, loadv2i64,
4156                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
4157
4158defm VPSRAWY : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai,
4159                             VR256, v16i16, v8i16, bc_v8i16, loadv2i64,
4160                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
4161defm VPSRADY : PDI_binop_rmi<0xE2, 0x72, MRM4r, "vpsrad", X86vsra, X86vsrai,
4162                             VR256, v8i32, v4i32, bc_v4i32, loadv2i64,
4163                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
4164
4165let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift], hasSideEffects = 0 in {
4166  // 256-bit logical shifts.
4167  def VPSLLDQYri : PDIi8<0x73, MRM7r,
4168                    (outs VR256:$dst), (ins VR256:$src1, u8imm:$src2),
4169                    "vpslldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
4170                    [(set VR256:$dst,
4171                      (v4i64 (X86vshldq VR256:$src1, (i8 imm:$src2))))]>,
4172                    VEX_4V, VEX_L;
4173  def VPSRLDQYri : PDIi8<0x73, MRM3r,
4174                    (outs VR256:$dst), (ins VR256:$src1, u8imm:$src2),
4175                    "vpsrldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
4176                    [(set VR256:$dst,
4177                      (v4i64 (X86vshrdq VR256:$src1, (i8 imm:$src2))))]>,
4178                    VEX_4V, VEX_L;
4179  // PSRADQYri doesn't exist in SSE[1-3].
4180}
4181} // Predicates = [HasAVX2]
4182
4183let Constraints = "$src1 = $dst" in {
4184defm PSLLW : PDI_binop_rmi<0xF1, 0x71, MRM6r, "psllw", X86vshl, X86vshli,
4185                           VR128, v8i16, v8i16, bc_v8i16, memopv2i64,
4186                           SSE_INTSHIFT_ITINS_P>;
4187defm PSLLD : PDI_binop_rmi<0xF2, 0x72, MRM6r, "pslld", X86vshl, X86vshli,
4188                           VR128, v4i32, v4i32, bc_v4i32, memopv2i64,
4189                           SSE_INTSHIFT_ITINS_P>;
4190defm PSLLQ : PDI_binop_rmi<0xF3, 0x73, MRM6r, "psllq", X86vshl, X86vshli,
4191                           VR128, v2i64, v2i64, bc_v2i64, memopv2i64,
4192                           SSE_INTSHIFT_ITINS_P>;
4193
4194defm PSRLW : PDI_binop_rmi<0xD1, 0x71, MRM2r, "psrlw", X86vsrl, X86vsrli,
4195                           VR128, v8i16, v8i16, bc_v8i16, memopv2i64,
4196                           SSE_INTSHIFT_ITINS_P>;
4197defm PSRLD : PDI_binop_rmi<0xD2, 0x72, MRM2r, "psrld", X86vsrl, X86vsrli,
4198                           VR128, v4i32, v4i32, bc_v4i32, memopv2i64,
4199                           SSE_INTSHIFT_ITINS_P>;
4200defm PSRLQ : PDI_binop_rmi<0xD3, 0x73, MRM2r, "psrlq", X86vsrl, X86vsrli,
4201                           VR128, v2i64, v2i64, bc_v2i64, memopv2i64,
4202                           SSE_INTSHIFT_ITINS_P>;
4203
4204defm PSRAW : PDI_binop_rmi<0xE1, 0x71, MRM4r, "psraw", X86vsra, X86vsrai,
4205                           VR128, v8i16, v8i16, bc_v8i16, memopv2i64,
4206                           SSE_INTSHIFT_ITINS_P>;
4207defm PSRAD : PDI_binop_rmi<0xE2, 0x72, MRM4r, "psrad", X86vsra, X86vsrai,
4208                           VR128, v4i32, v4i32, bc_v4i32, memopv2i64,
4209                           SSE_INTSHIFT_ITINS_P>;
4210
4211let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift], hasSideEffects = 0 in {
4212  // 128-bit logical shifts.
4213  def PSLLDQri : PDIi8<0x73, MRM7r,
4214                       (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
4215                       "pslldq\t{$src2, $dst|$dst, $src2}",
4216                       [(set VR128:$dst,
4217                         (v2i64 (X86vshldq VR128:$src1, (i8 imm:$src2))))],
4218                       IIC_SSE_INTSHDQ_P_RI>;
4219  def PSRLDQri : PDIi8<0x73, MRM3r,
4220                       (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
4221                       "psrldq\t{$src2, $dst|$dst, $src2}",
4222                       [(set VR128:$dst,
4223                         (v2i64 (X86vshrdq VR128:$src1, (i8 imm:$src2))))],
4224                       IIC_SSE_INTSHDQ_P_RI>;
4225  // PSRADQri doesn't exist in SSE[1-3].
4226}
4227} // Constraints = "$src1 = $dst"
4228
4229let Predicates = [HasAVX] in {
4230  def : Pat<(v2f64 (X86fsrl VR128:$src1, i32immSExt8:$src2)),
4231            (VPSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
4232}
4233
4234let Predicates = [UseSSE2] in {
4235  def : Pat<(v2f64 (X86fsrl VR128:$src1, i32immSExt8:$src2)),
4236            (PSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
4237}
4238
4239//===---------------------------------------------------------------------===//
4240// SSE2 - Packed Integer Comparison Instructions
4241//===---------------------------------------------------------------------===//
4242
4243defm PCMPEQB : PDI_binop_all<0x74, "pcmpeqb", X86pcmpeq, v16i8, v32i8,
4244                             SSE_INTALU_ITINS_P, 1>;
4245defm PCMPEQW : PDI_binop_all<0x75, "pcmpeqw", X86pcmpeq, v8i16, v16i16,
4246                             SSE_INTALU_ITINS_P, 1>;
4247defm PCMPEQD : PDI_binop_all<0x76, "pcmpeqd", X86pcmpeq, v4i32, v8i32,
4248                             SSE_INTALU_ITINS_P, 1>;
4249defm PCMPGTB : PDI_binop_all<0x64, "pcmpgtb", X86pcmpgt, v16i8, v32i8,
4250                             SSE_INTALU_ITINS_P, 0>;
4251defm PCMPGTW : PDI_binop_all<0x65, "pcmpgtw", X86pcmpgt, v8i16, v16i16,
4252                             SSE_INTALU_ITINS_P, 0>;
4253defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32,
4254                             SSE_INTALU_ITINS_P, 0>;
4255
4256//===---------------------------------------------------------------------===//
4257// SSE2 - Packed Integer Shuffle Instructions
4258//===---------------------------------------------------------------------===//
4259
4260let ExeDomain = SSEPackedInt in {
4261multiclass sse2_pshuffle<string OpcodeStr, ValueType vt128, ValueType vt256,
4262                         SDNode OpNode> {
4263let Predicates = [HasAVX] in {
4264  def V#NAME#ri : Ii8<0x70, MRMSrcReg, (outs VR128:$dst),
4265                      (ins VR128:$src1, u8imm:$src2),
4266                      !strconcat("v", OpcodeStr,
4267                                 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4268                      [(set VR128:$dst,
4269                        (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))],
4270                      IIC_SSE_PSHUF_RI>, VEX, Sched<[WriteShuffle]>;
4271  def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst),
4272                      (ins i128mem:$src1, u8imm:$src2),
4273                      !strconcat("v", OpcodeStr,
4274                                 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4275                     [(set VR128:$dst,
4276                       (vt128 (OpNode (bitconvert (loadv2i64 addr:$src1)),
4277                        (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>, VEX,
4278                  Sched<[WriteShuffleLd]>;
4279}
4280
4281let Predicates = [HasAVX2] in {
4282  def V#NAME#Yri : Ii8<0x70, MRMSrcReg, (outs VR256:$dst),
4283                       (ins VR256:$src1, u8imm:$src2),
4284                       !strconcat("v", OpcodeStr,
4285                                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4286                       [(set VR256:$dst,
4287                         (vt256 (OpNode VR256:$src1, (i8 imm:$src2))))],
4288                       IIC_SSE_PSHUF_RI>, VEX, VEX_L, Sched<[WriteShuffle]>;
4289  def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst),
4290                       (ins i256mem:$src1, u8imm:$src2),
4291                       !strconcat("v", OpcodeStr,
4292                                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4293                      [(set VR256:$dst,
4294                        (vt256 (OpNode (bitconvert (loadv4i64 addr:$src1)),
4295                         (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>, VEX, VEX_L,
4296                   Sched<[WriteShuffleLd]>;
4297}
4298
4299let Predicates = [UseSSE2] in {
4300  def ri : Ii8<0x70, MRMSrcReg,
4301               (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
4302               !strconcat(OpcodeStr,
4303                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4304                [(set VR128:$dst,
4305                  (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))],
4306                IIC_SSE_PSHUF_RI>, Sched<[WriteShuffle]>;
4307  def mi : Ii8<0x70, MRMSrcMem,
4308               (outs VR128:$dst), (ins i128mem:$src1, u8imm:$src2),
4309               !strconcat(OpcodeStr,
4310                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4311                [(set VR128:$dst,
4312                  (vt128 (OpNode (bitconvert (memopv2i64 addr:$src1)),
4313                          (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>,
4314           Sched<[WriteShuffleLd, ReadAfterLd]>;
4315}
4316}
4317} // ExeDomain = SSEPackedInt
4318
4319defm PSHUFD  : sse2_pshuffle<"pshufd", v4i32, v8i32, X86PShufd>, PD;
4320defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw>, XS;
4321defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw>, XD;
4322
4323let Predicates = [HasAVX] in {
4324  def : Pat<(v4f32 (X86PShufd (loadv4f32 addr:$src1), (i8 imm:$imm))),
4325            (VPSHUFDmi addr:$src1, imm:$imm)>;
4326  def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))),
4327            (VPSHUFDri VR128:$src1, imm:$imm)>;
4328}
4329
4330let Predicates = [UseSSE2] in {
4331  def : Pat<(v4f32 (X86PShufd (memopv4f32 addr:$src1), (i8 imm:$imm))),
4332            (PSHUFDmi addr:$src1, imm:$imm)>;
4333  def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))),
4334            (PSHUFDri VR128:$src1, imm:$imm)>;
4335}
4336
4337//===---------------------------------------------------------------------===//
4338// Packed Integer Pack Instructions (SSE & AVX)
4339//===---------------------------------------------------------------------===//
4340
4341let ExeDomain = SSEPackedInt in {
4342multiclass sse2_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
4343                     ValueType ArgVT, SDNode OpNode, PatFrag bc_frag,
4344                     PatFrag ld_frag, bit Is2Addr = 1> {
4345  def rr : PDI<opc, MRMSrcReg,
4346               (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
4347               !if(Is2Addr,
4348                   !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4349                   !strconcat(OpcodeStr,
4350                              "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4351               [(set VR128:$dst,
4352                     (OutVT (OpNode (ArgVT VR128:$src1), VR128:$src2)))]>,
4353               Sched<[WriteShuffle]>;
4354  def rm : PDI<opc, MRMSrcMem,
4355               (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
4356               !if(Is2Addr,
4357                   !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4358                   !strconcat(OpcodeStr,
4359                              "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4360               [(set VR128:$dst,
4361                     (OutVT (OpNode VR128:$src1,
4362                                    (bc_frag (ld_frag addr:$src2)))))]>,
4363               Sched<[WriteShuffleLd, ReadAfterLd]>;
4364}
4365
4366multiclass sse2_pack_y<bits<8> opc, string OpcodeStr, ValueType OutVT,
4367                       ValueType ArgVT, SDNode OpNode, PatFrag bc_frag> {
4368  def Yrr : PDI<opc, MRMSrcReg,
4369                (outs VR256:$dst), (ins VR256:$src1, VR256:$src2),
4370                !strconcat(OpcodeStr,
4371                           "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4372                [(set VR256:$dst,
4373                      (OutVT (OpNode (ArgVT VR256:$src1), VR256:$src2)))]>,
4374                Sched<[WriteShuffle]>;
4375  def Yrm : PDI<opc, MRMSrcMem,
4376                (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2),
4377                !strconcat(OpcodeStr,
4378                           "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4379                [(set VR256:$dst,
4380                      (OutVT (OpNode VR256:$src1,
4381                                     (bc_frag (loadv4i64 addr:$src2)))))]>,
4382                Sched<[WriteShuffleLd, ReadAfterLd]>;
4383}
4384
4385multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
4386                     ValueType ArgVT, SDNode OpNode, PatFrag bc_frag,
4387                     PatFrag ld_frag, bit Is2Addr = 1> {
4388  def rr : SS48I<opc, MRMSrcReg,
4389                 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
4390                 !if(Is2Addr,
4391                     !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4392                     !strconcat(OpcodeStr,
4393                                "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4394                 [(set VR128:$dst,
4395                       (OutVT (OpNode (ArgVT VR128:$src1), VR128:$src2)))]>,
4396                 Sched<[WriteShuffle]>;
4397  def rm : SS48I<opc, MRMSrcMem,
4398                 (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
4399                 !if(Is2Addr,
4400                     !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4401                     !strconcat(OpcodeStr,
4402                                "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4403                 [(set VR128:$dst,
4404                       (OutVT (OpNode VR128:$src1,
4405                                      (bc_frag (ld_frag addr:$src2)))))]>,
4406                 Sched<[WriteShuffleLd, ReadAfterLd]>;
4407}
4408
4409multiclass sse4_pack_y<bits<8> opc, string OpcodeStr, ValueType OutVT,
4410                     ValueType ArgVT, SDNode OpNode, PatFrag bc_frag> {
4411  def Yrr : SS48I<opc, MRMSrcReg,
4412                  (outs VR256:$dst), (ins VR256:$src1, VR256:$src2),
4413                  !strconcat(OpcodeStr,
4414                             "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4415                  [(set VR256:$dst,
4416                        (OutVT (OpNode (ArgVT VR256:$src1), VR256:$src2)))]>,
4417                  Sched<[WriteShuffle]>;
4418  def Yrm : SS48I<opc, MRMSrcMem,
4419                  (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2),
4420                  !strconcat(OpcodeStr,
4421                             "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4422                  [(set VR256:$dst,
4423                        (OutVT (OpNode VR256:$src1,
4424                                       (bc_frag (loadv4i64 addr:$src2)))))]>,
4425                  Sched<[WriteShuffleLd, ReadAfterLd]>;
4426}
4427
4428let Predicates = [HasAVX] in {
4429  defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss,
4430                             bc_v8i16, loadv2i64, 0>, VEX_4V;
4431  defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss,
4432                             bc_v4i32, loadv2i64, 0>, VEX_4V;
4433
4434  defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus,
4435                             bc_v8i16, loadv2i64, 0>, VEX_4V;
4436  defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus,
4437                             bc_v4i32, loadv2i64, 0>, VEX_4V;
4438}
4439
4440let Predicates = [HasAVX2] in {
4441  defm VPACKSSWB : sse2_pack_y<0x63, "vpacksswb", v32i8, v16i16, X86Packss,
4442                               bc_v16i16>, VEX_4V, VEX_L;
4443  defm VPACKSSDW : sse2_pack_y<0x6B, "vpackssdw", v16i16, v8i32, X86Packss,
4444                               bc_v8i32>, VEX_4V, VEX_L;
4445
4446  defm VPACKUSWB : sse2_pack_y<0x67, "vpackuswb", v32i8, v16i16, X86Packus,
4447                               bc_v16i16>, VEX_4V, VEX_L;
4448  defm VPACKUSDW : sse4_pack_y<0x2B, "vpackusdw", v16i16, v8i32, X86Packus,
4449                               bc_v8i32>, VEX_4V, VEX_L;
4450}
4451
4452let Constraints = "$src1 = $dst" in {
4453  defm PACKSSWB : sse2_pack<0x63, "packsswb", v16i8, v8i16, X86Packss,
4454                            bc_v8i16, memopv2i64>;
4455  defm PACKSSDW : sse2_pack<0x6B, "packssdw", v8i16, v4i32, X86Packss,
4456                            bc_v4i32, memopv2i64>;
4457
4458  defm PACKUSWB : sse2_pack<0x67, "packuswb", v16i8, v8i16, X86Packus,
4459                            bc_v8i16, memopv2i64>;
4460
4461  let Predicates = [HasSSE41] in
4462  defm PACKUSDW : sse4_pack<0x2B, "packusdw", v8i16, v4i32, X86Packus,
4463                            bc_v4i32, memopv2i64>;
4464}
4465} // ExeDomain = SSEPackedInt
4466
4467//===---------------------------------------------------------------------===//
4468// SSE2 - Packed Integer Unpack Instructions
4469//===---------------------------------------------------------------------===//
4470
4471let ExeDomain = SSEPackedInt in {
4472multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt,
4473                       SDNode OpNode, PatFrag bc_frag, PatFrag ld_frag,
4474                       bit Is2Addr = 1> {
4475  def rr : PDI<opc, MRMSrcReg,
4476      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
4477      !if(Is2Addr,
4478          !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
4479          !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4480      [(set VR128:$dst, (vt (OpNode VR128:$src1, VR128:$src2)))],
4481      IIC_SSE_UNPCK>, Sched<[WriteShuffle]>;
4482  def rm : PDI<opc, MRMSrcMem,
4483      (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
4484      !if(Is2Addr,
4485          !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
4486          !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4487      [(set VR128:$dst, (OpNode VR128:$src1,
4488                                  (bc_frag (ld_frag addr:$src2))))],
4489                                               IIC_SSE_UNPCK>,
4490      Sched<[WriteShuffleLd, ReadAfterLd]>;
4491}
4492
4493multiclass sse2_unpack_y<bits<8> opc, string OpcodeStr, ValueType vt,
4494                         SDNode OpNode, PatFrag bc_frag> {
4495  def Yrr : PDI<opc, MRMSrcReg,
4496      (outs VR256:$dst), (ins VR256:$src1, VR256:$src2),
4497      !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4498      [(set VR256:$dst, (vt (OpNode VR256:$src1, VR256:$src2)))]>,
4499      Sched<[WriteShuffle]>;
4500  def Yrm : PDI<opc, MRMSrcMem,
4501      (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2),
4502      !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4503      [(set VR256:$dst, (OpNode VR256:$src1,
4504                                  (bc_frag (loadv4i64 addr:$src2))))]>,
4505      Sched<[WriteShuffleLd, ReadAfterLd]>;
4506}
4507
4508let Predicates = [HasAVX] in {
4509  defm VPUNPCKLBW  : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl,
4510                                 bc_v16i8, loadv2i64, 0>, VEX_4V;
4511  defm VPUNPCKLWD  : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl,
4512                                 bc_v8i16, loadv2i64, 0>, VEX_4V;
4513  defm VPUNPCKLDQ  : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl,
4514                                 bc_v4i32, loadv2i64, 0>, VEX_4V;
4515  defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl,
4516                                 bc_v2i64, loadv2i64, 0>, VEX_4V;
4517
4518  defm VPUNPCKHBW  : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh,
4519                                 bc_v16i8, loadv2i64, 0>, VEX_4V;
4520  defm VPUNPCKHWD  : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh,
4521                                 bc_v8i16, loadv2i64, 0>, VEX_4V;
4522  defm VPUNPCKHDQ  : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh,
4523                                 bc_v4i32, loadv2i64, 0>, VEX_4V;
4524  defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh,
4525                                 bc_v2i64, loadv2i64, 0>, VEX_4V;
4526}
4527
4528let Predicates = [HasAVX2] in {
4529  defm VPUNPCKLBW  : sse2_unpack_y<0x60, "vpunpcklbw", v32i8, X86Unpckl,
4530                                   bc_v32i8>, VEX_4V, VEX_L;
4531  defm VPUNPCKLWD  : sse2_unpack_y<0x61, "vpunpcklwd", v16i16, X86Unpckl,
4532                                   bc_v16i16>, VEX_4V, VEX_L;
4533  defm VPUNPCKLDQ  : sse2_unpack_y<0x62, "vpunpckldq", v8i32, X86Unpckl,
4534                                   bc_v8i32>, VEX_4V, VEX_L;
4535  defm VPUNPCKLQDQ : sse2_unpack_y<0x6C, "vpunpcklqdq", v4i64, X86Unpckl,
4536                                   bc_v4i64>, VEX_4V, VEX_L;
4537
4538  defm VPUNPCKHBW  : sse2_unpack_y<0x68, "vpunpckhbw", v32i8, X86Unpckh,
4539                                   bc_v32i8>, VEX_4V, VEX_L;
4540  defm VPUNPCKHWD  : sse2_unpack_y<0x69, "vpunpckhwd", v16i16, X86Unpckh,
4541                                   bc_v16i16>, VEX_4V, VEX_L;
4542  defm VPUNPCKHDQ  : sse2_unpack_y<0x6A, "vpunpckhdq", v8i32, X86Unpckh,
4543                                   bc_v8i32>, VEX_4V, VEX_L;
4544  defm VPUNPCKHQDQ : sse2_unpack_y<0x6D, "vpunpckhqdq", v4i64, X86Unpckh,
4545                                   bc_v4i64>, VEX_4V, VEX_L;
4546}
4547
4548let Constraints = "$src1 = $dst" in {
4549  defm PUNPCKLBW  : sse2_unpack<0x60, "punpcklbw", v16i8, X86Unpckl,
4550                                bc_v16i8, memopv2i64>;
4551  defm PUNPCKLWD  : sse2_unpack<0x61, "punpcklwd", v8i16, X86Unpckl,
4552                                bc_v8i16, memopv2i64>;
4553  defm PUNPCKLDQ  : sse2_unpack<0x62, "punpckldq", v4i32, X86Unpckl,
4554                                bc_v4i32, memopv2i64>;
4555  defm PUNPCKLQDQ : sse2_unpack<0x6C, "punpcklqdq", v2i64, X86Unpckl,
4556                                bc_v2i64, memopv2i64>;
4557
4558  defm PUNPCKHBW  : sse2_unpack<0x68, "punpckhbw", v16i8, X86Unpckh,
4559                                bc_v16i8, memopv2i64>;
4560  defm PUNPCKHWD  : sse2_unpack<0x69, "punpckhwd", v8i16, X86Unpckh,
4561                                bc_v8i16, memopv2i64>;
4562  defm PUNPCKHDQ  : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Unpckh,
4563                                bc_v4i32, memopv2i64>;
4564  defm PUNPCKHQDQ : sse2_unpack<0x6D, "punpckhqdq", v2i64, X86Unpckh,
4565                                bc_v2i64, memopv2i64>;
4566}
4567} // ExeDomain = SSEPackedInt
4568
4569//===---------------------------------------------------------------------===//
4570// SSE2 - Packed Integer Extract and Insert
4571//===---------------------------------------------------------------------===//
4572
4573let ExeDomain = SSEPackedInt in {
4574multiclass sse2_pinsrw<bit Is2Addr = 1> {
4575  def rri : Ii8<0xC4, MRMSrcReg,
4576       (outs VR128:$dst), (ins VR128:$src1,
4577        GR32orGR64:$src2, u8imm:$src3),
4578       !if(Is2Addr,
4579           "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
4580           "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
4581       [(set VR128:$dst,
4582         (X86pinsrw VR128:$src1, GR32orGR64:$src2, imm:$src3))],
4583       IIC_SSE_PINSRW>, Sched<[WriteShuffle]>;
4584  def rmi : Ii8<0xC4, MRMSrcMem,
4585                       (outs VR128:$dst), (ins VR128:$src1,
4586                        i16mem:$src2, u8imm:$src3),
4587       !if(Is2Addr,
4588           "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
4589           "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
4590       [(set VR128:$dst,
4591         (X86pinsrw VR128:$src1, (extloadi16 addr:$src2),
4592                    imm:$src3))], IIC_SSE_PINSRW>,
4593       Sched<[WriteShuffleLd, ReadAfterLd]>;
4594}
4595
4596// Extract
4597let Predicates = [HasAVX] in
4598def VPEXTRWri : Ii8<0xC5, MRMSrcReg,
4599                    (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
4600                    "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
4601                    [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
4602                                            imm:$src2))]>, PD, VEX,
4603                Sched<[WriteShuffle]>;
4604def PEXTRWri : PDIi8<0xC5, MRMSrcReg,
4605                    (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
4606                    "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
4607                    [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
4608                                            imm:$src2))], IIC_SSE_PEXTRW>,
4609               Sched<[WriteShuffleLd, ReadAfterLd]>;
4610
4611// Insert
4612let Predicates = [HasAVX] in
4613defm VPINSRW : sse2_pinsrw<0>, PD, VEX_4V;
4614
4615let Predicates = [UseSSE2], Constraints = "$src1 = $dst" in
4616defm PINSRW : sse2_pinsrw, PD;
4617
4618} // ExeDomain = SSEPackedInt
4619
4620//===---------------------------------------------------------------------===//
4621// SSE2 - Packed Mask Creation
4622//===---------------------------------------------------------------------===//
4623
4624let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLogic] in {
4625
4626def VPMOVMSKBrr  : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
4627           (ins VR128:$src),
4628           "pmovmskb\t{$src, $dst|$dst, $src}",
4629           [(set GR32orGR64:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))],
4630           IIC_SSE_MOVMSK>, VEX;
4631
4632let Predicates = [HasAVX2] in {
4633def VPMOVMSKBYrr  : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
4634           (ins VR256:$src),
4635           "pmovmskb\t{$src, $dst|$dst, $src}",
4636           [(set GR32orGR64:$dst, (int_x86_avx2_pmovmskb VR256:$src))]>,
4637           VEX, VEX_L;
4638}
4639
4640def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src),
4641           "pmovmskb\t{$src, $dst|$dst, $src}",
4642           [(set GR32orGR64:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))],
4643           IIC_SSE_MOVMSK>;
4644
4645} // ExeDomain = SSEPackedInt
4646
4647//===---------------------------------------------------------------------===//
4648// SSE2 - Conditional Store
4649//===---------------------------------------------------------------------===//
4650
4651let ExeDomain = SSEPackedInt, SchedRW = [WriteStore] in {
4652
4653let Uses = [EDI], Predicates = [HasAVX,Not64BitMode] in
4654def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs),
4655           (ins VR128:$src, VR128:$mask),
4656           "maskmovdqu\t{$mask, $src|$src, $mask}",
4657           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)],
4658           IIC_SSE_MASKMOV>, VEX;
4659let Uses = [RDI], Predicates = [HasAVX,In64BitMode] in
4660def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs),
4661           (ins VR128:$src, VR128:$mask),
4662           "maskmovdqu\t{$mask, $src|$src, $mask}",
4663           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)],
4664           IIC_SSE_MASKMOV>, VEX;
4665
4666let Uses = [EDI], Predicates = [UseSSE2,Not64BitMode] in
4667def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
4668           "maskmovdqu\t{$mask, $src|$src, $mask}",
4669           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)],
4670           IIC_SSE_MASKMOV>;
4671let Uses = [RDI], Predicates = [UseSSE2,In64BitMode] in
4672def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
4673           "maskmovdqu\t{$mask, $src|$src, $mask}",
4674           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)],
4675           IIC_SSE_MASKMOV>;
4676
4677} // ExeDomain = SSEPackedInt
4678
4679//===---------------------------------------------------------------------===//
4680// SSE2 - Move Doubleword
4681//===---------------------------------------------------------------------===//
4682
4683//===---------------------------------------------------------------------===//
4684// Move Int Doubleword to Packed Double Int
4685//
4686def VMOVDI2PDIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
4687                      "movd\t{$src, $dst|$dst, $src}",
4688                      [(set VR128:$dst,
4689                        (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>,
4690                        VEX, Sched<[WriteMove]>;
4691def VMOVDI2PDIrm : VS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
4692                      "movd\t{$src, $dst|$dst, $src}",
4693                      [(set VR128:$dst,
4694                        (v4i32 (scalar_to_vector (loadi32 addr:$src))))],
4695                        IIC_SSE_MOVDQ>,
4696                      VEX, Sched<[WriteLoad]>;
4697def VMOV64toPQIrr : VRS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
4698                        "movq\t{$src, $dst|$dst, $src}",
4699                        [(set VR128:$dst,
4700                          (v2i64 (scalar_to_vector GR64:$src)))],
4701                          IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
4702let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
4703def VMOV64toPQIrm : VRS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4704                        "movq\t{$src, $dst|$dst, $src}",
4705                        [], IIC_SSE_MOVDQ>, VEX, Sched<[WriteLoad]>;
4706let isCodeGenOnly = 1 in
4707def VMOV64toSDrr : VRS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
4708                       "movq\t{$src, $dst|$dst, $src}",
4709                       [(set FR64:$dst, (bitconvert GR64:$src))],
4710                       IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
4711
4712def MOVDI2PDIrr : S2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
4713                      "movd\t{$src, $dst|$dst, $src}",
4714                      [(set VR128:$dst,
4715                        (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>,
4716                  Sched<[WriteMove]>;
4717def MOVDI2PDIrm : S2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
4718                      "movd\t{$src, $dst|$dst, $src}",
4719                      [(set VR128:$dst,
4720                        (v4i32 (scalar_to_vector (loadi32 addr:$src))))],
4721                        IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
4722def MOV64toPQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
4723                        "mov{d|q}\t{$src, $dst|$dst, $src}",
4724                        [(set VR128:$dst,
4725                          (v2i64 (scalar_to_vector GR64:$src)))],
4726                          IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
4727let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
4728def MOV64toPQIrm : RS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4729                        "mov{d|q}\t{$src, $dst|$dst, $src}",
4730                        [], IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
4731let isCodeGenOnly = 1 in
4732def MOV64toSDrr : RS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
4733                       "mov{d|q}\t{$src, $dst|$dst, $src}",
4734                       [(set FR64:$dst, (bitconvert GR64:$src))],
4735                       IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
4736
4737//===---------------------------------------------------------------------===//
4738// Move Int Doubleword to Single Scalar
4739//
4740let isCodeGenOnly = 1 in {
4741  def VMOVDI2SSrr  : VS2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
4742                        "movd\t{$src, $dst|$dst, $src}",
4743                        [(set FR32:$dst, (bitconvert GR32:$src))],
4744                        IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
4745
4746  def VMOVDI2SSrm  : VS2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
4747                        "movd\t{$src, $dst|$dst, $src}",
4748                        [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))],
4749                        IIC_SSE_MOVDQ>,
4750                        VEX, Sched<[WriteLoad]>;
4751  def MOVDI2SSrr  : S2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
4752                        "movd\t{$src, $dst|$dst, $src}",
4753                        [(set FR32:$dst, (bitconvert GR32:$src))],
4754                        IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
4755
4756  def MOVDI2SSrm  : S2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
4757                        "movd\t{$src, $dst|$dst, $src}",
4758                        [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))],
4759                        IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
4760}
4761
4762//===---------------------------------------------------------------------===//
4763// Move Packed Doubleword Int to Packed Double Int
4764//
4765def VMOVPDI2DIrr  : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
4766                       "movd\t{$src, $dst|$dst, $src}",
4767                       [(set GR32:$dst, (vector_extract (v4i32 VR128:$src),
4768                                        (iPTR 0)))], IIC_SSE_MOVD_ToGP>, VEX,
4769                    Sched<[WriteMove]>;
4770def VMOVPDI2DImr  : VS2I<0x7E, MRMDestMem, (outs),
4771                       (ins i32mem:$dst, VR128:$src),
4772                       "movd\t{$src, $dst|$dst, $src}",
4773                       [(store (i32 (vector_extract (v4i32 VR128:$src),
4774                                     (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>,
4775                                     VEX, Sched<[WriteStore]>;
4776def MOVPDI2DIrr  : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
4777                       "movd\t{$src, $dst|$dst, $src}",
4778                       [(set GR32:$dst, (vector_extract (v4i32 VR128:$src),
4779                                        (iPTR 0)))], IIC_SSE_MOVD_ToGP>,
4780                   Sched<[WriteMove]>;
4781def MOVPDI2DImr  : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src),
4782                       "movd\t{$src, $dst|$dst, $src}",
4783                       [(store (i32 (vector_extract (v4i32 VR128:$src),
4784                                     (iPTR 0))), addr:$dst)],
4785                                     IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
4786
4787def : Pat<(v8i32 (X86Vinsert (v8i32 immAllZerosV), GR32:$src2, (iPTR 0))),
4788        (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src2), sub_xmm)>;
4789
4790def : Pat<(v4i64 (X86Vinsert (bc_v4i64 (v8i32 immAllZerosV)), GR64:$src2, (iPTR 0))),
4791        (SUBREG_TO_REG (i32 0), (VMOV64toPQIrr GR64:$src2), sub_xmm)>;
4792
4793def : Pat<(v8i32 (X86Vinsert undef, GR32:$src2, (iPTR 0))),
4794        (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src2), sub_xmm)>;
4795
4796def : Pat<(v4i64 (X86Vinsert undef, GR64:$src2, (iPTR 0))),
4797        (SUBREG_TO_REG (i32 0), (VMOV64toPQIrr GR64:$src2), sub_xmm)>;
4798
4799//===---------------------------------------------------------------------===//
4800// Move Packed Doubleword Int first element to Doubleword Int
4801//
4802let SchedRW = [WriteMove] in {
4803def VMOVPQIto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
4804                          "movq\t{$src, $dst|$dst, $src}",
4805                          [(set GR64:$dst, (vector_extract (v2i64 VR128:$src),
4806                                                           (iPTR 0)))],
4807                                                           IIC_SSE_MOVD_ToGP>,
4808                      VEX;
4809
4810def MOVPQIto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
4811                        "mov{d|q}\t{$src, $dst|$dst, $src}",
4812                        [(set GR64:$dst, (vector_extract (v2i64 VR128:$src),
4813                                                         (iPTR 0)))],
4814                                                         IIC_SSE_MOVD_ToGP>;
4815} //SchedRW
4816
4817let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
4818def VMOVPQIto64rm : VRS2I<0x7E, MRMDestMem, (outs i64mem:$dst),
4819                          (ins VR128:$src), "movq\t{$src, $dst|$dst, $src}",
4820                          [], IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>;
4821let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
4822def MOVPQIto64rm : RS2I<0x7E, MRMDestMem, (outs i64mem:$dst), (ins VR128:$src),
4823                        "mov{d|q}\t{$src, $dst|$dst, $src}",
4824                        [], IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
4825
4826//===---------------------------------------------------------------------===//
4827// Bitcast FR64 <-> GR64
4828//
4829let isCodeGenOnly = 1 in {
4830  let Predicates = [UseAVX] in
4831  def VMOV64toSDrm : VS2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
4832                          "movq\t{$src, $dst|$dst, $src}",
4833                          [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>,
4834                          VEX, Sched<[WriteLoad]>;
4835  def VMOVSDto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
4836                           "movq\t{$src, $dst|$dst, $src}",
4837                           [(set GR64:$dst, (bitconvert FR64:$src))],
4838                           IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
4839  def VMOVSDto64mr : VRS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
4840                           "movq\t{$src, $dst|$dst, $src}",
4841                           [(store (i64 (bitconvert FR64:$src)), addr:$dst)],
4842                           IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>;
4843
4844  def MOV64toSDrm : S2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
4845                         "movq\t{$src, $dst|$dst, $src}",
4846                         [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))],
4847                         IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
4848  def MOVSDto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
4849                         "mov{d|q}\t{$src, $dst|$dst, $src}",
4850                         [(set GR64:$dst, (bitconvert FR64:$src))],
4851                         IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>;
4852  def MOVSDto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
4853                         "movq\t{$src, $dst|$dst, $src}",
4854                         [(store (i64 (bitconvert FR64:$src)), addr:$dst)],
4855                         IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
4856}
4857
4858//===---------------------------------------------------------------------===//
4859// Move Scalar Single to Double Int
4860//
4861let isCodeGenOnly = 1 in {
4862  def VMOVSS2DIrr  : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
4863                        "movd\t{$src, $dst|$dst, $src}",
4864                        [(set GR32:$dst, (bitconvert FR32:$src))],
4865                        IIC_SSE_MOVD_ToGP>, VEX, Sched<[WriteMove]>;
4866  def VMOVSS2DImr  : VS2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
4867                        "movd\t{$src, $dst|$dst, $src}",
4868                        [(store (i32 (bitconvert FR32:$src)), addr:$dst)],
4869                        IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>;
4870  def MOVSS2DIrr  : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
4871                        "movd\t{$src, $dst|$dst, $src}",
4872                        [(set GR32:$dst, (bitconvert FR32:$src))],
4873                        IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>;
4874  def MOVSS2DImr  : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
4875                        "movd\t{$src, $dst|$dst, $src}",
4876                        [(store (i32 (bitconvert FR32:$src)), addr:$dst)],
4877                        IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
4878}
4879
4880//===---------------------------------------------------------------------===//
4881// Patterns and instructions to describe movd/movq to XMM register zero-extends
4882//
4883let isCodeGenOnly = 1, SchedRW = [WriteMove] in {
4884let AddedComplexity = 15 in {
4885def VMOVZQI2PQIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
4886                       "movq\t{$src, $dst|$dst, $src}", // X86-64 only
4887                       [(set VR128:$dst, (v2i64 (X86vzmovl
4888                                      (v2i64 (scalar_to_vector GR64:$src)))))],
4889                                      IIC_SSE_MOVDQ>,
4890                                      VEX, VEX_W;
4891def MOVZQI2PQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
4892                       "mov{d|q}\t{$src, $dst|$dst, $src}", // X86-64 only
4893                       [(set VR128:$dst, (v2i64 (X86vzmovl
4894                                      (v2i64 (scalar_to_vector GR64:$src)))))],
4895                                      IIC_SSE_MOVDQ>;
4896}
4897} // isCodeGenOnly, SchedRW
4898
4899let Predicates = [UseAVX] in {
4900  let AddedComplexity = 15 in
4901    def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
4902              (VMOVDI2PDIrr GR32:$src)>;
4903
4904  // AVX 128-bit movd/movq instructions write zeros in the high 128-bit part.
4905  // These instructions also write zeros in the high part of a 256-bit register.
4906  let AddedComplexity = 20 in {
4907    def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
4908              (VMOVDI2PDIrm addr:$src)>;
4909    def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
4910              (VMOVDI2PDIrm addr:$src)>;
4911    def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
4912              (VMOVDI2PDIrm addr:$src)>;
4913    def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
4914                (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))),
4915              (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrm addr:$src), sub_xmm)>;
4916  }
4917  // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext.
4918  def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
4919                               (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),
4920            (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src), sub_xmm)>;
4921  def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
4922                               (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
4923            (SUBREG_TO_REG (i64 0), (VMOVZQI2PQIrr GR64:$src), sub_xmm)>;
4924}
4925
4926let Predicates = [UseSSE2] in {
4927  let AddedComplexity = 15 in
4928    def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
4929              (MOVDI2PDIrr GR32:$src)>;
4930
4931  let AddedComplexity = 20 in {
4932    def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
4933              (MOVDI2PDIrm addr:$src)>;
4934    def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
4935              (MOVDI2PDIrm addr:$src)>;
4936    def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
4937              (MOVDI2PDIrm addr:$src)>;
4938  }
4939}
4940
4941// These are the correct encodings of the instructions so that we know how to
4942// read correct assembly, even though we continue to emit the wrong ones for
4943// compatibility with Darwin's buggy assembler.
4944def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
4945                (MOV64toPQIrr VR128:$dst, GR64:$src), 0>;
4946def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
4947                (MOVPQIto64rr GR64:$dst, VR128:$src), 0>;
4948// Allow "vmovd" but print "vmovq" since we don't need compatibility for AVX.
4949def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
4950                (VMOV64toPQIrr VR128:$dst, GR64:$src), 0>;
4951def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
4952                (VMOVPQIto64rr GR64:$dst, VR128:$src), 0>;
4953
4954//===---------------------------------------------------------------------===//
4955// SSE2 - Move Quadword
4956//===---------------------------------------------------------------------===//
4957
4958//===---------------------------------------------------------------------===//
4959// Move Quadword Int to Packed Quadword Int
4960//
4961
4962let ExeDomain = SSEPackedInt, SchedRW = [WriteLoad] in {
4963def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4964                    "vmovq\t{$src, $dst|$dst, $src}",
4965                    [(set VR128:$dst,
4966                      (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS,
4967                    VEX, Requires<[UseAVX]>;
4968def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4969                    "movq\t{$src, $dst|$dst, $src}",
4970                    [(set VR128:$dst,
4971                      (v2i64 (scalar_to_vector (loadi64 addr:$src))))],
4972                      IIC_SSE_MOVDQ>, XS,
4973                    Requires<[UseSSE2]>; // SSE2 instruction with XS Prefix
4974} // ExeDomain, SchedRW
4975
4976//===---------------------------------------------------------------------===//
4977// Move Packed Quadword Int to Quadword Int
4978//
4979let ExeDomain = SSEPackedInt, SchedRW = [WriteStore] in {
4980def VMOVPQI2QImr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
4981                      "movq\t{$src, $dst|$dst, $src}",
4982                      [(store (i64 (vector_extract (v2i64 VR128:$src),
4983                                    (iPTR 0))), addr:$dst)],
4984                                    IIC_SSE_MOVDQ>, VEX;
4985def MOVPQI2QImr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
4986                      "movq\t{$src, $dst|$dst, $src}",
4987                      [(store (i64 (vector_extract (v2i64 VR128:$src),
4988                                    (iPTR 0))), addr:$dst)],
4989                                    IIC_SSE_MOVDQ>;
4990} // ExeDomain, SchedRW
4991
4992// For disassembler only
4993let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
4994    SchedRW = [WriteVecLogic] in {
4995def VMOVPQI2QIrr : VS2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
4996                     "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVQ_RR>, VEX;
4997def MOVPQI2QIrr : S2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
4998                      "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVQ_RR>;
4999}
5000
5001//===---------------------------------------------------------------------===//
5002// Store / copy lower 64-bits of a XMM register.
5003//
5004let Predicates = [UseAVX] in
5005def : Pat<(int_x86_sse2_storel_dq addr:$dst, VR128:$src),
5006          (VMOVPQI2QImr addr:$dst, VR128:$src)>;
5007let Predicates = [UseSSE2] in
5008def : Pat<(int_x86_sse2_storel_dq addr:$dst, VR128:$src),
5009          (MOVPQI2QImr addr:$dst, VR128:$src)>;
5010
5011let ExeDomain = SSEPackedInt, isCodeGenOnly = 1, AddedComplexity = 20 in {
5012def VMOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
5013                     "vmovq\t{$src, $dst|$dst, $src}",
5014                     [(set VR128:$dst,
5015                       (v2i64 (X86vzmovl (v2i64 (scalar_to_vector
5016                                                 (loadi64 addr:$src))))))],
5017                                                 IIC_SSE_MOVDQ>,
5018                     XS, VEX, Requires<[UseAVX]>, Sched<[WriteLoad]>;
5019
5020def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
5021                     "movq\t{$src, $dst|$dst, $src}",
5022                     [(set VR128:$dst,
5023                       (v2i64 (X86vzmovl (v2i64 (scalar_to_vector
5024                                                 (loadi64 addr:$src))))))],
5025                                                 IIC_SSE_MOVDQ>,
5026                     XS, Requires<[UseSSE2]>, Sched<[WriteLoad]>;
5027} // ExeDomain, isCodeGenOnly, AddedComplexity
5028
5029let Predicates = [UseAVX], AddedComplexity = 20 in {
5030  def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))),
5031            (VMOVZQI2PQIrm addr:$src)>;
5032  def : Pat<(v2i64 (X86vzload addr:$src)),
5033            (VMOVZQI2PQIrm addr:$src)>;
5034  def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
5035              (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))),
5036            (SUBREG_TO_REG (i64 0), (VMOVZQI2PQIrm addr:$src), sub_xmm)>;
5037}
5038
5039let Predicates = [UseSSE2], AddedComplexity = 20 in {
5040  def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))),
5041            (MOVZQI2PQIrm addr:$src)>;
5042  def : Pat<(v2i64 (X86vzload addr:$src)), (MOVZQI2PQIrm addr:$src)>;
5043}
5044
5045let Predicates = [HasAVX] in {
5046def : Pat<(v4i64 (alignedX86vzload addr:$src)),
5047          (SUBREG_TO_REG (i32 0), (VMOVAPSrm addr:$src), sub_xmm)>;
5048def : Pat<(v4i64 (X86vzload addr:$src)),
5049          (SUBREG_TO_REG (i32 0), (VMOVUPSrm addr:$src), sub_xmm)>;
5050}
5051
5052//===---------------------------------------------------------------------===//
5053// Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in
5054// IA32 document. movq xmm1, xmm2 does clear the high bits.
5055//
5056let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLogic] in {
5057let AddedComplexity = 15 in
5058def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
5059                        "vmovq\t{$src, $dst|$dst, $src}",
5060                    [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))],
5061                    IIC_SSE_MOVQ_RR>,
5062                      XS, VEX, Requires<[UseAVX]>;
5063let AddedComplexity = 15 in
5064def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
5065                        "movq\t{$src, $dst|$dst, $src}",
5066                    [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))],
5067                    IIC_SSE_MOVQ_RR>,
5068                      XS, Requires<[UseSSE2]>;
5069} // ExeDomain, SchedRW
5070
5071let ExeDomain = SSEPackedInt, isCodeGenOnly = 1, SchedRW = [WriteVecLogicLd] in {
5072let AddedComplexity = 20 in
5073def VMOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
5074                        "vmovq\t{$src, $dst|$dst, $src}",
5075                    [(set VR128:$dst, (v2i64 (X86vzmovl
5076                                             (loadv2i64 addr:$src))))],
5077                                             IIC_SSE_MOVDQ>,
5078                      XS, VEX, Requires<[UseAVX]>;
5079let AddedComplexity = 20 in {
5080def MOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
5081                        "movq\t{$src, $dst|$dst, $src}",
5082                    [(set VR128:$dst, (v2i64 (X86vzmovl
5083                                             (loadv2i64 addr:$src))))],
5084                                             IIC_SSE_MOVDQ>,
5085                      XS, Requires<[UseSSE2]>;
5086}
5087} // ExeDomain, isCodeGenOnly, SchedRW
5088
5089let AddedComplexity = 20 in {
5090  let Predicates = [UseAVX] in {
5091    def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
5092              (VMOVZPQILo2PQIrr VR128:$src)>;
5093  }
5094  let Predicates = [UseSSE2] in {
5095    def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
5096              (MOVZPQILo2PQIrr VR128:$src)>;
5097  }
5098}
5099
5100//===---------------------------------------------------------------------===//
5101// SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP
5102//===---------------------------------------------------------------------===//
5103multiclass sse3_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr,
5104                              ValueType vt, RegisterClass RC, PatFrag mem_frag,
5105                              X86MemOperand x86memop> {
5106def rr : S3SI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
5107                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5108                      [(set RC:$dst, (vt (OpNode RC:$src)))],
5109                      IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>;
5110def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
5111                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5112                      [(set RC:$dst, (OpNode (mem_frag addr:$src)))],
5113                      IIC_SSE_MOV_LH>, Sched<[WriteLoad]>;
5114}
5115
5116let Predicates = [HasAVX] in {
5117  defm VMOVSHDUP  : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
5118                                       v4f32, VR128, loadv4f32, f128mem>, VEX;
5119  defm VMOVSLDUP  : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
5120                                       v4f32, VR128, loadv4f32, f128mem>, VEX;
5121  defm VMOVSHDUPY : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
5122                                 v8f32, VR256, loadv8f32, f256mem>, VEX, VEX_L;
5123  defm VMOVSLDUPY : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
5124                                 v8f32, VR256, loadv8f32, f256mem>, VEX, VEX_L;
5125}
5126defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128,
5127                                   memopv4f32, f128mem>;
5128defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128,
5129                                   memopv4f32, f128mem>;
5130
5131let Predicates = [HasAVX] in {
5132  def : Pat<(v4i32 (X86Movshdup VR128:$src)),
5133            (VMOVSHDUPrr VR128:$src)>;
5134  def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (loadv2i64 addr:$src)))),
5135            (VMOVSHDUPrm addr:$src)>;
5136  def : Pat<(v4i32 (X86Movsldup VR128:$src)),
5137            (VMOVSLDUPrr VR128:$src)>;
5138  def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (loadv2i64 addr:$src)))),
5139            (VMOVSLDUPrm addr:$src)>;
5140  def : Pat<(v8i32 (X86Movshdup VR256:$src)),
5141            (VMOVSHDUPYrr VR256:$src)>;
5142  def : Pat<(v8i32 (X86Movshdup (bc_v8i32 (loadv4i64 addr:$src)))),
5143            (VMOVSHDUPYrm addr:$src)>;
5144  def : Pat<(v8i32 (X86Movsldup VR256:$src)),
5145            (VMOVSLDUPYrr VR256:$src)>;
5146  def : Pat<(v8i32 (X86Movsldup (bc_v8i32 (loadv4i64 addr:$src)))),
5147            (VMOVSLDUPYrm addr:$src)>;
5148}
5149
5150let Predicates = [UseSSE3] in {
5151  def : Pat<(v4i32 (X86Movshdup VR128:$src)),
5152            (MOVSHDUPrr VR128:$src)>;
5153  def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (memopv2i64 addr:$src)))),
5154            (MOVSHDUPrm addr:$src)>;
5155  def : Pat<(v4i32 (X86Movsldup VR128:$src)),
5156            (MOVSLDUPrr VR128:$src)>;
5157  def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (memopv2i64 addr:$src)))),
5158            (MOVSLDUPrm addr:$src)>;
5159}
5160
5161//===---------------------------------------------------------------------===//
5162// SSE3 - Replicate Double FP - MOVDDUP
5163//===---------------------------------------------------------------------===//
5164
5165multiclass sse3_replicate_dfp<string OpcodeStr> {
5166def rr  : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
5167                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5168                    [(set VR128:$dst, (v2f64 (X86Movddup VR128:$src)))],
5169                    IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>;
5170def rm  : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
5171                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5172                    [(set VR128:$dst,
5173                      (v2f64 (X86Movddup
5174                              (scalar_to_vector (loadf64 addr:$src)))))],
5175                              IIC_SSE_MOV_LH>, Sched<[WriteLoad]>;
5176}
5177
5178// FIXME: Merge with above classe when there're patterns for the ymm version
5179multiclass sse3_replicate_dfp_y<string OpcodeStr> {
5180def rr  : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
5181                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5182                    [(set VR256:$dst, (v4f64 (X86Movddup VR256:$src)))]>,
5183                    Sched<[WriteFShuffle]>;
5184def rm  : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
5185                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5186                    [(set VR256:$dst,
5187                      (v4f64 (X86Movddup
5188                              (scalar_to_vector (loadf64 addr:$src)))))]>,
5189                    Sched<[WriteLoad]>;
5190}
5191
5192let Predicates = [HasAVX] in {
5193  defm VMOVDDUP  : sse3_replicate_dfp<"vmovddup">, VEX;
5194  defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup">, VEX, VEX_L;
5195}
5196
5197defm MOVDDUP : sse3_replicate_dfp<"movddup">;
5198
5199let Predicates = [HasAVX] in {
5200  def : Pat<(X86Movddup (loadv2f64 addr:$src)),
5201            (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
5202  def : Pat<(X86Movddup (bc_v2f64 (loadv4f32 addr:$src))),
5203            (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
5204  def : Pat<(X86Movddup (bc_v2f64 (loadv2i64 addr:$src))),
5205            (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
5206  def : Pat<(X86Movddup (bc_v2f64
5207                             (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
5208            (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
5209
5210  // 256-bit version
5211  def : Pat<(X86Movddup (loadv4f64 addr:$src)),
5212            (VMOVDDUPYrm addr:$src)>;
5213  def : Pat<(X86Movddup (loadv4i64 addr:$src)),
5214            (VMOVDDUPYrm addr:$src)>;
5215  def : Pat<(X86Movddup (v4i64 (scalar_to_vector (loadi64 addr:$src)))),
5216            (VMOVDDUPYrm addr:$src)>;
5217  def : Pat<(X86Movddup (v4i64 VR256:$src)),
5218            (VMOVDDUPYrr VR256:$src)>;
5219}
5220
5221let Predicates = [UseAVX, OptForSize] in {
5222  def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))),
5223  (VMOVDDUPrm addr:$src)>;
5224  def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))),
5225  (VMOVDDUPrm addr:$src)>;
5226}
5227
5228let Predicates = [UseSSE3] in {
5229  def : Pat<(X86Movddup (memopv2f64 addr:$src)),
5230            (MOVDDUPrm addr:$src)>;
5231  def : Pat<(X86Movddup (bc_v2f64 (memopv4f32 addr:$src))),
5232            (MOVDDUPrm addr:$src)>;
5233  def : Pat<(X86Movddup (bc_v2f64 (memopv2i64 addr:$src))),
5234            (MOVDDUPrm addr:$src)>;
5235  def : Pat<(X86Movddup (bc_v2f64
5236                             (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
5237            (MOVDDUPrm addr:$src)>;
5238}
5239
5240//===---------------------------------------------------------------------===//
5241// SSE3 - Move Unaligned Integer
5242//===---------------------------------------------------------------------===//
5243
5244let SchedRW = [WriteLoad] in {
5245let Predicates = [HasAVX] in {
5246  def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
5247                   "vlddqu\t{$src, $dst|$dst, $src}",
5248                   [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>, VEX;
5249  def VLDDQUYrm : S3DI<0xF0, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
5250                   "vlddqu\t{$src, $dst|$dst, $src}",
5251                   [(set VR256:$dst, (int_x86_avx_ldu_dq_256 addr:$src))]>,
5252                   VEX, VEX_L;
5253}
5254def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
5255                   "lddqu\t{$src, $dst|$dst, $src}",
5256                   [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))],
5257                   IIC_SSE_LDDQU>;
5258}
5259
5260//===---------------------------------------------------------------------===//
5261// SSE3 - Arithmetic
5262//===---------------------------------------------------------------------===//
5263
5264multiclass sse3_addsub<Intrinsic Int, string OpcodeStr, RegisterClass RC,
5265                       X86MemOperand x86memop, OpndItins itins,
5266                       PatFrag ld_frag, bit Is2Addr = 1> {
5267  def rr : I<0xD0, MRMSrcReg,
5268       (outs RC:$dst), (ins RC:$src1, RC:$src2),
5269       !if(Is2Addr,
5270           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5271           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5272       [(set RC:$dst, (Int RC:$src1, RC:$src2))], itins.rr>,
5273       Sched<[itins.Sched]>;
5274  def rm : I<0xD0, MRMSrcMem,
5275       (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
5276       !if(Is2Addr,
5277           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5278           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5279       [(set RC:$dst, (Int RC:$src1, (ld_frag addr:$src2)))], itins.rr>,
5280       Sched<[itins.Sched.Folded, ReadAfterLd]>;
5281}
5282
5283let Predicates = [HasAVX] in {
5284  let ExeDomain = SSEPackedSingle in {
5285    defm VADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "vaddsubps", VR128,
5286                               f128mem, SSE_ALU_F32P, loadv4f32, 0>, XD, VEX_4V;
5287    defm VADDSUBPSY : sse3_addsub<int_x86_avx_addsub_ps_256, "vaddsubps", VR256,
5288                        f256mem, SSE_ALU_F32P, loadv8f32, 0>, XD, VEX_4V, VEX_L;
5289  }
5290  let ExeDomain = SSEPackedDouble in {
5291    defm VADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "vaddsubpd", VR128,
5292                               f128mem, SSE_ALU_F64P, loadv2f64, 0>, PD, VEX_4V;
5293    defm VADDSUBPDY : sse3_addsub<int_x86_avx_addsub_pd_256, "vaddsubpd", VR256,
5294                        f256mem, SSE_ALU_F64P, loadv4f64, 0>, PD, VEX_4V, VEX_L;
5295  }
5296}
5297let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in {
5298  let ExeDomain = SSEPackedSingle in
5299  defm ADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "addsubps", VR128,
5300                              f128mem, SSE_ALU_F32P, memopv4f32>, XD;
5301  let ExeDomain = SSEPackedDouble in
5302  defm ADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "addsubpd", VR128,
5303                              f128mem, SSE_ALU_F64P, memopv2f64>, PD;
5304}
5305
5306// Patterns used to select 'addsub' instructions.
5307let Predicates = [HasAVX] in {
5308  def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (v4f32 VR128:$rhs))),
5309            (VADDSUBPSrr VR128:$lhs, VR128:$rhs)>;
5310  def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (loadv4f32 addr:$rhs))),
5311            (VADDSUBPSrm VR128:$lhs, f128mem:$rhs)>;
5312  def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (v2f64 VR128:$rhs))),
5313            (VADDSUBPDrr VR128:$lhs, VR128:$rhs)>;
5314  def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (loadv2f64 addr:$rhs))),
5315            (VADDSUBPDrm VR128:$lhs, f128mem:$rhs)>;
5316
5317  def : Pat<(v8f32 (X86Addsub (v8f32 VR256:$lhs), (v8f32 VR256:$rhs))),
5318            (VADDSUBPSYrr VR256:$lhs, VR256:$rhs)>;
5319  def : Pat<(v8f32 (X86Addsub (v8f32 VR256:$lhs), (loadv8f32 addr:$rhs))),
5320            (VADDSUBPSYrm VR256:$lhs, f256mem:$rhs)>;
5321  def : Pat<(v4f64 (X86Addsub (v4f64 VR256:$lhs), (v4f64 VR256:$rhs))),
5322            (VADDSUBPDYrr VR256:$lhs, VR256:$rhs)>;
5323  def : Pat<(v4f64 (X86Addsub (v4f64 VR256:$lhs), (loadv4f64 addr:$rhs))),
5324            (VADDSUBPDYrm VR256:$lhs, f256mem:$rhs)>;
5325}
5326
5327let Predicates = [UseSSE3] in {
5328  def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (v4f32 VR128:$rhs))),
5329            (ADDSUBPSrr VR128:$lhs, VR128:$rhs)>;
5330  def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (memopv4f32 addr:$rhs))),
5331            (ADDSUBPSrm VR128:$lhs, f128mem:$rhs)>;
5332  def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (v2f64 VR128:$rhs))),
5333            (ADDSUBPDrr VR128:$lhs, VR128:$rhs)>;
5334  def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (memopv2f64 addr:$rhs))),
5335            (ADDSUBPDrm VR128:$lhs, f128mem:$rhs)>;
5336}
5337
5338//===---------------------------------------------------------------------===//
5339// SSE3 Instructions
5340//===---------------------------------------------------------------------===//
5341
5342// Horizontal ops
5343multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
5344                   X86MemOperand x86memop, SDNode OpNode, PatFrag ld_frag,
5345                   bit Is2Addr = 1> {
5346  def rr : S3DI<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
5347       !if(Is2Addr,
5348         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5349         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5350      [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], IIC_SSE_HADDSUB_RR>,
5351      Sched<[WriteFAdd]>;
5352
5353  def rm : S3DI<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
5354       !if(Is2Addr,
5355         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5356         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5357      [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))],
5358        IIC_SSE_HADDSUB_RM>, Sched<[WriteFAddLd, ReadAfterLd]>;
5359}
5360multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
5361                  X86MemOperand x86memop, SDNode OpNode, PatFrag ld_frag,
5362                  bit Is2Addr = 1> {
5363  def rr : S3I<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
5364       !if(Is2Addr,
5365         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5366         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5367      [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], IIC_SSE_HADDSUB_RR>,
5368      Sched<[WriteFAdd]>;
5369
5370  def rm : S3I<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
5371       !if(Is2Addr,
5372         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5373         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5374      [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))],
5375        IIC_SSE_HADDSUB_RM>, Sched<[WriteFAddLd, ReadAfterLd]>;
5376}
5377
5378let Predicates = [HasAVX] in {
5379  let ExeDomain = SSEPackedSingle in {
5380    defm VHADDPS  : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem,
5381                            X86fhadd, loadv4f32, 0>, VEX_4V;
5382    defm VHSUBPS  : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem,
5383                            X86fhsub, loadv4f32, 0>, VEX_4V;
5384    defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem,
5385                            X86fhadd, loadv8f32, 0>, VEX_4V, VEX_L;
5386    defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem,
5387                            X86fhsub, loadv8f32, 0>, VEX_4V, VEX_L;
5388  }
5389  let ExeDomain = SSEPackedDouble in {
5390    defm VHADDPD  : S3_Int <0x7C, "vhaddpd", v2f64, VR128, f128mem,
5391                            X86fhadd, loadv2f64, 0>, VEX_4V;
5392    defm VHSUBPD  : S3_Int <0x7D, "vhsubpd", v2f64, VR128, f128mem,
5393                            X86fhsub, loadv2f64, 0>, VEX_4V;
5394    defm VHADDPDY : S3_Int <0x7C, "vhaddpd", v4f64, VR256, f256mem,
5395                            X86fhadd, loadv4f64, 0>, VEX_4V, VEX_L;
5396    defm VHSUBPDY : S3_Int <0x7D, "vhsubpd", v4f64, VR256, f256mem,
5397                            X86fhsub, loadv4f64, 0>, VEX_4V, VEX_L;
5398  }
5399}
5400
5401let Constraints = "$src1 = $dst" in {
5402  let ExeDomain = SSEPackedSingle in {
5403    defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, X86fhadd,
5404                          memopv4f32>;
5405    defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, X86fhsub,
5406                          memopv4f32>;
5407  }
5408  let ExeDomain = SSEPackedDouble in {
5409    defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, X86fhadd,
5410                         memopv2f64>;
5411    defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, X86fhsub,
5412                         memopv2f64>;
5413  }
5414}
5415
5416//===---------------------------------------------------------------------===//
5417// SSSE3 - Packed Absolute Instructions
5418//===---------------------------------------------------------------------===//
5419
5420
5421/// SS3I_unop_rm_int - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
5422multiclass SS3I_unop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId128,
5423                            PatFrag ld_frag> {
5424  def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
5425                    (ins VR128:$src),
5426                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5427                    [(set VR128:$dst, (IntId128 VR128:$src))], IIC_SSE_PABS_RR>,
5428                    Sched<[WriteVecALU]>;
5429
5430  def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
5431                    (ins i128mem:$src),
5432                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5433                    [(set VR128:$dst,
5434                      (IntId128
5435                       (bitconvert (ld_frag addr:$src))))], IIC_SSE_PABS_RM>,
5436                    Sched<[WriteVecALULd]>;
5437}
5438
5439/// SS3I_unop_rm_int_y - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
5440multiclass SS3I_unop_rm_int_y<bits<8> opc, string OpcodeStr,
5441                              Intrinsic IntId256> {
5442  def rr256 : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
5443                    (ins VR256:$src),
5444                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5445                    [(set VR256:$dst, (IntId256 VR256:$src))]>,
5446                    Sched<[WriteVecALU]>;
5447
5448  def rm256 : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
5449                    (ins i256mem:$src),
5450                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5451                    [(set VR256:$dst,
5452                      (IntId256
5453                       (bitconvert (loadv4i64 addr:$src))))]>,
5454                    Sched<[WriteVecALULd]>;
5455}
5456
5457// Helper fragments to match sext vXi1 to vXiY.
5458def v16i1sextv16i8 : PatLeaf<(v16i8 (X86pcmpgt (bc_v16i8 (v4i32 immAllZerosV)),
5459                                               VR128:$src))>;
5460def v8i1sextv8i16  : PatLeaf<(v8i16 (X86vsrai VR128:$src, (i8 15)))>;
5461def v4i1sextv4i32  : PatLeaf<(v4i32 (X86vsrai VR128:$src, (i8 31)))>;
5462def v32i1sextv32i8 : PatLeaf<(v32i8 (X86pcmpgt (bc_v32i8 (v8i32 immAllZerosV)),
5463                                               VR256:$src))>;
5464def v16i1sextv16i16: PatLeaf<(v16i16 (X86vsrai VR256:$src, (i8 15)))>;
5465def v8i1sextv8i32  : PatLeaf<(v8i32 (X86vsrai VR256:$src, (i8 31)))>;
5466
5467let Predicates = [HasAVX] in {
5468  defm VPABSB  : SS3I_unop_rm_int<0x1C, "vpabsb", int_x86_ssse3_pabs_b_128,
5469                                  loadv2i64>, VEX;
5470  defm VPABSW  : SS3I_unop_rm_int<0x1D, "vpabsw", int_x86_ssse3_pabs_w_128,
5471                                  loadv2i64>, VEX;
5472  defm VPABSD  : SS3I_unop_rm_int<0x1E, "vpabsd", int_x86_ssse3_pabs_d_128,
5473                                  loadv2i64>, VEX;
5474
5475  def : Pat<(xor
5476            (bc_v2i64 (v16i1sextv16i8)),
5477            (bc_v2i64 (add (v16i8 VR128:$src), (v16i1sextv16i8)))),
5478            (VPABSBrr128 VR128:$src)>;
5479  def : Pat<(xor
5480            (bc_v2i64 (v8i1sextv8i16)),
5481            (bc_v2i64 (add (v8i16 VR128:$src), (v8i1sextv8i16)))),
5482            (VPABSWrr128 VR128:$src)>;
5483  def : Pat<(xor
5484            (bc_v2i64 (v4i1sextv4i32)),
5485            (bc_v2i64 (add (v4i32 VR128:$src), (v4i1sextv4i32)))),
5486            (VPABSDrr128 VR128:$src)>;
5487}
5488
5489let Predicates = [HasAVX2] in {
5490  defm VPABSB  : SS3I_unop_rm_int_y<0x1C, "vpabsb",
5491                                    int_x86_avx2_pabs_b>, VEX, VEX_L;
5492  defm VPABSW  : SS3I_unop_rm_int_y<0x1D, "vpabsw",
5493                                    int_x86_avx2_pabs_w>, VEX, VEX_L;
5494  defm VPABSD  : SS3I_unop_rm_int_y<0x1E, "vpabsd",
5495                                    int_x86_avx2_pabs_d>, VEX, VEX_L;
5496
5497  def : Pat<(xor
5498            (bc_v4i64 (v32i1sextv32i8)),
5499            (bc_v4i64 (add (v32i8 VR256:$src), (v32i1sextv32i8)))),
5500            (VPABSBrr256 VR256:$src)>;
5501  def : Pat<(xor
5502            (bc_v4i64 (v16i1sextv16i16)),
5503            (bc_v4i64 (add (v16i16 VR256:$src), (v16i1sextv16i16)))),
5504            (VPABSWrr256 VR256:$src)>;
5505  def : Pat<(xor
5506            (bc_v4i64 (v8i1sextv8i32)),
5507            (bc_v4i64 (add (v8i32 VR256:$src), (v8i1sextv8i32)))),
5508            (VPABSDrr256 VR256:$src)>;
5509}
5510
5511defm PABSB : SS3I_unop_rm_int<0x1C, "pabsb", int_x86_ssse3_pabs_b_128,
5512                              memopv2i64>;
5513defm PABSW : SS3I_unop_rm_int<0x1D, "pabsw", int_x86_ssse3_pabs_w_128,
5514                              memopv2i64>;
5515defm PABSD : SS3I_unop_rm_int<0x1E, "pabsd", int_x86_ssse3_pabs_d_128,
5516                              memopv2i64>;
5517
5518let Predicates = [HasSSSE3] in {
5519  def : Pat<(xor
5520            (bc_v2i64 (v16i1sextv16i8)),
5521            (bc_v2i64 (add (v16i8 VR128:$src), (v16i1sextv16i8)))),
5522            (PABSBrr128 VR128:$src)>;
5523  def : Pat<(xor
5524            (bc_v2i64 (v8i1sextv8i16)),
5525            (bc_v2i64 (add (v8i16 VR128:$src), (v8i1sextv8i16)))),
5526            (PABSWrr128 VR128:$src)>;
5527  def : Pat<(xor
5528            (bc_v2i64 (v4i1sextv4i32)),
5529            (bc_v2i64 (add (v4i32 VR128:$src), (v4i1sextv4i32)))),
5530            (PABSDrr128 VR128:$src)>;
5531}
5532
5533//===---------------------------------------------------------------------===//
5534// SSSE3 - Packed Binary Operator Instructions
5535//===---------------------------------------------------------------------===//
5536
5537let Sched = WriteVecALU in {
5538def SSE_PHADDSUBD : OpndItins<
5539  IIC_SSE_PHADDSUBD_RR, IIC_SSE_PHADDSUBD_RM
5540>;
5541def SSE_PHADDSUBSW : OpndItins<
5542  IIC_SSE_PHADDSUBSW_RR, IIC_SSE_PHADDSUBSW_RM
5543>;
5544def SSE_PHADDSUBW : OpndItins<
5545  IIC_SSE_PHADDSUBW_RR, IIC_SSE_PHADDSUBW_RM
5546>;
5547}
5548let Sched = WriteShuffle in
5549def SSE_PSHUFB : OpndItins<
5550  IIC_SSE_PSHUFB_RR, IIC_SSE_PSHUFB_RM
5551>;
5552let Sched = WriteVecALU in
5553def SSE_PSIGN : OpndItins<
5554  IIC_SSE_PSIGN_RR, IIC_SSE_PSIGN_RM
5555>;
5556let Sched = WriteVecIMul in
5557def SSE_PMULHRSW : OpndItins<
5558  IIC_SSE_PMULHRSW, IIC_SSE_PMULHRSW
5559>;
5560
5561/// SS3I_binop_rm - Simple SSSE3 bin op
5562multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
5563                         ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
5564                         X86MemOperand x86memop, OpndItins itins,
5565                         bit Is2Addr = 1> {
5566  let isCommutable = 1 in
5567  def rr : SS38I<opc, MRMSrcReg, (outs RC:$dst),
5568       (ins RC:$src1, RC:$src2),
5569       !if(Is2Addr,
5570         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5571         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5572       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))], itins.rr>,
5573       Sched<[itins.Sched]>;
5574  def rm : SS38I<opc, MRMSrcMem, (outs RC:$dst),
5575       (ins RC:$src1, x86memop:$src2),
5576       !if(Is2Addr,
5577         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5578         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5579       [(set RC:$dst,
5580         (OpVT (OpNode RC:$src1,
5581          (bitconvert (memop_frag addr:$src2)))))], itins.rm>,
5582       Sched<[itins.Sched.Folded, ReadAfterLd]>;
5583}
5584
5585/// SS3I_binop_rm_int - Simple SSSE3 bin op whose type can be v*{i8,i16,i32}.
5586multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr,
5587                             Intrinsic IntId128, OpndItins itins,
5588                             PatFrag ld_frag, bit Is2Addr = 1> {
5589  let isCommutable = 1 in
5590  def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
5591       (ins VR128:$src1, VR128:$src2),
5592       !if(Is2Addr,
5593         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5594         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5595       [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
5596       Sched<[itins.Sched]>;
5597  def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
5598       (ins VR128:$src1, i128mem:$src2),
5599       !if(Is2Addr,
5600         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5601         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5602       [(set VR128:$dst,
5603         (IntId128 VR128:$src1,
5604          (bitconvert (ld_frag addr:$src2))))]>,
5605       Sched<[itins.Sched.Folded, ReadAfterLd]>;
5606}
5607
5608multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
5609                               Intrinsic IntId256,
5610                               X86FoldableSchedWrite Sched> {
5611  let isCommutable = 1 in
5612  def rr256 : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
5613       (ins VR256:$src1, VR256:$src2),
5614       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5615       [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>,
5616       Sched<[Sched]>;
5617  def rm256 : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
5618       (ins VR256:$src1, i256mem:$src2),
5619       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5620       [(set VR256:$dst,
5621         (IntId256 VR256:$src1, (bitconvert (loadv4i64 addr:$src2))))]>,
5622       Sched<[Sched.Folded, ReadAfterLd]>;
5623}
5624
5625let ImmT = NoImm, Predicates = [HasAVX] in {
5626let isCommutable = 0 in {
5627  defm VPHADDW    : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, VR128,
5628                                  loadv2i64, i128mem,
5629                                  SSE_PHADDSUBW, 0>, VEX_4V;
5630  defm VPHADDD    : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, VR128,
5631                                  loadv2i64, i128mem,
5632                                  SSE_PHADDSUBD, 0>, VEX_4V;
5633  defm VPHSUBW    : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, VR128,
5634                                  loadv2i64, i128mem,
5635                                  SSE_PHADDSUBW, 0>, VEX_4V;
5636  defm VPHSUBD    : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, VR128,
5637                                  loadv2i64, i128mem,
5638                                  SSE_PHADDSUBD, 0>, VEX_4V;
5639  defm VPSIGNB    : SS3I_binop_rm<0x08, "vpsignb", X86psign, v16i8, VR128,
5640                                  loadv2i64, i128mem,
5641                                  SSE_PSIGN, 0>, VEX_4V;
5642  defm VPSIGNW    : SS3I_binop_rm<0x09, "vpsignw", X86psign, v8i16, VR128,
5643                                  loadv2i64, i128mem,
5644                                  SSE_PSIGN, 0>, VEX_4V;
5645  defm VPSIGND    : SS3I_binop_rm<0x0A, "vpsignd", X86psign, v4i32, VR128,
5646                                  loadv2i64, i128mem,
5647                                  SSE_PSIGN, 0>, VEX_4V;
5648  defm VPSHUFB    : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, VR128,
5649                                  loadv2i64, i128mem,
5650                                  SSE_PSHUFB, 0>, VEX_4V;
5651  defm VPHADDSW   : SS3I_binop_rm_int<0x03, "vphaddsw",
5652                                      int_x86_ssse3_phadd_sw_128,
5653                                      SSE_PHADDSUBSW, loadv2i64, 0>, VEX_4V;
5654  defm VPHSUBSW   : SS3I_binop_rm_int<0x07, "vphsubsw",
5655                                      int_x86_ssse3_phsub_sw_128,
5656                                      SSE_PHADDSUBSW, loadv2i64, 0>, VEX_4V;
5657  defm VPMADDUBSW : SS3I_binop_rm_int<0x04, "vpmaddubsw",
5658                                      int_x86_ssse3_pmadd_ub_sw_128,
5659                                      SSE_PMADD, loadv2i64, 0>, VEX_4V;
5660}
5661defm VPMULHRSW    : SS3I_binop_rm_int<0x0B, "vpmulhrsw",
5662                                      int_x86_ssse3_pmul_hr_sw_128,
5663                                      SSE_PMULHRSW, loadv2i64, 0>, VEX_4V;
5664}
5665
5666let ImmT = NoImm, Predicates = [HasAVX2] in {
5667let isCommutable = 0 in {
5668  defm VPHADDWY   : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, VR256,
5669                                  loadv4i64, i256mem,
5670                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
5671  defm VPHADDDY   : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, VR256,
5672                                  loadv4i64, i256mem,
5673                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
5674  defm VPHSUBWY   : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, VR256,
5675                                  loadv4i64, i256mem,
5676                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
5677  defm VPHSUBDY   : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, VR256,
5678                                  loadv4i64, i256mem,
5679                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
5680  defm VPSIGNBY   : SS3I_binop_rm<0x08, "vpsignb", X86psign, v32i8, VR256,
5681                                  loadv4i64, i256mem,
5682                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
5683  defm VPSIGNWY   : SS3I_binop_rm<0x09, "vpsignw", X86psign, v16i16, VR256,
5684                                  loadv4i64, i256mem,
5685                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
5686  defm VPSIGNDY   : SS3I_binop_rm<0x0A, "vpsignd", X86psign, v8i32, VR256,
5687                                  loadv4i64, i256mem,
5688                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
5689  defm VPSHUFBY   : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, VR256,
5690                                  loadv4i64, i256mem,
5691                                  SSE_PSHUFB, 0>, VEX_4V, VEX_L;
5692  defm VPHADDSW   : SS3I_binop_rm_int_y<0x03, "vphaddsw",
5693                                        int_x86_avx2_phadd_sw,
5694                                        WriteVecALU>, VEX_4V, VEX_L;
5695  defm VPHSUBSW   : SS3I_binop_rm_int_y<0x07, "vphsubsw",
5696                                        int_x86_avx2_phsub_sw,
5697                                        WriteVecALU>, VEX_4V, VEX_L;
5698  defm VPMADDUBSW : SS3I_binop_rm_int_y<0x04, "vpmaddubsw",
5699                                       int_x86_avx2_pmadd_ub_sw,
5700                                        WriteVecIMul>, VEX_4V, VEX_L;
5701}
5702defm VPMULHRSW    : SS3I_binop_rm_int_y<0x0B, "vpmulhrsw",
5703                                        int_x86_avx2_pmul_hr_sw,
5704                                        WriteVecIMul>, VEX_4V, VEX_L;
5705}
5706
5707// None of these have i8 immediate fields.
5708let ImmT = NoImm, Constraints = "$src1 = $dst" in {
5709let isCommutable = 0 in {
5710  defm PHADDW    : SS3I_binop_rm<0x01, "phaddw", X86hadd, v8i16, VR128,
5711                                 memopv2i64, i128mem, SSE_PHADDSUBW>;
5712  defm PHADDD    : SS3I_binop_rm<0x02, "phaddd", X86hadd, v4i32, VR128,
5713                                 memopv2i64, i128mem, SSE_PHADDSUBD>;
5714  defm PHSUBW    : SS3I_binop_rm<0x05, "phsubw", X86hsub, v8i16, VR128,
5715                                 memopv2i64, i128mem, SSE_PHADDSUBW>;
5716  defm PHSUBD    : SS3I_binop_rm<0x06, "phsubd", X86hsub, v4i32, VR128,
5717                                 memopv2i64, i128mem, SSE_PHADDSUBD>;
5718  defm PSIGNB    : SS3I_binop_rm<0x08, "psignb", X86psign, v16i8, VR128,
5719                                 memopv2i64, i128mem, SSE_PSIGN>;
5720  defm PSIGNW    : SS3I_binop_rm<0x09, "psignw", X86psign, v8i16, VR128,
5721                                 memopv2i64, i128mem, SSE_PSIGN>;
5722  defm PSIGND    : SS3I_binop_rm<0x0A, "psignd", X86psign, v4i32, VR128,
5723                                 memopv2i64, i128mem, SSE_PSIGN>;
5724  defm PSHUFB    : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, VR128,
5725                                 memopv2i64, i128mem, SSE_PSHUFB>;
5726  defm PHADDSW   : SS3I_binop_rm_int<0x03, "phaddsw",
5727                                     int_x86_ssse3_phadd_sw_128,
5728                                     SSE_PHADDSUBSW, memopv2i64>;
5729  defm PHSUBSW   : SS3I_binop_rm_int<0x07, "phsubsw",
5730                                     int_x86_ssse3_phsub_sw_128,
5731                                     SSE_PHADDSUBSW, memopv2i64>;
5732  defm PMADDUBSW : SS3I_binop_rm_int<0x04, "pmaddubsw",
5733                                     int_x86_ssse3_pmadd_ub_sw_128,
5734                                     SSE_PMADD, memopv2i64>;
5735}
5736defm PMULHRSW    : SS3I_binop_rm_int<0x0B, "pmulhrsw",
5737                                     int_x86_ssse3_pmul_hr_sw_128,
5738                                     SSE_PMULHRSW, memopv2i64>;
5739}
5740
5741//===---------------------------------------------------------------------===//
5742// SSSE3 - Packed Align Instruction Patterns
5743//===---------------------------------------------------------------------===//
5744
5745multiclass ssse3_palignr<string asm, bit Is2Addr = 1> {
5746  let hasSideEffects = 0 in {
5747  def R128rr : SS3AI<0x0F, MRMSrcReg, (outs VR128:$dst),
5748      (ins VR128:$src1, VR128:$src2, u8imm:$src3),
5749      !if(Is2Addr,
5750        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5751        !strconcat(asm,
5752                  "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5753      [], IIC_SSE_PALIGNRR>, Sched<[WriteShuffle]>;
5754  let mayLoad = 1 in
5755  def R128rm : SS3AI<0x0F, MRMSrcMem, (outs VR128:$dst),
5756      (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
5757      !if(Is2Addr,
5758        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5759        !strconcat(asm,
5760                  "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5761      [], IIC_SSE_PALIGNRM>, Sched<[WriteShuffleLd, ReadAfterLd]>;
5762  }
5763}
5764
5765multiclass ssse3_palignr_y<string asm, bit Is2Addr = 1> {
5766  let hasSideEffects = 0 in {
5767  def R256rr : SS3AI<0x0F, MRMSrcReg, (outs VR256:$dst),
5768      (ins VR256:$src1, VR256:$src2, u8imm:$src3),
5769      !strconcat(asm,
5770                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5771      []>, Sched<[WriteShuffle]>;
5772  let mayLoad = 1 in
5773  def R256rm : SS3AI<0x0F, MRMSrcMem, (outs VR256:$dst),
5774      (ins VR256:$src1, i256mem:$src2, u8imm:$src3),
5775      !strconcat(asm,
5776                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5777      []>, Sched<[WriteShuffleLd, ReadAfterLd]>;
5778  }
5779}
5780
5781let Predicates = [HasAVX] in
5782  defm VPALIGN : ssse3_palignr<"vpalignr", 0>, VEX_4V;
5783let Predicates = [HasAVX2] in
5784  defm VPALIGN : ssse3_palignr_y<"vpalignr", 0>, VEX_4V, VEX_L;
5785let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in
5786  defm PALIGN : ssse3_palignr<"palignr">;
5787
5788let Predicates = [HasAVX2] in {
5789def : Pat<(v8i32 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
5790          (VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>;
5791def : Pat<(v8f32 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
5792          (VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>;
5793def : Pat<(v16i16 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
5794          (VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>;
5795def : Pat<(v32i8 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
5796          (VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>;
5797}
5798
5799let Predicates = [HasAVX] in {
5800def : Pat<(v4i32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
5801          (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
5802def : Pat<(v4f32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
5803          (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
5804def : Pat<(v8i16 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
5805          (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
5806def : Pat<(v16i8 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
5807          (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
5808}
5809
5810let Predicates = [UseSSSE3] in {
5811def : Pat<(v4i32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
5812          (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
5813def : Pat<(v4f32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
5814          (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
5815def : Pat<(v8i16 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
5816          (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
5817def : Pat<(v16i8 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
5818          (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
5819}
5820
5821//===---------------------------------------------------------------------===//
5822// SSSE3 - Thread synchronization
5823//===---------------------------------------------------------------------===//
5824
5825let SchedRW = [WriteSystem] in {
5826let usesCustomInserter = 1 in {
5827def MONITOR : PseudoI<(outs), (ins i32mem:$src1, GR32:$src2, GR32:$src3),
5828                [(int_x86_sse3_monitor addr:$src1, GR32:$src2, GR32:$src3)]>,
5829                Requires<[HasSSE3]>;
5830}
5831
5832let Uses = [EAX, ECX, EDX] in
5833def MONITORrrr : I<0x01, MRM_C8, (outs), (ins), "monitor", [], IIC_SSE_MONITOR>,
5834                 TB, Requires<[HasSSE3]>;
5835let Uses = [ECX, EAX] in
5836def MWAITrr   : I<0x01, MRM_C9, (outs), (ins), "mwait",
5837                [(int_x86_sse3_mwait ECX, EAX)], IIC_SSE_MWAIT>,
5838                TB, Requires<[HasSSE3]>;
5839} // SchedRW
5840
5841def : InstAlias<"mwait\t{%eax, %ecx|ecx, eax}", (MWAITrr)>, Requires<[Not64BitMode]>;
5842def : InstAlias<"mwait\t{%rax, %rcx|rcx, rax}", (MWAITrr)>, Requires<[In64BitMode]>;
5843
5844def : InstAlias<"monitor\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITORrrr)>,
5845      Requires<[Not64BitMode]>;
5846def : InstAlias<"monitor\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITORrrr)>,
5847      Requires<[In64BitMode]>;
5848
5849//===----------------------------------------------------------------------===//
5850// SSE4.1 - Packed Move with Sign/Zero Extend
5851//===----------------------------------------------------------------------===//
5852
5853multiclass SS41I_pmovx_rrrm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp,
5854                          RegisterClass OutRC, RegisterClass InRC,
5855                          OpndItins itins> {
5856  def rr : SS48I<opc, MRMSrcReg, (outs OutRC:$dst), (ins InRC:$src),
5857                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5858                 [], itins.rr>,
5859                 Sched<[itins.Sched]>;
5860
5861  def rm : SS48I<opc, MRMSrcMem, (outs OutRC:$dst), (ins MemOp:$src),
5862                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5863                 [],
5864                 itins.rm>, Sched<[itins.Sched.Folded]>;
5865}
5866
5867multiclass SS41I_pmovx_rm_all<bits<8> opc, string OpcodeStr,
5868                          X86MemOperand MemOp, X86MemOperand MemYOp,
5869                          OpndItins SSEItins, OpndItins AVXItins,
5870                          OpndItins AVX2Itins> {
5871  defm NAME : SS41I_pmovx_rrrm<opc, OpcodeStr, MemOp, VR128, VR128, SSEItins>;
5872  let Predicates = [HasAVX] in
5873    defm V#NAME   : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemOp,
5874                                     VR128, VR128, AVXItins>, VEX;
5875  let Predicates = [HasAVX2] in
5876    defm V#NAME#Y : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemYOp,
5877                                     VR256, VR128, AVX2Itins>, VEX, VEX_L;
5878}
5879
5880multiclass SS41I_pmovx_rm<bits<8> opc, string OpcodeStr,
5881                                X86MemOperand MemOp, X86MemOperand MemYOp> {
5882  defm PMOVSX#NAME : SS41I_pmovx_rm_all<opc, !strconcat("pmovsx", OpcodeStr),
5883                                        MemOp, MemYOp,
5884                                        SSE_INTALU_ITINS_SHUFF_P,
5885                                        DEFAULT_ITINS_SHUFFLESCHED,
5886                                        DEFAULT_ITINS_SHUFFLESCHED>;
5887  defm PMOVZX#NAME : SS41I_pmovx_rm_all<!add(opc, 0x10),
5888                                        !strconcat("pmovzx", OpcodeStr),
5889                                        MemOp, MemYOp,
5890                                        SSE_INTALU_ITINS_SHUFF_P,
5891                                        DEFAULT_ITINS_SHUFFLESCHED,
5892                                        DEFAULT_ITINS_SHUFFLESCHED>;
5893}
5894
5895defm BW : SS41I_pmovx_rm<0x20, "bw", i64mem, i128mem>;
5896defm WD : SS41I_pmovx_rm<0x23, "wd", i64mem, i128mem>;
5897defm DQ : SS41I_pmovx_rm<0x25, "dq", i64mem, i128mem>;
5898
5899defm BD : SS41I_pmovx_rm<0x21, "bd", i32mem, i64mem>;
5900defm WQ : SS41I_pmovx_rm<0x24, "wq", i32mem, i64mem>;
5901
5902defm BQ : SS41I_pmovx_rm<0x22, "bq", i16mem, i32mem>;
5903
5904// AVX2 Patterns
5905multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy, SDNode ExtOp> {
5906  // Register-Register patterns
5907  def : Pat<(v16i16 (ExtOp (v16i8 VR128:$src))),
5908            (!cast<I>(OpcPrefix#BWYrr) VR128:$src)>;
5909  def : Pat<(v8i32 (ExtOp (v16i8 VR128:$src))),
5910            (!cast<I>(OpcPrefix#BDYrr) VR128:$src)>;
5911  def : Pat<(v4i64 (ExtOp (v16i8 VR128:$src))),
5912            (!cast<I>(OpcPrefix#BQYrr) VR128:$src)>;
5913
5914  def : Pat<(v8i32 (ExtOp (v8i16 VR128:$src))),
5915            (!cast<I>(OpcPrefix#WDYrr) VR128:$src)>;
5916  def : Pat<(v4i64 (ExtOp (v8i16 VR128:$src))),
5917            (!cast<I>(OpcPrefix#WQYrr) VR128:$src)>;
5918
5919  def : Pat<(v4i64 (ExtOp (v4i32 VR128:$src))),
5920            (!cast<I>(OpcPrefix#DQYrr) VR128:$src)>;
5921
5922  // On AVX2, we also support 256bit inputs.
5923  def : Pat<(v16i16 (ExtOp (v32i8 VR256:$src))),
5924            (!cast<I>(OpcPrefix#BWYrr) (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
5925  def : Pat<(v8i32 (ExtOp (v32i8 VR256:$src))),
5926            (!cast<I>(OpcPrefix#BDYrr) (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
5927  def : Pat<(v4i64 (ExtOp (v32i8 VR256:$src))),
5928            (!cast<I>(OpcPrefix#BQYrr) (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
5929
5930  def : Pat<(v8i32 (ExtOp (v16i16 VR256:$src))),
5931            (!cast<I>(OpcPrefix#WDYrr) (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
5932  def : Pat<(v4i64 (ExtOp (v16i16 VR256:$src))),
5933            (!cast<I>(OpcPrefix#WQYrr) (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
5934
5935  def : Pat<(v4i64 (ExtOp (v8i32 VR256:$src))),
5936            (!cast<I>(OpcPrefix#DQYrr) (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
5937
5938  // Simple Register-Memory patterns
5939  def : Pat<(v16i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5940            (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
5941  def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5942            (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
5943  def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5944            (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
5945
5946  def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
5947            (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
5948  def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
5949            (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
5950
5951  def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)),
5952            (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
5953
5954  // AVX2 Register-Memory patterns
5955  def : Pat<(v16i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
5956            (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
5957  def : Pat<(v16i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
5958            (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
5959  def : Pat<(v16i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
5960            (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
5961  def : Pat<(v16i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
5962            (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
5963
5964  def : Pat<(v8i32 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5965            (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
5966  def : Pat<(v8i32 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
5967            (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
5968  def : Pat<(v8i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
5969            (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
5970  def : Pat<(v8i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
5971            (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
5972
5973  def : Pat<(v4i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
5974            (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
5975  def : Pat<(v4i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))),
5976            (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
5977  def : Pat<(v4i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
5978            (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
5979  def : Pat<(v4i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
5980            (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
5981
5982  def : Pat<(v8i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
5983            (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
5984  def : Pat<(v8i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
5985            (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
5986  def : Pat<(v8i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
5987            (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
5988  def : Pat<(v8i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
5989            (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
5990
5991  def : Pat<(v4i64 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5992            (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
5993  def : Pat<(v4i64 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
5994            (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
5995  def : Pat<(v4i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
5996            (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
5997  def : Pat<(v4i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
5998            (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
5999
6000  def : Pat<(v4i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))),
6001            (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
6002  def : Pat<(v4i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))),
6003            (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
6004  def : Pat<(v4i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))),
6005            (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
6006  def : Pat<(v4i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))),
6007            (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
6008}
6009
6010let Predicates = [HasAVX2] in {
6011  defm : SS41I_pmovx_avx2_patterns<"VPMOVSX", "s", X86vsext>;
6012  defm : SS41I_pmovx_avx2_patterns<"VPMOVZX", "z", X86vzext>;
6013}
6014
6015// SSE4.1/AVX patterns.
6016multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
6017                                SDNode ExtOp, PatFrag ExtLoad16> {
6018  def : Pat<(v8i16 (ExtOp (v16i8 VR128:$src))),
6019            (!cast<I>(OpcPrefix#BWrr) VR128:$src)>;
6020  def : Pat<(v4i32 (ExtOp (v16i8 VR128:$src))),
6021            (!cast<I>(OpcPrefix#BDrr) VR128:$src)>;
6022  def : Pat<(v2i64 (ExtOp (v16i8 VR128:$src))),
6023            (!cast<I>(OpcPrefix#BQrr) VR128:$src)>;
6024
6025  def : Pat<(v4i32 (ExtOp (v8i16 VR128:$src))),
6026            (!cast<I>(OpcPrefix#WDrr) VR128:$src)>;
6027  def : Pat<(v2i64 (ExtOp (v8i16 VR128:$src))),
6028            (!cast<I>(OpcPrefix#WQrr) VR128:$src)>;
6029
6030  def : Pat<(v2i64 (ExtOp (v4i32 VR128:$src))),
6031            (!cast<I>(OpcPrefix#DQrr) VR128:$src)>;
6032
6033  def : Pat<(v8i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
6034            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
6035  def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
6036            (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
6037  def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
6038            (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
6039
6040  def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
6041            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
6042  def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
6043            (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
6044
6045  def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)),
6046            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
6047
6048  def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
6049            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
6050  def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
6051            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
6052  def : Pat<(v8i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
6053            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
6054  def : Pat<(v8i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
6055            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
6056  def : Pat<(v8i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
6057            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
6058
6059  def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
6060            (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
6061  def : Pat<(v4i32 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))),
6062            (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
6063  def : Pat<(v4i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
6064            (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
6065  def : Pat<(v4i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
6066            (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
6067
6068  def : Pat<(v2i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (ExtLoad16 addr:$src)))))),
6069            (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
6070  def : Pat<(v2i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))),
6071            (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
6072  def : Pat<(v2i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
6073            (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
6074  def : Pat<(v2i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
6075            (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
6076
6077  def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
6078            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
6079  def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
6080            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
6081  def : Pat<(v4i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
6082            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
6083  def : Pat<(v4i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
6084            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
6085  def : Pat<(v4i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
6086            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
6087
6088  def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
6089            (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
6090  def : Pat<(v2i64 (ExtOp (v8i16 (vzmovl_v4i32 addr:$src)))),
6091            (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
6092  def : Pat<(v2i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
6093            (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
6094  def : Pat<(v2i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
6095            (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
6096
6097  def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
6098            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
6099  def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
6100            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
6101  def : Pat<(v2i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))),
6102            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
6103  def : Pat<(v2i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))),
6104            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
6105  def : Pat<(v2i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))),
6106            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
6107}
6108
6109let Predicates = [HasAVX] in {
6110  defm : SS41I_pmovx_patterns<"VPMOVSX", "s", X86vsext, extloadi32i16>;
6111  defm : SS41I_pmovx_patterns<"VPMOVZX", "z", X86vzext, loadi16_anyext>;
6112}
6113
6114let Predicates = [UseSSE41] in {
6115  defm : SS41I_pmovx_patterns<"PMOVSX", "s", X86vsext, extloadi32i16>;
6116  defm : SS41I_pmovx_patterns<"PMOVZX", "z", X86vzext, loadi16_anyext>;
6117}
6118
6119//===----------------------------------------------------------------------===//
6120// SSE4.1 - Extract Instructions
6121//===----------------------------------------------------------------------===//
6122
6123/// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem
6124multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> {
6125  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
6126                 (ins VR128:$src1, u8imm:$src2),
6127                 !strconcat(OpcodeStr,
6128                            "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6129                 [(set GR32orGR64:$dst, (X86pextrb (v16i8 VR128:$src1),
6130                                         imm:$src2))]>,
6131                  Sched<[WriteShuffle]>;
6132  let hasSideEffects = 0, mayStore = 1,
6133      SchedRW = [WriteShuffleLd, WriteRMW] in
6134  def mr : SS4AIi8<opc, MRMDestMem, (outs),
6135                 (ins i8mem:$dst, VR128:$src1, u8imm:$src2),
6136                 !strconcat(OpcodeStr,
6137                            "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6138                 [(store (i8 (trunc (assertzext (X86pextrb (v16i8 VR128:$src1),
6139                                                 imm:$src2)))), addr:$dst)]>;
6140}
6141
6142let Predicates = [HasAVX] in
6143  defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX;
6144
6145defm PEXTRB      : SS41I_extract8<0x14, "pextrb">;
6146
6147
6148/// SS41I_extract16 - SSE 4.1 extract 16 bits to memory destination
6149multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> {
6150  let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
6151  def rr_REV : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
6152                   (ins VR128:$src1, u8imm:$src2),
6153                   !strconcat(OpcodeStr,
6154                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6155                   []>, Sched<[WriteShuffle]>;
6156
6157  let hasSideEffects = 0, mayStore = 1,
6158      SchedRW = [WriteShuffleLd, WriteRMW] in
6159  def mr : SS4AIi8<opc, MRMDestMem, (outs),
6160                 (ins i16mem:$dst, VR128:$src1, u8imm:$src2),
6161                 !strconcat(OpcodeStr,
6162                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6163                 [(store (i16 (trunc (assertzext (X86pextrw (v8i16 VR128:$src1),
6164                                                  imm:$src2)))), addr:$dst)]>;
6165}
6166
6167let Predicates = [HasAVX] in
6168  defm VPEXTRW : SS41I_extract16<0x15, "vpextrw">, VEX;
6169
6170defm PEXTRW      : SS41I_extract16<0x15, "pextrw">;
6171
6172
6173/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
6174multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> {
6175  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
6176                 (ins VR128:$src1, u8imm:$src2),
6177                 !strconcat(OpcodeStr,
6178                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6179                 [(set GR32:$dst,
6180                  (extractelt (v4i32 VR128:$src1), imm:$src2))]>,
6181                  Sched<[WriteShuffle]>;
6182  let SchedRW = [WriteShuffleLd, WriteRMW] in
6183  def mr : SS4AIi8<opc, MRMDestMem, (outs),
6184                 (ins i32mem:$dst, VR128:$src1, u8imm:$src2),
6185                 !strconcat(OpcodeStr,
6186                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6187                 [(store (extractelt (v4i32 VR128:$src1), imm:$src2),
6188                          addr:$dst)]>;
6189}
6190
6191let Predicates = [HasAVX] in
6192  defm VPEXTRD : SS41I_extract32<0x16, "vpextrd">, VEX;
6193
6194defm PEXTRD      : SS41I_extract32<0x16, "pextrd">;
6195
6196/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
6197multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> {
6198  def rr : SS4AIi8<opc, MRMDestReg, (outs GR64:$dst),
6199                 (ins VR128:$src1, u8imm:$src2),
6200                 !strconcat(OpcodeStr,
6201                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6202                 [(set GR64:$dst,
6203                  (extractelt (v2i64 VR128:$src1), imm:$src2))]>,
6204                  Sched<[WriteShuffle]>, REX_W;
6205  let SchedRW = [WriteShuffleLd, WriteRMW] in
6206  def mr : SS4AIi8<opc, MRMDestMem, (outs),
6207                 (ins i64mem:$dst, VR128:$src1, u8imm:$src2),
6208                 !strconcat(OpcodeStr,
6209                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6210                 [(store (extractelt (v2i64 VR128:$src1), imm:$src2),
6211                          addr:$dst)]>, REX_W;
6212}
6213
6214let Predicates = [HasAVX] in
6215  defm VPEXTRQ : SS41I_extract64<0x16, "vpextrq">, VEX, VEX_W;
6216
6217defm PEXTRQ      : SS41I_extract64<0x16, "pextrq">;
6218
6219/// SS41I_extractf32 - SSE 4.1 extract 32 bits fp value to int reg or memory
6220/// destination
6221multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr,
6222                            OpndItins itins = DEFAULT_ITINS> {
6223  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
6224                 (ins VR128:$src1, u8imm:$src2),
6225                 !strconcat(OpcodeStr,
6226                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6227                 [(set GR32orGR64:$dst,
6228                    (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))],
6229                    itins.rr>, Sched<[WriteFBlend]>;
6230  let SchedRW = [WriteFBlendLd, WriteRMW] in
6231  def mr : SS4AIi8<opc, MRMDestMem, (outs),
6232                 (ins f32mem:$dst, VR128:$src1, u8imm:$src2),
6233                 !strconcat(OpcodeStr,
6234                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6235                 [(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2),
6236                          addr:$dst)], itins.rm>;
6237}
6238
6239let ExeDomain = SSEPackedSingle in {
6240  let Predicates = [UseAVX] in
6241    defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX;
6242  defm EXTRACTPS   : SS41I_extractf32<0x17, "extractps", SSE_EXTRACT_ITINS>;
6243}
6244
6245// Also match an EXTRACTPS store when the store is done as f32 instead of i32.
6246def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)),
6247                                              imm:$src2))),
6248                 addr:$dst),
6249          (VEXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>,
6250          Requires<[HasAVX]>;
6251def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)),
6252                                              imm:$src2))),
6253                 addr:$dst),
6254          (EXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>,
6255          Requires<[UseSSE41]>;
6256
6257//===----------------------------------------------------------------------===//
6258// SSE4.1 - Insert Instructions
6259//===----------------------------------------------------------------------===//
6260
6261multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> {
6262  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
6263      (ins VR128:$src1, GR32orGR64:$src2, u8imm:$src3),
6264      !if(Is2Addr,
6265        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6266        !strconcat(asm,
6267                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6268      [(set VR128:$dst,
6269        (X86pinsrb VR128:$src1, GR32orGR64:$src2, imm:$src3))]>,
6270      Sched<[WriteShuffle]>;
6271  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
6272      (ins VR128:$src1, i8mem:$src2, u8imm:$src3),
6273      !if(Is2Addr,
6274        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6275        !strconcat(asm,
6276                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6277      [(set VR128:$dst,
6278        (X86pinsrb VR128:$src1, (extloadi8 addr:$src2),
6279                   imm:$src3))]>, Sched<[WriteShuffleLd, ReadAfterLd]>;
6280}
6281
6282let Predicates = [HasAVX] in
6283  defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V;
6284let Constraints = "$src1 = $dst" in
6285  defm PINSRB  : SS41I_insert8<0x20, "pinsrb">;
6286
6287multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> {
6288  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
6289      (ins VR128:$src1, GR32:$src2, u8imm:$src3),
6290      !if(Is2Addr,
6291        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6292        !strconcat(asm,
6293                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6294      [(set VR128:$dst,
6295        (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>,
6296      Sched<[WriteShuffle]>;
6297  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
6298      (ins VR128:$src1, i32mem:$src2, u8imm:$src3),
6299      !if(Is2Addr,
6300        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6301        !strconcat(asm,
6302                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6303      [(set VR128:$dst,
6304        (v4i32 (insertelt VR128:$src1, (loadi32 addr:$src2),
6305                          imm:$src3)))]>, Sched<[WriteShuffleLd, ReadAfterLd]>;
6306}
6307
6308let Predicates = [HasAVX] in
6309  defm VPINSRD : SS41I_insert32<0x22, "vpinsrd", 0>, VEX_4V;
6310let Constraints = "$src1 = $dst" in
6311  defm PINSRD : SS41I_insert32<0x22, "pinsrd">;
6312
6313multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> {
6314  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
6315      (ins VR128:$src1, GR64:$src2, u8imm:$src3),
6316      !if(Is2Addr,
6317        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6318        !strconcat(asm,
6319                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6320      [(set VR128:$dst,
6321        (v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>,
6322      Sched<[WriteShuffle]>;
6323  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
6324      (ins VR128:$src1, i64mem:$src2, u8imm:$src3),
6325      !if(Is2Addr,
6326        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6327        !strconcat(asm,
6328                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6329      [(set VR128:$dst,
6330        (v2i64 (insertelt VR128:$src1, (loadi64 addr:$src2),
6331                          imm:$src3)))]>, Sched<[WriteShuffleLd, ReadAfterLd]>;
6332}
6333
6334let Predicates = [HasAVX] in
6335  defm VPINSRQ : SS41I_insert64<0x22, "vpinsrq", 0>, VEX_4V, VEX_W;
6336let Constraints = "$src1 = $dst" in
6337  defm PINSRQ : SS41I_insert64<0x22, "pinsrq">, REX_W;
6338
6339// insertps has a few different modes, there's the first two here below which
6340// are optimized inserts that won't zero arbitrary elements in the destination
6341// vector. The next one matches the intrinsic and could zero arbitrary elements
6342// in the target vector.
6343multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1,
6344                           OpndItins itins = DEFAULT_ITINS> {
6345  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
6346      (ins VR128:$src1, VR128:$src2, u8imm:$src3),
6347      !if(Is2Addr,
6348        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6349        !strconcat(asm,
6350                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6351      [(set VR128:$dst,
6352        (X86insertps VR128:$src1, VR128:$src2, imm:$src3))], itins.rr>,
6353      Sched<[WriteFShuffle]>;
6354  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
6355      (ins VR128:$src1, f32mem:$src2, u8imm:$src3),
6356      !if(Is2Addr,
6357        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6358        !strconcat(asm,
6359                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6360      [(set VR128:$dst,
6361        (X86insertps VR128:$src1,
6362                   (v4f32 (scalar_to_vector (loadf32 addr:$src2))),
6363                    imm:$src3))], itins.rm>,
6364      Sched<[WriteFShuffleLd, ReadAfterLd]>;
6365}
6366
6367let ExeDomain = SSEPackedSingle in {
6368  let Predicates = [UseAVX] in
6369    defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>, VEX_4V;
6370  let Constraints = "$src1 = $dst" in
6371    defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1, SSE_INSERT_ITINS>;
6372}
6373
6374let Predicates = [UseSSE41] in {
6375  // If we're inserting an element from a load or a null pshuf of a load,
6376  // fold the load into the insertps instruction.
6377  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), (X86PShufd (v4f32
6378                       (scalar_to_vector (loadf32 addr:$src2))), (i8 0)),
6379                   imm:$src3)),
6380            (INSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
6381  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), (X86PShufd
6382                      (loadv4f32 addr:$src2), (i8 0)), imm:$src3)),
6383            (INSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
6384}
6385
6386let Predicates = [UseAVX] in {
6387  // If we're inserting an element from a vbroadcast of a load, fold the
6388  // load into the X86insertps instruction.
6389  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1),
6390                (X86VBroadcast (loadf32 addr:$src2)), imm:$src3)),
6391            (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
6392  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1),
6393                (X86VBroadcast (loadv4f32 addr:$src2)), imm:$src3)),
6394            (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
6395}
6396
6397//===----------------------------------------------------------------------===//
6398// SSE4.1 - Round Instructions
6399//===----------------------------------------------------------------------===//
6400
6401multiclass sse41_fp_unop_rm<bits<8> opcps, bits<8> opcpd, string OpcodeStr,
6402                            X86MemOperand x86memop, RegisterClass RC,
6403                            PatFrag mem_frag32, PatFrag mem_frag64,
6404                            Intrinsic V4F32Int, Intrinsic V2F64Int> {
6405let ExeDomain = SSEPackedSingle in {
6406  // Intrinsic operation, reg.
6407  // Vector intrinsic operation, reg
6408  def PSr : SS4AIi8<opcps, MRMSrcReg,
6409                    (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2),
6410                    !strconcat(OpcodeStr,
6411                    "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6412                    [(set RC:$dst, (V4F32Int RC:$src1, imm:$src2))],
6413                    IIC_SSE_ROUNDPS_REG>, Sched<[WriteFAdd]>;
6414
6415  // Vector intrinsic operation, mem
6416  def PSm : SS4AIi8<opcps, MRMSrcMem,
6417                    (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2),
6418                    !strconcat(OpcodeStr,
6419                    "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6420                    [(set RC:$dst,
6421                          (V4F32Int (mem_frag32 addr:$src1),imm:$src2))],
6422                          IIC_SSE_ROUNDPS_MEM>, Sched<[WriteFAddLd]>;
6423} // ExeDomain = SSEPackedSingle
6424
6425let ExeDomain = SSEPackedDouble in {
6426  // Vector intrinsic operation, reg
6427  def PDr : SS4AIi8<opcpd, MRMSrcReg,
6428                    (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2),
6429                    !strconcat(OpcodeStr,
6430                    "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6431                    [(set RC:$dst, (V2F64Int RC:$src1, imm:$src2))],
6432                    IIC_SSE_ROUNDPS_REG>, Sched<[WriteFAdd]>;
6433
6434  // Vector intrinsic operation, mem
6435  def PDm : SS4AIi8<opcpd, MRMSrcMem,
6436                    (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2),
6437                    !strconcat(OpcodeStr,
6438                    "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6439                    [(set RC:$dst,
6440                          (V2F64Int (mem_frag64 addr:$src1),imm:$src2))],
6441                          IIC_SSE_ROUNDPS_REG>, Sched<[WriteFAddLd]>;
6442} // ExeDomain = SSEPackedDouble
6443}
6444
6445multiclass sse41_fp_binop_rm<bits<8> opcss, bits<8> opcsd,
6446                            string OpcodeStr,
6447                            Intrinsic F32Int,
6448                            Intrinsic F64Int, bit Is2Addr = 1> {
6449let ExeDomain = GenericDomain in {
6450  // Operation, reg.
6451  let hasSideEffects = 0 in
6452  def SSr : SS4AIi8<opcss, MRMSrcReg,
6453      (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32u8imm:$src3),
6454      !if(Is2Addr,
6455          !strconcat(OpcodeStr,
6456              "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6457          !strconcat(OpcodeStr,
6458              "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6459      []>, Sched<[WriteFAdd]>;
6460
6461  // Intrinsic operation, reg.
6462  let isCodeGenOnly = 1 in
6463  def SSr_Int : SS4AIi8<opcss, MRMSrcReg,
6464        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
6465        !if(Is2Addr,
6466            !strconcat(OpcodeStr,
6467                "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6468            !strconcat(OpcodeStr,
6469                "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6470        [(set VR128:$dst, (F32Int VR128:$src1, VR128:$src2, imm:$src3))]>,
6471        Sched<[WriteFAdd]>;
6472
6473  // Intrinsic operation, mem.
6474  def SSm : SS4AIi8<opcss, MRMSrcMem,
6475        (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32u8imm:$src3),
6476        !if(Is2Addr,
6477            !strconcat(OpcodeStr,
6478                "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6479            !strconcat(OpcodeStr,
6480                "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6481        [(set VR128:$dst,
6482             (F32Int VR128:$src1, sse_load_f32:$src2, imm:$src3))]>,
6483        Sched<[WriteFAddLd, ReadAfterLd]>;
6484
6485  // Operation, reg.
6486  let hasSideEffects = 0 in
6487  def SDr : SS4AIi8<opcsd, MRMSrcReg,
6488        (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32u8imm:$src3),
6489        !if(Is2Addr,
6490            !strconcat(OpcodeStr,
6491                "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6492            !strconcat(OpcodeStr,
6493                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6494        []>, Sched<[WriteFAdd]>;
6495
6496  // Intrinsic operation, reg.
6497  let isCodeGenOnly = 1 in
6498  def SDr_Int : SS4AIi8<opcsd, MRMSrcReg,
6499        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
6500        !if(Is2Addr,
6501            !strconcat(OpcodeStr,
6502                "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6503            !strconcat(OpcodeStr,
6504                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6505        [(set VR128:$dst, (F64Int VR128:$src1, VR128:$src2, imm:$src3))]>,
6506        Sched<[WriteFAdd]>;
6507
6508  // Intrinsic operation, mem.
6509  def SDm : SS4AIi8<opcsd, MRMSrcMem,
6510        (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32u8imm:$src3),
6511        !if(Is2Addr,
6512            !strconcat(OpcodeStr,
6513                "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6514            !strconcat(OpcodeStr,
6515                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6516        [(set VR128:$dst,
6517              (F64Int VR128:$src1, sse_load_f64:$src2, imm:$src3))]>,
6518        Sched<[WriteFAddLd, ReadAfterLd]>;
6519} // ExeDomain = GenericDomain
6520}
6521
6522// FP round - roundss, roundps, roundsd, roundpd
6523let Predicates = [HasAVX] in {
6524  // Intrinsic form
6525  defm VROUND  : sse41_fp_unop_rm<0x08, 0x09, "vround", f128mem, VR128,
6526                                  loadv4f32, loadv2f64,
6527                                  int_x86_sse41_round_ps,
6528                                  int_x86_sse41_round_pd>, VEX;
6529  defm VROUNDY : sse41_fp_unop_rm<0x08, 0x09, "vround", f256mem, VR256,
6530                                  loadv8f32, loadv4f64,
6531                                  int_x86_avx_round_ps_256,
6532                                  int_x86_avx_round_pd_256>, VEX, VEX_L;
6533  defm VROUND  : sse41_fp_binop_rm<0x0A, 0x0B, "vround",
6534                                  int_x86_sse41_round_ss,
6535                                  int_x86_sse41_round_sd, 0>, VEX_4V, VEX_LIG;
6536}
6537
6538let Predicates = [UseAVX] in {
6539  def : Pat<(ffloor FR32:$src),
6540            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x1))>;
6541  def : Pat<(f64 (ffloor FR64:$src)),
6542            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x1))>;
6543  def : Pat<(f32 (fnearbyint FR32:$src)),
6544            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>;
6545  def : Pat<(f64 (fnearbyint FR64:$src)),
6546            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>;
6547  def : Pat<(f32 (fceil FR32:$src)),
6548            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x2))>;
6549  def : Pat<(f64 (fceil FR64:$src)),
6550            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x2))>;
6551  def : Pat<(f32 (frint FR32:$src)),
6552            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>;
6553  def : Pat<(f64 (frint FR64:$src)),
6554            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>;
6555  def : Pat<(f32 (ftrunc FR32:$src)),
6556            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x3))>;
6557  def : Pat<(f64 (ftrunc FR64:$src)),
6558            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x3))>;
6559}
6560
6561let Predicates = [HasAVX] in {
6562  def : Pat<(v4f32 (ffloor VR128:$src)),
6563            (VROUNDPSr VR128:$src, (i32 0x1))>;
6564  def : Pat<(v4f32 (fnearbyint VR128:$src)),
6565            (VROUNDPSr VR128:$src, (i32 0xC))>;
6566  def : Pat<(v4f32 (fceil VR128:$src)),
6567            (VROUNDPSr VR128:$src, (i32 0x2))>;
6568  def : Pat<(v4f32 (frint VR128:$src)),
6569            (VROUNDPSr VR128:$src, (i32 0x4))>;
6570  def : Pat<(v4f32 (ftrunc VR128:$src)),
6571            (VROUNDPSr VR128:$src, (i32 0x3))>;
6572
6573  def : Pat<(v2f64 (ffloor VR128:$src)),
6574            (VROUNDPDr VR128:$src, (i32 0x1))>;
6575  def : Pat<(v2f64 (fnearbyint VR128:$src)),
6576            (VROUNDPDr VR128:$src, (i32 0xC))>;
6577  def : Pat<(v2f64 (fceil VR128:$src)),
6578            (VROUNDPDr VR128:$src, (i32 0x2))>;
6579  def : Pat<(v2f64 (frint VR128:$src)),
6580            (VROUNDPDr VR128:$src, (i32 0x4))>;
6581  def : Pat<(v2f64 (ftrunc VR128:$src)),
6582            (VROUNDPDr VR128:$src, (i32 0x3))>;
6583
6584  def : Pat<(v8f32 (ffloor VR256:$src)),
6585            (VROUNDYPSr VR256:$src, (i32 0x1))>;
6586  def : Pat<(v8f32 (fnearbyint VR256:$src)),
6587            (VROUNDYPSr VR256:$src, (i32 0xC))>;
6588  def : Pat<(v8f32 (fceil VR256:$src)),
6589            (VROUNDYPSr VR256:$src, (i32 0x2))>;
6590  def : Pat<(v8f32 (frint VR256:$src)),
6591            (VROUNDYPSr VR256:$src, (i32 0x4))>;
6592  def : Pat<(v8f32 (ftrunc VR256:$src)),
6593            (VROUNDYPSr VR256:$src, (i32 0x3))>;
6594
6595  def : Pat<(v4f64 (ffloor VR256:$src)),
6596            (VROUNDYPDr VR256:$src, (i32 0x1))>;
6597  def : Pat<(v4f64 (fnearbyint VR256:$src)),
6598            (VROUNDYPDr VR256:$src, (i32 0xC))>;
6599  def : Pat<(v4f64 (fceil VR256:$src)),
6600            (VROUNDYPDr VR256:$src, (i32 0x2))>;
6601  def : Pat<(v4f64 (frint VR256:$src)),
6602            (VROUNDYPDr VR256:$src, (i32 0x4))>;
6603  def : Pat<(v4f64 (ftrunc VR256:$src)),
6604            (VROUNDYPDr VR256:$src, (i32 0x3))>;
6605}
6606
6607defm ROUND  : sse41_fp_unop_rm<0x08, 0x09, "round", f128mem, VR128,
6608                               memopv4f32, memopv2f64,
6609                               int_x86_sse41_round_ps, int_x86_sse41_round_pd>;
6610let Constraints = "$src1 = $dst" in
6611defm ROUND  : sse41_fp_binop_rm<0x0A, 0x0B, "round",
6612                               int_x86_sse41_round_ss, int_x86_sse41_round_sd>;
6613
6614let Predicates = [UseSSE41] in {
6615  def : Pat<(ffloor FR32:$src),
6616            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x1))>;
6617  def : Pat<(f64 (ffloor FR64:$src)),
6618            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x1))>;
6619  def : Pat<(f32 (fnearbyint FR32:$src)),
6620            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>;
6621  def : Pat<(f64 (fnearbyint FR64:$src)),
6622            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>;
6623  def : Pat<(f32 (fceil FR32:$src)),
6624            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x2))>;
6625  def : Pat<(f64 (fceil FR64:$src)),
6626            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x2))>;
6627  def : Pat<(f32 (frint FR32:$src)),
6628            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>;
6629  def : Pat<(f64 (frint FR64:$src)),
6630            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>;
6631  def : Pat<(f32 (ftrunc FR32:$src)),
6632            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x3))>;
6633  def : Pat<(f64 (ftrunc FR64:$src)),
6634            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x3))>;
6635
6636  def : Pat<(v4f32 (ffloor VR128:$src)),
6637            (ROUNDPSr VR128:$src, (i32 0x1))>;
6638  def : Pat<(v4f32 (fnearbyint VR128:$src)),
6639            (ROUNDPSr VR128:$src, (i32 0xC))>;
6640  def : Pat<(v4f32 (fceil VR128:$src)),
6641            (ROUNDPSr VR128:$src, (i32 0x2))>;
6642  def : Pat<(v4f32 (frint VR128:$src)),
6643            (ROUNDPSr VR128:$src, (i32 0x4))>;
6644  def : Pat<(v4f32 (ftrunc VR128:$src)),
6645            (ROUNDPSr VR128:$src, (i32 0x3))>;
6646
6647  def : Pat<(v2f64 (ffloor VR128:$src)),
6648            (ROUNDPDr VR128:$src, (i32 0x1))>;
6649  def : Pat<(v2f64 (fnearbyint VR128:$src)),
6650            (ROUNDPDr VR128:$src, (i32 0xC))>;
6651  def : Pat<(v2f64 (fceil VR128:$src)),
6652            (ROUNDPDr VR128:$src, (i32 0x2))>;
6653  def : Pat<(v2f64 (frint VR128:$src)),
6654            (ROUNDPDr VR128:$src, (i32 0x4))>;
6655  def : Pat<(v2f64 (ftrunc VR128:$src)),
6656            (ROUNDPDr VR128:$src, (i32 0x3))>;
6657}
6658
6659//===----------------------------------------------------------------------===//
6660// SSE4.1 - Packed Bit Test
6661//===----------------------------------------------------------------------===//
6662
6663// ptest instruction we'll lower to this in X86ISelLowering primarily from
6664// the intel intrinsic that corresponds to this.
6665let Defs = [EFLAGS], Predicates = [HasAVX] in {
6666def VPTESTrr  : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
6667                "vptest\t{$src2, $src1|$src1, $src2}",
6668                [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
6669                Sched<[WriteVecLogic]>, VEX;
6670def VPTESTrm  : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
6671                "vptest\t{$src2, $src1|$src1, $src2}",
6672                [(set EFLAGS,(X86ptest VR128:$src1, (loadv2i64 addr:$src2)))]>,
6673                Sched<[WriteVecLogicLd, ReadAfterLd]>, VEX;
6674
6675def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2),
6676                "vptest\t{$src2, $src1|$src1, $src2}",
6677                [(set EFLAGS, (X86ptest VR256:$src1, (v4i64 VR256:$src2)))]>,
6678                Sched<[WriteVecLogic]>, VEX, VEX_L;
6679def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2),
6680                "vptest\t{$src2, $src1|$src1, $src2}",
6681                [(set EFLAGS,(X86ptest VR256:$src1, (loadv4i64 addr:$src2)))]>,
6682                Sched<[WriteVecLogicLd, ReadAfterLd]>, VEX, VEX_L;
6683}
6684
6685let Defs = [EFLAGS] in {
6686def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
6687              "ptest\t{$src2, $src1|$src1, $src2}",
6688              [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
6689              Sched<[WriteVecLogic]>;
6690def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
6691              "ptest\t{$src2, $src1|$src1, $src2}",
6692              [(set EFLAGS, (X86ptest VR128:$src1, (memopv2i64 addr:$src2)))]>,
6693              Sched<[WriteVecLogicLd, ReadAfterLd]>;
6694}
6695
6696// The bit test instructions below are AVX only
6697multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC,
6698                       X86MemOperand x86memop, PatFrag mem_frag, ValueType vt> {
6699  def rr : SS48I<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
6700            !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
6701            [(set EFLAGS, (X86testp RC:$src1, (vt RC:$src2)))]>,
6702            Sched<[WriteVecLogic]>, VEX;
6703  def rm : SS48I<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
6704            !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
6705            [(set EFLAGS, (X86testp RC:$src1, (mem_frag addr:$src2)))]>,
6706            Sched<[WriteVecLogicLd, ReadAfterLd]>, VEX;
6707}
6708
6709let Defs = [EFLAGS], Predicates = [HasAVX] in {
6710let ExeDomain = SSEPackedSingle in {
6711defm VTESTPS  : avx_bittest<0x0E, "vtestps", VR128, f128mem, loadv4f32, v4f32>;
6712defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, loadv8f32, v8f32>,
6713                            VEX_L;
6714}
6715let ExeDomain = SSEPackedDouble in {
6716defm VTESTPD  : avx_bittest<0x0F, "vtestpd", VR128, f128mem, loadv2f64, v2f64>;
6717defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, loadv4f64, v4f64>,
6718                            VEX_L;
6719}
6720}
6721
6722//===----------------------------------------------------------------------===//
6723// SSE4.1 - Misc Instructions
6724//===----------------------------------------------------------------------===//
6725
6726let Defs = [EFLAGS], Predicates = [HasPOPCNT] in {
6727  def POPCNT16rr : I<0xB8, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
6728                     "popcnt{w}\t{$src, $dst|$dst, $src}",
6729                     [(set GR16:$dst, (ctpop GR16:$src)), (implicit EFLAGS)],
6730                     IIC_SSE_POPCNT_RR>, Sched<[WriteFAdd]>,
6731                     OpSize16, XS;
6732  def POPCNT16rm : I<0xB8, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
6733                     "popcnt{w}\t{$src, $dst|$dst, $src}",
6734                     [(set GR16:$dst, (ctpop (loadi16 addr:$src))),
6735                      (implicit EFLAGS)], IIC_SSE_POPCNT_RM>,
6736                      Sched<[WriteFAddLd]>, OpSize16, XS;
6737
6738  def POPCNT32rr : I<0xB8, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
6739                     "popcnt{l}\t{$src, $dst|$dst, $src}",
6740                     [(set GR32:$dst, (ctpop GR32:$src)), (implicit EFLAGS)],
6741                     IIC_SSE_POPCNT_RR>, Sched<[WriteFAdd]>,
6742                     OpSize32, XS;
6743
6744  def POPCNT32rm : I<0xB8, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
6745                     "popcnt{l}\t{$src, $dst|$dst, $src}",
6746                     [(set GR32:$dst, (ctpop (loadi32 addr:$src))),
6747                      (implicit EFLAGS)], IIC_SSE_POPCNT_RM>,
6748                      Sched<[WriteFAddLd]>, OpSize32, XS;
6749
6750  def POPCNT64rr : RI<0xB8, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
6751                      "popcnt{q}\t{$src, $dst|$dst, $src}",
6752                      [(set GR64:$dst, (ctpop GR64:$src)), (implicit EFLAGS)],
6753                      IIC_SSE_POPCNT_RR>, Sched<[WriteFAdd]>, XS;
6754  def POPCNT64rm : RI<0xB8, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
6755                      "popcnt{q}\t{$src, $dst|$dst, $src}",
6756                      [(set GR64:$dst, (ctpop (loadi64 addr:$src))),
6757                       (implicit EFLAGS)], IIC_SSE_POPCNT_RM>,
6758                       Sched<[WriteFAddLd]>, XS;
6759}
6760
6761
6762
6763// SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16.
6764multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr,
6765                                 Intrinsic IntId128, PatFrag ld_frag,
6766                                 X86FoldableSchedWrite Sched> {
6767  def rr128 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
6768                    (ins VR128:$src),
6769                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
6770                    [(set VR128:$dst, (IntId128 VR128:$src))]>,
6771                    Sched<[Sched]>;
6772  def rm128 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
6773                     (ins i128mem:$src),
6774                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
6775                     [(set VR128:$dst,
6776                       (IntId128 (bitconvert (ld_frag addr:$src))))]>,
6777                    Sched<[Sched.Folded]>;
6778}
6779
6780// PHMIN has the same profile as PSAD, thus we use the same scheduling
6781// model, although the naming is misleading.
6782let Predicates = [HasAVX] in
6783defm VPHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "vphminposuw",
6784                                         int_x86_sse41_phminposuw, loadv2i64,
6785                                         WriteVecIMul>, VEX;
6786defm PHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "phminposuw",
6787                                         int_x86_sse41_phminposuw, memopv2i64,
6788                                         WriteVecIMul>;
6789
6790/// SS48I_binop_rm - Simple SSE41 binary operator.
6791multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
6792                          ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
6793                          X86MemOperand x86memop, bit Is2Addr = 1,
6794                          OpndItins itins = SSE_INTALU_ITINS_P> {
6795  let isCommutable = 1 in
6796  def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst),
6797       (ins RC:$src1, RC:$src2),
6798       !if(Is2Addr,
6799           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
6800           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
6801       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
6802       Sched<[itins.Sched]>;
6803  def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst),
6804       (ins RC:$src1, x86memop:$src2),
6805       !if(Is2Addr,
6806           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
6807           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
6808       [(set RC:$dst,
6809         (OpVT (OpNode RC:$src1, (bitconvert (memop_frag addr:$src2)))))]>,
6810       Sched<[itins.Sched.Folded, ReadAfterLd]>;
6811}
6812
6813/// SS48I_binop_rm2 - Simple SSE41 binary operator with different src and dst
6814/// types.
6815multiclass SS48I_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
6816                         ValueType DstVT, ValueType SrcVT, RegisterClass RC,
6817                         PatFrag memop_frag, X86MemOperand x86memop,
6818                         OpndItins itins,
6819                         bit IsCommutable = 0, bit Is2Addr = 1> {
6820  let isCommutable = IsCommutable in
6821  def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst),
6822       (ins RC:$src1, RC:$src2),
6823       !if(Is2Addr,
6824           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
6825           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
6826       [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>,
6827       Sched<[itins.Sched]>;
6828  def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst),
6829       (ins RC:$src1, x86memop:$src2),
6830       !if(Is2Addr,
6831           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
6832           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
6833       [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1),
6834                                     (bitconvert (memop_frag addr:$src2)))))]>,
6835       Sched<[itins.Sched.Folded, ReadAfterLd]>;
6836}
6837
6838let Predicates = [HasAVX, NoVLX] in {
6839  let isCommutable = 0 in
6840  defm VPMINSB   : SS48I_binop_rm<0x38, "vpminsb", X86smin, v16i8, VR128,
6841                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
6842                                  VEX_4V;
6843  defm VPMINSD   : SS48I_binop_rm<0x39, "vpminsd", X86smin, v4i32, VR128,
6844                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
6845                                  VEX_4V;
6846  defm VPMINUD   : SS48I_binop_rm<0x3B, "vpminud", X86umin, v4i32, VR128,
6847                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
6848                                  VEX_4V;
6849  defm VPMINUW   : SS48I_binop_rm<0x3A, "vpminuw", X86umin, v8i16, VR128,
6850                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
6851                                  VEX_4V;
6852  defm VPMAXSB   : SS48I_binop_rm<0x3C, "vpmaxsb", X86smax, v16i8, VR128,
6853                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
6854                                  VEX_4V;
6855  defm VPMAXSD   : SS48I_binop_rm<0x3D, "vpmaxsd", X86smax, v4i32, VR128,
6856                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
6857                                  VEX_4V;
6858  defm VPMAXUD   : SS48I_binop_rm<0x3F, "vpmaxud", X86umax, v4i32, VR128,
6859                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
6860                                  VEX_4V;
6861  defm VPMAXUW   : SS48I_binop_rm<0x3E, "vpmaxuw", X86umax, v8i16, VR128,
6862                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
6863                                  VEX_4V;
6864  defm VPMULDQ   : SS48I_binop_rm2<0x28, "vpmuldq", X86pmuldq, v2i64, v4i32,
6865                                   VR128, loadv2i64, i128mem,
6866                                   SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V;
6867}
6868
6869let Predicates = [HasAVX2, NoVLX] in {
6870  let isCommutable = 0 in
6871  defm VPMINSBY  : SS48I_binop_rm<0x38, "vpminsb", X86smin, v32i8, VR256,
6872                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
6873                                  VEX_4V, VEX_L;
6874  defm VPMINSDY  : SS48I_binop_rm<0x39, "vpminsd", X86smin, v8i32, VR256,
6875                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
6876                                  VEX_4V, VEX_L;
6877  defm VPMINUDY  : SS48I_binop_rm<0x3B, "vpminud", X86umin, v8i32, VR256,
6878                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
6879                                  VEX_4V, VEX_L;
6880  defm VPMINUWY  : SS48I_binop_rm<0x3A, "vpminuw", X86umin, v16i16, VR256,
6881                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
6882                                  VEX_4V, VEX_L;
6883  defm VPMAXSBY  : SS48I_binop_rm<0x3C, "vpmaxsb", X86smax, v32i8, VR256,
6884                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
6885                                  VEX_4V, VEX_L;
6886  defm VPMAXSDY  : SS48I_binop_rm<0x3D, "vpmaxsd", X86smax, v8i32, VR256,
6887                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
6888                                  VEX_4V, VEX_L;
6889  defm VPMAXUDY  : SS48I_binop_rm<0x3F, "vpmaxud", X86umax, v8i32, VR256,
6890                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
6891                                  VEX_4V, VEX_L;
6892  defm VPMAXUWY  : SS48I_binop_rm<0x3E, "vpmaxuw", X86umax, v16i16, VR256,
6893                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
6894                                  VEX_4V, VEX_L;
6895  defm VPMULDQY : SS48I_binop_rm2<0x28, "vpmuldq", X86pmuldq, v4i64, v8i32,
6896                                  VR256, loadv4i64, i256mem,
6897                                  SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L;
6898}
6899
6900let Constraints = "$src1 = $dst" in {
6901  let isCommutable = 0 in
6902  defm PMINSB   : SS48I_binop_rm<0x38, "pminsb", X86smin, v16i8, VR128,
6903                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
6904  defm PMINSD   : SS48I_binop_rm<0x39, "pminsd", X86smin, v4i32, VR128,
6905                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
6906  defm PMINUD   : SS48I_binop_rm<0x3B, "pminud", X86umin, v4i32, VR128,
6907                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
6908  defm PMINUW   : SS48I_binop_rm<0x3A, "pminuw", X86umin, v8i16, VR128,
6909                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
6910  defm PMAXSB   : SS48I_binop_rm<0x3C, "pmaxsb", X86smax, v16i8, VR128,
6911                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
6912  defm PMAXSD   : SS48I_binop_rm<0x3D, "pmaxsd", X86smax, v4i32, VR128,
6913                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
6914  defm PMAXUD   : SS48I_binop_rm<0x3F, "pmaxud", X86umax, v4i32, VR128,
6915                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
6916  defm PMAXUW   : SS48I_binop_rm<0x3E, "pmaxuw", X86umax, v8i16, VR128,
6917                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
6918  defm PMULDQ   : SS48I_binop_rm2<0x28, "pmuldq", X86pmuldq, v2i64, v4i32,
6919                                  VR128, memopv2i64, i128mem,
6920                                  SSE_INTMUL_ITINS_P, 1>;
6921}
6922
6923let Predicates = [HasAVX, NoVLX] in {
6924  defm VPMULLD  : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128,
6925                                 memopv2i64, i128mem, 0, SSE_PMULLD_ITINS>,
6926                                 VEX_4V;
6927  defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128,
6928                                 memopv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
6929                                 VEX_4V;
6930}
6931let Predicates = [HasAVX2] in {
6932  defm VPMULLDY  : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256,
6933                                  loadv4i64, i256mem, 0, SSE_PMULLD_ITINS>,
6934                                  VEX_4V, VEX_L;
6935  defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256,
6936                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
6937                                  VEX_4V, VEX_L;
6938}
6939
6940let Constraints = "$src1 = $dst" in {
6941  defm PMULLD  : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, VR128,
6942                                memopv2i64, i128mem, 1, SSE_PMULLD_ITINS>;
6943  defm PCMPEQQ : SS48I_binop_rm<0x29, "pcmpeqq", X86pcmpeq, v2i64, VR128,
6944                                memopv2i64, i128mem, 1, SSE_INTALUQ_ITINS_P>;
6945}
6946
6947/// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate
6948multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
6949                 Intrinsic IntId, RegisterClass RC, PatFrag memop_frag,
6950                 X86MemOperand x86memop, bit Is2Addr = 1,
6951                 OpndItins itins = DEFAULT_ITINS> {
6952  let isCommutable = 1 in
6953  def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
6954        (ins RC:$src1, RC:$src2, u8imm:$src3),
6955        !if(Is2Addr,
6956            !strconcat(OpcodeStr,
6957                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6958            !strconcat(OpcodeStr,
6959                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6960        [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))], itins.rr>,
6961        Sched<[itins.Sched]>;
6962  def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
6963        (ins RC:$src1, x86memop:$src2, u8imm:$src3),
6964        !if(Is2Addr,
6965            !strconcat(OpcodeStr,
6966                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6967            !strconcat(OpcodeStr,
6968                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6969        [(set RC:$dst,
6970          (IntId RC:$src1,
6971           (bitconvert (memop_frag addr:$src2)), imm:$src3))], itins.rm>,
6972        Sched<[itins.Sched.Folded, ReadAfterLd]>;
6973}
6974
6975/// SS41I_binop_rmi - SSE 4.1 binary operator with 8-bit immediate
6976multiclass SS41I_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
6977                           ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
6978                           X86MemOperand x86memop, bit Is2Addr = 1,
6979                           OpndItins itins = DEFAULT_ITINS> {
6980  let isCommutable = 1 in
6981  def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
6982        (ins RC:$src1, RC:$src2, u8imm:$src3),
6983        !if(Is2Addr,
6984            !strconcat(OpcodeStr,
6985                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6986            !strconcat(OpcodeStr,
6987                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6988        [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))],
6989        itins.rr>, Sched<[itins.Sched]>;
6990  def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
6991        (ins RC:$src1, x86memop:$src2, u8imm:$src3),
6992        !if(Is2Addr,
6993            !strconcat(OpcodeStr,
6994                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6995            !strconcat(OpcodeStr,
6996                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6997        [(set RC:$dst,
6998          (OpVT (OpNode RC:$src1,
6999                 (bitconvert (memop_frag addr:$src2)), imm:$src3)))], itins.rm>,
7000        Sched<[itins.Sched.Folded, ReadAfterLd]>;
7001}
7002
7003let Predicates = [HasAVX] in {
7004  let isCommutable = 0 in {
7005    defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw,
7006                                        VR128, loadv2i64, i128mem, 0,
7007                                        DEFAULT_ITINS_MPSADSCHED>, VEX_4V;
7008  }
7009
7010  let ExeDomain = SSEPackedSingle in {
7011  defm VBLENDPS : SS41I_binop_rmi<0x0C, "vblendps", X86Blendi, v4f32,
7012                                  VR128, loadv4f32, f128mem, 0,
7013                                  DEFAULT_ITINS_FBLENDSCHED>, VEX_4V;
7014  defm VBLENDPSY : SS41I_binop_rmi<0x0C, "vblendps", X86Blendi, v8f32,
7015                                   VR256, loadv8f32, f256mem, 0,
7016                                   DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_L;
7017  }
7018  let ExeDomain = SSEPackedDouble in {
7019  defm VBLENDPD : SS41I_binop_rmi<0x0D, "vblendpd", X86Blendi, v2f64,
7020                                  VR128, loadv2f64, f128mem, 0,
7021                                  DEFAULT_ITINS_FBLENDSCHED>, VEX_4V;
7022  defm VBLENDPDY : SS41I_binop_rmi<0x0D, "vblendpd", X86Blendi, v4f64,
7023                                   VR256, loadv4f64, f256mem, 0,
7024                                   DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_L;
7025  }
7026  defm VPBLENDW : SS41I_binop_rmi<0x0E, "vpblendw", X86Blendi, v8i16,
7027                                  VR128, loadv2i64, i128mem, 0,
7028                                  DEFAULT_ITINS_BLENDSCHED>, VEX_4V;
7029
7030  let ExeDomain = SSEPackedSingle in
7031  defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps,
7032                                   VR128, loadv4f32, f128mem, 0,
7033                                   SSE_DPPS_ITINS>, VEX_4V;
7034  let ExeDomain = SSEPackedDouble in
7035  defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd,
7036                                   VR128, loadv2f64, f128mem, 0,
7037                                   SSE_DPPS_ITINS>, VEX_4V;
7038  let ExeDomain = SSEPackedSingle in
7039  defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256,
7040                                    VR256, loadv8f32, i256mem, 0,
7041                                    SSE_DPPS_ITINS>, VEX_4V, VEX_L;
7042}
7043
7044let Predicates = [HasAVX2] in {
7045  let isCommutable = 0 in {
7046  defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw,
7047                                  VR256, loadv4i64, i256mem, 0,
7048                                  DEFAULT_ITINS_MPSADSCHED>, VEX_4V, VEX_L;
7049  }
7050  defm VPBLENDWY : SS41I_binop_rmi<0x0E, "vpblendw", X86Blendi, v16i16,
7051                                   VR256, loadv4i64, i256mem, 0,
7052                                   DEFAULT_ITINS_BLENDSCHED>, VEX_4V, VEX_L;
7053}
7054
7055let Constraints = "$src1 = $dst" in {
7056  let isCommutable = 0 in {
7057  defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw,
7058                                     VR128, memopv2i64, i128mem,
7059                                     1, SSE_MPSADBW_ITINS>;
7060  }
7061  let ExeDomain = SSEPackedSingle in
7062  defm BLENDPS : SS41I_binop_rmi<0x0C, "blendps", X86Blendi, v4f32,
7063                                 VR128, memopv4f32, f128mem,
7064                                 1, SSE_INTALU_ITINS_FBLEND_P>;
7065  let ExeDomain = SSEPackedDouble in
7066  defm BLENDPD : SS41I_binop_rmi<0x0D, "blendpd", X86Blendi, v2f64,
7067                                 VR128, memopv2f64, f128mem,
7068                                 1, SSE_INTALU_ITINS_FBLEND_P>;
7069  defm PBLENDW : SS41I_binop_rmi<0x0E, "pblendw", X86Blendi, v8i16,
7070                                 VR128, memopv2i64, i128mem,
7071                                 1, SSE_INTALU_ITINS_BLEND_P>;
7072  let ExeDomain = SSEPackedSingle in
7073  defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps,
7074                                  VR128, memopv4f32, f128mem, 1,
7075                                  SSE_DPPS_ITINS>;
7076  let ExeDomain = SSEPackedDouble in
7077  defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd,
7078                                  VR128, memopv2f64, f128mem, 1,
7079                                  SSE_DPPD_ITINS>;
7080}
7081
7082/// SS41I_quaternary_int_avx - AVX SSE 4.1 with 4 operators
7083multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr,
7084                                    RegisterClass RC, X86MemOperand x86memop,
7085                                    PatFrag mem_frag, Intrinsic IntId,
7086                                    X86FoldableSchedWrite Sched> {
7087  def rr : Ii8<opc, MRMSrcReg, (outs RC:$dst),
7088                  (ins RC:$src1, RC:$src2, RC:$src3),
7089                  !strconcat(OpcodeStr,
7090                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
7091                  [(set RC:$dst, (IntId RC:$src1, RC:$src2, RC:$src3))],
7092                  NoItinerary, SSEPackedInt>, TAPD, VEX_4V, VEX_I8IMM,
7093                Sched<[Sched]>;
7094
7095  def rm : Ii8<opc, MRMSrcMem, (outs RC:$dst),
7096                  (ins RC:$src1, x86memop:$src2, RC:$src3),
7097                  !strconcat(OpcodeStr,
7098                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
7099                  [(set RC:$dst,
7100                        (IntId RC:$src1, (bitconvert (mem_frag addr:$src2)),
7101                               RC:$src3))],
7102                  NoItinerary, SSEPackedInt>, TAPD, VEX_4V, VEX_I8IMM,
7103                Sched<[Sched.Folded, ReadAfterLd]>;
7104}
7105
7106let Predicates = [HasAVX] in {
7107let ExeDomain = SSEPackedDouble in {
7108defm VBLENDVPD  : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR128, f128mem,
7109                                           loadv2f64, int_x86_sse41_blendvpd,
7110                                           WriteFVarBlend>;
7111defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, f256mem,
7112                                  loadv4f64, int_x86_avx_blendv_pd_256,
7113                                  WriteFVarBlend>, VEX_L;
7114} // ExeDomain = SSEPackedDouble
7115let ExeDomain = SSEPackedSingle in {
7116defm VBLENDVPS  : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR128, f128mem,
7117                                           loadv4f32, int_x86_sse41_blendvps,
7118                                           WriteFVarBlend>;
7119defm VBLENDVPSY : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR256, f256mem,
7120                                  loadv8f32, int_x86_avx_blendv_ps_256,
7121                                  WriteFVarBlend>, VEX_L;
7122} // ExeDomain = SSEPackedSingle
7123defm VPBLENDVB  : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR128, i128mem,
7124                                           loadv2i64, int_x86_sse41_pblendvb,
7125                                           WriteVarBlend>;
7126}
7127
7128let Predicates = [HasAVX2] in {
7129defm VPBLENDVBY : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR256, i256mem,
7130                                      loadv4i64, int_x86_avx2_pblendvb,
7131                                      WriteVarBlend>, VEX_L;
7132}
7133
7134let Predicates = [HasAVX] in {
7135  def : Pat<(v16i8 (vselect (v16i8 VR128:$mask), (v16i8 VR128:$src1),
7136                            (v16i8 VR128:$src2))),
7137            (VPBLENDVBrr VR128:$src2, VR128:$src1, VR128:$mask)>;
7138  def : Pat<(v4i32 (vselect (v4i32 VR128:$mask), (v4i32 VR128:$src1),
7139                            (v4i32 VR128:$src2))),
7140            (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>;
7141  def : Pat<(v4f32 (vselect (v4i32 VR128:$mask), (v4f32 VR128:$src1),
7142                            (v4f32 VR128:$src2))),
7143            (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>;
7144  def : Pat<(v2i64 (vselect (v2i64 VR128:$mask), (v2i64 VR128:$src1),
7145                            (v2i64 VR128:$src2))),
7146            (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>;
7147  def : Pat<(v2f64 (vselect (v2i64 VR128:$mask), (v2f64 VR128:$src1),
7148                            (v2f64 VR128:$src2))),
7149            (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>;
7150  def : Pat<(v8i32 (vselect (v8i32 VR256:$mask), (v8i32 VR256:$src1),
7151                            (v8i32 VR256:$src2))),
7152            (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
7153  def : Pat<(v8f32 (vselect (v8i32 VR256:$mask), (v8f32 VR256:$src1),
7154                            (v8f32 VR256:$src2))),
7155            (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
7156  def : Pat<(v4i64 (vselect (v4i64 VR256:$mask), (v4i64 VR256:$src1),
7157                            (v4i64 VR256:$src2))),
7158            (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
7159  def : Pat<(v4f64 (vselect (v4i64 VR256:$mask), (v4f64 VR256:$src1),
7160                            (v4f64 VR256:$src2))),
7161            (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
7162}
7163
7164let Predicates = [HasAVX2] in {
7165  def : Pat<(v32i8 (vselect (v32i8 VR256:$mask), (v32i8 VR256:$src1),
7166                            (v32i8 VR256:$src2))),
7167            (VPBLENDVBYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
7168}
7169
7170// Patterns
7171// FIXME: Prefer a movss or movsd over a blendps when optimizing for size or
7172// on targets where they have equal performance. These were changed to use
7173// blends because blends have better throughput on SandyBridge and Haswell, but
7174// movs[s/d] are 1-2 byte shorter instructions.
7175let Predicates = [UseAVX] in {
7176  let AddedComplexity = 15 in {
7177  // Move scalar to XMM zero-extended, zeroing a VR128 then do a
7178  // MOVS{S,D} to the lower bits.
7179  def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
7180            (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)>;
7181  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
7182            (VBLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
7183  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
7184            (VPBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
7185  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
7186            (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)>;
7187
7188  // Move low f32 and clear high bits.
7189  def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
7190            (VBLENDPSYrri (v8f32 (AVX_SET0)), VR256:$src, (i8 1))>;
7191
7192  // Move low f64 and clear high bits.
7193  def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))),
7194            (VBLENDPDYrri (v4f64 (AVX_SET0)), VR256:$src, (i8 1))>;
7195  }
7196
7197  def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
7198                   (v4f32 (scalar_to_vector FR32:$src)), (iPTR 0)))),
7199            (SUBREG_TO_REG (i32 0),
7200                           (v4f32 (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)),
7201                           sub_xmm)>;
7202  def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
7203                   (v2f64 (scalar_to_vector FR64:$src)), (iPTR 0)))),
7204            (SUBREG_TO_REG (i64 0),
7205                           (v2f64 (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)),
7206                           sub_xmm)>;
7207
7208  // These will incur an FP/int domain crossing penalty, but it may be the only
7209  // way without AVX2. Do not add any complexity because we may be able to match
7210  // more optimal patterns defined earlier in this file.
7211  def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
7212            (VBLENDPSYrri (v8i32 (AVX_SET0)), VR256:$src, (i8 1))>;
7213  def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))),
7214            (VBLENDPDYrri (v4i64 (AVX_SET0)), VR256:$src, (i8 1))>;
7215}
7216
7217// FIXME: Prefer a movss or movsd over a blendps when optimizing for size or
7218// on targets where they have equal performance. These were changed to use
7219// blends because blends have better throughput on SandyBridge and Haswell, but
7220// movs[s/d] are 1-2 byte shorter instructions.
7221let Predicates = [UseSSE41] in {
7222  // With SSE41 we can use blends for these patterns.
7223  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
7224            (BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
7225  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
7226            (PBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
7227  def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
7228            (BLENDPDrri (v2f64 (V_SET0)), VR128:$src, (i8 1))>;
7229}
7230
7231
7232/// SS41I_ternary_int - SSE 4.1 ternary operator
7233let Uses = [XMM0], Constraints = "$src1 = $dst" in {
7234  multiclass SS41I_ternary_int<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
7235                               X86MemOperand x86memop, Intrinsic IntId,
7236                               OpndItins itins = DEFAULT_ITINS> {
7237    def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
7238                    (ins VR128:$src1, VR128:$src2),
7239                    !strconcat(OpcodeStr,
7240                     "\t{$src2, $dst|$dst, $src2}"),
7241                    [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0))],
7242                    itins.rr>, Sched<[itins.Sched]>;
7243
7244    def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
7245                    (ins VR128:$src1, x86memop:$src2),
7246                    !strconcat(OpcodeStr,
7247                     "\t{$src2, $dst|$dst, $src2}"),
7248                    [(set VR128:$dst,
7249                      (IntId VR128:$src1,
7250                       (bitconvert (mem_frag addr:$src2)), XMM0))],
7251                       itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
7252  }
7253}
7254
7255let ExeDomain = SSEPackedDouble in
7256defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", memopv2f64, f128mem,
7257                                  int_x86_sse41_blendvpd,
7258                                  DEFAULT_ITINS_FBLENDSCHED>;
7259let ExeDomain = SSEPackedSingle in
7260defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", memopv4f32, f128mem,
7261                                  int_x86_sse41_blendvps,
7262                                  DEFAULT_ITINS_FBLENDSCHED>;
7263defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", memopv2i64, i128mem,
7264                                  int_x86_sse41_pblendvb,
7265                                  DEFAULT_ITINS_VARBLENDSCHED>;
7266
7267// Aliases with the implicit xmm0 argument
7268def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
7269                (BLENDVPDrr0 VR128:$dst, VR128:$src2)>;
7270def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
7271                (BLENDVPDrm0 VR128:$dst, f128mem:$src2)>;
7272def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
7273                (BLENDVPSrr0 VR128:$dst, VR128:$src2)>;
7274def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
7275                (BLENDVPSrm0 VR128:$dst, f128mem:$src2)>;
7276def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
7277                (PBLENDVBrr0 VR128:$dst, VR128:$src2)>;
7278def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
7279                (PBLENDVBrm0 VR128:$dst, i128mem:$src2)>;
7280
7281let Predicates = [UseSSE41] in {
7282  def : Pat<(v16i8 (vselect (v16i8 XMM0), (v16i8 VR128:$src1),
7283                            (v16i8 VR128:$src2))),
7284            (PBLENDVBrr0 VR128:$src2, VR128:$src1)>;
7285  def : Pat<(v4i32 (vselect (v4i32 XMM0), (v4i32 VR128:$src1),
7286                            (v4i32 VR128:$src2))),
7287            (BLENDVPSrr0 VR128:$src2, VR128:$src1)>;
7288  def : Pat<(v4f32 (vselect (v4i32 XMM0), (v4f32 VR128:$src1),
7289                            (v4f32 VR128:$src2))),
7290            (BLENDVPSrr0 VR128:$src2, VR128:$src1)>;
7291  def : Pat<(v2i64 (vselect (v2i64 XMM0), (v2i64 VR128:$src1),
7292                            (v2i64 VR128:$src2))),
7293            (BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
7294  def : Pat<(v2f64 (vselect (v2i64 XMM0), (v2f64 VR128:$src1),
7295                            (v2f64 VR128:$src2))),
7296            (BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
7297}
7298
7299let SchedRW = [WriteLoad] in {
7300let Predicates = [HasAVX] in
7301def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
7302                       "vmovntdqa\t{$src, $dst|$dst, $src}",
7303                       [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>,
7304                       VEX;
7305let Predicates = [HasAVX2] in
7306def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
7307                         "vmovntdqa\t{$src, $dst|$dst, $src}",
7308                         [(set VR256:$dst, (int_x86_avx2_movntdqa addr:$src))]>,
7309                         VEX, VEX_L;
7310def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
7311                       "movntdqa\t{$src, $dst|$dst, $src}",
7312                       [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>;
7313} // SchedRW
7314
7315//===----------------------------------------------------------------------===//
7316// SSE4.2 - Compare Instructions
7317//===----------------------------------------------------------------------===//
7318
7319/// SS42I_binop_rm - Simple SSE 4.2 binary operator
7320multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
7321                          ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
7322                          X86MemOperand x86memop, bit Is2Addr = 1> {
7323  def rr : SS428I<opc, MRMSrcReg, (outs RC:$dst),
7324       (ins RC:$src1, RC:$src2),
7325       !if(Is2Addr,
7326           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
7327           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
7328       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>;
7329  def rm : SS428I<opc, MRMSrcMem, (outs RC:$dst),
7330       (ins RC:$src1, x86memop:$src2),
7331       !if(Is2Addr,
7332           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
7333           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
7334       [(set RC:$dst,
7335         (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>;
7336}
7337
7338let Predicates = [HasAVX] in
7339  defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128,
7340                                 loadv2i64, i128mem, 0>, VEX_4V;
7341
7342let Predicates = [HasAVX2] in
7343  defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256,
7344                                  loadv4i64, i256mem, 0>, VEX_4V, VEX_L;
7345
7346let Constraints = "$src1 = $dst" in
7347  defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128,
7348                                memopv2i64, i128mem>;
7349
7350//===----------------------------------------------------------------------===//
7351// SSE4.2 - String/text Processing Instructions
7352//===----------------------------------------------------------------------===//
7353
7354// Packed Compare Implicit Length Strings, Return Mask
7355multiclass pseudo_pcmpistrm<string asm, PatFrag ld_frag> {
7356  def REG : PseudoI<(outs VR128:$dst),
7357                    (ins VR128:$src1, VR128:$src2, u8imm:$src3),
7358    [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1, VR128:$src2,
7359                                                  imm:$src3))]>;
7360  def MEM : PseudoI<(outs VR128:$dst),
7361                    (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
7362    [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1,
7363                       (bc_v16i8 (ld_frag addr:$src2)), imm:$src3))]>;
7364}
7365
7366let Defs = [EFLAGS], usesCustomInserter = 1 in {
7367  defm VPCMPISTRM128 : pseudo_pcmpistrm<"#VPCMPISTRM128", loadv2i64>,
7368                         Requires<[HasAVX]>;
7369  defm PCMPISTRM128 : pseudo_pcmpistrm<"#PCMPISTRM128", memopv2i64>,
7370                         Requires<[UseSSE42]>;
7371}
7372
7373multiclass pcmpistrm_SS42AI<string asm> {
7374  def rr : SS42AI<0x62, MRMSrcReg, (outs),
7375    (ins VR128:$src1, VR128:$src2, u8imm:$src3),
7376    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
7377    []>, Sched<[WritePCmpIStrM]>;
7378  let mayLoad = 1 in
7379  def rm :SS42AI<0x62, MRMSrcMem, (outs),
7380    (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
7381    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
7382    []>, Sched<[WritePCmpIStrMLd, ReadAfterLd]>;
7383}
7384
7385let Defs = [XMM0, EFLAGS], hasSideEffects = 0 in {
7386  let Predicates = [HasAVX] in
7387  defm VPCMPISTRM128 : pcmpistrm_SS42AI<"vpcmpistrm">, VEX;
7388  defm PCMPISTRM128  : pcmpistrm_SS42AI<"pcmpistrm"> ;
7389}
7390
7391// Packed Compare Explicit Length Strings, Return Mask
7392multiclass pseudo_pcmpestrm<string asm, PatFrag ld_frag> {
7393  def REG : PseudoI<(outs VR128:$dst),
7394                    (ins VR128:$src1, VR128:$src3, u8imm:$src5),
7395    [(set VR128:$dst, (int_x86_sse42_pcmpestrm128
7396                       VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>;
7397  def MEM : PseudoI<(outs VR128:$dst),
7398                    (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
7399    [(set VR128:$dst, (int_x86_sse42_pcmpestrm128 VR128:$src1, EAX,
7400                       (bc_v16i8 (ld_frag addr:$src3)), EDX, imm:$src5))]>;
7401}
7402
7403let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in {
7404  defm VPCMPESTRM128 : pseudo_pcmpestrm<"#VPCMPESTRM128", loadv2i64>,
7405                         Requires<[HasAVX]>;
7406  defm PCMPESTRM128 : pseudo_pcmpestrm<"#PCMPESTRM128", memopv2i64>,
7407                         Requires<[UseSSE42]>;
7408}
7409
7410multiclass SS42AI_pcmpestrm<string asm> {
7411  def rr : SS42AI<0x60, MRMSrcReg, (outs),
7412    (ins VR128:$src1, VR128:$src3, u8imm:$src5),
7413    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
7414    []>, Sched<[WritePCmpEStrM]>;
7415  let mayLoad = 1 in
7416  def rm : SS42AI<0x60, MRMSrcMem, (outs),
7417    (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
7418    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
7419    []>, Sched<[WritePCmpEStrMLd, ReadAfterLd]>;
7420}
7421
7422let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
7423  let Predicates = [HasAVX] in
7424  defm VPCMPESTRM128 : SS42AI_pcmpestrm<"vpcmpestrm">, VEX;
7425  defm PCMPESTRM128 :  SS42AI_pcmpestrm<"pcmpestrm">;
7426}
7427
7428// Packed Compare Implicit Length Strings, Return Index
7429multiclass pseudo_pcmpistri<string asm, PatFrag ld_frag> {
7430  def REG : PseudoI<(outs GR32:$dst),
7431                    (ins VR128:$src1, VR128:$src2, u8imm:$src3),
7432    [(set GR32:$dst, EFLAGS,
7433      (X86pcmpistri VR128:$src1, VR128:$src2, imm:$src3))]>;
7434  def MEM : PseudoI<(outs GR32:$dst),
7435                    (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
7436    [(set GR32:$dst, EFLAGS, (X86pcmpistri VR128:$src1,
7437                              (bc_v16i8 (ld_frag addr:$src2)), imm:$src3))]>;
7438}
7439
7440let Defs = [EFLAGS], usesCustomInserter = 1 in {
7441  defm VPCMPISTRI : pseudo_pcmpistri<"#VPCMPISTRI", loadv2i64>,
7442                      Requires<[HasAVX]>;
7443  defm PCMPISTRI  : pseudo_pcmpistri<"#PCMPISTRI", memopv2i64>,
7444                      Requires<[UseSSE42]>;
7445}
7446
7447multiclass SS42AI_pcmpistri<string asm> {
7448  def rr : SS42AI<0x63, MRMSrcReg, (outs),
7449    (ins VR128:$src1, VR128:$src2, u8imm:$src3),
7450    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
7451    []>, Sched<[WritePCmpIStrI]>;
7452  let mayLoad = 1 in
7453  def rm : SS42AI<0x63, MRMSrcMem, (outs),
7454    (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
7455    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
7456    []>, Sched<[WritePCmpIStrILd, ReadAfterLd]>;
7457}
7458
7459let Defs = [ECX, EFLAGS], hasSideEffects = 0 in {
7460  let Predicates = [HasAVX] in
7461  defm VPCMPISTRI : SS42AI_pcmpistri<"vpcmpistri">, VEX;
7462  defm PCMPISTRI  : SS42AI_pcmpistri<"pcmpistri">;
7463}
7464
7465// Packed Compare Explicit Length Strings, Return Index
7466multiclass pseudo_pcmpestri<string asm, PatFrag ld_frag> {
7467  def REG : PseudoI<(outs GR32:$dst),
7468                    (ins VR128:$src1, VR128:$src3, u8imm:$src5),
7469    [(set GR32:$dst, EFLAGS,
7470      (X86pcmpestri VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>;
7471  def MEM : PseudoI<(outs GR32:$dst),
7472                    (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
7473    [(set GR32:$dst, EFLAGS,
7474      (X86pcmpestri VR128:$src1, EAX, (bc_v16i8 (ld_frag addr:$src3)), EDX,
7475       imm:$src5))]>;
7476}
7477
7478let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in {
7479  defm VPCMPESTRI : pseudo_pcmpestri<"#VPCMPESTRI", loadv2i64>,
7480                      Requires<[HasAVX]>;
7481  defm PCMPESTRI  : pseudo_pcmpestri<"#PCMPESTRI", memopv2i64>,
7482                      Requires<[UseSSE42]>;
7483}
7484
7485multiclass SS42AI_pcmpestri<string asm> {
7486  def rr : SS42AI<0x61, MRMSrcReg, (outs),
7487    (ins VR128:$src1, VR128:$src3, u8imm:$src5),
7488    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
7489    []>, Sched<[WritePCmpEStrI]>;
7490  let mayLoad = 1 in
7491  def rm : SS42AI<0x61, MRMSrcMem, (outs),
7492    (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
7493    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
7494    []>, Sched<[WritePCmpEStrILd, ReadAfterLd]>;
7495}
7496
7497let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
7498  let Predicates = [HasAVX] in
7499  defm VPCMPESTRI : SS42AI_pcmpestri<"vpcmpestri">, VEX;
7500  defm PCMPESTRI  : SS42AI_pcmpestri<"pcmpestri">;
7501}
7502
7503//===----------------------------------------------------------------------===//
7504// SSE4.2 - CRC Instructions
7505//===----------------------------------------------------------------------===//
7506
7507// No CRC instructions have AVX equivalents
7508
7509// crc intrinsic instruction
7510// This set of instructions are only rm, the only difference is the size
7511// of r and m.
7512class SS42I_crc32r<bits<8> opc, string asm, RegisterClass RCOut,
7513                   RegisterClass RCIn, SDPatternOperator Int> :
7514  SS42FI<opc, MRMSrcReg, (outs RCOut:$dst), (ins RCOut:$src1, RCIn:$src2),
7515         !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
7516         [(set RCOut:$dst, (Int RCOut:$src1, RCIn:$src2))], IIC_CRC32_REG>,
7517         Sched<[WriteFAdd]>;
7518
7519class SS42I_crc32m<bits<8> opc, string asm, RegisterClass RCOut,
7520                   X86MemOperand x86memop, SDPatternOperator Int> :
7521  SS42FI<opc, MRMSrcMem, (outs RCOut:$dst), (ins RCOut:$src1, x86memop:$src2),
7522         !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
7523         [(set RCOut:$dst, (Int RCOut:$src1, (load addr:$src2)))],
7524         IIC_CRC32_MEM>, Sched<[WriteFAddLd, ReadAfterLd]>;
7525
7526let Constraints = "$src1 = $dst" in {
7527  def CRC32r32m8  : SS42I_crc32m<0xF0, "crc32{b}", GR32, i8mem,
7528                                 int_x86_sse42_crc32_32_8>;
7529  def CRC32r32r8  : SS42I_crc32r<0xF0, "crc32{b}", GR32, GR8,
7530                                 int_x86_sse42_crc32_32_8>;
7531  def CRC32r32m16 : SS42I_crc32m<0xF1, "crc32{w}", GR32, i16mem,
7532                                 int_x86_sse42_crc32_32_16>, OpSize16;
7533  def CRC32r32r16 : SS42I_crc32r<0xF1, "crc32{w}", GR32, GR16,
7534                                 int_x86_sse42_crc32_32_16>, OpSize16;
7535  def CRC32r32m32 : SS42I_crc32m<0xF1, "crc32{l}", GR32, i32mem,
7536                                 int_x86_sse42_crc32_32_32>, OpSize32;
7537  def CRC32r32r32 : SS42I_crc32r<0xF1, "crc32{l}", GR32, GR32,
7538                                 int_x86_sse42_crc32_32_32>, OpSize32;
7539  def CRC32r64m64 : SS42I_crc32m<0xF1, "crc32{q}", GR64, i64mem,
7540                                 int_x86_sse42_crc32_64_64>, REX_W;
7541  def CRC32r64r64 : SS42I_crc32r<0xF1, "crc32{q}", GR64, GR64,
7542                                 int_x86_sse42_crc32_64_64>, REX_W;
7543  let hasSideEffects = 0 in {
7544    let mayLoad = 1 in
7545    def CRC32r64m8 : SS42I_crc32m<0xF0, "crc32{b}", GR64, i8mem,
7546                                   null_frag>, REX_W;
7547    def CRC32r64r8 : SS42I_crc32r<0xF0, "crc32{b}", GR64, GR8,
7548                                   null_frag>, REX_W;
7549  }
7550}
7551
7552//===----------------------------------------------------------------------===//
7553// SHA-NI Instructions
7554//===----------------------------------------------------------------------===//
7555
7556multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId,
7557                      bit UsesXMM0 = 0> {
7558  def rr : I<Opc, MRMSrcReg, (outs VR128:$dst),
7559             (ins VR128:$src1, VR128:$src2),
7560             !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
7561             [!if(UsesXMM0,
7562                  (set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0)),
7563                  (set VR128:$dst, (IntId VR128:$src1, VR128:$src2)))]>, T8;
7564
7565  def rm : I<Opc, MRMSrcMem, (outs VR128:$dst),
7566             (ins VR128:$src1, i128mem:$src2),
7567             !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
7568             [!if(UsesXMM0,
7569                  (set VR128:$dst, (IntId VR128:$src1,
7570                    (bc_v4i32 (memopv2i64 addr:$src2)), XMM0)),
7571                  (set VR128:$dst, (IntId VR128:$src1,
7572                    (bc_v4i32 (memopv2i64 addr:$src2)))))]>, T8;
7573}
7574
7575let Constraints = "$src1 = $dst", Predicates = [HasSHA] in {
7576  def SHA1RNDS4rri : Ii8<0xCC, MRMSrcReg, (outs VR128:$dst),
7577                         (ins VR128:$src1, VR128:$src2, u8imm:$src3),
7578                         "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
7579                         [(set VR128:$dst,
7580                           (int_x86_sha1rnds4 VR128:$src1, VR128:$src2,
7581                            (i8 imm:$src3)))]>, TA;
7582  def SHA1RNDS4rmi : Ii8<0xCC, MRMSrcMem, (outs VR128:$dst),
7583                         (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
7584                         "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
7585                         [(set VR128:$dst,
7586                           (int_x86_sha1rnds4 VR128:$src1,
7587                            (bc_v4i32 (memopv2i64 addr:$src2)),
7588                            (i8 imm:$src3)))]>, TA;
7589
7590  defm SHA1NEXTE : SHAI_binop<0xC8, "sha1nexte", int_x86_sha1nexte>;
7591  defm SHA1MSG1  : SHAI_binop<0xC9, "sha1msg1", int_x86_sha1msg1>;
7592  defm SHA1MSG2  : SHAI_binop<0xCA, "sha1msg2", int_x86_sha1msg2>;
7593
7594  let Uses=[XMM0] in
7595  defm SHA256RNDS2 : SHAI_binop<0xCB, "sha256rnds2", int_x86_sha256rnds2, 1>;
7596
7597  defm SHA256MSG1 : SHAI_binop<0xCC, "sha256msg1", int_x86_sha256msg1>;
7598  defm SHA256MSG2 : SHAI_binop<0xCD, "sha256msg2", int_x86_sha256msg2>;
7599}
7600
7601// Aliases with explicit %xmm0
7602def : InstAlias<"sha256rnds2\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
7603                (SHA256RNDS2rr VR128:$dst, VR128:$src2)>;
7604def : InstAlias<"sha256rnds2\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
7605                (SHA256RNDS2rm VR128:$dst, i128mem:$src2)>;
7606
7607//===----------------------------------------------------------------------===//
7608// AES-NI Instructions
7609//===----------------------------------------------------------------------===//
7610
7611multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId128,
7612                             PatFrag ld_frag, bit Is2Addr = 1> {
7613  def rr : AES8I<opc, MRMSrcReg, (outs VR128:$dst),
7614       (ins VR128:$src1, VR128:$src2),
7615       !if(Is2Addr,
7616           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
7617           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
7618       [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
7619       Sched<[WriteAESDecEnc]>;
7620  def rm : AES8I<opc, MRMSrcMem, (outs VR128:$dst),
7621       (ins VR128:$src1, i128mem:$src2),
7622       !if(Is2Addr,
7623           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
7624           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
7625       [(set VR128:$dst,
7626         (IntId128 VR128:$src1, (ld_frag addr:$src2)))]>,
7627       Sched<[WriteAESDecEncLd, ReadAfterLd]>;
7628}
7629
7630// Perform One Round of an AES Encryption/Decryption Flow
7631let Predicates = [HasAVX, HasAES] in {
7632  defm VAESENC          : AESI_binop_rm_int<0xDC, "vaesenc",
7633                         int_x86_aesni_aesenc, loadv2i64, 0>, VEX_4V;
7634  defm VAESENCLAST      : AESI_binop_rm_int<0xDD, "vaesenclast",
7635                         int_x86_aesni_aesenclast, loadv2i64, 0>, VEX_4V;
7636  defm VAESDEC          : AESI_binop_rm_int<0xDE, "vaesdec",
7637                         int_x86_aesni_aesdec, loadv2i64, 0>, VEX_4V;
7638  defm VAESDECLAST      : AESI_binop_rm_int<0xDF, "vaesdeclast",
7639                         int_x86_aesni_aesdeclast, loadv2i64, 0>, VEX_4V;
7640}
7641
7642let Constraints = "$src1 = $dst" in {
7643  defm AESENC          : AESI_binop_rm_int<0xDC, "aesenc",
7644                         int_x86_aesni_aesenc, memopv2i64>;
7645  defm AESENCLAST      : AESI_binop_rm_int<0xDD, "aesenclast",
7646                         int_x86_aesni_aesenclast, memopv2i64>;
7647  defm AESDEC          : AESI_binop_rm_int<0xDE, "aesdec",
7648                         int_x86_aesni_aesdec, memopv2i64>;
7649  defm AESDECLAST      : AESI_binop_rm_int<0xDF, "aesdeclast",
7650                         int_x86_aesni_aesdeclast, memopv2i64>;
7651}
7652
7653// Perform the AES InvMixColumn Transformation
7654let Predicates = [HasAVX, HasAES] in {
7655  def VAESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
7656      (ins VR128:$src1),
7657      "vaesimc\t{$src1, $dst|$dst, $src1}",
7658      [(set VR128:$dst,
7659        (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>,
7660      VEX;
7661  def VAESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
7662      (ins i128mem:$src1),
7663      "vaesimc\t{$src1, $dst|$dst, $src1}",
7664      [(set VR128:$dst, (int_x86_aesni_aesimc (loadv2i64 addr:$src1)))]>,
7665      Sched<[WriteAESIMCLd]>, VEX;
7666}
7667def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
7668  (ins VR128:$src1),
7669  "aesimc\t{$src1, $dst|$dst, $src1}",
7670  [(set VR128:$dst,
7671    (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>;
7672def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
7673  (ins i128mem:$src1),
7674  "aesimc\t{$src1, $dst|$dst, $src1}",
7675  [(set VR128:$dst, (int_x86_aesni_aesimc (memopv2i64 addr:$src1)))]>,
7676  Sched<[WriteAESIMCLd]>;
7677
7678// AES Round Key Generation Assist
7679let Predicates = [HasAVX, HasAES] in {
7680  def VAESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
7681      (ins VR128:$src1, u8imm:$src2),
7682      "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7683      [(set VR128:$dst,
7684        (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>,
7685      Sched<[WriteAESKeyGen]>, VEX;
7686  def VAESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
7687      (ins i128mem:$src1, u8imm:$src2),
7688      "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7689      [(set VR128:$dst,
7690        (int_x86_aesni_aeskeygenassist (loadv2i64 addr:$src1), imm:$src2))]>,
7691      Sched<[WriteAESKeyGenLd]>, VEX;
7692}
7693def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
7694  (ins VR128:$src1, u8imm:$src2),
7695  "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7696  [(set VR128:$dst,
7697    (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>,
7698  Sched<[WriteAESKeyGen]>;
7699def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
7700  (ins i128mem:$src1, u8imm:$src2),
7701  "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7702  [(set VR128:$dst,
7703    (int_x86_aesni_aeskeygenassist (memopv2i64 addr:$src1), imm:$src2))]>,
7704  Sched<[WriteAESKeyGenLd]>;
7705
7706//===----------------------------------------------------------------------===//
7707// PCLMUL Instructions
7708//===----------------------------------------------------------------------===//
7709
7710// AVX carry-less Multiplication instructions
7711let isCommutable = 1 in
7712def VPCLMULQDQrr : AVXPCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
7713           (ins VR128:$src1, VR128:$src2, u8imm:$src3),
7714           "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7715           [(set VR128:$dst,
7716             (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))]>,
7717           Sched<[WriteCLMul]>;
7718
7719def VPCLMULQDQrm : AVXPCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
7720           (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
7721           "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7722           [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1,
7723                              (loadv2i64 addr:$src2), imm:$src3))]>,
7724           Sched<[WriteCLMulLd, ReadAfterLd]>;
7725
7726// Carry-less Multiplication instructions
7727let Constraints = "$src1 = $dst" in {
7728let isCommutable = 1 in
7729def PCLMULQDQrr : PCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
7730           (ins VR128:$src1, VR128:$src2, u8imm:$src3),
7731           "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
7732           [(set VR128:$dst,
7733             (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))],
7734             IIC_SSE_PCLMULQDQ_RR>, Sched<[WriteCLMul]>;
7735
7736def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
7737           (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
7738           "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
7739           [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1,
7740                              (memopv2i64 addr:$src2), imm:$src3))],
7741                              IIC_SSE_PCLMULQDQ_RM>,
7742           Sched<[WriteCLMulLd, ReadAfterLd]>;
7743} // Constraints = "$src1 = $dst"
7744
7745
7746multiclass pclmul_alias<string asm, int immop> {
7747  def : InstAlias<!strconcat("pclmul", asm, "dq {$src, $dst|$dst, $src}"),
7748                  (PCLMULQDQrr VR128:$dst, VR128:$src, immop), 0>;
7749
7750  def : InstAlias<!strconcat("pclmul", asm, "dq {$src, $dst|$dst, $src}"),
7751                  (PCLMULQDQrm VR128:$dst, i128mem:$src, immop), 0>;
7752
7753  def : InstAlias<!strconcat("vpclmul", asm,
7754                             "dq {$src2, $src1, $dst|$dst, $src1, $src2}"),
7755                  (VPCLMULQDQrr VR128:$dst, VR128:$src1, VR128:$src2, immop),
7756                  0>;
7757
7758  def : InstAlias<!strconcat("vpclmul", asm,
7759                             "dq {$src2, $src1, $dst|$dst, $src1, $src2}"),
7760                  (VPCLMULQDQrm VR128:$dst, VR128:$src1, i128mem:$src2, immop),
7761                  0>;
7762}
7763defm : pclmul_alias<"hqhq", 0x11>;
7764defm : pclmul_alias<"hqlq", 0x01>;
7765defm : pclmul_alias<"lqhq", 0x10>;
7766defm : pclmul_alias<"lqlq", 0x00>;
7767
7768//===----------------------------------------------------------------------===//
7769// SSE4A Instructions
7770//===----------------------------------------------------------------------===//
7771
7772let Predicates = [HasSSE4A] in {
7773
7774let Constraints = "$src = $dst" in {
7775def EXTRQI : Ii8<0x78, MRMXr, (outs VR128:$dst),
7776                 (ins VR128:$src, u8imm:$len, u8imm:$idx),
7777                 "extrq\t{$idx, $len, $src|$src, $len, $idx}",
7778                 [(set VR128:$dst, (int_x86_sse4a_extrqi VR128:$src, imm:$len,
7779                                    imm:$idx))]>, PD;
7780def EXTRQ  : I<0x79, MRMSrcReg, (outs VR128:$dst),
7781              (ins VR128:$src, VR128:$mask),
7782              "extrq\t{$mask, $src|$src, $mask}",
7783              [(set VR128:$dst, (int_x86_sse4a_extrq VR128:$src,
7784                                 VR128:$mask))]>, PD;
7785
7786def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst),
7787                   (ins VR128:$src, VR128:$src2, u8imm:$len, u8imm:$idx),
7788                   "insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}",
7789                   [(set VR128:$dst, (int_x86_sse4a_insertqi VR128:$src,
7790                                      VR128:$src2, imm:$len, imm:$idx))]>, XD;
7791def INSERTQ  : I<0x79, MRMSrcReg, (outs VR128:$dst),
7792                 (ins VR128:$src, VR128:$mask),
7793                 "insertq\t{$mask, $src|$src, $mask}",
7794                 [(set VR128:$dst, (int_x86_sse4a_insertq VR128:$src,
7795                                    VR128:$mask))]>, XD;
7796}
7797
7798def MOVNTSS : I<0x2B, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src),
7799                "movntss\t{$src, $dst|$dst, $src}",
7800                [(int_x86_sse4a_movnt_ss addr:$dst, VR128:$src)]>, XS;
7801
7802def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
7803                "movntsd\t{$src, $dst|$dst, $src}",
7804                [(int_x86_sse4a_movnt_sd addr:$dst, VR128:$src)]>, XD;
7805}
7806
7807//===----------------------------------------------------------------------===//
7808// AVX Instructions
7809//===----------------------------------------------------------------------===//
7810
7811//===----------------------------------------------------------------------===//
7812// VBROADCAST - Load from memory and broadcast to all elements of the
7813//              destination operand
7814//
7815class avx_broadcast<bits<8> opc, string OpcodeStr, RegisterClass RC,
7816                    X86MemOperand x86memop, Intrinsic Int, SchedWrite Sched> :
7817  AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
7818        !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7819        [(set RC:$dst, (Int addr:$src))]>, Sched<[Sched]>, VEX;
7820
7821class avx_broadcast_no_int<bits<8> opc, string OpcodeStr, RegisterClass RC,
7822                           X86MemOperand x86memop, ValueType VT,
7823                           PatFrag ld_frag, SchedWrite Sched> :
7824  AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
7825        !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7826        [(set RC:$dst, (VT (X86VBroadcast (ld_frag addr:$src))))]>,
7827        Sched<[Sched]>, VEX {
7828    let mayLoad = 1;
7829}
7830
7831// AVX2 adds register forms
7832class avx2_broadcast_reg<bits<8> opc, string OpcodeStr, RegisterClass RC,
7833                         Intrinsic Int, SchedWrite Sched> :
7834  AVX28I<opc, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
7835         !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7836         [(set RC:$dst, (Int VR128:$src))]>, Sched<[Sched]>, VEX;
7837
7838let ExeDomain = SSEPackedSingle in {
7839  def VBROADCASTSSrm  : avx_broadcast_no_int<0x18, "vbroadcastss", VR128,
7840                                             f32mem, v4f32, loadf32, WriteLoad>;
7841  def VBROADCASTSSYrm : avx_broadcast_no_int<0x18, "vbroadcastss", VR256,
7842                                             f32mem, v8f32, loadf32,
7843                                             WriteFShuffleLd>, VEX_L;
7844}
7845let ExeDomain = SSEPackedDouble in
7846def VBROADCASTSDYrm  : avx_broadcast_no_int<0x19, "vbroadcastsd", VR256, f64mem,
7847                                    v4f64, loadf64, WriteFShuffleLd>, VEX_L;
7848def VBROADCASTF128 : avx_broadcast<0x1A, "vbroadcastf128", VR256, f128mem,
7849                                   int_x86_avx_vbroadcastf128_pd_256,
7850                                   WriteFShuffleLd>, VEX_L;
7851
7852let ExeDomain = SSEPackedSingle in {
7853  def VBROADCASTSSrr  : avx2_broadcast_reg<0x18, "vbroadcastss", VR128,
7854                                           int_x86_avx2_vbroadcast_ss_ps,
7855                                           WriteFShuffle>;
7856  def VBROADCASTSSYrr : avx2_broadcast_reg<0x18, "vbroadcastss", VR256,
7857                                      int_x86_avx2_vbroadcast_ss_ps_256,
7858                                      WriteFShuffle256>, VEX_L;
7859}
7860let ExeDomain = SSEPackedDouble in
7861def VBROADCASTSDYrr  : avx2_broadcast_reg<0x19, "vbroadcastsd", VR256,
7862                                      int_x86_avx2_vbroadcast_sd_pd_256,
7863                                      WriteFShuffle256>, VEX_L;
7864
7865let Predicates = [HasAVX2] in
7866def VBROADCASTI128 : avx_broadcast_no_int<0x5A, "vbroadcasti128", VR256,
7867                                          i128mem, v4i64, loadv2i64,
7868                                          WriteLoad>, VEX_L;
7869
7870let Predicates = [HasAVX] in
7871def : Pat<(int_x86_avx_vbroadcastf128_ps_256 addr:$src),
7872          (VBROADCASTF128 addr:$src)>;
7873
7874
7875//===----------------------------------------------------------------------===//
7876// VINSERTF128 - Insert packed floating-point values
7877//
7878let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
7879def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst),
7880          (ins VR256:$src1, VR128:$src2, u8imm:$src3),
7881          "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7882          []>, Sched<[WriteFShuffle]>, VEX_4V, VEX_L;
7883let mayLoad = 1 in
7884def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst),
7885          (ins VR256:$src1, f128mem:$src2, u8imm:$src3),
7886          "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7887          []>, Sched<[WriteFShuffleLd, ReadAfterLd]>, VEX_4V, VEX_L;
7888}
7889
7890let Predicates = [HasAVX] in {
7891def : Pat<(vinsert128_insert:$ins (v8f32 VR256:$src1), (v4f32 VR128:$src2),
7892                                   (iPTR imm)),
7893          (VINSERTF128rr VR256:$src1, VR128:$src2,
7894                         (INSERT_get_vinsert128_imm VR256:$ins))>;
7895def : Pat<(vinsert128_insert:$ins (v4f64 VR256:$src1), (v2f64 VR128:$src2),
7896                                   (iPTR imm)),
7897          (VINSERTF128rr VR256:$src1, VR128:$src2,
7898                         (INSERT_get_vinsert128_imm VR256:$ins))>;
7899
7900def : Pat<(vinsert128_insert:$ins (v8f32 VR256:$src1), (loadv4f32 addr:$src2),
7901                                   (iPTR imm)),
7902          (VINSERTF128rm VR256:$src1, addr:$src2,
7903                         (INSERT_get_vinsert128_imm VR256:$ins))>;
7904def : Pat<(vinsert128_insert:$ins (v4f64 VR256:$src1), (loadv2f64 addr:$src2),
7905                                   (iPTR imm)),
7906          (VINSERTF128rm VR256:$src1, addr:$src2,
7907                         (INSERT_get_vinsert128_imm VR256:$ins))>;
7908}
7909
7910let Predicates = [HasAVX1Only] in {
7911def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2),
7912                                   (iPTR imm)),
7913          (VINSERTF128rr VR256:$src1, VR128:$src2,
7914                         (INSERT_get_vinsert128_imm VR256:$ins))>;
7915def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2),
7916                                   (iPTR imm)),
7917          (VINSERTF128rr VR256:$src1, VR128:$src2,
7918                         (INSERT_get_vinsert128_imm VR256:$ins))>;
7919def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2),
7920                                   (iPTR imm)),
7921          (VINSERTF128rr VR256:$src1, VR128:$src2,
7922                         (INSERT_get_vinsert128_imm VR256:$ins))>;
7923def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2),
7924                                   (iPTR imm)),
7925          (VINSERTF128rr VR256:$src1, VR128:$src2,
7926                         (INSERT_get_vinsert128_imm VR256:$ins))>;
7927
7928def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (loadv2i64 addr:$src2),
7929                                   (iPTR imm)),
7930          (VINSERTF128rm VR256:$src1, addr:$src2,
7931                         (INSERT_get_vinsert128_imm VR256:$ins))>;
7932def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1),
7933                                   (bc_v4i32 (loadv2i64 addr:$src2)),
7934                                   (iPTR imm)),
7935          (VINSERTF128rm VR256:$src1, addr:$src2,
7936                         (INSERT_get_vinsert128_imm VR256:$ins))>;
7937def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1),
7938                                   (bc_v16i8 (loadv2i64 addr:$src2)),
7939                                   (iPTR imm)),
7940          (VINSERTF128rm VR256:$src1, addr:$src2,
7941                         (INSERT_get_vinsert128_imm VR256:$ins))>;
7942def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1),
7943                                   (bc_v8i16 (loadv2i64 addr:$src2)),
7944                                   (iPTR imm)),
7945          (VINSERTF128rm VR256:$src1, addr:$src2,
7946                         (INSERT_get_vinsert128_imm VR256:$ins))>;
7947}
7948
7949//===----------------------------------------------------------------------===//
7950// VEXTRACTF128 - Extract packed floating-point values
7951//
7952let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
7953def VEXTRACTF128rr : AVXAIi8<0x19, MRMDestReg, (outs VR128:$dst),
7954          (ins VR256:$src1, u8imm:$src2),
7955          "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7956          []>, Sched<[WriteFShuffle]>, VEX, VEX_L;
7957let mayStore = 1 in
7958def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs),
7959          (ins f128mem:$dst, VR256:$src1, u8imm:$src2),
7960          "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7961          []>, Sched<[WriteStore]>, VEX, VEX_L;
7962}
7963
7964// AVX1 patterns
7965let Predicates = [HasAVX] in {
7966def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
7967          (v4f32 (VEXTRACTF128rr
7968                    (v8f32 VR256:$src1),
7969                    (EXTRACT_get_vextract128_imm VR128:$ext)))>;
7970def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
7971          (v2f64 (VEXTRACTF128rr
7972                    (v4f64 VR256:$src1),
7973                    (EXTRACT_get_vextract128_imm VR128:$ext)))>;
7974
7975def : Pat<(store (v4f32 (vextract128_extract:$ext (v8f32 VR256:$src1),
7976                         (iPTR imm))), addr:$dst),
7977          (VEXTRACTF128mr addr:$dst, VR256:$src1,
7978           (EXTRACT_get_vextract128_imm VR128:$ext))>;
7979def : Pat<(store (v2f64 (vextract128_extract:$ext (v4f64 VR256:$src1),
7980                         (iPTR imm))), addr:$dst),
7981          (VEXTRACTF128mr addr:$dst, VR256:$src1,
7982           (EXTRACT_get_vextract128_imm VR128:$ext))>;
7983}
7984
7985let Predicates = [HasAVX1Only] in {
7986def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
7987          (v2i64 (VEXTRACTF128rr
7988                  (v4i64 VR256:$src1),
7989                  (EXTRACT_get_vextract128_imm VR128:$ext)))>;
7990def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
7991          (v4i32 (VEXTRACTF128rr
7992                  (v8i32 VR256:$src1),
7993                  (EXTRACT_get_vextract128_imm VR128:$ext)))>;
7994def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
7995          (v8i16 (VEXTRACTF128rr
7996                  (v16i16 VR256:$src1),
7997                  (EXTRACT_get_vextract128_imm VR128:$ext)))>;
7998def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
7999          (v16i8 (VEXTRACTF128rr
8000                  (v32i8 VR256:$src1),
8001                  (EXTRACT_get_vextract128_imm VR128:$ext)))>;
8002
8003def : Pat<(alignedstore (v2i64 (vextract128_extract:$ext (v4i64 VR256:$src1),
8004                                (iPTR imm))), addr:$dst),
8005          (VEXTRACTF128mr addr:$dst, VR256:$src1,
8006           (EXTRACT_get_vextract128_imm VR128:$ext))>;
8007def : Pat<(alignedstore (v4i32 (vextract128_extract:$ext (v8i32 VR256:$src1),
8008                                (iPTR imm))), addr:$dst),
8009          (VEXTRACTF128mr addr:$dst, VR256:$src1,
8010           (EXTRACT_get_vextract128_imm VR128:$ext))>;
8011def : Pat<(alignedstore (v8i16 (vextract128_extract:$ext (v16i16 VR256:$src1),
8012                                (iPTR imm))), addr:$dst),
8013          (VEXTRACTF128mr addr:$dst, VR256:$src1,
8014           (EXTRACT_get_vextract128_imm VR128:$ext))>;
8015def : Pat<(alignedstore (v16i8 (vextract128_extract:$ext (v32i8 VR256:$src1),
8016                                (iPTR imm))), addr:$dst),
8017          (VEXTRACTF128mr addr:$dst, VR256:$src1,
8018           (EXTRACT_get_vextract128_imm VR128:$ext))>;
8019}
8020
8021//===----------------------------------------------------------------------===//
8022// VMASKMOV - Conditional SIMD Packed Loads and Stores
8023//
8024multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr,
8025                          Intrinsic IntLd, Intrinsic IntLd256,
8026                          Intrinsic IntSt, Intrinsic IntSt256> {
8027  def rm  : AVX8I<opc_rm, MRMSrcMem, (outs VR128:$dst),
8028             (ins VR128:$src1, f128mem:$src2),
8029             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8030             [(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))]>,
8031             VEX_4V;
8032  def Yrm : AVX8I<opc_rm, MRMSrcMem, (outs VR256:$dst),
8033             (ins VR256:$src1, f256mem:$src2),
8034             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8035             [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
8036             VEX_4V, VEX_L;
8037  def mr  : AVX8I<opc_mr, MRMDestMem, (outs),
8038             (ins f128mem:$dst, VR128:$src1, VR128:$src2),
8039             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8040             [(IntSt addr:$dst, VR128:$src1, VR128:$src2)]>, VEX_4V;
8041  def Ymr : AVX8I<opc_mr, MRMDestMem, (outs),
8042             (ins f256mem:$dst, VR256:$src1, VR256:$src2),
8043             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8044             [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, VEX_4V, VEX_L;
8045}
8046
8047let ExeDomain = SSEPackedSingle in
8048defm VMASKMOVPS : avx_movmask_rm<0x2C, 0x2E, "vmaskmovps",
8049                                 int_x86_avx_maskload_ps,
8050                                 int_x86_avx_maskload_ps_256,
8051                                 int_x86_avx_maskstore_ps,
8052                                 int_x86_avx_maskstore_ps_256>;
8053let ExeDomain = SSEPackedDouble in
8054defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd",
8055                                 int_x86_avx_maskload_pd,
8056                                 int_x86_avx_maskload_pd_256,
8057                                 int_x86_avx_maskstore_pd,
8058                                 int_x86_avx_maskstore_pd_256>;
8059
8060//===----------------------------------------------------------------------===//
8061// VPERMIL - Permute Single and Double Floating-Point Values
8062//
8063multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
8064                      RegisterClass RC, X86MemOperand x86memop_f,
8065                      X86MemOperand x86memop_i, PatFrag i_frag,
8066                      Intrinsic IntVar, ValueType vt> {
8067  def rr  : AVX8I<opc_rm, MRMSrcReg, (outs RC:$dst),
8068             (ins RC:$src1, RC:$src2),
8069             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8070             [(set RC:$dst, (IntVar RC:$src1, RC:$src2))]>, VEX_4V,
8071             Sched<[WriteFShuffle]>;
8072  def rm  : AVX8I<opc_rm, MRMSrcMem, (outs RC:$dst),
8073             (ins RC:$src1, x86memop_i:$src2),
8074             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8075             [(set RC:$dst, (IntVar RC:$src1,
8076                             (bitconvert (i_frag addr:$src2))))]>, VEX_4V,
8077             Sched<[WriteFShuffleLd, ReadAfterLd]>;
8078
8079  def ri  : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst),
8080             (ins RC:$src1, u8imm:$src2),
8081             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8082             [(set RC:$dst, (vt (X86VPermilpi RC:$src1, (i8 imm:$src2))))]>, VEX,
8083             Sched<[WriteFShuffle]>;
8084  def mi  : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst),
8085             (ins x86memop_f:$src1, u8imm:$src2),
8086             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8087             [(set RC:$dst,
8088               (vt (X86VPermilpi (load addr:$src1), (i8 imm:$src2))))]>, VEX,
8089             Sched<[WriteFShuffleLd]>;
8090}
8091
8092let ExeDomain = SSEPackedSingle in {
8093  defm VPERMILPS  : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem,
8094                               loadv2i64, int_x86_avx_vpermilvar_ps, v4f32>;
8095  defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem,
8096                       loadv4i64, int_x86_avx_vpermilvar_ps_256, v8f32>, VEX_L;
8097}
8098let ExeDomain = SSEPackedDouble in {
8099  defm VPERMILPD  : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem,
8100                               loadv2i64, int_x86_avx_vpermilvar_pd, v2f64>;
8101  defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem,
8102                       loadv4i64, int_x86_avx_vpermilvar_pd_256, v4f64>, VEX_L;
8103}
8104
8105let Predicates = [HasAVX] in {
8106def : Pat<(v8f32 (X86VPermilpv VR256:$src1, (v8i32 VR256:$src2))),
8107          (VPERMILPSYrr VR256:$src1, VR256:$src2)>;
8108def : Pat<(v8f32 (X86VPermilpv VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))),
8109          (VPERMILPSYrm VR256:$src1, addr:$src2)>;
8110def : Pat<(v4f64 (X86VPermilpv VR256:$src1, (v4i64 VR256:$src2))),
8111          (VPERMILPDYrr VR256:$src1, VR256:$src2)>;
8112def : Pat<(v4f64 (X86VPermilpv VR256:$src1, (loadv4i64 addr:$src2))),
8113          (VPERMILPDYrm VR256:$src1, addr:$src2)>;
8114
8115def : Pat<(v8i32 (X86VPermilpi VR256:$src1, (i8 imm:$imm))),
8116          (VPERMILPSYri VR256:$src1, imm:$imm)>;
8117def : Pat<(v4i64 (X86VPermilpi VR256:$src1, (i8 imm:$imm))),
8118          (VPERMILPDYri VR256:$src1, imm:$imm)>;
8119def : Pat<(v8i32 (X86VPermilpi (bc_v8i32 (loadv4i64 addr:$src1)),
8120                               (i8 imm:$imm))),
8121          (VPERMILPSYmi addr:$src1, imm:$imm)>;
8122def : Pat<(v4i64 (X86VPermilpi (loadv4i64 addr:$src1), (i8 imm:$imm))),
8123          (VPERMILPDYmi addr:$src1, imm:$imm)>;
8124
8125def : Pat<(v4f32 (X86VPermilpv VR128:$src1, (v4i32 VR128:$src2))),
8126          (VPERMILPSrr VR128:$src1, VR128:$src2)>;
8127def : Pat<(v4f32 (X86VPermilpv VR128:$src1, (bc_v4i32 (loadv2i64 addr:$src2)))),
8128          (VPERMILPSrm VR128:$src1, addr:$src2)>;
8129def : Pat<(v2f64 (X86VPermilpv VR128:$src1, (v2i64 VR128:$src2))),
8130          (VPERMILPDrr VR128:$src1, VR128:$src2)>;
8131def : Pat<(v2f64 (X86VPermilpv VR128:$src1, (loadv2i64 addr:$src2))),
8132          (VPERMILPDrm VR128:$src1, addr:$src2)>;
8133
8134def : Pat<(v2i64 (X86VPermilpi VR128:$src1, (i8 imm:$imm))),
8135          (VPERMILPDri VR128:$src1, imm:$imm)>;
8136def : Pat<(v2i64 (X86VPermilpi (loadv2i64 addr:$src1), (i8 imm:$imm))),
8137          (VPERMILPDmi addr:$src1, imm:$imm)>;
8138}
8139
8140//===----------------------------------------------------------------------===//
8141// VPERM2F128 - Permute Floating-Point Values in 128-bit chunks
8142//
8143let ExeDomain = SSEPackedSingle in {
8144def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst),
8145          (ins VR256:$src1, VR256:$src2, u8imm:$src3),
8146          "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
8147          [(set VR256:$dst, (v8f32 (X86VPerm2x128 VR256:$src1, VR256:$src2,
8148                              (i8 imm:$src3))))]>, VEX_4V, VEX_L,
8149          Sched<[WriteFShuffle]>;
8150def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst),
8151          (ins VR256:$src1, f256mem:$src2, u8imm:$src3),
8152          "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
8153          [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv8f32 addr:$src2),
8154                             (i8 imm:$src3)))]>, VEX_4V, VEX_L,
8155          Sched<[WriteFShuffleLd, ReadAfterLd]>;
8156}
8157
8158let Predicates = [HasAVX] in {
8159def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
8160          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
8161def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1,
8162                  (loadv4f64 addr:$src2), (i8 imm:$imm))),
8163          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
8164}
8165
8166let Predicates = [HasAVX1Only] in {
8167def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
8168          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
8169def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
8170          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
8171def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
8172          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
8173def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
8174          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
8175
8176def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1,
8177                  (bc_v8i32 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
8178          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
8179def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1,
8180                  (loadv4i64 addr:$src2), (i8 imm:$imm))),
8181          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
8182def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1,
8183                  (bc_v32i8 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
8184          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
8185def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1,
8186                  (bc_v16i16 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
8187          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
8188}
8189
8190//===----------------------------------------------------------------------===//
8191// VZERO - Zero YMM registers
8192//
8193let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
8194            YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15] in {
8195  // Zero All YMM registers
8196  def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall",
8197                  [(int_x86_avx_vzeroall)]>, PS, VEX, VEX_L, Requires<[HasAVX]>;
8198
8199  // Zero Upper bits of YMM registers
8200  def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper",
8201                     [(int_x86_avx_vzeroupper)]>, PS, VEX, Requires<[HasAVX]>;
8202}
8203
8204//===----------------------------------------------------------------------===//
8205// Half precision conversion instructions
8206//===----------------------------------------------------------------------===//
8207multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> {
8208  def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
8209             "vcvtph2ps\t{$src, $dst|$dst, $src}",
8210             [(set RC:$dst, (Int VR128:$src))]>,
8211             T8PD, VEX, Sched<[WriteCvtF2F]>;
8212  let hasSideEffects = 0, mayLoad = 1 in
8213  def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
8214             "vcvtph2ps\t{$src, $dst|$dst, $src}", []>, T8PD, VEX,
8215             Sched<[WriteCvtF2FLd]>;
8216}
8217
8218multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> {
8219  def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst),
8220               (ins RC:$src1, i32u8imm:$src2),
8221               "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
8222               [(set VR128:$dst, (Int RC:$src1, imm:$src2))]>,
8223               TAPD, VEX, Sched<[WriteCvtF2F]>;
8224  let hasSideEffects = 0, mayStore = 1,
8225      SchedRW = [WriteCvtF2FLd, WriteRMW] in
8226  def mr : Ii8<0x1D, MRMDestMem, (outs),
8227               (ins x86memop:$dst, RC:$src1, i32u8imm:$src2),
8228               "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
8229               TAPD, VEX;
8230}
8231
8232let Predicates = [HasF16C] in {
8233  defm VCVTPH2PS  : f16c_ph2ps<VR128, f64mem, int_x86_vcvtph2ps_128>;
8234  defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, int_x86_vcvtph2ps_256>, VEX_L;
8235  defm VCVTPS2PH  : f16c_ps2ph<VR128, f64mem, int_x86_vcvtps2ph_128>;
8236  defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, int_x86_vcvtps2ph_256>, VEX_L;
8237
8238  // Pattern match vcvtph2ps of a scalar i64 load.
8239  def : Pat<(int_x86_vcvtph2ps_128 (vzmovl_v2i64 addr:$src)),
8240            (VCVTPH2PSrm addr:$src)>;
8241  def : Pat<(int_x86_vcvtph2ps_128 (vzload_v2i64 addr:$src)),
8242            (VCVTPH2PSrm addr:$src)>;
8243}
8244
8245// Patterns for  matching conversions from float to half-float and vice versa.
8246let Predicates = [HasF16C] in {
8247  def : Pat<(fp_to_f16 FR32:$src),
8248            (i16 (EXTRACT_SUBREG (VMOVPDI2DIrr (VCVTPS2PHrr
8249              (COPY_TO_REGCLASS FR32:$src, VR128), 0)), sub_16bit))>;
8250
8251  def : Pat<(f16_to_fp GR16:$src),
8252            (f32 (COPY_TO_REGCLASS (VCVTPH2PSrr
8253              (COPY_TO_REGCLASS (MOVSX32rr16 GR16:$src), VR128)), FR32)) >;
8254
8255  def : Pat<(f16_to_fp (i16 (fp_to_f16 FR32:$src))),
8256            (f32 (COPY_TO_REGCLASS (VCVTPH2PSrr
8257              (VCVTPS2PHrr (COPY_TO_REGCLASS FR32:$src, VR128), 0)), FR32)) >;
8258}
8259
8260//===----------------------------------------------------------------------===//
8261// AVX2 Instructions
8262//===----------------------------------------------------------------------===//
8263
8264/// AVX2_binop_rmi - AVX2 binary operator with 8-bit immediate
8265multiclass AVX2_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
8266                          ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
8267                          X86MemOperand x86memop> {
8268  let isCommutable = 1 in
8269  def rri : AVX2AIi8<opc, MRMSrcReg, (outs RC:$dst),
8270        (ins RC:$src1, RC:$src2, u8imm:$src3),
8271        !strconcat(OpcodeStr,
8272            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
8273        [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))]>,
8274        Sched<[WriteBlend]>, VEX_4V;
8275  def rmi : AVX2AIi8<opc, MRMSrcMem, (outs RC:$dst),
8276        (ins RC:$src1, x86memop:$src2, u8imm:$src3),
8277        !strconcat(OpcodeStr,
8278            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
8279        [(set RC:$dst,
8280          (OpVT (OpNode RC:$src1,
8281           (bitconvert (memop_frag addr:$src2)), imm:$src3)))]>,
8282        Sched<[WriteBlendLd, ReadAfterLd]>, VEX_4V;
8283}
8284
8285defm VPBLENDD : AVX2_binop_rmi<0x02, "vpblendd", X86Blendi, v4i32,
8286                               VR128, loadv2i64, i128mem>;
8287defm VPBLENDDY : AVX2_binop_rmi<0x02, "vpblendd", X86Blendi, v8i32,
8288                                VR256, loadv4i64, i256mem>, VEX_L;
8289
8290//===----------------------------------------------------------------------===//
8291// VPBROADCAST - Load from memory and broadcast to all elements of the
8292//               destination operand
8293//
8294multiclass avx2_broadcast<bits<8> opc, string OpcodeStr,
8295                          X86MemOperand x86memop, PatFrag ld_frag,
8296                          Intrinsic Int128, Intrinsic Int256> {
8297  def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
8298                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
8299                  [(set VR128:$dst, (Int128 VR128:$src))]>,
8300                  Sched<[WriteShuffle]>, VEX;
8301  def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
8302                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
8303                  [(set VR128:$dst,
8304                    (Int128 (scalar_to_vector (ld_frag addr:$src))))]>,
8305                  Sched<[WriteLoad]>, VEX;
8306  def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
8307                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
8308                   [(set VR256:$dst, (Int256 VR128:$src))]>,
8309                   Sched<[WriteShuffle256]>, VEX, VEX_L;
8310  def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src),
8311                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
8312                   [(set VR256:$dst,
8313                    (Int256 (scalar_to_vector (ld_frag addr:$src))))]>,
8314                   Sched<[WriteLoad]>, VEX, VEX_L;
8315}
8316
8317defm VPBROADCASTB  : avx2_broadcast<0x78, "vpbroadcastb", i8mem, loadi8,
8318                                    int_x86_avx2_pbroadcastb_128,
8319                                    int_x86_avx2_pbroadcastb_256>;
8320defm VPBROADCASTW  : avx2_broadcast<0x79, "vpbroadcastw", i16mem, loadi16,
8321                                    int_x86_avx2_pbroadcastw_128,
8322                                    int_x86_avx2_pbroadcastw_256>;
8323defm VPBROADCASTD  : avx2_broadcast<0x58, "vpbroadcastd", i32mem, loadi32,
8324                                    int_x86_avx2_pbroadcastd_128,
8325                                    int_x86_avx2_pbroadcastd_256>;
8326defm VPBROADCASTQ  : avx2_broadcast<0x59, "vpbroadcastq", i64mem, loadi64,
8327                                    int_x86_avx2_pbroadcastq_128,
8328                                    int_x86_avx2_pbroadcastq_256>;
8329
8330let Predicates = [HasAVX2] in {
8331  def : Pat<(v16i8 (X86VBroadcast (loadi8 addr:$src))),
8332          (VPBROADCASTBrm addr:$src)>;
8333  def : Pat<(v32i8 (X86VBroadcast (loadi8 addr:$src))),
8334          (VPBROADCASTBYrm addr:$src)>;
8335  def : Pat<(v8i16 (X86VBroadcast (loadi16 addr:$src))),
8336          (VPBROADCASTWrm addr:$src)>;
8337  def : Pat<(v16i16 (X86VBroadcast (loadi16 addr:$src))),
8338          (VPBROADCASTWYrm addr:$src)>;
8339  def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
8340          (VPBROADCASTDrm addr:$src)>;
8341  def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))),
8342          (VPBROADCASTDYrm addr:$src)>;
8343  def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))),
8344          (VPBROADCASTQrm addr:$src)>;
8345  def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))),
8346          (VPBROADCASTQYrm addr:$src)>;
8347
8348  def : Pat<(v16i8 (X86VBroadcast (v16i8 VR128:$src))),
8349          (VPBROADCASTBrr VR128:$src)>;
8350  def : Pat<(v32i8 (X86VBroadcast (v16i8 VR128:$src))),
8351          (VPBROADCASTBYrr VR128:$src)>;
8352  def : Pat<(v8i16 (X86VBroadcast (v8i16 VR128:$src))),
8353          (VPBROADCASTWrr VR128:$src)>;
8354  def : Pat<(v16i16 (X86VBroadcast (v8i16 VR128:$src))),
8355          (VPBROADCASTWYrr VR128:$src)>;
8356  def : Pat<(v4i32 (X86VBroadcast (v4i32 VR128:$src))),
8357          (VPBROADCASTDrr VR128:$src)>;
8358  def : Pat<(v8i32 (X86VBroadcast (v4i32 VR128:$src))),
8359          (VPBROADCASTDYrr VR128:$src)>;
8360  def : Pat<(v2i64 (X86VBroadcast (v2i64 VR128:$src))),
8361          (VPBROADCASTQrr VR128:$src)>;
8362  def : Pat<(v4i64 (X86VBroadcast (v2i64 VR128:$src))),
8363          (VPBROADCASTQYrr VR128:$src)>;
8364  def : Pat<(v4f32 (X86VBroadcast (v4f32 VR128:$src))),
8365          (VBROADCASTSSrr VR128:$src)>;
8366  def : Pat<(v8f32 (X86VBroadcast (v4f32 VR128:$src))),
8367          (VBROADCASTSSYrr VR128:$src)>;
8368  def : Pat<(v2f64 (X86VBroadcast (v2f64 VR128:$src))),
8369          (VPBROADCASTQrr VR128:$src)>;
8370  def : Pat<(v4f64 (X86VBroadcast (v2f64 VR128:$src))),
8371          (VBROADCASTSDYrr VR128:$src)>;
8372
8373  // Provide aliases for broadcast from the same register class that
8374  // automatically does the extract.
8375  def : Pat<(v32i8 (X86VBroadcast (v32i8 VR256:$src))),
8376            (VPBROADCASTBYrr (v16i8 (EXTRACT_SUBREG (v32i8 VR256:$src),
8377                                                    sub_xmm)))>;
8378  def : Pat<(v16i16 (X86VBroadcast (v16i16 VR256:$src))),
8379            (VPBROADCASTWYrr (v8i16 (EXTRACT_SUBREG (v16i16 VR256:$src),
8380                                                    sub_xmm)))>;
8381  def : Pat<(v8i32 (X86VBroadcast (v8i32 VR256:$src))),
8382            (VPBROADCASTDYrr (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src),
8383                                                    sub_xmm)))>;
8384  def : Pat<(v4i64 (X86VBroadcast (v4i64 VR256:$src))),
8385            (VPBROADCASTQYrr (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src),
8386                                                    sub_xmm)))>;
8387  def : Pat<(v8f32 (X86VBroadcast (v8f32 VR256:$src))),
8388            (VBROADCASTSSYrr (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src),
8389                                                    sub_xmm)))>;
8390  def : Pat<(v4f64 (X86VBroadcast (v4f64 VR256:$src))),
8391            (VBROADCASTSDYrr (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src),
8392                                                    sub_xmm)))>;
8393
8394  // Provide fallback in case the load node that is used in the patterns above
8395  // is used by additional users, which prevents the pattern selection.
8396  let AddedComplexity = 20 in {
8397    def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
8398              (VBROADCASTSSrr (COPY_TO_REGCLASS FR32:$src, VR128))>;
8399    def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
8400              (VBROADCASTSSYrr (COPY_TO_REGCLASS FR32:$src, VR128))>;
8401    def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
8402              (VBROADCASTSDYrr (COPY_TO_REGCLASS FR64:$src, VR128))>;
8403
8404    def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
8405              (VBROADCASTSSrr (COPY_TO_REGCLASS GR32:$src, VR128))>;
8406    def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
8407              (VBROADCASTSSYrr (COPY_TO_REGCLASS GR32:$src, VR128))>;
8408    def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
8409              (VBROADCASTSDYrr (COPY_TO_REGCLASS GR64:$src, VR128))>;
8410
8411    def : Pat<(v16i8 (X86VBroadcast GR8:$src)),
8412          (VPBROADCASTBrr (COPY_TO_REGCLASS
8413                           (i32 (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit)),
8414                           VR128))>;
8415    def : Pat<(v32i8 (X86VBroadcast GR8:$src)),
8416          (VPBROADCASTBYrr (COPY_TO_REGCLASS
8417                            (i32 (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit)),
8418                            VR128))>;
8419
8420    def : Pat<(v8i16 (X86VBroadcast GR16:$src)),
8421          (VPBROADCASTWrr (COPY_TO_REGCLASS
8422                           (i32 (SUBREG_TO_REG (i32 0), GR16:$src, sub_16bit)),
8423                           VR128))>;
8424    def : Pat<(v16i16 (X86VBroadcast GR16:$src)),
8425          (VPBROADCASTWYrr (COPY_TO_REGCLASS
8426                            (i32 (SUBREG_TO_REG (i32 0), GR16:$src, sub_16bit)),
8427                            VR128))>;
8428
8429    // The patterns for VPBROADCASTD are not needed because they would match
8430    // the exact same thing as VBROADCASTSS patterns.
8431
8432    def : Pat<(v2i64 (X86VBroadcast GR64:$src)),
8433          (VPBROADCASTQrr (COPY_TO_REGCLASS GR64:$src, VR128))>;
8434    // The v4i64 pattern is not needed because VBROADCASTSDYrr already match.
8435  }
8436}
8437
8438// AVX1 broadcast patterns
8439let Predicates = [HasAVX1Only] in {
8440def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))),
8441          (VBROADCASTSSYrm addr:$src)>;
8442def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))),
8443          (VBROADCASTSDYrm addr:$src)>;
8444def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
8445          (VBROADCASTSSrm addr:$src)>;
8446}
8447
8448let Predicates = [HasAVX] in {
8449  // Provide fallback in case the load node that is used in the patterns above
8450  // is used by additional users, which prevents the pattern selection.
8451  let AddedComplexity = 20 in {
8452  // 128bit broadcasts:
8453  def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
8454            (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0)>;
8455  def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
8456            (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
8457              (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0), sub_xmm),
8458              (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0), 1)>;
8459  def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
8460            (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
8461              (VPSHUFDri (COPY_TO_REGCLASS FR64:$src, VR128), 0x44), sub_xmm),
8462              (VPSHUFDri (COPY_TO_REGCLASS FR64:$src, VR128), 0x44), 1)>;
8463
8464  def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
8465            (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0)>;
8466  def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
8467            (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
8468              (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0), sub_xmm),
8469              (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0), 1)>;
8470  def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
8471            (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)),
8472              (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44), sub_xmm),
8473              (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44), 1)>;
8474  }
8475
8476  def : Pat<(v2f64 (X86VBroadcast f64:$src)),
8477            (VMOVDDUPrr (COPY_TO_REGCLASS FR64:$src, VR128))>;
8478}
8479
8480//===----------------------------------------------------------------------===//
8481// VPERM - Permute instructions
8482//
8483
8484multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
8485                     ValueType OpVT, X86FoldableSchedWrite Sched> {
8486  def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
8487                   (ins VR256:$src1, VR256:$src2),
8488                   !strconcat(OpcodeStr,
8489                       "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8490                   [(set VR256:$dst,
8491                     (OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>,
8492                   Sched<[Sched]>, VEX_4V, VEX_L;
8493  def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
8494                   (ins VR256:$src1, i256mem:$src2),
8495                   !strconcat(OpcodeStr,
8496                       "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8497                   [(set VR256:$dst,
8498                     (OpVT (X86VPermv VR256:$src1,
8499                            (bitconvert (mem_frag addr:$src2)))))]>,
8500                   Sched<[Sched.Folded, ReadAfterLd]>, VEX_4V, VEX_L;
8501}
8502
8503defm VPERMD : avx2_perm<0x36, "vpermd", loadv4i64, v8i32, WriteShuffle256>;
8504let ExeDomain = SSEPackedSingle in
8505defm VPERMPS : avx2_perm<0x16, "vpermps", loadv8f32, v8f32, WriteFShuffle256>;
8506
8507multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
8508                         ValueType OpVT, X86FoldableSchedWrite Sched> {
8509  def Yri : AVX2AIi8<opc, MRMSrcReg, (outs VR256:$dst),
8510                     (ins VR256:$src1, u8imm:$src2),
8511                     !strconcat(OpcodeStr,
8512                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8513                     [(set VR256:$dst,
8514                       (OpVT (X86VPermi VR256:$src1, (i8 imm:$src2))))]>,
8515                     Sched<[Sched]>, VEX, VEX_L;
8516  def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst),
8517                     (ins i256mem:$src1, u8imm:$src2),
8518                     !strconcat(OpcodeStr,
8519                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8520                     [(set VR256:$dst,
8521                       (OpVT (X86VPermi (mem_frag addr:$src1),
8522                              (i8 imm:$src2))))]>,
8523                     Sched<[Sched.Folded, ReadAfterLd]>, VEX, VEX_L;
8524}
8525
8526defm VPERMQ : avx2_perm_imm<0x00, "vpermq", loadv4i64, v4i64,
8527                            WriteShuffle256>, VEX_W;
8528let ExeDomain = SSEPackedDouble in
8529defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64,
8530                             WriteFShuffle256>, VEX_W;
8531
8532//===----------------------------------------------------------------------===//
8533// VPERM2I128 - Permute Floating-Point Values in 128-bit chunks
8534//
8535def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst),
8536          (ins VR256:$src1, VR256:$src2, u8imm:$src3),
8537          "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
8538          [(set VR256:$dst, (v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2,
8539                            (i8 imm:$src3))))]>, Sched<[WriteShuffle256]>,
8540          VEX_4V, VEX_L;
8541def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst),
8542          (ins VR256:$src1, f256mem:$src2, u8imm:$src3),
8543          "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
8544          [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4i64 addr:$src2),
8545                             (i8 imm:$src3)))]>,
8546          Sched<[WriteShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L;
8547
8548let Predicates = [HasAVX2] in {
8549def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
8550          (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>;
8551def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
8552          (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>;
8553def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
8554          (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>;
8555
8556def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, (bc_v32i8 (loadv4i64 addr:$src2)),
8557                  (i8 imm:$imm))),
8558          (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>;
8559def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1,
8560                   (bc_v16i16 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
8561          (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>;
8562def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)),
8563                  (i8 imm:$imm))),
8564          (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>;
8565}
8566
8567
8568//===----------------------------------------------------------------------===//
8569// VINSERTI128 - Insert packed integer values
8570//
8571let hasSideEffects = 0 in {
8572def VINSERTI128rr : AVX2AIi8<0x38, MRMSrcReg, (outs VR256:$dst),
8573          (ins VR256:$src1, VR128:$src2, u8imm:$src3),
8574          "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
8575          []>, Sched<[WriteShuffle256]>, VEX_4V, VEX_L;
8576let mayLoad = 1 in
8577def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst),
8578          (ins VR256:$src1, i128mem:$src2, u8imm:$src3),
8579          "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
8580          []>, Sched<[WriteShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L;
8581}
8582
8583let Predicates = [HasAVX2] in {
8584def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2),
8585                                   (iPTR imm)),
8586          (VINSERTI128rr VR256:$src1, VR128:$src2,
8587                         (INSERT_get_vinsert128_imm VR256:$ins))>;
8588def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2),
8589                                   (iPTR imm)),
8590          (VINSERTI128rr VR256:$src1, VR128:$src2,
8591                         (INSERT_get_vinsert128_imm VR256:$ins))>;
8592def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2),
8593                                   (iPTR imm)),
8594          (VINSERTI128rr VR256:$src1, VR128:$src2,
8595                         (INSERT_get_vinsert128_imm VR256:$ins))>;
8596def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2),
8597                                   (iPTR imm)),
8598          (VINSERTI128rr VR256:$src1, VR128:$src2,
8599                         (INSERT_get_vinsert128_imm VR256:$ins))>;
8600
8601def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (loadv2i64 addr:$src2),
8602                                   (iPTR imm)),
8603          (VINSERTI128rm VR256:$src1, addr:$src2,
8604                         (INSERT_get_vinsert128_imm VR256:$ins))>;
8605def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1),
8606                                   (bc_v4i32 (loadv2i64 addr:$src2)),
8607                                   (iPTR imm)),
8608          (VINSERTI128rm VR256:$src1, addr:$src2,
8609                         (INSERT_get_vinsert128_imm VR256:$ins))>;
8610def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1),
8611                                   (bc_v16i8 (loadv2i64 addr:$src2)),
8612                                   (iPTR imm)),
8613          (VINSERTI128rm VR256:$src1, addr:$src2,
8614                         (INSERT_get_vinsert128_imm VR256:$ins))>;
8615def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1),
8616                                   (bc_v8i16 (loadv2i64 addr:$src2)),
8617                                   (iPTR imm)),
8618          (VINSERTI128rm VR256:$src1, addr:$src2,
8619                         (INSERT_get_vinsert128_imm VR256:$ins))>;
8620}
8621
8622//===----------------------------------------------------------------------===//
8623// VEXTRACTI128 - Extract packed integer values
8624//
8625def VEXTRACTI128rr : AVX2AIi8<0x39, MRMDestReg, (outs VR128:$dst),
8626          (ins VR256:$src1, u8imm:$src2),
8627          "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
8628          Sched<[WriteShuffle256]>, VEX, VEX_L;
8629let hasSideEffects = 0, mayStore = 1 in
8630def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs),
8631          (ins i128mem:$dst, VR256:$src1, u8imm:$src2),
8632          "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
8633          Sched<[WriteStore]>, VEX, VEX_L;
8634
8635let Predicates = [HasAVX2] in {
8636def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
8637          (v2i64 (VEXTRACTI128rr
8638                    (v4i64 VR256:$src1),
8639                    (EXTRACT_get_vextract128_imm VR128:$ext)))>;
8640def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
8641          (v4i32 (VEXTRACTI128rr
8642                    (v8i32 VR256:$src1),
8643                    (EXTRACT_get_vextract128_imm VR128:$ext)))>;
8644def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
8645          (v8i16 (VEXTRACTI128rr
8646                    (v16i16 VR256:$src1),
8647                    (EXTRACT_get_vextract128_imm VR128:$ext)))>;
8648def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
8649          (v16i8 (VEXTRACTI128rr
8650                    (v32i8 VR256:$src1),
8651                    (EXTRACT_get_vextract128_imm VR128:$ext)))>;
8652
8653def : Pat<(store (v2i64 (vextract128_extract:$ext (v4i64 VR256:$src1),
8654                         (iPTR imm))), addr:$dst),
8655          (VEXTRACTI128mr addr:$dst, VR256:$src1,
8656           (EXTRACT_get_vextract128_imm VR128:$ext))>;
8657def : Pat<(store (v4i32 (vextract128_extract:$ext (v8i32 VR256:$src1),
8658                         (iPTR imm))), addr:$dst),
8659          (VEXTRACTI128mr addr:$dst, VR256:$src1,
8660           (EXTRACT_get_vextract128_imm VR128:$ext))>;
8661def : Pat<(store (v8i16 (vextract128_extract:$ext (v16i16 VR256:$src1),
8662                         (iPTR imm))), addr:$dst),
8663          (VEXTRACTI128mr addr:$dst, VR256:$src1,
8664           (EXTRACT_get_vextract128_imm VR128:$ext))>;
8665def : Pat<(store (v16i8 (vextract128_extract:$ext (v32i8 VR256:$src1),
8666                         (iPTR imm))), addr:$dst),
8667          (VEXTRACTI128mr addr:$dst, VR256:$src1,
8668           (EXTRACT_get_vextract128_imm VR128:$ext))>;
8669}
8670
8671//===----------------------------------------------------------------------===//
8672// VPMASKMOV - Conditional SIMD Integer Packed Loads and Stores
8673//
8674multiclass avx2_pmovmask<string OpcodeStr,
8675                         Intrinsic IntLd128, Intrinsic IntLd256,
8676                         Intrinsic IntSt128, Intrinsic IntSt256> {
8677  def rm  : AVX28I<0x8c, MRMSrcMem, (outs VR128:$dst),
8678             (ins VR128:$src1, i128mem:$src2),
8679             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8680             [(set VR128:$dst, (IntLd128 addr:$src2, VR128:$src1))]>, VEX_4V;
8681  def Yrm : AVX28I<0x8c, MRMSrcMem, (outs VR256:$dst),
8682             (ins VR256:$src1, i256mem:$src2),
8683             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8684             [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
8685             VEX_4V, VEX_L;
8686  def mr  : AVX28I<0x8e, MRMDestMem, (outs),
8687             (ins i128mem:$dst, VR128:$src1, VR128:$src2),
8688             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8689             [(IntSt128 addr:$dst, VR128:$src1, VR128:$src2)]>, VEX_4V;
8690  def Ymr : AVX28I<0x8e, MRMDestMem, (outs),
8691             (ins i256mem:$dst, VR256:$src1, VR256:$src2),
8692             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8693             [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, VEX_4V, VEX_L;
8694}
8695
8696defm VPMASKMOVD : avx2_pmovmask<"vpmaskmovd",
8697                                int_x86_avx2_maskload_d,
8698                                int_x86_avx2_maskload_d_256,
8699                                int_x86_avx2_maskstore_d,
8700                                int_x86_avx2_maskstore_d_256>;
8701defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq",
8702                                int_x86_avx2_maskload_q,
8703                                int_x86_avx2_maskload_q_256,
8704                                int_x86_avx2_maskstore_q,
8705                                int_x86_avx2_maskstore_q_256>, VEX_W;
8706
8707def: Pat<(masked_store addr:$ptr, (v8i32 VR256:$mask), (v8f32 VR256:$src)),
8708         (VMASKMOVPSYmr addr:$ptr, VR256:$mask, VR256:$src)>;
8709
8710def: Pat<(masked_store addr:$ptr, (v8i32 VR256:$mask), (v8i32 VR256:$src)),
8711         (VPMASKMOVDYmr addr:$ptr, VR256:$mask, VR256:$src)>;
8712
8713def: Pat<(masked_store addr:$ptr, (v4i32 VR128:$mask), (v4f32 VR128:$src)),
8714         (VMASKMOVPSmr addr:$ptr, VR128:$mask, VR128:$src)>;
8715
8716def: Pat<(masked_store addr:$ptr, (v4i32 VR128:$mask), (v4i32 VR128:$src)),
8717         (VPMASKMOVDmr addr:$ptr, VR128:$mask, VR128:$src)>;
8718
8719def: Pat<(v8f32 (masked_load addr:$ptr, (v8i32 VR256:$mask), undef)),
8720         (VMASKMOVPSYrm VR256:$mask, addr:$ptr)>;
8721
8722def: Pat<(v8f32 (masked_load addr:$ptr, (v8i32 VR256:$mask),
8723                             (bc_v8f32 (v8i32 immAllZerosV)))),
8724         (VMASKMOVPSYrm VR256:$mask, addr:$ptr)>;
8725
8726def: Pat<(v8f32 (masked_load addr:$ptr, (v8i32 VR256:$mask), (v8f32 VR256:$src0))),
8727         (VBLENDVPSYrr VR256:$src0, (VMASKMOVPSYrm VR256:$mask, addr:$ptr),
8728                       VR256:$mask)>;
8729
8730def: Pat<(v8i32 (masked_load addr:$ptr, (v8i32 VR256:$mask), undef)),
8731         (VPMASKMOVDYrm VR256:$mask, addr:$ptr)>;
8732
8733def: Pat<(v8i32 (masked_load addr:$ptr, (v8i32 VR256:$mask), (v8i32 immAllZerosV))),
8734         (VPMASKMOVDYrm VR256:$mask, addr:$ptr)>;
8735
8736def: Pat<(v8i32 (masked_load addr:$ptr, (v8i32 VR256:$mask), (v8i32 VR256:$src0))),
8737         (VBLENDVPSYrr VR256:$src0, (VPMASKMOVDYrm VR256:$mask, addr:$ptr),
8738                       VR256:$mask)>;
8739
8740def: Pat<(v4f32 (masked_load addr:$ptr, (v4i32 VR128:$mask), undef)),
8741         (VMASKMOVPSrm VR128:$mask, addr:$ptr)>;
8742
8743def: Pat<(v4f32 (masked_load addr:$ptr, (v4i32 VR128:$mask),
8744                             (bc_v4f32 (v4i32 immAllZerosV)))),
8745         (VMASKMOVPSrm VR128:$mask, addr:$ptr)>;
8746
8747def: Pat<(v4f32 (masked_load addr:$ptr, (v4i32 VR128:$mask), (v4f32 VR128:$src0))),
8748         (VBLENDVPSrr VR128:$src0, (VMASKMOVPSrm VR128:$mask, addr:$ptr),
8749                       VR128:$mask)>;
8750
8751def: Pat<(v4i32 (masked_load addr:$ptr, (v4i32 VR128:$mask), undef)),
8752         (VPMASKMOVDrm VR128:$mask, addr:$ptr)>;
8753
8754def: Pat<(v4i32 (masked_load addr:$ptr, (v4i32 VR128:$mask), (v4i32 immAllZerosV))),
8755         (VPMASKMOVDrm VR128:$mask, addr:$ptr)>;
8756
8757def: Pat<(v4i32 (masked_load addr:$ptr, (v4i32 VR128:$mask), (v4i32 VR128:$src0))),
8758         (VBLENDVPSrr VR128:$src0, (VPMASKMOVDrm VR128:$mask, addr:$ptr),
8759                       VR128:$mask)>;
8760
8761def: Pat<(masked_store addr:$ptr, (v4i64 VR256:$mask), (v4f64 VR256:$src)),
8762         (VMASKMOVPDYmr addr:$ptr, VR256:$mask, VR256:$src)>;
8763
8764def: Pat<(masked_store addr:$ptr, (v4i64 VR256:$mask), (v4i64 VR256:$src)),
8765         (VPMASKMOVQYmr addr:$ptr, VR256:$mask, VR256:$src)>;
8766
8767def: Pat<(v4f64 (masked_load addr:$ptr, (v4i64 VR256:$mask), undef)),
8768         (VMASKMOVPDYrm VR256:$mask, addr:$ptr)>;
8769
8770def: Pat<(v4f64 (masked_load addr:$ptr, (v4i64 VR256:$mask),
8771                             (v4f64 immAllZerosV))),
8772         (VMASKMOVPDYrm VR256:$mask, addr:$ptr)>;
8773
8774def: Pat<(v4f64 (masked_load addr:$ptr, (v4i64 VR256:$mask), (v4f64 VR256:$src0))),
8775         (VBLENDVPDYrr VR256:$src0, (VMASKMOVPDYrm VR256:$mask, addr:$ptr),
8776                       VR256:$mask)>;
8777
8778def: Pat<(v4i64 (masked_load addr:$ptr, (v4i64 VR256:$mask), undef)),
8779         (VPMASKMOVQYrm VR256:$mask, addr:$ptr)>;
8780
8781def: Pat<(v4i64 (masked_load addr:$ptr, (v4i64 VR256:$mask),
8782                             (bc_v4i64 (v8i32 immAllZerosV)))),
8783         (VPMASKMOVQYrm VR256:$mask, addr:$ptr)>;
8784
8785def: Pat<(v4i64 (masked_load addr:$ptr, (v4i64 VR256:$mask), (v4i64 VR256:$src0))),
8786         (VBLENDVPDYrr VR256:$src0, (VPMASKMOVQYrm VR256:$mask, addr:$ptr),
8787                       VR256:$mask)>;
8788
8789def: Pat<(masked_store addr:$ptr, (v2i64 VR128:$mask), (v2f64 VR128:$src)),
8790         (VMASKMOVPDmr addr:$ptr, VR128:$mask, VR128:$src)>;
8791
8792def: Pat<(masked_store addr:$ptr, (v2i64 VR128:$mask), (v2i64 VR128:$src)),
8793         (VPMASKMOVQmr addr:$ptr, VR128:$mask, VR128:$src)>;
8794
8795def: Pat<(v2f64 (masked_load addr:$ptr, (v2i64 VR128:$mask), undef)),
8796         (VMASKMOVPDrm VR128:$mask, addr:$ptr)>;
8797
8798def: Pat<(v2f64 (masked_load addr:$ptr, (v2i64 VR128:$mask),
8799                             (v2f64 immAllZerosV))),
8800         (VMASKMOVPDrm VR128:$mask, addr:$ptr)>;
8801
8802def: Pat<(v2f64 (masked_load addr:$ptr, (v2i64 VR128:$mask), (v2f64 VR128:$src0))),
8803         (VBLENDVPDrr VR128:$src0, (VMASKMOVPDrm VR128:$mask, addr:$ptr),
8804                       VR128:$mask)>;
8805
8806def: Pat<(v2i64 (masked_load addr:$ptr, (v2i64 VR128:$mask), undef)),
8807         (VPMASKMOVQrm VR128:$mask, addr:$ptr)>;
8808
8809def: Pat<(v2i64 (masked_load addr:$ptr, (v2i64 VR128:$mask),
8810                             (bc_v2i64 (v4i32 immAllZerosV)))),
8811         (VPMASKMOVQrm VR128:$mask, addr:$ptr)>;
8812
8813def: Pat<(v2i64 (masked_load addr:$ptr, (v2i64 VR128:$mask), (v2i64 VR128:$src0))),
8814         (VBLENDVPDrr VR128:$src0, (VPMASKMOVQrm VR128:$mask, addr:$ptr),
8815                       VR128:$mask)>;
8816
8817//===----------------------------------------------------------------------===//
8818// Variable Bit Shifts
8819//
8820multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
8821                          ValueType vt128, ValueType vt256> {
8822  def rr  : AVX28I<opc, MRMSrcReg, (outs VR128:$dst),
8823             (ins VR128:$src1, VR128:$src2),
8824             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8825             [(set VR128:$dst,
8826               (vt128 (OpNode VR128:$src1, (vt128 VR128:$src2))))]>,
8827             VEX_4V, Sched<[WriteVarVecShift]>;
8828  def rm  : AVX28I<opc, MRMSrcMem, (outs VR128:$dst),
8829             (ins VR128:$src1, i128mem:$src2),
8830             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8831             [(set VR128:$dst,
8832               (vt128 (OpNode VR128:$src1,
8833                       (vt128 (bitconvert (loadv2i64 addr:$src2))))))]>,
8834             VEX_4V, Sched<[WriteVarVecShiftLd, ReadAfterLd]>;
8835  def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
8836             (ins VR256:$src1, VR256:$src2),
8837             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8838             [(set VR256:$dst,
8839               (vt256 (OpNode VR256:$src1, (vt256 VR256:$src2))))]>,
8840             VEX_4V, VEX_L, Sched<[WriteVarVecShift]>;
8841  def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
8842             (ins VR256:$src1, i256mem:$src2),
8843             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8844             [(set VR256:$dst,
8845               (vt256 (OpNode VR256:$src1,
8846                       (vt256 (bitconvert (loadv4i64 addr:$src2))))))]>,
8847             VEX_4V, VEX_L, Sched<[WriteVarVecShiftLd, ReadAfterLd]>;
8848}
8849
8850defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", shl, v4i32, v8i32>;
8851defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", shl, v2i64, v4i64>, VEX_W;
8852defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", srl, v4i32, v8i32>;
8853defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", srl, v2i64, v4i64>, VEX_W;
8854defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", sra, v4i32, v8i32>;
8855
8856//===----------------------------------------------------------------------===//
8857// VGATHER - GATHER Operations
8858multiclass avx2_gather<bits<8> opc, string OpcodeStr, RegisterClass RC256,
8859                       X86MemOperand memop128, X86MemOperand memop256> {
8860  def rm  : AVX28I<opc, MRMSrcMem, (outs VR128:$dst, VR128:$mask_wb),
8861            (ins VR128:$src1, memop128:$src2, VR128:$mask),
8862            !strconcat(OpcodeStr,
8863              "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
8864            []>, VEX_4VOp3;
8865  def Yrm : AVX28I<opc, MRMSrcMem, (outs RC256:$dst, RC256:$mask_wb),
8866            (ins RC256:$src1, memop256:$src2, RC256:$mask),
8867            !strconcat(OpcodeStr,
8868              "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
8869            []>, VEX_4VOp3, VEX_L;
8870}
8871
8872let mayLoad = 1, Constraints
8873  = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb"
8874  in {
8875  defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", VR256, vx64mem, vx64mem>, VEX_W;
8876  defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", VR256, vx64mem, vy64mem>, VEX_W;
8877  defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", VR256, vx32mem, vy32mem>;
8878  defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", VR128, vx32mem, vy32mem>;
8879
8880  let ExeDomain = SSEPackedDouble in {
8881    defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", VR256, vx64mem, vx64mem>, VEX_W;
8882    defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", VR256, vx64mem, vy64mem>, VEX_W;
8883  }
8884
8885  let ExeDomain = SSEPackedSingle in {
8886    defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", VR256, vx32mem, vy32mem>;
8887    defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", VR128, vx32mem, vy32mem>;
8888  }
8889}
8890