1//===-- X86InstrSSE.td - SSE Instruction Set ---------------*- tablegen -*-===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file describes the X86 SSE instruction set, defining the instructions, 11// and properties of the instructions which are needed for code generation, 12// machine code emission, and analysis. 13// 14//===----------------------------------------------------------------------===// 15 16class OpndItins<InstrItinClass arg_rr, InstrItinClass arg_rm> { 17 InstrItinClass rr = arg_rr; 18 InstrItinClass rm = arg_rm; 19 // InstrSchedModel info. 20 X86FoldableSchedWrite Sched = WriteFAdd; 21} 22 23class SizeItins<OpndItins arg_s, OpndItins arg_d> { 24 OpndItins s = arg_s; 25 OpndItins d = arg_d; 26} 27 28 29class ShiftOpndItins<InstrItinClass arg_rr, InstrItinClass arg_rm, 30 InstrItinClass arg_ri> { 31 InstrItinClass rr = arg_rr; 32 InstrItinClass rm = arg_rm; 33 InstrItinClass ri = arg_ri; 34} 35 36 37// scalar 38let Sched = WriteFAdd in { 39def SSE_ALU_F32S : OpndItins< 40 IIC_SSE_ALU_F32S_RR, IIC_SSE_ALU_F32S_RM 41>; 42 43def SSE_ALU_F64S : OpndItins< 44 IIC_SSE_ALU_F64S_RR, IIC_SSE_ALU_F64S_RM 45>; 46} 47 48def SSE_ALU_ITINS_S : SizeItins< 49 SSE_ALU_F32S, SSE_ALU_F64S 50>; 51 52let Sched = WriteFMul in { 53def SSE_MUL_F32S : OpndItins< 54 IIC_SSE_MUL_F32S_RR, IIC_SSE_MUL_F64S_RM 55>; 56 57def SSE_MUL_F64S : OpndItins< 58 IIC_SSE_MUL_F64S_RR, IIC_SSE_MUL_F64S_RM 59>; 60} 61 62def SSE_MUL_ITINS_S : SizeItins< 63 SSE_MUL_F32S, SSE_MUL_F64S 64>; 65 66let Sched = WriteFDiv in { 67def SSE_DIV_F32S : OpndItins< 68 IIC_SSE_DIV_F32S_RR, IIC_SSE_DIV_F64S_RM 69>; 70 71def SSE_DIV_F64S : OpndItins< 72 IIC_SSE_DIV_F64S_RR, IIC_SSE_DIV_F64S_RM 73>; 74} 75 76def SSE_DIV_ITINS_S : SizeItins< 77 SSE_DIV_F32S, SSE_DIV_F64S 78>; 79 80// parallel 81let Sched = WriteFAdd in { 82def SSE_ALU_F32P : OpndItins< 83 IIC_SSE_ALU_F32P_RR, IIC_SSE_ALU_F32P_RM 84>; 85 86def SSE_ALU_F64P : OpndItins< 87 IIC_SSE_ALU_F64P_RR, IIC_SSE_ALU_F64P_RM 88>; 89} 90 91def SSE_ALU_ITINS_P : SizeItins< 92 SSE_ALU_F32P, SSE_ALU_F64P 93>; 94 95let Sched = WriteFMul in { 96def SSE_MUL_F32P : OpndItins< 97 IIC_SSE_MUL_F32P_RR, IIC_SSE_MUL_F64P_RM 98>; 99 100def SSE_MUL_F64P : OpndItins< 101 IIC_SSE_MUL_F64P_RR, IIC_SSE_MUL_F64P_RM 102>; 103} 104 105def SSE_MUL_ITINS_P : SizeItins< 106 SSE_MUL_F32P, SSE_MUL_F64P 107>; 108 109let Sched = WriteFDiv in { 110def SSE_DIV_F32P : OpndItins< 111 IIC_SSE_DIV_F32P_RR, IIC_SSE_DIV_F64P_RM 112>; 113 114def SSE_DIV_F64P : OpndItins< 115 IIC_SSE_DIV_F64P_RR, IIC_SSE_DIV_F64P_RM 116>; 117} 118 119def SSE_DIV_ITINS_P : SizeItins< 120 SSE_DIV_F32P, SSE_DIV_F64P 121>; 122 123let Sched = WriteVecLogic in 124def SSE_VEC_BIT_ITINS_P : OpndItins< 125 IIC_SSE_BIT_P_RR, IIC_SSE_BIT_P_RM 126>; 127 128def SSE_BIT_ITINS_P : OpndItins< 129 IIC_SSE_BIT_P_RR, IIC_SSE_BIT_P_RM 130>; 131 132let Sched = WriteVecALU in { 133def SSE_INTALU_ITINS_P : OpndItins< 134 IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM 135>; 136 137def SSE_INTALUQ_ITINS_P : OpndItins< 138 IIC_SSE_INTALUQ_P_RR, IIC_SSE_INTALUQ_P_RM 139>; 140} 141 142let Sched = WriteVecIMul in 143def SSE_INTMUL_ITINS_P : OpndItins< 144 IIC_SSE_INTMUL_P_RR, IIC_SSE_INTMUL_P_RM 145>; 146 147def SSE_INTSHIFT_ITINS_P : ShiftOpndItins< 148 IIC_SSE_INTSH_P_RR, IIC_SSE_INTSH_P_RM, IIC_SSE_INTSH_P_RI 149>; 150 151def SSE_MOVA_ITINS : OpndItins< 152 IIC_SSE_MOVA_P_RR, IIC_SSE_MOVA_P_RM 153>; 154 155def SSE_MOVU_ITINS : OpndItins< 156 IIC_SSE_MOVU_P_RR, IIC_SSE_MOVU_P_RM 157>; 158 159def SSE_DPPD_ITINS : OpndItins< 160 IIC_SSE_DPPD_RR, IIC_SSE_DPPD_RM 161>; 162 163def SSE_DPPS_ITINS : OpndItins< 164 IIC_SSE_DPPS_RR, IIC_SSE_DPPD_RM 165>; 166 167def DEFAULT_ITINS : OpndItins< 168 IIC_ALU_NONMEM, IIC_ALU_MEM 169>; 170 171def SSE_EXTRACT_ITINS : OpndItins< 172 IIC_SSE_EXTRACTPS_RR, IIC_SSE_EXTRACTPS_RM 173>; 174 175def SSE_INSERT_ITINS : OpndItins< 176 IIC_SSE_INSERTPS_RR, IIC_SSE_INSERTPS_RM 177>; 178 179let Sched = WriteMPSAD in 180def SSE_MPSADBW_ITINS : OpndItins< 181 IIC_SSE_MPSADBW_RR, IIC_SSE_MPSADBW_RM 182>; 183 184let Sched = WriteVecIMul in 185def SSE_PMULLD_ITINS : OpndItins< 186 IIC_SSE_PMULLD_RR, IIC_SSE_PMULLD_RM 187>; 188 189// Definitions for backward compatibility. 190// The instructions mapped on these definitions uses a different itinerary 191// than the actual scheduling model. 192let Sched = WriteShuffle in 193def DEFAULT_ITINS_SHUFFLESCHED : OpndItins< 194 IIC_ALU_NONMEM, IIC_ALU_MEM 195>; 196 197let Sched = WriteVecIMul in 198def DEFAULT_ITINS_VECIMULSCHED : OpndItins< 199 IIC_ALU_NONMEM, IIC_ALU_MEM 200>; 201 202let Sched = WriteShuffle in 203def SSE_INTALU_ITINS_SHUFF_P : OpndItins< 204 IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM 205>; 206 207let Sched = WriteMPSAD in 208def DEFAULT_ITINS_MPSADSCHED : OpndItins< 209 IIC_ALU_NONMEM, IIC_ALU_MEM 210>; 211 212let Sched = WriteFBlend in 213def DEFAULT_ITINS_FBLENDSCHED : OpndItins< 214 IIC_ALU_NONMEM, IIC_ALU_MEM 215>; 216 217let Sched = WriteBlend in 218def DEFAULT_ITINS_BLENDSCHED : OpndItins< 219 IIC_ALU_NONMEM, IIC_ALU_MEM 220>; 221 222let Sched = WriteVarBlend in 223def DEFAULT_ITINS_VARBLENDSCHED : OpndItins< 224 IIC_ALU_NONMEM, IIC_ALU_MEM 225>; 226 227let Sched = WriteFBlend in 228def SSE_INTALU_ITINS_FBLEND_P : OpndItins< 229 IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM 230>; 231 232let Sched = WriteBlend in 233def SSE_INTALU_ITINS_BLEND_P : OpndItins< 234 IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM 235>; 236 237//===----------------------------------------------------------------------===// 238// SSE 1 & 2 Instructions Classes 239//===----------------------------------------------------------------------===// 240 241/// sse12_fp_scalar - SSE 1 & 2 scalar instructions class 242multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode, 243 RegisterClass RC, X86MemOperand x86memop, 244 Domain d, OpndItins itins, bit Is2Addr = 1> { 245 let isCommutable = 1 in { 246 def rr : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 247 !if(Is2Addr, 248 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 249 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 250 [(set RC:$dst, (OpNode RC:$src1, RC:$src2))], itins.rr, d>, 251 Sched<[itins.Sched]>; 252 } 253 def rm : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 254 !if(Is2Addr, 255 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 256 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 257 [(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))], itins.rm, d>, 258 Sched<[itins.Sched.Folded, ReadAfterLd]>; 259} 260 261/// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class 262multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr, RegisterClass RC, 263 string asm, string SSEVer, string FPSizeStr, 264 Operand memopr, ComplexPattern mem_cpat, 265 Domain d, OpndItins itins, bit Is2Addr = 1> { 266let isCodeGenOnly = 1 in { 267 def rr_Int : SI_Int<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 268 !if(Is2Addr, 269 !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), 270 !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 271 [(set RC:$dst, (!cast<Intrinsic>( 272 !strconcat("int_x86_sse", SSEVer, "_", OpcodeStr, FPSizeStr)) 273 RC:$src1, RC:$src2))], itins.rr, d>, 274 Sched<[itins.Sched]>; 275 def rm_Int : SI_Int<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, memopr:$src2), 276 !if(Is2Addr, 277 !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), 278 !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 279 [(set RC:$dst, (!cast<Intrinsic>(!strconcat("int_x86_sse", 280 SSEVer, "_", OpcodeStr, FPSizeStr)) 281 RC:$src1, mem_cpat:$src2))], itins.rm, d>, 282 Sched<[itins.Sched.Folded, ReadAfterLd]>; 283} 284} 285 286/// sse12_fp_packed - SSE 1 & 2 packed instructions class 287multiclass sse12_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode, 288 RegisterClass RC, ValueType vt, 289 X86MemOperand x86memop, PatFrag mem_frag, 290 Domain d, OpndItins itins, bit Is2Addr = 1> { 291 let isCommutable = 1 in 292 def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 293 !if(Is2Addr, 294 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 295 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 296 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], itins.rr, d>, 297 Sched<[itins.Sched]>; 298 let mayLoad = 1 in 299 def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 300 !if(Is2Addr, 301 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 302 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 303 [(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))], 304 itins.rm, d>, 305 Sched<[itins.Sched.Folded, ReadAfterLd]>; 306} 307 308/// sse12_fp_packed_logical_rm - SSE 1 & 2 packed instructions class 309multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d, 310 string OpcodeStr, X86MemOperand x86memop, 311 list<dag> pat_rr, list<dag> pat_rm, 312 bit Is2Addr = 1> { 313 let isCommutable = 1, hasSideEffects = 0 in 314 def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 315 !if(Is2Addr, 316 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 317 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 318 pat_rr, NoItinerary, d>, 319 Sched<[WriteVecLogic]>; 320 def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 321 !if(Is2Addr, 322 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 323 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 324 pat_rm, NoItinerary, d>, 325 Sched<[WriteVecLogicLd, ReadAfterLd]>; 326} 327 328//===----------------------------------------------------------------------===// 329// Non-instruction patterns 330//===----------------------------------------------------------------------===// 331 332// A vector extract of the first f32/f64 position is a subregister copy 333def : Pat<(f32 (extractelt (v4f32 VR128:$src), (iPTR 0))), 334 (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32)>; 335def : Pat<(f64 (extractelt (v2f64 VR128:$src), (iPTR 0))), 336 (COPY_TO_REGCLASS (v2f64 VR128:$src), FR64)>; 337 338// A 128-bit subvector extract from the first 256-bit vector position 339// is a subregister copy that needs no instruction. 340def : Pat<(v4i32 (extract_subvector (v8i32 VR256:$src), (iPTR 0))), 341 (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm))>; 342def : Pat<(v4f32 (extract_subvector (v8f32 VR256:$src), (iPTR 0))), 343 (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm))>; 344 345def : Pat<(v2i64 (extract_subvector (v4i64 VR256:$src), (iPTR 0))), 346 (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm))>; 347def : Pat<(v2f64 (extract_subvector (v4f64 VR256:$src), (iPTR 0))), 348 (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm))>; 349 350def : Pat<(v8i16 (extract_subvector (v16i16 VR256:$src), (iPTR 0))), 351 (v8i16 (EXTRACT_SUBREG (v16i16 VR256:$src), sub_xmm))>; 352def : Pat<(v16i8 (extract_subvector (v32i8 VR256:$src), (iPTR 0))), 353 (v16i8 (EXTRACT_SUBREG (v32i8 VR256:$src), sub_xmm))>; 354 355// A 128-bit subvector insert to the first 256-bit vector position 356// is a subregister copy that needs no instruction. 357let AddedComplexity = 25 in { // to give priority over vinsertf128rm 358def : Pat<(insert_subvector undef, (v2i64 VR128:$src), (iPTR 0)), 359 (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>; 360def : Pat<(insert_subvector undef, (v2f64 VR128:$src), (iPTR 0)), 361 (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>; 362def : Pat<(insert_subvector undef, (v4i32 VR128:$src), (iPTR 0)), 363 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>; 364def : Pat<(insert_subvector undef, (v4f32 VR128:$src), (iPTR 0)), 365 (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>; 366def : Pat<(insert_subvector undef, (v8i16 VR128:$src), (iPTR 0)), 367 (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>; 368def : Pat<(insert_subvector undef, (v16i8 VR128:$src), (iPTR 0)), 369 (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>; 370} 371 372// Implicitly promote a 32-bit scalar to a vector. 373def : Pat<(v4f32 (scalar_to_vector FR32:$src)), 374 (COPY_TO_REGCLASS FR32:$src, VR128)>; 375def : Pat<(v8f32 (scalar_to_vector FR32:$src)), 376 (COPY_TO_REGCLASS FR32:$src, VR128)>; 377// Implicitly promote a 64-bit scalar to a vector. 378def : Pat<(v2f64 (scalar_to_vector FR64:$src)), 379 (COPY_TO_REGCLASS FR64:$src, VR128)>; 380def : Pat<(v4f64 (scalar_to_vector FR64:$src)), 381 (COPY_TO_REGCLASS FR64:$src, VR128)>; 382 383// Bitcasts between 128-bit vector types. Return the original type since 384// no instruction is needed for the conversion 385let Predicates = [HasSSE2] in { 386 def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>; 387 def : Pat<(v2i64 (bitconvert (v8i16 VR128:$src))), (v2i64 VR128:$src)>; 388 def : Pat<(v2i64 (bitconvert (v16i8 VR128:$src))), (v2i64 VR128:$src)>; 389 def : Pat<(v2i64 (bitconvert (v2f64 VR128:$src))), (v2i64 VR128:$src)>; 390 def : Pat<(v2i64 (bitconvert (v4f32 VR128:$src))), (v2i64 VR128:$src)>; 391 def : Pat<(v4i32 (bitconvert (v2i64 VR128:$src))), (v4i32 VR128:$src)>; 392 def : Pat<(v4i32 (bitconvert (v8i16 VR128:$src))), (v4i32 VR128:$src)>; 393 def : Pat<(v4i32 (bitconvert (v16i8 VR128:$src))), (v4i32 VR128:$src)>; 394 def : Pat<(v4i32 (bitconvert (v2f64 VR128:$src))), (v4i32 VR128:$src)>; 395 def : Pat<(v4i32 (bitconvert (v4f32 VR128:$src))), (v4i32 VR128:$src)>; 396 def : Pat<(v8i16 (bitconvert (v2i64 VR128:$src))), (v8i16 VR128:$src)>; 397 def : Pat<(v8i16 (bitconvert (v4i32 VR128:$src))), (v8i16 VR128:$src)>; 398 def : Pat<(v8i16 (bitconvert (v16i8 VR128:$src))), (v8i16 VR128:$src)>; 399 def : Pat<(v8i16 (bitconvert (v2f64 VR128:$src))), (v8i16 VR128:$src)>; 400 def : Pat<(v8i16 (bitconvert (v4f32 VR128:$src))), (v8i16 VR128:$src)>; 401 def : Pat<(v16i8 (bitconvert (v2i64 VR128:$src))), (v16i8 VR128:$src)>; 402 def : Pat<(v16i8 (bitconvert (v4i32 VR128:$src))), (v16i8 VR128:$src)>; 403 def : Pat<(v16i8 (bitconvert (v8i16 VR128:$src))), (v16i8 VR128:$src)>; 404 def : Pat<(v16i8 (bitconvert (v2f64 VR128:$src))), (v16i8 VR128:$src)>; 405 def : Pat<(v16i8 (bitconvert (v4f32 VR128:$src))), (v16i8 VR128:$src)>; 406 def : Pat<(v4f32 (bitconvert (v2i64 VR128:$src))), (v4f32 VR128:$src)>; 407 def : Pat<(v4f32 (bitconvert (v4i32 VR128:$src))), (v4f32 VR128:$src)>; 408 def : Pat<(v4f32 (bitconvert (v8i16 VR128:$src))), (v4f32 VR128:$src)>; 409 def : Pat<(v4f32 (bitconvert (v16i8 VR128:$src))), (v4f32 VR128:$src)>; 410 def : Pat<(v4f32 (bitconvert (v2f64 VR128:$src))), (v4f32 VR128:$src)>; 411 def : Pat<(v2f64 (bitconvert (v2i64 VR128:$src))), (v2f64 VR128:$src)>; 412 def : Pat<(v2f64 (bitconvert (v4i32 VR128:$src))), (v2f64 VR128:$src)>; 413 def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>; 414 def : Pat<(v2f64 (bitconvert (v16i8 VR128:$src))), (v2f64 VR128:$src)>; 415 def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>; 416 def : Pat<(f128 (bitconvert (i128 FR128:$src))), (f128 FR128:$src)>; 417 def : Pat<(i128 (bitconvert (f128 FR128:$src))), (i128 FR128:$src)>; 418} 419 420// Bitcasts between 256-bit vector types. Return the original type since 421// no instruction is needed for the conversion 422let Predicates = [HasAVX] in { 423 def : Pat<(v4f64 (bitconvert (v8f32 VR256:$src))), (v4f64 VR256:$src)>; 424 def : Pat<(v4f64 (bitconvert (v8i32 VR256:$src))), (v4f64 VR256:$src)>; 425 def : Pat<(v4f64 (bitconvert (v4i64 VR256:$src))), (v4f64 VR256:$src)>; 426 def : Pat<(v4f64 (bitconvert (v16i16 VR256:$src))), (v4f64 VR256:$src)>; 427 def : Pat<(v4f64 (bitconvert (v32i8 VR256:$src))), (v4f64 VR256:$src)>; 428 def : Pat<(v8f32 (bitconvert (v8i32 VR256:$src))), (v8f32 VR256:$src)>; 429 def : Pat<(v8f32 (bitconvert (v4i64 VR256:$src))), (v8f32 VR256:$src)>; 430 def : Pat<(v8f32 (bitconvert (v4f64 VR256:$src))), (v8f32 VR256:$src)>; 431 def : Pat<(v8f32 (bitconvert (v32i8 VR256:$src))), (v8f32 VR256:$src)>; 432 def : Pat<(v8f32 (bitconvert (v16i16 VR256:$src))), (v8f32 VR256:$src)>; 433 def : Pat<(v4i64 (bitconvert (v8f32 VR256:$src))), (v4i64 VR256:$src)>; 434 def : Pat<(v4i64 (bitconvert (v8i32 VR256:$src))), (v4i64 VR256:$src)>; 435 def : Pat<(v4i64 (bitconvert (v4f64 VR256:$src))), (v4i64 VR256:$src)>; 436 def : Pat<(v4i64 (bitconvert (v32i8 VR256:$src))), (v4i64 VR256:$src)>; 437 def : Pat<(v4i64 (bitconvert (v16i16 VR256:$src))), (v4i64 VR256:$src)>; 438 def : Pat<(v32i8 (bitconvert (v4f64 VR256:$src))), (v32i8 VR256:$src)>; 439 def : Pat<(v32i8 (bitconvert (v4i64 VR256:$src))), (v32i8 VR256:$src)>; 440 def : Pat<(v32i8 (bitconvert (v8f32 VR256:$src))), (v32i8 VR256:$src)>; 441 def : Pat<(v32i8 (bitconvert (v8i32 VR256:$src))), (v32i8 VR256:$src)>; 442 def : Pat<(v32i8 (bitconvert (v16i16 VR256:$src))), (v32i8 VR256:$src)>; 443 def : Pat<(v8i32 (bitconvert (v32i8 VR256:$src))), (v8i32 VR256:$src)>; 444 def : Pat<(v8i32 (bitconvert (v16i16 VR256:$src))), (v8i32 VR256:$src)>; 445 def : Pat<(v8i32 (bitconvert (v8f32 VR256:$src))), (v8i32 VR256:$src)>; 446 def : Pat<(v8i32 (bitconvert (v4i64 VR256:$src))), (v8i32 VR256:$src)>; 447 def : Pat<(v8i32 (bitconvert (v4f64 VR256:$src))), (v8i32 VR256:$src)>; 448 def : Pat<(v16i16 (bitconvert (v8f32 VR256:$src))), (v16i16 VR256:$src)>; 449 def : Pat<(v16i16 (bitconvert (v8i32 VR256:$src))), (v16i16 VR256:$src)>; 450 def : Pat<(v16i16 (bitconvert (v4i64 VR256:$src))), (v16i16 VR256:$src)>; 451 def : Pat<(v16i16 (bitconvert (v4f64 VR256:$src))), (v16i16 VR256:$src)>; 452 def : Pat<(v16i16 (bitconvert (v32i8 VR256:$src))), (v16i16 VR256:$src)>; 453} 454 455// Alias instructions that map fld0 to xorps for sse or vxorps for avx. 456// This is expanded by ExpandPostRAPseudos. 457let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, 458 isPseudo = 1, SchedRW = [WriteZero] in { 459 def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "", 460 [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1]>; 461 def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "", 462 [(set FR64:$dst, fpimm0)]>, Requires<[HasSSE2]>; 463} 464 465//===----------------------------------------------------------------------===// 466// AVX & SSE - Zero/One Vectors 467//===----------------------------------------------------------------------===// 468 469// Alias instruction that maps zero vector to pxor / xorp* for sse. 470// This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then 471// swizzled by ExecutionDepsFix to pxor. 472// We set canFoldAsLoad because this can be converted to a constant-pool 473// load of an all-zeros value if folding it would be beneficial. 474let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, 475 isPseudo = 1, SchedRW = [WriteZero] in { 476def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "", 477 [(set VR128:$dst, (v4f32 immAllZerosV))]>; 478} 479 480def : Pat<(v2f64 immAllZerosV), (V_SET0)>; 481def : Pat<(v4i32 immAllZerosV), (V_SET0)>; 482def : Pat<(v2i64 immAllZerosV), (V_SET0)>; 483def : Pat<(v8i16 immAllZerosV), (V_SET0)>; 484def : Pat<(v16i8 immAllZerosV), (V_SET0)>; 485 486 487// The same as done above but for AVX. The 256-bit AVX1 ISA doesn't support PI, 488// and doesn't need it because on sandy bridge the register is set to zero 489// at the rename stage without using any execution unit, so SET0PSY 490// and SET0PDY can be used for vector int instructions without penalty 491let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, 492 isPseudo = 1, Predicates = [HasAVX], SchedRW = [WriteZero] in { 493def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "", 494 [(set VR256:$dst, (v8f32 immAllZerosV))]>; 495} 496 497let Predicates = [HasAVX] in 498 def : Pat<(v4f64 immAllZerosV), (AVX_SET0)>; 499 500let Predicates = [HasAVX2] in { 501 def : Pat<(v4i64 immAllZerosV), (AVX_SET0)>; 502 def : Pat<(v8i32 immAllZerosV), (AVX_SET0)>; 503 def : Pat<(v16i16 immAllZerosV), (AVX_SET0)>; 504 def : Pat<(v32i8 immAllZerosV), (AVX_SET0)>; 505} 506 507// AVX1 has no support for 256-bit integer instructions, but since the 128-bit 508// VPXOR instruction writes zero to its upper part, it's safe build zeros. 509let Predicates = [HasAVX1Only] in { 510def : Pat<(v32i8 immAllZerosV), (SUBREG_TO_REG (i8 0), (V_SET0), sub_xmm)>; 511def : Pat<(bc_v32i8 (v8f32 immAllZerosV)), 512 (SUBREG_TO_REG (i8 0), (V_SET0), sub_xmm)>; 513 514def : Pat<(v16i16 immAllZerosV), (SUBREG_TO_REG (i16 0), (V_SET0), sub_xmm)>; 515def : Pat<(bc_v16i16 (v8f32 immAllZerosV)), 516 (SUBREG_TO_REG (i16 0), (V_SET0), sub_xmm)>; 517 518def : Pat<(v8i32 immAllZerosV), (SUBREG_TO_REG (i32 0), (V_SET0), sub_xmm)>; 519def : Pat<(bc_v8i32 (v8f32 immAllZerosV)), 520 (SUBREG_TO_REG (i32 0), (V_SET0), sub_xmm)>; 521 522def : Pat<(v4i64 immAllZerosV), (SUBREG_TO_REG (i64 0), (V_SET0), sub_xmm)>; 523def : Pat<(bc_v4i64 (v8f32 immAllZerosV)), 524 (SUBREG_TO_REG (i64 0), (V_SET0), sub_xmm)>; 525} 526 527// We set canFoldAsLoad because this can be converted to a constant-pool 528// load of an all-ones value if folding it would be beneficial. 529let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, 530 isPseudo = 1, SchedRW = [WriteZero] in { 531 def V_SETALLONES : I<0, Pseudo, (outs VR128:$dst), (ins), "", 532 [(set VR128:$dst, (v4i32 immAllOnesV))]>; 533 let Predicates = [HasAVX2] in 534 def AVX2_SETALLONES : I<0, Pseudo, (outs VR256:$dst), (ins), "", 535 [(set VR256:$dst, (v8i32 immAllOnesV))]>; 536} 537 538 539//===----------------------------------------------------------------------===// 540// SSE 1 & 2 - Move FP Scalar Instructions 541// 542// Move Instructions. Register-to-register movss/movsd is not used for FR32/64 543// register copies because it's a partial register update; Register-to-register 544// movss/movsd is not modeled as an INSERT_SUBREG because INSERT_SUBREG requires 545// that the insert be implementable in terms of a copy, and just mentioned, we 546// don't use movss/movsd for copies. 547//===----------------------------------------------------------------------===// 548 549multiclass sse12_move_rr<RegisterClass RC, SDNode OpNode, ValueType vt, 550 X86MemOperand x86memop, string base_opc, 551 string asm_opr, Domain d = GenericDomain> { 552 def rr : SI<0x10, MRMSrcReg, (outs VR128:$dst), 553 (ins VR128:$src1, RC:$src2), 554 !strconcat(base_opc, asm_opr), 555 [(set VR128:$dst, (vt (OpNode VR128:$src1, 556 (scalar_to_vector RC:$src2))))], 557 IIC_SSE_MOV_S_RR, d>, Sched<[WriteFShuffle]>; 558 559 // For the disassembler 560 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in 561 def rr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst), 562 (ins VR128:$src1, RC:$src2), 563 !strconcat(base_opc, asm_opr), 564 [], IIC_SSE_MOV_S_RR>, Sched<[WriteFShuffle]>; 565} 566 567multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt, 568 X86MemOperand x86memop, string OpcodeStr, 569 Domain d = GenericDomain> { 570 // AVX 571 defm V#NAME : sse12_move_rr<RC, OpNode, vt, x86memop, OpcodeStr, 572 "\t{$src2, $src1, $dst|$dst, $src1, $src2}", d>, 573 VEX_4V, VEX_LIG; 574 575 def V#NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src), 576 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 577 [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR, d>, 578 VEX, VEX_LIG, Sched<[WriteStore]>; 579 // SSE1 & 2 580 let Constraints = "$src1 = $dst" in { 581 defm NAME : sse12_move_rr<RC, OpNode, vt, x86memop, OpcodeStr, 582 "\t{$src2, $dst|$dst, $src2}", d>; 583 } 584 585 def NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src), 586 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 587 [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR, d>, 588 Sched<[WriteStore]>; 589} 590 591// Loading from memory automatically zeroing upper bits. 592multiclass sse12_move_rm<RegisterClass RC, X86MemOperand x86memop, 593 PatFrag mem_pat, string OpcodeStr, 594 Domain d = GenericDomain> { 595 def V#NAME#rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), 596 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 597 [(set RC:$dst, (mem_pat addr:$src))], 598 IIC_SSE_MOV_S_RM, d>, VEX, VEX_LIG, Sched<[WriteLoad]>; 599 def NAME#rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), 600 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 601 [(set RC:$dst, (mem_pat addr:$src))], 602 IIC_SSE_MOV_S_RM, d>, Sched<[WriteLoad]>; 603} 604 605defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss", 606 SSEPackedSingle>, XS; 607defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd", 608 SSEPackedDouble>, XD; 609 610let canFoldAsLoad = 1, isReMaterializable = 1 in { 611 defm MOVSS : sse12_move_rm<FR32, f32mem, loadf32, "movss", 612 SSEPackedSingle>, XS; 613 614 let AddedComplexity = 20 in 615 defm MOVSD : sse12_move_rm<FR64, f64mem, loadf64, "movsd", 616 SSEPackedDouble>, XD; 617} 618 619// Patterns 620let Predicates = [UseAVX] in { 621 let AddedComplexity = 20 in { 622 // MOVSSrm zeros the high parts of the register; represent this 623 // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0 624 def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))), 625 (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>; 626 def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))), 627 (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>; 628 def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))), 629 (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>; 630 631 // MOVSDrm zeros the high parts of the register; represent this 632 // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0 633 def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))), 634 (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>; 635 def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))), 636 (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>; 637 def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))), 638 (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>; 639 def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))), 640 (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>; 641 def : Pat<(v2f64 (X86vzload addr:$src)), 642 (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>; 643 644 // Represent the same patterns above but in the form they appear for 645 // 256-bit types 646 def : Pat<(v8f32 (X86vzmovl (insert_subvector undef, 647 (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))), 648 (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>; 649 def : Pat<(v4f64 (X86vzmovl (insert_subvector undef, 650 (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))), 651 (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>; 652 } 653 654 // Extract and store. 655 def : Pat<(store (f32 (extractelt (v4f32 VR128:$src), (iPTR 0))), 656 addr:$dst), 657 (VMOVSSmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32))>; 658 def : Pat<(store (f64 (extractelt (v2f64 VR128:$src), (iPTR 0))), 659 addr:$dst), 660 (VMOVSDmr addr:$dst, (COPY_TO_REGCLASS (v2f64 VR128:$src), FR64))>; 661 662 // Shuffle with VMOVSS 663 def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)), 664 (VMOVSSrr (v4i32 VR128:$src1), 665 (COPY_TO_REGCLASS (v4i32 VR128:$src2), FR32))>; 666 def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)), 667 (VMOVSSrr (v4f32 VR128:$src1), 668 (COPY_TO_REGCLASS (v4f32 VR128:$src2), FR32))>; 669 670 // 256-bit variants 671 def : Pat<(v8i32 (X86Movss VR256:$src1, VR256:$src2)), 672 (SUBREG_TO_REG (i32 0), 673 (VMOVSSrr (EXTRACT_SUBREG (v8i32 VR256:$src1), sub_xmm), 674 (EXTRACT_SUBREG (v8i32 VR256:$src2), sub_xmm)), 675 sub_xmm)>; 676 def : Pat<(v8f32 (X86Movss VR256:$src1, VR256:$src2)), 677 (SUBREG_TO_REG (i32 0), 678 (VMOVSSrr (EXTRACT_SUBREG (v8f32 VR256:$src1), sub_xmm), 679 (EXTRACT_SUBREG (v8f32 VR256:$src2), sub_xmm)), 680 sub_xmm)>; 681 682 // Shuffle with VMOVSD 683 def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)), 684 (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; 685 def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)), 686 (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; 687 def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)), 688 (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; 689 def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)), 690 (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; 691 692 // 256-bit variants 693 def : Pat<(v4i64 (X86Movsd VR256:$src1, VR256:$src2)), 694 (SUBREG_TO_REG (i32 0), 695 (VMOVSDrr (EXTRACT_SUBREG (v4i64 VR256:$src1), sub_xmm), 696 (EXTRACT_SUBREG (v4i64 VR256:$src2), sub_xmm)), 697 sub_xmm)>; 698 def : Pat<(v4f64 (X86Movsd VR256:$src1, VR256:$src2)), 699 (SUBREG_TO_REG (i32 0), 700 (VMOVSDrr (EXTRACT_SUBREG (v4f64 VR256:$src1), sub_xmm), 701 (EXTRACT_SUBREG (v4f64 VR256:$src2), sub_xmm)), 702 sub_xmm)>; 703 704 // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem 705 // is during lowering, where it's not possible to recognize the fold cause 706 // it has two uses through a bitcast. One use disappears at isel time and the 707 // fold opportunity reappears. 708 def : Pat<(v2f64 (X86Movlpd VR128:$src1, VR128:$src2)), 709 (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; 710 def : Pat<(v2i64 (X86Movlpd VR128:$src1, VR128:$src2)), 711 (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; 712 def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)), 713 (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; 714 def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)), 715 (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; 716} 717 718let Predicates = [UseSSE1] in { 719 let Predicates = [NoSSE41], AddedComplexity = 15 in { 720 // Move scalar to XMM zero-extended, zeroing a VR128 then do a 721 // MOVSS to the lower bits. 722 def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))), 723 (MOVSSrr (v4f32 (V_SET0)), FR32:$src)>; 724 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), 725 (MOVSSrr (v4f32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>; 726 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), 727 (MOVSSrr (v4i32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>; 728 } 729 730 let AddedComplexity = 20 in { 731 // MOVSSrm already zeros the high parts of the register. 732 def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))), 733 (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>; 734 def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))), 735 (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>; 736 def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))), 737 (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>; 738 } 739 740 // Extract and store. 741 def : Pat<(store (f32 (extractelt (v4f32 VR128:$src), (iPTR 0))), 742 addr:$dst), 743 (MOVSSmr addr:$dst, (COPY_TO_REGCLASS VR128:$src, FR32))>; 744 745 // Shuffle with MOVSS 746 def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)), 747 (MOVSSrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR32))>; 748 def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)), 749 (MOVSSrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR32))>; 750} 751 752let Predicates = [UseSSE2] in { 753 let Predicates = [NoSSE41], AddedComplexity = 15 in { 754 // Move scalar to XMM zero-extended, zeroing a VR128 then do a 755 // MOVSD to the lower bits. 756 def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))), 757 (MOVSDrr (v2f64 (V_SET0)), FR64:$src)>; 758 } 759 760 let AddedComplexity = 20 in { 761 // MOVSDrm already zeros the high parts of the register. 762 def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))), 763 (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>; 764 def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))), 765 (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>; 766 def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))), 767 (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>; 768 def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))), 769 (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>; 770 def : Pat<(v2f64 (X86vzload addr:$src)), 771 (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>; 772 } 773 774 // Extract and store. 775 def : Pat<(store (f64 (extractelt (v2f64 VR128:$src), (iPTR 0))), 776 addr:$dst), 777 (MOVSDmr addr:$dst, (COPY_TO_REGCLASS VR128:$src, FR64))>; 778 779 // Shuffle with MOVSD 780 def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)), 781 (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; 782 def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)), 783 (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; 784 def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)), 785 (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; 786 def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)), 787 (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; 788 789 // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem 790 // is during lowering, where it's not possible to recognize the fold because 791 // it has two uses through a bitcast. One use disappears at isel time and the 792 // fold opportunity reappears. 793 def : Pat<(v2f64 (X86Movlpd VR128:$src1, VR128:$src2)), 794 (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; 795 def : Pat<(v2i64 (X86Movlpd VR128:$src1, VR128:$src2)), 796 (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; 797 def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)), 798 (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; 799 def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)), 800 (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; 801} 802 803//===----------------------------------------------------------------------===// 804// SSE 1 & 2 - Move Aligned/Unaligned FP Instructions 805//===----------------------------------------------------------------------===// 806 807multiclass sse12_mov_packed<bits<8> opc, RegisterClass RC, 808 X86MemOperand x86memop, PatFrag ld_frag, 809 string asm, Domain d, 810 OpndItins itins, 811 bit IsReMaterializable = 1> { 812let hasSideEffects = 0 in 813 def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), 814 !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], itins.rr, d>, 815 Sched<[WriteFShuffle]>; 816let canFoldAsLoad = 1, isReMaterializable = IsReMaterializable in 817 def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), 818 !strconcat(asm, "\t{$src, $dst|$dst, $src}"), 819 [(set RC:$dst, (ld_frag addr:$src))], itins.rm, d>, 820 Sched<[WriteLoad]>; 821} 822 823let Predicates = [HasAVX, NoVLX] in { 824defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, 825 "movaps", SSEPackedSingle, SSE_MOVA_ITINS>, 826 PS, VEX; 827defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, 828 "movapd", SSEPackedDouble, SSE_MOVA_ITINS>, 829 PD, VEX; 830defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, 831 "movups", SSEPackedSingle, SSE_MOVU_ITINS>, 832 PS, VEX; 833defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, 834 "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>, 835 PD, VEX; 836 837defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32, 838 "movaps", SSEPackedSingle, SSE_MOVA_ITINS>, 839 PS, VEX, VEX_L; 840defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64, 841 "movapd", SSEPackedDouble, SSE_MOVA_ITINS>, 842 PD, VEX, VEX_L; 843defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32, 844 "movups", SSEPackedSingle, SSE_MOVU_ITINS>, 845 PS, VEX, VEX_L; 846defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64, 847 "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>, 848 PD, VEX, VEX_L; 849} 850 851let Predicates = [UseSSE1] in { 852defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, 853 "movaps", SSEPackedSingle, SSE_MOVA_ITINS>, 854 PS; 855defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, 856 "movups", SSEPackedSingle, SSE_MOVU_ITINS>, 857 PS; 858} 859let Predicates = [UseSSE2] in { 860defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, 861 "movapd", SSEPackedDouble, SSE_MOVA_ITINS>, 862 PD; 863defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, 864 "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>, 865 PD; 866} 867 868let SchedRW = [WriteStore], Predicates = [HasAVX, NoVLX] in { 869def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 870 "movaps\t{$src, $dst|$dst, $src}", 871 [(alignedstore (v4f32 VR128:$src), addr:$dst)], 872 IIC_SSE_MOVA_P_MR>, VEX; 873def VMOVAPDmr : VPDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 874 "movapd\t{$src, $dst|$dst, $src}", 875 [(alignedstore (v2f64 VR128:$src), addr:$dst)], 876 IIC_SSE_MOVA_P_MR>, VEX; 877def VMOVUPSmr : VPSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 878 "movups\t{$src, $dst|$dst, $src}", 879 [(store (v4f32 VR128:$src), addr:$dst)], 880 IIC_SSE_MOVU_P_MR>, VEX; 881def VMOVUPDmr : VPDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 882 "movupd\t{$src, $dst|$dst, $src}", 883 [(store (v2f64 VR128:$src), addr:$dst)], 884 IIC_SSE_MOVU_P_MR>, VEX; 885def VMOVAPSYmr : VPSI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), 886 "movaps\t{$src, $dst|$dst, $src}", 887 [(alignedstore256 (v8f32 VR256:$src), addr:$dst)], 888 IIC_SSE_MOVA_P_MR>, VEX, VEX_L; 889def VMOVAPDYmr : VPDI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), 890 "movapd\t{$src, $dst|$dst, $src}", 891 [(alignedstore256 (v4f64 VR256:$src), addr:$dst)], 892 IIC_SSE_MOVA_P_MR>, VEX, VEX_L; 893def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), 894 "movups\t{$src, $dst|$dst, $src}", 895 [(store (v8f32 VR256:$src), addr:$dst)], 896 IIC_SSE_MOVU_P_MR>, VEX, VEX_L; 897def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), 898 "movupd\t{$src, $dst|$dst, $src}", 899 [(store (v4f64 VR256:$src), addr:$dst)], 900 IIC_SSE_MOVU_P_MR>, VEX, VEX_L; 901} // SchedRW 902 903// For disassembler 904let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, 905 SchedRW = [WriteFShuffle] in { 906 def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst), 907 (ins VR128:$src), 908 "movaps\t{$src, $dst|$dst, $src}", [], 909 IIC_SSE_MOVA_P_RR>, VEX; 910 def VMOVAPDrr_REV : VPDI<0x29, MRMDestReg, (outs VR128:$dst), 911 (ins VR128:$src), 912 "movapd\t{$src, $dst|$dst, $src}", [], 913 IIC_SSE_MOVA_P_RR>, VEX; 914 def VMOVUPSrr_REV : VPSI<0x11, MRMDestReg, (outs VR128:$dst), 915 (ins VR128:$src), 916 "movups\t{$src, $dst|$dst, $src}", [], 917 IIC_SSE_MOVU_P_RR>, VEX; 918 def VMOVUPDrr_REV : VPDI<0x11, MRMDestReg, (outs VR128:$dst), 919 (ins VR128:$src), 920 "movupd\t{$src, $dst|$dst, $src}", [], 921 IIC_SSE_MOVU_P_RR>, VEX; 922 def VMOVAPSYrr_REV : VPSI<0x29, MRMDestReg, (outs VR256:$dst), 923 (ins VR256:$src), 924 "movaps\t{$src, $dst|$dst, $src}", [], 925 IIC_SSE_MOVA_P_RR>, VEX, VEX_L; 926 def VMOVAPDYrr_REV : VPDI<0x29, MRMDestReg, (outs VR256:$dst), 927 (ins VR256:$src), 928 "movapd\t{$src, $dst|$dst, $src}", [], 929 IIC_SSE_MOVA_P_RR>, VEX, VEX_L; 930 def VMOVUPSYrr_REV : VPSI<0x11, MRMDestReg, (outs VR256:$dst), 931 (ins VR256:$src), 932 "movups\t{$src, $dst|$dst, $src}", [], 933 IIC_SSE_MOVU_P_RR>, VEX, VEX_L; 934 def VMOVUPDYrr_REV : VPDI<0x11, MRMDestReg, (outs VR256:$dst), 935 (ins VR256:$src), 936 "movupd\t{$src, $dst|$dst, $src}", [], 937 IIC_SSE_MOVU_P_RR>, VEX, VEX_L; 938} 939 940def : Pat<(int_x86_avx_storeu_ps_256 addr:$dst, VR256:$src), 941 (VMOVUPSYmr addr:$dst, VR256:$src)>; 942def : Pat<(int_x86_avx_storeu_pd_256 addr:$dst, VR256:$src), 943 (VMOVUPDYmr addr:$dst, VR256:$src)>; 944 945let SchedRW = [WriteStore] in { 946def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 947 "movaps\t{$src, $dst|$dst, $src}", 948 [(alignedstore (v4f32 VR128:$src), addr:$dst)], 949 IIC_SSE_MOVA_P_MR>; 950def MOVAPDmr : PDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 951 "movapd\t{$src, $dst|$dst, $src}", 952 [(alignedstore (v2f64 VR128:$src), addr:$dst)], 953 IIC_SSE_MOVA_P_MR>; 954def MOVUPSmr : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 955 "movups\t{$src, $dst|$dst, $src}", 956 [(store (v4f32 VR128:$src), addr:$dst)], 957 IIC_SSE_MOVU_P_MR>; 958def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 959 "movupd\t{$src, $dst|$dst, $src}", 960 [(store (v2f64 VR128:$src), addr:$dst)], 961 IIC_SSE_MOVU_P_MR>; 962} // SchedRW 963 964// For disassembler 965let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, 966 SchedRW = [WriteFShuffle] in { 967 def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 968 "movaps\t{$src, $dst|$dst, $src}", [], 969 IIC_SSE_MOVA_P_RR>; 970 def MOVAPDrr_REV : PDI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 971 "movapd\t{$src, $dst|$dst, $src}", [], 972 IIC_SSE_MOVA_P_RR>; 973 def MOVUPSrr_REV : PSI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 974 "movups\t{$src, $dst|$dst, $src}", [], 975 IIC_SSE_MOVU_P_RR>; 976 def MOVUPDrr_REV : PDI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 977 "movupd\t{$src, $dst|$dst, $src}", [], 978 IIC_SSE_MOVU_P_RR>; 979} 980 981let Predicates = [HasAVX] in { 982 def : Pat<(int_x86_sse_storeu_ps addr:$dst, VR128:$src), 983 (VMOVUPSmr addr:$dst, VR128:$src)>; 984 def : Pat<(int_x86_sse2_storeu_pd addr:$dst, VR128:$src), 985 (VMOVUPDmr addr:$dst, VR128:$src)>; 986} 987 988let Predicates = [UseSSE1] in 989 def : Pat<(int_x86_sse_storeu_ps addr:$dst, VR128:$src), 990 (MOVUPSmr addr:$dst, VR128:$src)>; 991let Predicates = [UseSSE2] in 992 def : Pat<(int_x86_sse2_storeu_pd addr:$dst, VR128:$src), 993 (MOVUPDmr addr:$dst, VR128:$src)>; 994 995// Use vmovaps/vmovups for AVX integer load/store. 996let Predicates = [HasAVX, NoVLX] in { 997 // 128-bit load/store 998 def : Pat<(alignedloadv2i64 addr:$src), 999 (VMOVAPSrm addr:$src)>; 1000 def : Pat<(loadv2i64 addr:$src), 1001 (VMOVUPSrm addr:$src)>; 1002 1003 def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst), 1004 (VMOVAPSmr addr:$dst, VR128:$src)>; 1005 def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst), 1006 (VMOVAPSmr addr:$dst, VR128:$src)>; 1007 def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst), 1008 (VMOVAPSmr addr:$dst, VR128:$src)>; 1009 def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst), 1010 (VMOVAPSmr addr:$dst, VR128:$src)>; 1011 def : Pat<(store (v2i64 VR128:$src), addr:$dst), 1012 (VMOVUPSmr addr:$dst, VR128:$src)>; 1013 def : Pat<(store (v4i32 VR128:$src), addr:$dst), 1014 (VMOVUPSmr addr:$dst, VR128:$src)>; 1015 def : Pat<(store (v8i16 VR128:$src), addr:$dst), 1016 (VMOVUPSmr addr:$dst, VR128:$src)>; 1017 def : Pat<(store (v16i8 VR128:$src), addr:$dst), 1018 (VMOVUPSmr addr:$dst, VR128:$src)>; 1019 1020 // 256-bit load/store 1021 def : Pat<(alignedloadv4i64 addr:$src), 1022 (VMOVAPSYrm addr:$src)>; 1023 def : Pat<(loadv4i64 addr:$src), 1024 (VMOVUPSYrm addr:$src)>; 1025 def : Pat<(alignedstore256 (v4i64 VR256:$src), addr:$dst), 1026 (VMOVAPSYmr addr:$dst, VR256:$src)>; 1027 def : Pat<(alignedstore256 (v8i32 VR256:$src), addr:$dst), 1028 (VMOVAPSYmr addr:$dst, VR256:$src)>; 1029 def : Pat<(alignedstore256 (v16i16 VR256:$src), addr:$dst), 1030 (VMOVAPSYmr addr:$dst, VR256:$src)>; 1031 def : Pat<(alignedstore256 (v32i8 VR256:$src), addr:$dst), 1032 (VMOVAPSYmr addr:$dst, VR256:$src)>; 1033 def : Pat<(store (v4i64 VR256:$src), addr:$dst), 1034 (VMOVUPSYmr addr:$dst, VR256:$src)>; 1035 def : Pat<(store (v8i32 VR256:$src), addr:$dst), 1036 (VMOVUPSYmr addr:$dst, VR256:$src)>; 1037 def : Pat<(store (v16i16 VR256:$src), addr:$dst), 1038 (VMOVUPSYmr addr:$dst, VR256:$src)>; 1039 def : Pat<(store (v32i8 VR256:$src), addr:$dst), 1040 (VMOVUPSYmr addr:$dst, VR256:$src)>; 1041 1042 // Special patterns for storing subvector extracts of lower 128-bits 1043 // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr 1044 def : Pat<(alignedstore (v2f64 (extract_subvector 1045 (v4f64 VR256:$src), (iPTR 0))), addr:$dst), 1046 (VMOVAPDmr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; 1047 def : Pat<(alignedstore (v4f32 (extract_subvector 1048 (v8f32 VR256:$src), (iPTR 0))), addr:$dst), 1049 (VMOVAPSmr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; 1050 def : Pat<(alignedstore (v2i64 (extract_subvector 1051 (v4i64 VR256:$src), (iPTR 0))), addr:$dst), 1052 (VMOVAPDmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; 1053 def : Pat<(alignedstore (v4i32 (extract_subvector 1054 (v8i32 VR256:$src), (iPTR 0))), addr:$dst), 1055 (VMOVAPSmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; 1056 def : Pat<(alignedstore (v8i16 (extract_subvector 1057 (v16i16 VR256:$src), (iPTR 0))), addr:$dst), 1058 (VMOVAPSmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; 1059 def : Pat<(alignedstore (v16i8 (extract_subvector 1060 (v32i8 VR256:$src), (iPTR 0))), addr:$dst), 1061 (VMOVAPSmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; 1062 1063 def : Pat<(store (v2f64 (extract_subvector 1064 (v4f64 VR256:$src), (iPTR 0))), addr:$dst), 1065 (VMOVUPDmr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; 1066 def : Pat<(store (v4f32 (extract_subvector 1067 (v8f32 VR256:$src), (iPTR 0))), addr:$dst), 1068 (VMOVUPSmr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; 1069 def : Pat<(store (v2i64 (extract_subvector 1070 (v4i64 VR256:$src), (iPTR 0))), addr:$dst), 1071 (VMOVUPDmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; 1072 def : Pat<(store (v4i32 (extract_subvector 1073 (v8i32 VR256:$src), (iPTR 0))), addr:$dst), 1074 (VMOVUPSmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; 1075 def : Pat<(store (v8i16 (extract_subvector 1076 (v16i16 VR256:$src), (iPTR 0))), addr:$dst), 1077 (VMOVUPSmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; 1078 def : Pat<(store (v16i8 (extract_subvector 1079 (v32i8 VR256:$src), (iPTR 0))), addr:$dst), 1080 (VMOVUPSmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; 1081} 1082 1083// Use movaps / movups for SSE integer load / store (one byte shorter). 1084// The instructions selected below are then converted to MOVDQA/MOVDQU 1085// during the SSE domain pass. 1086let Predicates = [UseSSE1] in { 1087 def : Pat<(alignedloadv2i64 addr:$src), 1088 (MOVAPSrm addr:$src)>; 1089 def : Pat<(loadv2i64 addr:$src), 1090 (MOVUPSrm addr:$src)>; 1091 1092 def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst), 1093 (MOVAPSmr addr:$dst, VR128:$src)>; 1094 def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst), 1095 (MOVAPSmr addr:$dst, VR128:$src)>; 1096 def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst), 1097 (MOVAPSmr addr:$dst, VR128:$src)>; 1098 def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst), 1099 (MOVAPSmr addr:$dst, VR128:$src)>; 1100 def : Pat<(store (v2i64 VR128:$src), addr:$dst), 1101 (MOVUPSmr addr:$dst, VR128:$src)>; 1102 def : Pat<(store (v4i32 VR128:$src), addr:$dst), 1103 (MOVUPSmr addr:$dst, VR128:$src)>; 1104 def : Pat<(store (v8i16 VR128:$src), addr:$dst), 1105 (MOVUPSmr addr:$dst, VR128:$src)>; 1106 def : Pat<(store (v16i8 VR128:$src), addr:$dst), 1107 (MOVUPSmr addr:$dst, VR128:$src)>; 1108} 1109 1110// Alias instruction to load FR32 or FR64 from f128mem using movaps. Upper 1111// bits are disregarded. FIXME: Set encoding to pseudo! 1112let canFoldAsLoad = 1, isReMaterializable = 1, SchedRW = [WriteLoad] in { 1113let isCodeGenOnly = 1 in { 1114 def FsVMOVAPSrm : VPSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src), 1115 "movaps\t{$src, $dst|$dst, $src}", 1116 [(set FR32:$dst, (alignedloadfsf32 addr:$src))], 1117 IIC_SSE_MOVA_P_RM>, VEX; 1118 def FsVMOVAPDrm : VPDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src), 1119 "movapd\t{$src, $dst|$dst, $src}", 1120 [(set FR64:$dst, (alignedloadfsf64 addr:$src))], 1121 IIC_SSE_MOVA_P_RM>, VEX; 1122 def FsMOVAPSrm : PSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src), 1123 "movaps\t{$src, $dst|$dst, $src}", 1124 [(set FR32:$dst, (alignedloadfsf32 addr:$src))], 1125 IIC_SSE_MOVA_P_RM>; 1126 def FsMOVAPDrm : PDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src), 1127 "movapd\t{$src, $dst|$dst, $src}", 1128 [(set FR64:$dst, (alignedloadfsf64 addr:$src))], 1129 IIC_SSE_MOVA_P_RM>; 1130} 1131} 1132 1133//===----------------------------------------------------------------------===// 1134// SSE 1 & 2 - Move Low packed FP Instructions 1135//===----------------------------------------------------------------------===// 1136 1137multiclass sse12_mov_hilo_packed_base<bits<8>opc, SDNode psnode, SDNode pdnode, 1138 string base_opc, string asm_opr, 1139 InstrItinClass itin> { 1140 def PSrm : PI<opc, MRMSrcMem, 1141 (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2), 1142 !strconcat(base_opc, "s", asm_opr), 1143 [(set VR128:$dst, 1144 (psnode VR128:$src1, 1145 (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))))], 1146 itin, SSEPackedSingle>, PS, 1147 Sched<[WriteFShuffleLd, ReadAfterLd]>; 1148 1149 def PDrm : PI<opc, MRMSrcMem, 1150 (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2), 1151 !strconcat(base_opc, "d", asm_opr), 1152 [(set VR128:$dst, (v2f64 (pdnode VR128:$src1, 1153 (scalar_to_vector (loadf64 addr:$src2)))))], 1154 itin, SSEPackedDouble>, PD, 1155 Sched<[WriteFShuffleLd, ReadAfterLd]>; 1156 1157} 1158 1159multiclass sse12_mov_hilo_packed<bits<8>opc, SDNode psnode, SDNode pdnode, 1160 string base_opc, InstrItinClass itin> { 1161 let Predicates = [UseAVX] in 1162 defm V#NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc, 1163 "\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1164 itin>, VEX_4V; 1165 1166 let Constraints = "$src1 = $dst" in 1167 defm NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc, 1168 "\t{$src2, $dst|$dst, $src2}", 1169 itin>; 1170} 1171 1172let AddedComplexity = 20 in { 1173 defm MOVL : sse12_mov_hilo_packed<0x12, X86Movlps, X86Movlpd, "movlp", 1174 IIC_SSE_MOV_LH>; 1175} 1176 1177let SchedRW = [WriteStore] in { 1178let Predicates = [UseAVX] in { 1179def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 1180 "movlps\t{$src, $dst|$dst, $src}", 1181 [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128:$src)), 1182 (iPTR 0))), addr:$dst)], 1183 IIC_SSE_MOV_LH>, VEX; 1184def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 1185 "movlpd\t{$src, $dst|$dst, $src}", 1186 [(store (f64 (extractelt (v2f64 VR128:$src), 1187 (iPTR 0))), addr:$dst)], 1188 IIC_SSE_MOV_LH>, VEX; 1189}// UseAVX 1190def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 1191 "movlps\t{$src, $dst|$dst, $src}", 1192 [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128:$src)), 1193 (iPTR 0))), addr:$dst)], 1194 IIC_SSE_MOV_LH>; 1195def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 1196 "movlpd\t{$src, $dst|$dst, $src}", 1197 [(store (f64 (extractelt (v2f64 VR128:$src), 1198 (iPTR 0))), addr:$dst)], 1199 IIC_SSE_MOV_LH>; 1200} // SchedRW 1201 1202let Predicates = [UseAVX] in { 1203 // Shuffle with VMOVLPS 1204 def : Pat<(v4f32 (X86Movlps VR128:$src1, (load addr:$src2))), 1205 (VMOVLPSrm VR128:$src1, addr:$src2)>; 1206 def : Pat<(v4i32 (X86Movlps VR128:$src1, (load addr:$src2))), 1207 (VMOVLPSrm VR128:$src1, addr:$src2)>; 1208 1209 // Shuffle with VMOVLPD 1210 def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))), 1211 (VMOVLPDrm VR128:$src1, addr:$src2)>; 1212 def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))), 1213 (VMOVLPDrm VR128:$src1, addr:$src2)>; 1214 def : Pat<(v2f64 (X86Movsd VR128:$src1, 1215 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))), 1216 (VMOVLPDrm VR128:$src1, addr:$src2)>; 1217 1218 // Store patterns 1219 def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), 1220 addr:$src1), 1221 (VMOVLPSmr addr:$src1, VR128:$src2)>; 1222 def : Pat<(store (v4i32 (X86Movlps 1223 (bc_v4i32 (loadv2i64 addr:$src1)), VR128:$src2)), addr:$src1), 1224 (VMOVLPSmr addr:$src1, VR128:$src2)>; 1225 def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128:$src2)), 1226 addr:$src1), 1227 (VMOVLPDmr addr:$src1, VR128:$src2)>; 1228 def : Pat<(store (v2i64 (X86Movlpd (load addr:$src1), VR128:$src2)), 1229 addr:$src1), 1230 (VMOVLPDmr addr:$src1, VR128:$src2)>; 1231} 1232 1233let Predicates = [UseSSE1] in { 1234 // (store (vector_shuffle (load addr), v2, <4, 5, 2, 3>), addr) using MOVLPS 1235 def : Pat<(store (i64 (extractelt (bc_v2i64 (v4f32 VR128:$src2)), 1236 (iPTR 0))), addr:$src1), 1237 (MOVLPSmr addr:$src1, VR128:$src2)>; 1238 1239 // Shuffle with MOVLPS 1240 def : Pat<(v4f32 (X86Movlps VR128:$src1, (load addr:$src2))), 1241 (MOVLPSrm VR128:$src1, addr:$src2)>; 1242 def : Pat<(v4i32 (X86Movlps VR128:$src1, (load addr:$src2))), 1243 (MOVLPSrm VR128:$src1, addr:$src2)>; 1244 def : Pat<(X86Movlps VR128:$src1, 1245 (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))), 1246 (MOVLPSrm VR128:$src1, addr:$src2)>; 1247 1248 // Store patterns 1249 def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), 1250 addr:$src1), 1251 (MOVLPSmr addr:$src1, VR128:$src2)>; 1252 def : Pat<(store (v4i32 (X86Movlps 1253 (bc_v4i32 (loadv2i64 addr:$src1)), VR128:$src2)), 1254 addr:$src1), 1255 (MOVLPSmr addr:$src1, VR128:$src2)>; 1256} 1257 1258let Predicates = [UseSSE2] in { 1259 // Shuffle with MOVLPD 1260 def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))), 1261 (MOVLPDrm VR128:$src1, addr:$src2)>; 1262 def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))), 1263 (MOVLPDrm VR128:$src1, addr:$src2)>; 1264 def : Pat<(v2f64 (X86Movsd VR128:$src1, 1265 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))), 1266 (MOVLPDrm VR128:$src1, addr:$src2)>; 1267 1268 // Store patterns 1269 def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128:$src2)), 1270 addr:$src1), 1271 (MOVLPDmr addr:$src1, VR128:$src2)>; 1272 def : Pat<(store (v2i64 (X86Movlpd (load addr:$src1), VR128:$src2)), 1273 addr:$src1), 1274 (MOVLPDmr addr:$src1, VR128:$src2)>; 1275} 1276 1277//===----------------------------------------------------------------------===// 1278// SSE 1 & 2 - Move Hi packed FP Instructions 1279//===----------------------------------------------------------------------===// 1280 1281let AddedComplexity = 20 in { 1282 defm MOVH : sse12_mov_hilo_packed<0x16, X86Movlhps, X86Movlhpd, "movhp", 1283 IIC_SSE_MOV_LH>; 1284} 1285 1286let SchedRW = [WriteStore] in { 1287// v2f64 extract element 1 is always custom lowered to unpack high to low 1288// and extract element 0 so the non-store version isn't too horrible. 1289let Predicates = [UseAVX] in { 1290def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 1291 "movhps\t{$src, $dst|$dst, $src}", 1292 [(store (f64 (extractelt 1293 (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)), 1294 (bc_v2f64 (v4f32 VR128:$src))), 1295 (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX; 1296def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 1297 "movhpd\t{$src, $dst|$dst, $src}", 1298 [(store (f64 (extractelt 1299 (v2f64 (X86Unpckh VR128:$src, VR128:$src)), 1300 (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX; 1301} // UseAVX 1302def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 1303 "movhps\t{$src, $dst|$dst, $src}", 1304 [(store (f64 (extractelt 1305 (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)), 1306 (bc_v2f64 (v4f32 VR128:$src))), 1307 (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>; 1308def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 1309 "movhpd\t{$src, $dst|$dst, $src}", 1310 [(store (f64 (extractelt 1311 (v2f64 (X86Unpckh VR128:$src, VR128:$src)), 1312 (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>; 1313} // SchedRW 1314 1315let Predicates = [UseAVX] in { 1316 // VMOVHPS patterns 1317 def : Pat<(X86Movlhps VR128:$src1, 1318 (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))), 1319 (VMOVHPSrm VR128:$src1, addr:$src2)>; 1320 def : Pat<(X86Movlhps VR128:$src1, 1321 (bc_v4i32 (v2i64 (X86vzload addr:$src2)))), 1322 (VMOVHPSrm VR128:$src1, addr:$src2)>; 1323 1324 // VMOVHPD patterns 1325 1326 // FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem 1327 // is during lowering, where it's not possible to recognize the load fold 1328 // cause it has two uses through a bitcast. One use disappears at isel time 1329 // and the fold opportunity reappears. 1330 def : Pat<(v2f64 (X86Unpckl VR128:$src1, 1331 (scalar_to_vector (loadf64 addr:$src2)))), 1332 (VMOVHPDrm VR128:$src1, addr:$src2)>; 1333 // Also handle an i64 load because that may get selected as a faster way to 1334 // load the data. 1335 def : Pat<(v2f64 (X86Unpckl VR128:$src1, 1336 (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))), 1337 (VMOVHPDrm VR128:$src1, addr:$src2)>; 1338 1339 def : Pat<(store (f64 (extractelt 1340 (v2f64 (X86VPermilpi VR128:$src, (i8 1))), 1341 (iPTR 0))), addr:$dst), 1342 (VMOVHPDmr addr:$dst, VR128:$src)>; 1343} 1344 1345let Predicates = [UseSSE1] in { 1346 // MOVHPS patterns 1347 def : Pat<(X86Movlhps VR128:$src1, 1348 (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))), 1349 (MOVHPSrm VR128:$src1, addr:$src2)>; 1350 def : Pat<(X86Movlhps VR128:$src1, 1351 (bc_v4f32 (v2i64 (X86vzload addr:$src2)))), 1352 (MOVHPSrm VR128:$src1, addr:$src2)>; 1353} 1354 1355let Predicates = [UseSSE2] in { 1356 // MOVHPD patterns 1357 1358 // FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem 1359 // is during lowering, where it's not possible to recognize the load fold 1360 // cause it has two uses through a bitcast. One use disappears at isel time 1361 // and the fold opportunity reappears. 1362 def : Pat<(v2f64 (X86Unpckl VR128:$src1, 1363 (scalar_to_vector (loadf64 addr:$src2)))), 1364 (MOVHPDrm VR128:$src1, addr:$src2)>; 1365 // Also handle an i64 load because that may get selected as a faster way to 1366 // load the data. 1367 def : Pat<(v2f64 (X86Unpckl VR128:$src1, 1368 (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))), 1369 (MOVHPDrm VR128:$src1, addr:$src2)>; 1370 1371 def : Pat<(store (f64 (extractelt 1372 (v2f64 (X86Shufp VR128:$src, VR128:$src, (i8 1))), 1373 (iPTR 0))), addr:$dst), 1374 (MOVHPDmr addr:$dst, VR128:$src)>; 1375} 1376 1377//===----------------------------------------------------------------------===// 1378// SSE 1 & 2 - Move Low to High and High to Low packed FP Instructions 1379//===----------------------------------------------------------------------===// 1380 1381let AddedComplexity = 20, Predicates = [UseAVX] in { 1382 def VMOVLHPSrr : VPSI<0x16, MRMSrcReg, (outs VR128:$dst), 1383 (ins VR128:$src1, VR128:$src2), 1384 "movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1385 [(set VR128:$dst, 1386 (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))], 1387 IIC_SSE_MOV_LH>, 1388 VEX_4V, Sched<[WriteFShuffle]>; 1389 def VMOVHLPSrr : VPSI<0x12, MRMSrcReg, (outs VR128:$dst), 1390 (ins VR128:$src1, VR128:$src2), 1391 "movhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1392 [(set VR128:$dst, 1393 (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))], 1394 IIC_SSE_MOV_LH>, 1395 VEX_4V, Sched<[WriteFShuffle]>; 1396} 1397let Constraints = "$src1 = $dst", AddedComplexity = 20 in { 1398 def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst), 1399 (ins VR128:$src1, VR128:$src2), 1400 "movlhps\t{$src2, $dst|$dst, $src2}", 1401 [(set VR128:$dst, 1402 (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))], 1403 IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>; 1404 def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst), 1405 (ins VR128:$src1, VR128:$src2), 1406 "movhlps\t{$src2, $dst|$dst, $src2}", 1407 [(set VR128:$dst, 1408 (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))], 1409 IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>; 1410} 1411 1412let Predicates = [UseAVX] in { 1413 // MOVLHPS patterns 1414 def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)), 1415 (VMOVLHPSrr VR128:$src1, VR128:$src2)>; 1416 def : Pat<(v2i64 (X86Movlhps VR128:$src1, VR128:$src2)), 1417 (VMOVLHPSrr (v2i64 VR128:$src1), VR128:$src2)>; 1418 1419 // MOVHLPS patterns 1420 def : Pat<(v4i32 (X86Movhlps VR128:$src1, VR128:$src2)), 1421 (VMOVHLPSrr VR128:$src1, VR128:$src2)>; 1422} 1423 1424let Predicates = [UseSSE1] in { 1425 // MOVLHPS patterns 1426 def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)), 1427 (MOVLHPSrr VR128:$src1, VR128:$src2)>; 1428 def : Pat<(v2i64 (X86Movlhps VR128:$src1, VR128:$src2)), 1429 (MOVLHPSrr (v2i64 VR128:$src1), VR128:$src2)>; 1430 1431 // MOVHLPS patterns 1432 def : Pat<(v4i32 (X86Movhlps VR128:$src1, VR128:$src2)), 1433 (MOVHLPSrr VR128:$src1, VR128:$src2)>; 1434} 1435 1436//===----------------------------------------------------------------------===// 1437// SSE 1 & 2 - Conversion Instructions 1438//===----------------------------------------------------------------------===// 1439 1440def SSE_CVT_PD : OpndItins< 1441 IIC_SSE_CVT_PD_RR, IIC_SSE_CVT_PD_RM 1442>; 1443 1444let Sched = WriteCvtI2F in 1445def SSE_CVT_PS : OpndItins< 1446 IIC_SSE_CVT_PS_RR, IIC_SSE_CVT_PS_RM 1447>; 1448 1449let Sched = WriteCvtI2F in 1450def SSE_CVT_Scalar : OpndItins< 1451 IIC_SSE_CVT_Scalar_RR, IIC_SSE_CVT_Scalar_RM 1452>; 1453 1454let Sched = WriteCvtF2I in 1455def SSE_CVT_SS2SI_32 : OpndItins< 1456 IIC_SSE_CVT_SS2SI32_RR, IIC_SSE_CVT_SS2SI32_RM 1457>; 1458 1459let Sched = WriteCvtF2I in 1460def SSE_CVT_SS2SI_64 : OpndItins< 1461 IIC_SSE_CVT_SS2SI64_RR, IIC_SSE_CVT_SS2SI64_RM 1462>; 1463 1464let Sched = WriteCvtF2I in 1465def SSE_CVT_SD2SI : OpndItins< 1466 IIC_SSE_CVT_SD2SI_RR, IIC_SSE_CVT_SD2SI_RM 1467>; 1468 1469multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, 1470 SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag, 1471 string asm, OpndItins itins> { 1472 def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm, 1473 [(set DstRC:$dst, (OpNode SrcRC:$src))], 1474 itins.rr>, Sched<[itins.Sched]>; 1475 def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm, 1476 [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))], 1477 itins.rm>, Sched<[itins.Sched.Folded]>; 1478} 1479 1480multiclass sse12_cvt_p<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, 1481 X86MemOperand x86memop, string asm, Domain d, 1482 OpndItins itins> { 1483let hasSideEffects = 0 in { 1484 def rr : I<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm, 1485 [], itins.rr, d>, Sched<[itins.Sched]>; 1486 let mayLoad = 1 in 1487 def rm : I<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm, 1488 [], itins.rm, d>, Sched<[itins.Sched.Folded]>; 1489} 1490} 1491 1492multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, 1493 X86MemOperand x86memop, string asm> { 1494let hasSideEffects = 0, Predicates = [UseAVX] in { 1495 def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src), 1496 !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>, 1497 Sched<[WriteCvtI2F]>; 1498 let mayLoad = 1 in 1499 def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), 1500 (ins DstRC:$src1, x86memop:$src), 1501 !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>, 1502 Sched<[WriteCvtI2FLd, ReadAfterLd]>; 1503} // hasSideEffects = 0 1504} 1505 1506let Predicates = [UseAVX] in { 1507defm VCVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32, 1508 "cvttss2si\t{$src, $dst|$dst, $src}", 1509 SSE_CVT_SS2SI_32>, 1510 XS, VEX, VEX_LIG; 1511defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32, 1512 "cvttss2si\t{$src, $dst|$dst, $src}", 1513 SSE_CVT_SS2SI_64>, 1514 XS, VEX, VEX_W, VEX_LIG; 1515defm VCVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64, 1516 "cvttsd2si\t{$src, $dst|$dst, $src}", 1517 SSE_CVT_SD2SI>, 1518 XD, VEX, VEX_LIG; 1519defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64, 1520 "cvttsd2si\t{$src, $dst|$dst, $src}", 1521 SSE_CVT_SD2SI>, 1522 XD, VEX, VEX_W, VEX_LIG; 1523 1524def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}", 1525 (VCVTTSS2SIrr GR32:$dst, FR32:$src), 0>; 1526def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}", 1527 (VCVTTSS2SIrm GR32:$dst, f32mem:$src), 0>; 1528def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}", 1529 (VCVTTSD2SIrr GR32:$dst, FR64:$src), 0>; 1530def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}", 1531 (VCVTTSD2SIrm GR32:$dst, f64mem:$src), 0>; 1532def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}", 1533 (VCVTTSS2SI64rr GR64:$dst, FR32:$src), 0>; 1534def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}", 1535 (VCVTTSS2SI64rm GR64:$dst, f32mem:$src), 0>; 1536def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}", 1537 (VCVTTSD2SI64rr GR64:$dst, FR64:$src), 0>; 1538def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}", 1539 (VCVTTSD2SI64rm GR64:$dst, f64mem:$src), 0>; 1540} 1541// The assembler can recognize rr 64-bit instructions by seeing a rxx 1542// register, but the same isn't true when only using memory operands, 1543// provide other assembly "l" and "q" forms to address this explicitly 1544// where appropriate to do so. 1545defm VCVTSI2SS : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss{l}">, 1546 XS, VEX_4V, VEX_LIG; 1547defm VCVTSI2SS64 : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss{q}">, 1548 XS, VEX_4V, VEX_W, VEX_LIG; 1549defm VCVTSI2SD : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd{l}">, 1550 XD, VEX_4V, VEX_LIG; 1551defm VCVTSI2SD64 : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd{q}">, 1552 XD, VEX_4V, VEX_W, VEX_LIG; 1553 1554let Predicates = [UseAVX] in { 1555 def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}", 1556 (VCVTSI2SSrm FR64:$dst, FR64:$src1, i32mem:$src), 0>; 1557 def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}", 1558 (VCVTSI2SDrm FR64:$dst, FR64:$src1, i32mem:$src), 0>; 1559 1560 def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))), 1561 (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>; 1562 def : Pat<(f32 (sint_to_fp (loadi64 addr:$src))), 1563 (VCVTSI2SS64rm (f32 (IMPLICIT_DEF)), addr:$src)>; 1564 def : Pat<(f64 (sint_to_fp (loadi32 addr:$src))), 1565 (VCVTSI2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>; 1566 def : Pat<(f64 (sint_to_fp (loadi64 addr:$src))), 1567 (VCVTSI2SD64rm (f64 (IMPLICIT_DEF)), addr:$src)>; 1568 1569 def : Pat<(f32 (sint_to_fp GR32:$src)), 1570 (VCVTSI2SSrr (f32 (IMPLICIT_DEF)), GR32:$src)>; 1571 def : Pat<(f32 (sint_to_fp GR64:$src)), 1572 (VCVTSI2SS64rr (f32 (IMPLICIT_DEF)), GR64:$src)>; 1573 def : Pat<(f64 (sint_to_fp GR32:$src)), 1574 (VCVTSI2SDrr (f64 (IMPLICIT_DEF)), GR32:$src)>; 1575 def : Pat<(f64 (sint_to_fp GR64:$src)), 1576 (VCVTSI2SD64rr (f64 (IMPLICIT_DEF)), GR64:$src)>; 1577} 1578 1579defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32, 1580 "cvttss2si\t{$src, $dst|$dst, $src}", 1581 SSE_CVT_SS2SI_32>, XS; 1582defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32, 1583 "cvttss2si\t{$src, $dst|$dst, $src}", 1584 SSE_CVT_SS2SI_64>, XS, REX_W; 1585defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64, 1586 "cvttsd2si\t{$src, $dst|$dst, $src}", 1587 SSE_CVT_SD2SI>, XD; 1588defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64, 1589 "cvttsd2si\t{$src, $dst|$dst, $src}", 1590 SSE_CVT_SD2SI>, XD, REX_W; 1591defm CVTSI2SS : sse12_cvt_s<0x2A, GR32, FR32, sint_to_fp, i32mem, loadi32, 1592 "cvtsi2ss{l}\t{$src, $dst|$dst, $src}", 1593 SSE_CVT_Scalar>, XS; 1594defm CVTSI2SS64 : sse12_cvt_s<0x2A, GR64, FR32, sint_to_fp, i64mem, loadi64, 1595 "cvtsi2ss{q}\t{$src, $dst|$dst, $src}", 1596 SSE_CVT_Scalar>, XS, REX_W; 1597defm CVTSI2SD : sse12_cvt_s<0x2A, GR32, FR64, sint_to_fp, i32mem, loadi32, 1598 "cvtsi2sd{l}\t{$src, $dst|$dst, $src}", 1599 SSE_CVT_Scalar>, XD; 1600defm CVTSI2SD64 : sse12_cvt_s<0x2A, GR64, FR64, sint_to_fp, i64mem, loadi64, 1601 "cvtsi2sd{q}\t{$src, $dst|$dst, $src}", 1602 SSE_CVT_Scalar>, XD, REX_W; 1603 1604def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}", 1605 (CVTTSS2SIrr GR32:$dst, FR32:$src), 0>; 1606def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}", 1607 (CVTTSS2SIrm GR32:$dst, f32mem:$src), 0>; 1608def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}", 1609 (CVTTSD2SIrr GR32:$dst, FR64:$src), 0>; 1610def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}", 1611 (CVTTSD2SIrm GR32:$dst, f64mem:$src), 0>; 1612def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}", 1613 (CVTTSS2SI64rr GR64:$dst, FR32:$src), 0>; 1614def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}", 1615 (CVTTSS2SI64rm GR64:$dst, f32mem:$src), 0>; 1616def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}", 1617 (CVTTSD2SI64rr GR64:$dst, FR64:$src), 0>; 1618def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}", 1619 (CVTTSD2SI64rm GR64:$dst, f64mem:$src), 0>; 1620 1621def : InstAlias<"cvtsi2ss\t{$src, $dst|$dst, $src}", 1622 (CVTSI2SSrm FR64:$dst, i32mem:$src), 0>; 1623def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}", 1624 (CVTSI2SDrm FR64:$dst, i32mem:$src), 0>; 1625 1626// Conversion Instructions Intrinsics - Match intrinsics which expect MM 1627// and/or XMM operand(s). 1628 1629multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, 1630 Intrinsic Int, Operand memop, ComplexPattern mem_cpat, 1631 string asm, OpndItins itins> { 1632 def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), 1633 !strconcat(asm, "\t{$src, $dst|$dst, $src}"), 1634 [(set DstRC:$dst, (Int SrcRC:$src))], itins.rr>, 1635 Sched<[itins.Sched]>; 1636 def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src), 1637 !strconcat(asm, "\t{$src, $dst|$dst, $src}"), 1638 [(set DstRC:$dst, (Int mem_cpat:$src))], itins.rm>, 1639 Sched<[itins.Sched.Folded]>; 1640} 1641 1642multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC, 1643 RegisterClass DstRC, Intrinsic Int, X86MemOperand x86memop, 1644 PatFrag ld_frag, string asm, OpndItins itins, 1645 bit Is2Addr = 1> { 1646 def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src2), 1647 !if(Is2Addr, 1648 !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), 1649 !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 1650 [(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))], 1651 itins.rr>, Sched<[itins.Sched]>; 1652 def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), 1653 (ins DstRC:$src1, x86memop:$src2), 1654 !if(Is2Addr, 1655 !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), 1656 !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 1657 [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))], 1658 itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>; 1659} 1660 1661let Predicates = [UseAVX] in { 1662defm VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, 1663 int_x86_sse2_cvtsd2si, sdmem, sse_load_f64, "cvtsd2si", 1664 SSE_CVT_SD2SI>, XD, VEX, VEX_LIG; 1665defm VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, 1666 int_x86_sse2_cvtsd2si64, sdmem, sse_load_f64, "cvtsd2si", 1667 SSE_CVT_SD2SI>, XD, VEX, VEX_W, VEX_LIG; 1668} 1669defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse2_cvtsd2si, 1670 sdmem, sse_load_f64, "cvtsd2si", SSE_CVT_SD2SI>, XD; 1671defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse2_cvtsd2si64, 1672 sdmem, sse_load_f64, "cvtsd2si", SSE_CVT_SD2SI>, XD, REX_W; 1673 1674 1675let isCodeGenOnly = 1 in { 1676 let Predicates = [UseAVX] in { 1677 defm Int_VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128, 1678 int_x86_sse_cvtsi2ss, i32mem, loadi32, "cvtsi2ss{l}", 1679 SSE_CVT_Scalar, 0>, XS, VEX_4V; 1680 defm Int_VCVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128, 1681 int_x86_sse_cvtsi642ss, i64mem, loadi64, "cvtsi2ss{q}", 1682 SSE_CVT_Scalar, 0>, XS, VEX_4V, 1683 VEX_W; 1684 defm Int_VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128, 1685 int_x86_sse2_cvtsi2sd, i32mem, loadi32, "cvtsi2sd{l}", 1686 SSE_CVT_Scalar, 0>, XD, VEX_4V; 1687 defm Int_VCVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128, 1688 int_x86_sse2_cvtsi642sd, i64mem, loadi64, "cvtsi2sd{q}", 1689 SSE_CVT_Scalar, 0>, XD, 1690 VEX_4V, VEX_W; 1691 } 1692 let Constraints = "$src1 = $dst" in { 1693 defm Int_CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128, 1694 int_x86_sse_cvtsi2ss, i32mem, loadi32, 1695 "cvtsi2ss{l}", SSE_CVT_Scalar>, XS; 1696 defm Int_CVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128, 1697 int_x86_sse_cvtsi642ss, i64mem, loadi64, 1698 "cvtsi2ss{q}", SSE_CVT_Scalar>, XS, REX_W; 1699 defm Int_CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128, 1700 int_x86_sse2_cvtsi2sd, i32mem, loadi32, 1701 "cvtsi2sd{l}", SSE_CVT_Scalar>, XD; 1702 defm Int_CVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128, 1703 int_x86_sse2_cvtsi642sd, i64mem, loadi64, 1704 "cvtsi2sd{q}", SSE_CVT_Scalar>, XD, REX_W; 1705 } 1706} // isCodeGenOnly = 1 1707 1708/// SSE 1 Only 1709 1710// Aliases for intrinsics 1711let isCodeGenOnly = 1 in { 1712let Predicates = [UseAVX] in { 1713defm Int_VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si, 1714 ssmem, sse_load_f32, "cvttss2si", 1715 SSE_CVT_SS2SI_32>, XS, VEX; 1716defm Int_VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, 1717 int_x86_sse_cvttss2si64, ssmem, sse_load_f32, 1718 "cvttss2si", SSE_CVT_SS2SI_64>, 1719 XS, VEX, VEX_W; 1720defm Int_VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si, 1721 sdmem, sse_load_f64, "cvttsd2si", 1722 SSE_CVT_SD2SI>, XD, VEX; 1723defm Int_VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, 1724 int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64, 1725 "cvttsd2si", SSE_CVT_SD2SI>, 1726 XD, VEX, VEX_W; 1727} 1728defm Int_CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si, 1729 ssmem, sse_load_f32, "cvttss2si", 1730 SSE_CVT_SS2SI_32>, XS; 1731defm Int_CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, 1732 int_x86_sse_cvttss2si64, ssmem, sse_load_f32, 1733 "cvttss2si", SSE_CVT_SS2SI_64>, XS, REX_W; 1734defm Int_CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si, 1735 sdmem, sse_load_f64, "cvttsd2si", 1736 SSE_CVT_SD2SI>, XD; 1737defm Int_CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, 1738 int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64, 1739 "cvttsd2si", SSE_CVT_SD2SI>, XD, REX_W; 1740} // isCodeGenOnly = 1 1741 1742let Predicates = [UseAVX] in { 1743defm VCVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si, 1744 ssmem, sse_load_f32, "cvtss2si", 1745 SSE_CVT_SS2SI_32>, XS, VEX, VEX_LIG; 1746defm VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64, 1747 ssmem, sse_load_f32, "cvtss2si", 1748 SSE_CVT_SS2SI_64>, XS, VEX, VEX_W, VEX_LIG; 1749} 1750defm CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si, 1751 ssmem, sse_load_f32, "cvtss2si", 1752 SSE_CVT_SS2SI_32>, XS; 1753defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64, 1754 ssmem, sse_load_f32, "cvtss2si", 1755 SSE_CVT_SS2SI_64>, XS, REX_W; 1756 1757defm VCVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, i128mem, 1758 "vcvtdq2ps\t{$src, $dst|$dst, $src}", 1759 SSEPackedSingle, SSE_CVT_PS>, 1760 PS, VEX, Requires<[HasAVX]>; 1761defm VCVTDQ2PSY : sse12_cvt_p<0x5B, VR256, VR256, i256mem, 1762 "vcvtdq2ps\t{$src, $dst|$dst, $src}", 1763 SSEPackedSingle, SSE_CVT_PS>, 1764 PS, VEX, VEX_L, Requires<[HasAVX]>; 1765 1766defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, i128mem, 1767 "cvtdq2ps\t{$src, $dst|$dst, $src}", 1768 SSEPackedSingle, SSE_CVT_PS>, 1769 PS, Requires<[UseSSE2]>; 1770 1771let Predicates = [UseAVX] in { 1772def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}", 1773 (VCVTSS2SIrr GR32:$dst, VR128:$src), 0>; 1774def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}", 1775 (VCVTSS2SIrm GR32:$dst, ssmem:$src), 0>; 1776def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}", 1777 (VCVTSD2SIrr GR32:$dst, VR128:$src), 0>; 1778def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}", 1779 (VCVTSD2SIrm GR32:$dst, sdmem:$src), 0>; 1780def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}", 1781 (VCVTSS2SI64rr GR64:$dst, VR128:$src), 0>; 1782def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}", 1783 (VCVTSS2SI64rm GR64:$dst, ssmem:$src), 0>; 1784def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}", 1785 (VCVTSD2SI64rr GR64:$dst, VR128:$src), 0>; 1786def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}", 1787 (VCVTSD2SI64rm GR64:$dst, sdmem:$src), 0>; 1788} 1789 1790def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}", 1791 (CVTSS2SIrr GR32:$dst, VR128:$src), 0>; 1792def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}", 1793 (CVTSS2SIrm GR32:$dst, ssmem:$src), 0>; 1794def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}", 1795 (CVTSD2SIrr GR32:$dst, VR128:$src), 0>; 1796def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}", 1797 (CVTSD2SIrm GR32:$dst, sdmem:$src), 0>; 1798def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}", 1799 (CVTSS2SI64rr GR64:$dst, VR128:$src), 0>; 1800def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}", 1801 (CVTSS2SI64rm GR64:$dst, ssmem:$src), 0>; 1802def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}", 1803 (CVTSD2SI64rr GR64:$dst, VR128:$src), 0>; 1804def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}", 1805 (CVTSD2SI64rm GR64:$dst, sdmem:$src)>; 1806 1807/// SSE 2 Only 1808 1809// Convert scalar double to scalar single 1810let hasSideEffects = 0, Predicates = [UseAVX] in { 1811def VCVTSD2SSrr : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst), 1812 (ins FR64:$src1, FR64:$src2), 1813 "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [], 1814 IIC_SSE_CVT_Scalar_RR>, VEX_4V, VEX_LIG, 1815 Sched<[WriteCvtF2F]>; 1816let mayLoad = 1 in 1817def VCVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), 1818 (ins FR64:$src1, f64mem:$src2), 1819 "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1820 [], IIC_SSE_CVT_Scalar_RM>, 1821 XD, Requires<[HasAVX, OptForSize]>, VEX_4V, VEX_LIG, 1822 Sched<[WriteCvtF2FLd, ReadAfterLd]>; 1823} 1824 1825def : Pat<(f32 (fround FR64:$src)), (VCVTSD2SSrr FR64:$src, FR64:$src)>, 1826 Requires<[UseAVX]>; 1827 1828def CVTSD2SSrr : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src), 1829 "cvtsd2ss\t{$src, $dst|$dst, $src}", 1830 [(set FR32:$dst, (fround FR64:$src))], 1831 IIC_SSE_CVT_Scalar_RR>, Sched<[WriteCvtF2F]>; 1832def CVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src), 1833 "cvtsd2ss\t{$src, $dst|$dst, $src}", 1834 [(set FR32:$dst, (fround (loadf64 addr:$src)))], 1835 IIC_SSE_CVT_Scalar_RM>, 1836 XD, 1837 Requires<[UseSSE2, OptForSize]>, Sched<[WriteCvtF2FLd]>; 1838 1839let isCodeGenOnly = 1 in { 1840def Int_VCVTSD2SSrr: I<0x5A, MRMSrcReg, 1841 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), 1842 "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1843 [(set VR128:$dst, 1844 (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))], 1845 IIC_SSE_CVT_Scalar_RR>, XD, VEX_4V, Requires<[HasAVX]>, 1846 Sched<[WriteCvtF2F]>; 1847def Int_VCVTSD2SSrm: I<0x5A, MRMSrcReg, 1848 (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2), 1849 "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1850 [(set VR128:$dst, (int_x86_sse2_cvtsd2ss 1851 VR128:$src1, sse_load_f64:$src2))], 1852 IIC_SSE_CVT_Scalar_RM>, XD, VEX_4V, Requires<[HasAVX]>, 1853 Sched<[WriteCvtF2FLd, ReadAfterLd]>; 1854 1855let Constraints = "$src1 = $dst" in { 1856def Int_CVTSD2SSrr: I<0x5A, MRMSrcReg, 1857 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), 1858 "cvtsd2ss\t{$src2, $dst|$dst, $src2}", 1859 [(set VR128:$dst, 1860 (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))], 1861 IIC_SSE_CVT_Scalar_RR>, XD, Requires<[UseSSE2]>, 1862 Sched<[WriteCvtF2F]>; 1863def Int_CVTSD2SSrm: I<0x5A, MRMSrcReg, 1864 (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2), 1865 "cvtsd2ss\t{$src2, $dst|$dst, $src2}", 1866 [(set VR128:$dst, (int_x86_sse2_cvtsd2ss 1867 VR128:$src1, sse_load_f64:$src2))], 1868 IIC_SSE_CVT_Scalar_RM>, XD, Requires<[UseSSE2]>, 1869 Sched<[WriteCvtF2FLd, ReadAfterLd]>; 1870} 1871} // isCodeGenOnly = 1 1872 1873// Convert scalar single to scalar double 1874// SSE2 instructions with XS prefix 1875let hasSideEffects = 0, Predicates = [UseAVX] in { 1876def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), 1877 (ins FR32:$src1, FR32:$src2), 1878 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1879 [], IIC_SSE_CVT_Scalar_RR>, 1880 XS, Requires<[HasAVX]>, VEX_4V, VEX_LIG, 1881 Sched<[WriteCvtF2F]>; 1882let mayLoad = 1 in 1883def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), 1884 (ins FR32:$src1, f32mem:$src2), 1885 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1886 [], IIC_SSE_CVT_Scalar_RM>, 1887 XS, VEX_4V, VEX_LIG, Requires<[HasAVX, OptForSize]>, 1888 Sched<[WriteCvtF2FLd, ReadAfterLd]>; 1889} 1890 1891def : Pat<(f64 (fextend FR32:$src)), 1892 (VCVTSS2SDrr FR32:$src, FR32:$src)>, Requires<[UseAVX]>; 1893def : Pat<(fextend (loadf32 addr:$src)), 1894 (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX]>; 1895 1896def : Pat<(extloadf32 addr:$src), 1897 (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>, 1898 Requires<[UseAVX, OptForSize]>; 1899def : Pat<(extloadf32 addr:$src), 1900 (VCVTSS2SDrr (f32 (IMPLICIT_DEF)), (VMOVSSrm addr:$src))>, 1901 Requires<[UseAVX, OptForSpeed]>; 1902 1903def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src), 1904 "cvtss2sd\t{$src, $dst|$dst, $src}", 1905 [(set FR64:$dst, (fextend FR32:$src))], 1906 IIC_SSE_CVT_Scalar_RR>, XS, 1907 Requires<[UseSSE2]>, Sched<[WriteCvtF2F]>; 1908def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src), 1909 "cvtss2sd\t{$src, $dst|$dst, $src}", 1910 [(set FR64:$dst, (extloadf32 addr:$src))], 1911 IIC_SSE_CVT_Scalar_RM>, XS, 1912 Requires<[UseSSE2, OptForSize]>, Sched<[WriteCvtF2FLd]>; 1913 1914// extload f32 -> f64. This matches load+fextend because we have a hack in 1915// the isel (PreprocessForFPConvert) that can introduce loads after dag 1916// combine. 1917// Since these loads aren't folded into the fextend, we have to match it 1918// explicitly here. 1919def : Pat<(fextend (loadf32 addr:$src)), 1920 (CVTSS2SDrm addr:$src)>, Requires<[UseSSE2]>; 1921def : Pat<(extloadf32 addr:$src), 1922 (CVTSS2SDrr (MOVSSrm addr:$src))>, Requires<[UseSSE2, OptForSpeed]>; 1923 1924let isCodeGenOnly = 1 in { 1925def Int_VCVTSS2SDrr: I<0x5A, MRMSrcReg, 1926 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), 1927 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1928 [(set VR128:$dst, 1929 (int_x86_sse2_cvtss2sd VR128:$src1, VR128:$src2))], 1930 IIC_SSE_CVT_Scalar_RR>, XS, VEX_4V, Requires<[HasAVX]>, 1931 Sched<[WriteCvtF2F]>; 1932def Int_VCVTSS2SDrm: I<0x5A, MRMSrcMem, 1933 (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2), 1934 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1935 [(set VR128:$dst, 1936 (int_x86_sse2_cvtss2sd VR128:$src1, sse_load_f32:$src2))], 1937 IIC_SSE_CVT_Scalar_RM>, XS, VEX_4V, Requires<[HasAVX]>, 1938 Sched<[WriteCvtF2FLd, ReadAfterLd]>; 1939let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix 1940def Int_CVTSS2SDrr: I<0x5A, MRMSrcReg, 1941 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), 1942 "cvtss2sd\t{$src2, $dst|$dst, $src2}", 1943 [(set VR128:$dst, 1944 (int_x86_sse2_cvtss2sd VR128:$src1, VR128:$src2))], 1945 IIC_SSE_CVT_Scalar_RR>, XS, Requires<[UseSSE2]>, 1946 Sched<[WriteCvtF2F]>; 1947def Int_CVTSS2SDrm: I<0x5A, MRMSrcMem, 1948 (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2), 1949 "cvtss2sd\t{$src2, $dst|$dst, $src2}", 1950 [(set VR128:$dst, 1951 (int_x86_sse2_cvtss2sd VR128:$src1, sse_load_f32:$src2))], 1952 IIC_SSE_CVT_Scalar_RM>, XS, Requires<[UseSSE2]>, 1953 Sched<[WriteCvtF2FLd, ReadAfterLd]>; 1954} 1955} // isCodeGenOnly = 1 1956 1957// Convert packed single/double fp to doubleword 1958def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1959 "cvtps2dq\t{$src, $dst|$dst, $src}", 1960 [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))], 1961 IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>; 1962def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1963 "cvtps2dq\t{$src, $dst|$dst, $src}", 1964 [(set VR128:$dst, 1965 (int_x86_sse2_cvtps2dq (loadv4f32 addr:$src)))], 1966 IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>; 1967def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 1968 "cvtps2dq\t{$src, $dst|$dst, $src}", 1969 [(set VR256:$dst, 1970 (int_x86_avx_cvt_ps2dq_256 VR256:$src))], 1971 IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>; 1972def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), 1973 "cvtps2dq\t{$src, $dst|$dst, $src}", 1974 [(set VR256:$dst, 1975 (int_x86_avx_cvt_ps2dq_256 (loadv8f32 addr:$src)))], 1976 IIC_SSE_CVT_PS_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>; 1977def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1978 "cvtps2dq\t{$src, $dst|$dst, $src}", 1979 [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))], 1980 IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>; 1981def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1982 "cvtps2dq\t{$src, $dst|$dst, $src}", 1983 [(set VR128:$dst, 1984 (int_x86_sse2_cvtps2dq (memopv4f32 addr:$src)))], 1985 IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>; 1986 1987 1988// Convert Packed Double FP to Packed DW Integers 1989let Predicates = [HasAVX] in { 1990// The assembler can recognize rr 256-bit instructions by seeing a ymm 1991// register, but the same isn't true when using memory operands instead. 1992// Provide other assembly rr and rm forms to address this explicitly. 1993def VCVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1994 "vcvtpd2dq\t{$src, $dst|$dst, $src}", 1995 [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))]>, 1996 VEX, Sched<[WriteCvtF2I]>; 1997 1998// XMM only 1999def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}", 2000 (VCVTPD2DQrr VR128:$dst, VR128:$src), 0>; 2001def VCVTPD2DQXrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 2002 "vcvtpd2dqx\t{$src, $dst|$dst, $src}", 2003 [(set VR128:$dst, 2004 (int_x86_sse2_cvtpd2dq (loadv2f64 addr:$src)))]>, VEX, 2005 Sched<[WriteCvtF2ILd]>; 2006 2007// YMM only 2008def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), 2009 "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}", 2010 [(set VR128:$dst, 2011 (int_x86_avx_cvt_pd2dq_256 VR256:$src))]>, VEX, VEX_L, 2012 Sched<[WriteCvtF2I]>; 2013def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), 2014 "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}", 2015 [(set VR128:$dst, 2016 (int_x86_avx_cvt_pd2dq_256 (loadv4f64 addr:$src)))]>, 2017 VEX, VEX_L, Sched<[WriteCvtF2ILd]>; 2018def : InstAlias<"vcvtpd2dq\t{$src, $dst|$dst, $src}", 2019 (VCVTPD2DQYrr VR128:$dst, VR256:$src), 0>; 2020} 2021 2022def CVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 2023 "cvtpd2dq\t{$src, $dst|$dst, $src}", 2024 [(set VR128:$dst, 2025 (int_x86_sse2_cvtpd2dq (memopv2f64 addr:$src)))], 2026 IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtF2ILd]>; 2027def CVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 2028 "cvtpd2dq\t{$src, $dst|$dst, $src}", 2029 [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))], 2030 IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2I]>; 2031 2032// Convert with truncation packed single/double fp to doubleword 2033// SSE2 packed instructions with XS prefix 2034def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 2035 "cvttps2dq\t{$src, $dst|$dst, $src}", 2036 [(set VR128:$dst, 2037 (int_x86_sse2_cvttps2dq VR128:$src))], 2038 IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>; 2039def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 2040 "cvttps2dq\t{$src, $dst|$dst, $src}", 2041 [(set VR128:$dst, (int_x86_sse2_cvttps2dq 2042 (loadv4f32 addr:$src)))], 2043 IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>; 2044def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 2045 "cvttps2dq\t{$src, $dst|$dst, $src}", 2046 [(set VR256:$dst, 2047 (int_x86_avx_cvtt_ps2dq_256 VR256:$src))], 2048 IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>; 2049def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), 2050 "cvttps2dq\t{$src, $dst|$dst, $src}", 2051 [(set VR256:$dst, (int_x86_avx_cvtt_ps2dq_256 2052 (loadv8f32 addr:$src)))], 2053 IIC_SSE_CVT_PS_RM>, VEX, VEX_L, 2054 Sched<[WriteCvtF2ILd]>; 2055 2056def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 2057 "cvttps2dq\t{$src, $dst|$dst, $src}", 2058 [(set VR128:$dst, (int_x86_sse2_cvttps2dq VR128:$src))], 2059 IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>; 2060def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 2061 "cvttps2dq\t{$src, $dst|$dst, $src}", 2062 [(set VR128:$dst, 2063 (int_x86_sse2_cvttps2dq (memopv4f32 addr:$src)))], 2064 IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>; 2065 2066let Predicates = [HasAVX] in { 2067 def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src), 2068 (VCVTDQ2PSrr VR128:$src)>; 2069 def : Pat<(int_x86_sse2_cvtdq2ps (bc_v4i32 (loadv2i64 addr:$src))), 2070 (VCVTDQ2PSrm addr:$src)>; 2071} 2072 2073let Predicates = [HasAVX, NoVLX] in { 2074 def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))), 2075 (VCVTDQ2PSrr VR128:$src)>; 2076 def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))), 2077 (VCVTDQ2PSrm addr:$src)>; 2078 2079 def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))), 2080 (VCVTTPS2DQrr VR128:$src)>; 2081 def : Pat<(v4i32 (fp_to_sint (loadv4f32 addr:$src))), 2082 (VCVTTPS2DQrm addr:$src)>; 2083 2084 def : Pat<(v8f32 (sint_to_fp (v8i32 VR256:$src))), 2085 (VCVTDQ2PSYrr VR256:$src)>; 2086 def : Pat<(v8f32 (sint_to_fp (bc_v8i32 (loadv4i64 addr:$src)))), 2087 (VCVTDQ2PSYrm addr:$src)>; 2088 2089 def : Pat<(v8i32 (fp_to_sint (v8f32 VR256:$src))), 2090 (VCVTTPS2DQYrr VR256:$src)>; 2091 def : Pat<(v8i32 (fp_to_sint (loadv8f32 addr:$src))), 2092 (VCVTTPS2DQYrm addr:$src)>; 2093} 2094 2095let Predicates = [UseSSE2] in { 2096 def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))), 2097 (CVTDQ2PSrr VR128:$src)>; 2098 def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (memopv2i64 addr:$src)))), 2099 (CVTDQ2PSrm addr:$src)>; 2100 2101 def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src), 2102 (CVTDQ2PSrr VR128:$src)>; 2103 def : Pat<(int_x86_sse2_cvtdq2ps (bc_v4i32 (memopv2i64 addr:$src))), 2104 (CVTDQ2PSrm addr:$src)>; 2105 2106 def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))), 2107 (CVTTPS2DQrr VR128:$src)>; 2108 def : Pat<(v4i32 (fp_to_sint (memopv4f32 addr:$src))), 2109 (CVTTPS2DQrm addr:$src)>; 2110} 2111 2112def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 2113 "cvttpd2dq\t{$src, $dst|$dst, $src}", 2114 [(set VR128:$dst, 2115 (int_x86_sse2_cvttpd2dq VR128:$src))], 2116 IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2I]>; 2117 2118// The assembler can recognize rr 256-bit instructions by seeing a ymm 2119// register, but the same isn't true when using memory operands instead. 2120// Provide other assembly rr and rm forms to address this explicitly. 2121 2122// XMM only 2123def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}", 2124 (VCVTTPD2DQrr VR128:$dst, VR128:$src), 0>; 2125def VCVTTPD2DQXrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 2126 "cvttpd2dqx\t{$src, $dst|$dst, $src}", 2127 [(set VR128:$dst, (int_x86_sse2_cvttpd2dq 2128 (loadv2f64 addr:$src)))], 2129 IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2ILd]>; 2130 2131// YMM only 2132def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), 2133 "cvttpd2dq{y}\t{$src, $dst|$dst, $src}", 2134 [(set VR128:$dst, 2135 (int_x86_avx_cvtt_pd2dq_256 VR256:$src))], 2136 IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>; 2137def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), 2138 "cvttpd2dq{y}\t{$src, $dst|$dst, $src}", 2139 [(set VR128:$dst, 2140 (int_x86_avx_cvtt_pd2dq_256 (loadv4f64 addr:$src)))], 2141 IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>; 2142def : InstAlias<"vcvttpd2dq\t{$src, $dst|$dst, $src}", 2143 (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0>; 2144 2145let Predicates = [HasAVX, NoVLX] in { 2146 def : Pat<(v4i32 (fp_to_sint (v4f64 VR256:$src))), 2147 (VCVTTPD2DQYrr VR256:$src)>; 2148 def : Pat<(v4i32 (fp_to_sint (loadv4f64 addr:$src))), 2149 (VCVTTPD2DQYrm addr:$src)>; 2150} // Predicates = [HasAVX] 2151 2152def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 2153 "cvttpd2dq\t{$src, $dst|$dst, $src}", 2154 [(set VR128:$dst, (int_x86_sse2_cvttpd2dq VR128:$src))], 2155 IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2I]>; 2156def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src), 2157 "cvttpd2dq\t{$src, $dst|$dst, $src}", 2158 [(set VR128:$dst, (int_x86_sse2_cvttpd2dq 2159 (memopv2f64 addr:$src)))], 2160 IIC_SSE_CVT_PD_RM>, 2161 Sched<[WriteCvtF2ILd]>; 2162 2163// Convert packed single to packed double 2164let Predicates = [HasAVX] in { 2165 // SSE2 instructions without OpSize prefix 2166def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 2167 "vcvtps2pd\t{$src, $dst|$dst, $src}", 2168 [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))], 2169 IIC_SSE_CVT_PD_RR>, PS, VEX, Sched<[WriteCvtF2F]>; 2170def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), 2171 "vcvtps2pd\t{$src, $dst|$dst, $src}", 2172 [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))], 2173 IIC_SSE_CVT_PD_RM>, PS, VEX, Sched<[WriteCvtF2FLd]>; 2174def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), 2175 "vcvtps2pd\t{$src, $dst|$dst, $src}", 2176 [(set VR256:$dst, 2177 (int_x86_avx_cvt_ps2_pd_256 VR128:$src))], 2178 IIC_SSE_CVT_PD_RR>, PS, VEX, VEX_L, Sched<[WriteCvtF2F]>; 2179def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src), 2180 "vcvtps2pd\t{$src, $dst|$dst, $src}", 2181 [(set VR256:$dst, 2182 (int_x86_avx_cvt_ps2_pd_256 (loadv4f32 addr:$src)))], 2183 IIC_SSE_CVT_PD_RM>, PS, VEX, VEX_L, Sched<[WriteCvtF2FLd]>; 2184} 2185 2186let Predicates = [UseSSE2] in { 2187def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 2188 "cvtps2pd\t{$src, $dst|$dst, $src}", 2189 [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))], 2190 IIC_SSE_CVT_PD_RR>, PS, Sched<[WriteCvtF2F]>; 2191def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), 2192 "cvtps2pd\t{$src, $dst|$dst, $src}", 2193 [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))], 2194 IIC_SSE_CVT_PD_RM>, PS, Sched<[WriteCvtF2FLd]>; 2195} 2196 2197// Convert Packed DW Integers to Packed Double FP 2198let Predicates = [HasAVX] in { 2199let hasSideEffects = 0, mayLoad = 1 in 2200def VCVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 2201 "vcvtdq2pd\t{$src, $dst|$dst, $src}", 2202 []>, VEX, Sched<[WriteCvtI2FLd]>; 2203def VCVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 2204 "vcvtdq2pd\t{$src, $dst|$dst, $src}", 2205 [(set VR128:$dst, 2206 (int_x86_sse2_cvtdq2pd VR128:$src))]>, VEX, 2207 Sched<[WriteCvtI2F]>; 2208def VCVTDQ2PDYrm : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src), 2209 "vcvtdq2pd\t{$src, $dst|$dst, $src}", 2210 [(set VR256:$dst, 2211 (int_x86_avx_cvtdq2_pd_256 2212 (bitconvert (loadv2i64 addr:$src))))]>, VEX, VEX_L, 2213 Sched<[WriteCvtI2FLd]>; 2214def VCVTDQ2PDYrr : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), 2215 "vcvtdq2pd\t{$src, $dst|$dst, $src}", 2216 [(set VR256:$dst, 2217 (int_x86_avx_cvtdq2_pd_256 VR128:$src))]>, VEX, VEX_L, 2218 Sched<[WriteCvtI2F]>; 2219} 2220 2221let hasSideEffects = 0, mayLoad = 1 in 2222def CVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 2223 "cvtdq2pd\t{$src, $dst|$dst, $src}", [], 2224 IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtI2FLd]>; 2225def CVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 2226 "cvtdq2pd\t{$src, $dst|$dst, $src}", 2227 [(set VR128:$dst, (int_x86_sse2_cvtdq2pd VR128:$src))], 2228 IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtI2F]>; 2229 2230// AVX register conversion intrinsics 2231let Predicates = [HasAVX] in { 2232 def : Pat<(v2f64 (X86cvtdq2pd (v4i32 VR128:$src))), 2233 (VCVTDQ2PDrr VR128:$src)>; 2234 def : Pat<(v2f64 (X86cvtdq2pd (bc_v4i32 (loadv2i64 addr:$src)))), 2235 (VCVTDQ2PDrm addr:$src)>; 2236 2237 def : Pat<(v4f64 (sint_to_fp (v4i32 VR128:$src))), 2238 (VCVTDQ2PDYrr VR128:$src)>; 2239 def : Pat<(v4f64 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))), 2240 (VCVTDQ2PDYrm addr:$src)>; 2241} // Predicates = [HasAVX] 2242 2243// SSE2 register conversion intrinsics 2244let Predicates = [HasSSE2] in { 2245 def : Pat<(v2f64 (X86cvtdq2pd (v4i32 VR128:$src))), 2246 (CVTDQ2PDrr VR128:$src)>; 2247 def : Pat<(v2f64 (X86cvtdq2pd (bc_v4i32 (loadv2i64 addr:$src)))), 2248 (CVTDQ2PDrm addr:$src)>; 2249} // Predicates = [HasSSE2] 2250 2251// Convert packed double to packed single 2252// The assembler can recognize rr 256-bit instructions by seeing a ymm 2253// register, but the same isn't true when using memory operands instead. 2254// Provide other assembly rr and rm forms to address this explicitly. 2255def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 2256 "cvtpd2ps\t{$src, $dst|$dst, $src}", 2257 [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))], 2258 IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2F]>; 2259 2260// XMM only 2261def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}", 2262 (VCVTPD2PSrr VR128:$dst, VR128:$src), 0>; 2263def VCVTPD2PSXrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 2264 "cvtpd2psx\t{$src, $dst|$dst, $src}", 2265 [(set VR128:$dst, 2266 (int_x86_sse2_cvtpd2ps (loadv2f64 addr:$src)))], 2267 IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2FLd]>; 2268 2269// YMM only 2270def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), 2271 "cvtpd2ps{y}\t{$src, $dst|$dst, $src}", 2272 [(set VR128:$dst, 2273 (int_x86_avx_cvt_pd2_ps_256 VR256:$src))], 2274 IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2F]>; 2275def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), 2276 "cvtpd2ps{y}\t{$src, $dst|$dst, $src}", 2277 [(set VR128:$dst, 2278 (int_x86_avx_cvt_pd2_ps_256 (loadv4f64 addr:$src)))], 2279 IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2FLd]>; 2280def : InstAlias<"vcvtpd2ps\t{$src, $dst|$dst, $src}", 2281 (VCVTPD2PSYrr VR128:$dst, VR256:$src), 0>; 2282 2283def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 2284 "cvtpd2ps\t{$src, $dst|$dst, $src}", 2285 [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))], 2286 IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2F]>; 2287def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 2288 "cvtpd2ps\t{$src, $dst|$dst, $src}", 2289 [(set VR128:$dst, 2290 (int_x86_sse2_cvtpd2ps (memopv2f64 addr:$src)))], 2291 IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtF2FLd]>; 2292 2293 2294// AVX 256-bit register conversion intrinsics 2295// FIXME: Migrate SSE conversion intrinsics matching to use patterns as below 2296// whenever possible to avoid declaring two versions of each one. 2297let Predicates = [HasAVX] in { 2298 def : Pat<(int_x86_avx_cvtdq2_ps_256 VR256:$src), 2299 (VCVTDQ2PSYrr VR256:$src)>; 2300 def : Pat<(int_x86_avx_cvtdq2_ps_256 (bitconvert (loadv4i64 addr:$src))), 2301 (VCVTDQ2PSYrm addr:$src)>; 2302} 2303 2304let Predicates = [HasAVX, NoVLX] in { 2305 // Match fround and fextend for 128/256-bit conversions 2306 def : Pat<(v4f32 (X86vfpround (v2f64 VR128:$src))), 2307 (VCVTPD2PSrr VR128:$src)>; 2308 def : Pat<(v4f32 (X86vfpround (loadv2f64 addr:$src))), 2309 (VCVTPD2PSXrm addr:$src)>; 2310 def : Pat<(v4f32 (fround (v4f64 VR256:$src))), 2311 (VCVTPD2PSYrr VR256:$src)>; 2312 def : Pat<(v4f32 (fround (loadv4f64 addr:$src))), 2313 (VCVTPD2PSYrm addr:$src)>; 2314 2315 def : Pat<(v2f64 (X86vfpext (v4f32 VR128:$src))), 2316 (VCVTPS2PDrr VR128:$src)>; 2317 def : Pat<(v4f64 (fextend (v4f32 VR128:$src))), 2318 (VCVTPS2PDYrr VR128:$src)>; 2319 def : Pat<(v4f64 (extloadv4f32 addr:$src)), 2320 (VCVTPS2PDYrm addr:$src)>; 2321} 2322 2323let Predicates = [UseSSE2] in { 2324 // Match fround and fextend for 128 conversions 2325 def : Pat<(v4f32 (X86vfpround (v2f64 VR128:$src))), 2326 (CVTPD2PSrr VR128:$src)>; 2327 def : Pat<(v4f32 (X86vfpround (memopv2f64 addr:$src))), 2328 (CVTPD2PSrm addr:$src)>; 2329 2330 def : Pat<(v2f64 (X86vfpext (v4f32 VR128:$src))), 2331 (CVTPS2PDrr VR128:$src)>; 2332} 2333 2334//===----------------------------------------------------------------------===// 2335// SSE 1 & 2 - Compare Instructions 2336//===----------------------------------------------------------------------===// 2337 2338// sse12_cmp_scalar - sse 1 & 2 compare scalar instructions 2339multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop, 2340 Operand CC, SDNode OpNode, ValueType VT, 2341 PatFrag ld_frag, string asm, string asm_alt, 2342 OpndItins itins, ImmLeaf immLeaf> { 2343 def rr : SIi8<0xC2, MRMSrcReg, 2344 (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm, 2345 [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, immLeaf:$cc))], 2346 itins.rr>, Sched<[itins.Sched]>; 2347 def rm : SIi8<0xC2, MRMSrcMem, 2348 (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm, 2349 [(set RC:$dst, (OpNode (VT RC:$src1), 2350 (ld_frag addr:$src2), immLeaf:$cc))], 2351 itins.rm>, 2352 Sched<[itins.Sched.Folded, ReadAfterLd]>; 2353 2354 // Accept explicit immediate argument form instead of comparison code. 2355 let isAsmParserOnly = 1, hasSideEffects = 0 in { 2356 def rr_alt : SIi8<0xC2, MRMSrcReg, (outs RC:$dst), 2357 (ins RC:$src1, RC:$src2, u8imm:$cc), asm_alt, [], 2358 IIC_SSE_ALU_F32S_RR>, Sched<[itins.Sched]>; 2359 let mayLoad = 1 in 2360 def rm_alt : SIi8<0xC2, MRMSrcMem, (outs RC:$dst), 2361 (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm_alt, [], 2362 IIC_SSE_ALU_F32S_RM>, 2363 Sched<[itins.Sched.Folded, ReadAfterLd]>; 2364 } 2365} 2366 2367defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, AVXCC, X86cmps, f32, loadf32, 2368 "cmp${cc}ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2369 "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", 2370 SSE_ALU_F32S, i8immZExt5>, XS, VEX_4V, VEX_LIG; 2371defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, AVXCC, X86cmps, f64, loadf64, 2372 "cmp${cc}sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2373 "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", 2374 SSE_ALU_F32S, i8immZExt5>, // same latency as 32 bit compare 2375 XD, VEX_4V, VEX_LIG; 2376 2377let Constraints = "$src1 = $dst" in { 2378 defm CMPSS : sse12_cmp_scalar<FR32, f32mem, SSECC, X86cmps, f32, loadf32, 2379 "cmp${cc}ss\t{$src2, $dst|$dst, $src2}", 2380 "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}", SSE_ALU_F32S, 2381 i8immZExt3>, XS; 2382 defm CMPSD : sse12_cmp_scalar<FR64, f64mem, SSECC, X86cmps, f64, loadf64, 2383 "cmp${cc}sd\t{$src2, $dst|$dst, $src2}", 2384 "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}", 2385 SSE_ALU_F64S, i8immZExt3>, XD; 2386} 2387 2388multiclass sse12_cmp_scalar_int<X86MemOperand x86memop, Operand CC, 2389 Intrinsic Int, string asm, OpndItins itins, 2390 ImmLeaf immLeaf> { 2391 def rr : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst), 2392 (ins VR128:$src1, VR128:$src, CC:$cc), asm, 2393 [(set VR128:$dst, (Int VR128:$src1, 2394 VR128:$src, immLeaf:$cc))], 2395 itins.rr>, 2396 Sched<[itins.Sched]>; 2397 def rm : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst), 2398 (ins VR128:$src1, x86memop:$src, CC:$cc), asm, 2399 [(set VR128:$dst, (Int VR128:$src1, 2400 (load addr:$src), immLeaf:$cc))], 2401 itins.rm>, 2402 Sched<[itins.Sched.Folded, ReadAfterLd]>; 2403} 2404 2405let isCodeGenOnly = 1 in { 2406 // Aliases to match intrinsics which expect XMM operand(s). 2407 defm Int_VCMPSS : sse12_cmp_scalar_int<f32mem, AVXCC, int_x86_sse_cmp_ss, 2408 "cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}", 2409 SSE_ALU_F32S, i8immZExt5>, 2410 XS, VEX_4V; 2411 defm Int_VCMPSD : sse12_cmp_scalar_int<f64mem, AVXCC, int_x86_sse2_cmp_sd, 2412 "cmp${cc}sd\t{$src, $src1, $dst|$dst, $src1, $src}", 2413 SSE_ALU_F32S, i8immZExt5>, // same latency as f32 2414 XD, VEX_4V; 2415 let Constraints = "$src1 = $dst" in { 2416 defm Int_CMPSS : sse12_cmp_scalar_int<f32mem, SSECC, int_x86_sse_cmp_ss, 2417 "cmp${cc}ss\t{$src, $dst|$dst, $src}", 2418 SSE_ALU_F32S, i8immZExt3>, XS; 2419 defm Int_CMPSD : sse12_cmp_scalar_int<f64mem, SSECC, int_x86_sse2_cmp_sd, 2420 "cmp${cc}sd\t{$src, $dst|$dst, $src}", 2421 SSE_ALU_F64S, i8immZExt3>, 2422 XD; 2423} 2424} 2425 2426 2427// sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS 2428multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode, 2429 ValueType vt, X86MemOperand x86memop, 2430 PatFrag ld_frag, string OpcodeStr> { 2431 def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2), 2432 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), 2433 [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))], 2434 IIC_SSE_COMIS_RR>, 2435 Sched<[WriteFAdd]>; 2436 def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2), 2437 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), 2438 [(set EFLAGS, (OpNode (vt RC:$src1), 2439 (ld_frag addr:$src2)))], 2440 IIC_SSE_COMIS_RM>, 2441 Sched<[WriteFAddLd, ReadAfterLd]>; 2442} 2443 2444let Defs = [EFLAGS] in { 2445 defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32, 2446 "ucomiss">, PS, VEX, VEX_LIG; 2447 defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64, 2448 "ucomisd">, PD, VEX, VEX_LIG; 2449 let Pattern = []<dag> in { 2450 defm VCOMISS : sse12_ord_cmp<0x2F, FR32, undef, f32, f32mem, loadf32, 2451 "comiss">, PS, VEX, VEX_LIG; 2452 defm VCOMISD : sse12_ord_cmp<0x2F, FR64, undef, f64, f64mem, loadf64, 2453 "comisd">, PD, VEX, VEX_LIG; 2454 } 2455 2456 let isCodeGenOnly = 1 in { 2457 defm Int_VUCOMISS : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem, 2458 load, "ucomiss">, PS, VEX; 2459 defm Int_VUCOMISD : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem, 2460 load, "ucomisd">, PD, VEX; 2461 2462 defm Int_VCOMISS : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem, 2463 load, "comiss">, PS, VEX; 2464 defm Int_VCOMISD : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem, 2465 load, "comisd">, PD, VEX; 2466 } 2467 defm UCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32, 2468 "ucomiss">, PS; 2469 defm UCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64, 2470 "ucomisd">, PD; 2471 2472 let Pattern = []<dag> in { 2473 defm COMISS : sse12_ord_cmp<0x2F, FR32, undef, f32, f32mem, loadf32, 2474 "comiss">, PS; 2475 defm COMISD : sse12_ord_cmp<0x2F, FR64, undef, f64, f64mem, loadf64, 2476 "comisd">, PD; 2477 } 2478 2479 let isCodeGenOnly = 1 in { 2480 defm Int_UCOMISS : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem, 2481 load, "ucomiss">, PS; 2482 defm Int_UCOMISD : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem, 2483 load, "ucomisd">, PD; 2484 2485 defm Int_COMISS : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem, load, 2486 "comiss">, PS; 2487 defm Int_COMISD : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem, load, 2488 "comisd">, PD; 2489 } 2490} // Defs = [EFLAGS] 2491 2492// sse12_cmp_packed - sse 1 & 2 compare packed instructions 2493multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop, 2494 Operand CC, Intrinsic Int, string asm, 2495 string asm_alt, Domain d, ImmLeaf immLeaf, 2496 PatFrag ld_frag, OpndItins itins = SSE_ALU_F32P> { 2497 let isCommutable = 1 in 2498 def rri : PIi8<0xC2, MRMSrcReg, 2499 (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm, 2500 [(set RC:$dst, (Int RC:$src1, RC:$src2, immLeaf:$cc))], 2501 itins.rr, d>, 2502 Sched<[WriteFAdd]>; 2503 def rmi : PIi8<0xC2, MRMSrcMem, 2504 (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm, 2505 [(set RC:$dst, (Int RC:$src1, (ld_frag addr:$src2), immLeaf:$cc))], 2506 itins.rm, d>, 2507 Sched<[WriteFAddLd, ReadAfterLd]>; 2508 2509 // Accept explicit immediate argument form instead of comparison code. 2510 let isAsmParserOnly = 1, hasSideEffects = 0 in { 2511 def rri_alt : PIi8<0xC2, MRMSrcReg, 2512 (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), 2513 asm_alt, [], itins.rr, d>, Sched<[WriteFAdd]>; 2514 let mayLoad = 1 in 2515 def rmi_alt : PIi8<0xC2, MRMSrcMem, 2516 (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), 2517 asm_alt, [], itins.rm, d>, 2518 Sched<[WriteFAddLd, ReadAfterLd]>; 2519 } 2520} 2521 2522defm VCMPPS : sse12_cmp_packed<VR128, f128mem, AVXCC, int_x86_sse_cmp_ps, 2523 "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2524 "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", 2525 SSEPackedSingle, i8immZExt5, loadv4f32>, PS, VEX_4V; 2526defm VCMPPD : sse12_cmp_packed<VR128, f128mem, AVXCC, int_x86_sse2_cmp_pd, 2527 "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2528 "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", 2529 SSEPackedDouble, i8immZExt5, loadv2f64>, PD, VEX_4V; 2530defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, AVXCC, int_x86_avx_cmp_ps_256, 2531 "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2532 "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", 2533 SSEPackedSingle, i8immZExt5, loadv8f32>, PS, VEX_4V, VEX_L; 2534defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, AVXCC, int_x86_avx_cmp_pd_256, 2535 "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2536 "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", 2537 SSEPackedDouble, i8immZExt5, loadv4f64>, PD, VEX_4V, VEX_L; 2538let Constraints = "$src1 = $dst" in { 2539 defm CMPPS : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse_cmp_ps, 2540 "cmp${cc}ps\t{$src2, $dst|$dst, $src2}", 2541 "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}", 2542 SSEPackedSingle, i8immZExt5, memopv4f32, SSE_ALU_F32P>, PS; 2543 defm CMPPD : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse2_cmp_pd, 2544 "cmp${cc}pd\t{$src2, $dst|$dst, $src2}", 2545 "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}", 2546 SSEPackedDouble, i8immZExt5, memopv2f64, SSE_ALU_F64P>, PD; 2547} 2548 2549let Predicates = [HasAVX] in { 2550def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)), 2551 (VCMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>; 2552def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), (loadv4f32 addr:$src2), imm:$cc)), 2553 (VCMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>; 2554def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)), 2555 (VCMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>; 2556def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), (loadv2f64 addr:$src2), imm:$cc)), 2557 (VCMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>; 2558 2559def : Pat<(v8i32 (X86cmpp (v8f32 VR256:$src1), VR256:$src2, imm:$cc)), 2560 (VCMPPSYrri (v8f32 VR256:$src1), (v8f32 VR256:$src2), imm:$cc)>; 2561def : Pat<(v8i32 (X86cmpp (v8f32 VR256:$src1), (loadv8f32 addr:$src2), imm:$cc)), 2562 (VCMPPSYrmi (v8f32 VR256:$src1), addr:$src2, imm:$cc)>; 2563def : Pat<(v4i64 (X86cmpp (v4f64 VR256:$src1), VR256:$src2, imm:$cc)), 2564 (VCMPPDYrri VR256:$src1, VR256:$src2, imm:$cc)>; 2565def : Pat<(v4i64 (X86cmpp (v4f64 VR256:$src1), (loadv4f64 addr:$src2), imm:$cc)), 2566 (VCMPPDYrmi VR256:$src1, addr:$src2, imm:$cc)>; 2567} 2568 2569let Predicates = [UseSSE1] in { 2570def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)), 2571 (CMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>; 2572def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), (memopv4f32 addr:$src2), imm:$cc)), 2573 (CMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>; 2574} 2575 2576let Predicates = [UseSSE2] in { 2577def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)), 2578 (CMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>; 2579def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), (memopv2f64 addr:$src2), imm:$cc)), 2580 (CMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>; 2581} 2582 2583//===----------------------------------------------------------------------===// 2584// SSE 1 & 2 - Shuffle Instructions 2585//===----------------------------------------------------------------------===// 2586 2587/// sse12_shuffle - sse 1 & 2 fp shuffle instructions 2588multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop, 2589 ValueType vt, string asm, PatFrag mem_frag, 2590 Domain d> { 2591 def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst), 2592 (ins RC:$src1, x86memop:$src2, u8imm:$src3), asm, 2593 [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2), 2594 (i8 imm:$src3))))], IIC_SSE_SHUFP, d>, 2595 Sched<[WriteFShuffleLd, ReadAfterLd]>; 2596 def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst), 2597 (ins RC:$src1, RC:$src2, u8imm:$src3), asm, 2598 [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2, 2599 (i8 imm:$src3))))], IIC_SSE_SHUFP, d>, 2600 Sched<[WriteFShuffle]>; 2601} 2602 2603let Predicates = [HasAVX, NoVLX] in { 2604 defm VSHUFPS : sse12_shuffle<VR128, f128mem, v4f32, 2605 "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 2606 loadv4f32, SSEPackedSingle>, PS, VEX_4V; 2607 defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32, 2608 "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 2609 loadv8f32, SSEPackedSingle>, PS, VEX_4V, VEX_L; 2610 defm VSHUFPD : sse12_shuffle<VR128, f128mem, v2f64, 2611 "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 2612 loadv2f64, SSEPackedDouble>, PD, VEX_4V; 2613 defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64, 2614 "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 2615 loadv4f64, SSEPackedDouble>, PD, VEX_4V, VEX_L; 2616} 2617let Constraints = "$src1 = $dst" in { 2618 defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32, 2619 "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}", 2620 memopv4f32, SSEPackedSingle>, PS; 2621 defm SHUFPD : sse12_shuffle<VR128, f128mem, v2f64, 2622 "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}", 2623 memopv2f64, SSEPackedDouble>, PD; 2624} 2625 2626let Predicates = [HasAVX, NoVLX] in { 2627 def : Pat<(v4i32 (X86Shufp VR128:$src1, 2628 (bc_v4i32 (loadv2i64 addr:$src2)), (i8 imm:$imm))), 2629 (VSHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>; 2630 def : Pat<(v4i32 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))), 2631 (VSHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>; 2632 2633 def : Pat<(v2i64 (X86Shufp VR128:$src1, 2634 (loadv2i64 addr:$src2), (i8 imm:$imm))), 2635 (VSHUFPDrmi VR128:$src1, addr:$src2, imm:$imm)>; 2636 def : Pat<(v2i64 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))), 2637 (VSHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>; 2638 2639 // 256-bit patterns 2640 def : Pat<(v8i32 (X86Shufp VR256:$src1, VR256:$src2, (i8 imm:$imm))), 2641 (VSHUFPSYrri VR256:$src1, VR256:$src2, imm:$imm)>; 2642 def : Pat<(v8i32 (X86Shufp VR256:$src1, 2643 (bc_v8i32 (loadv4i64 addr:$src2)), (i8 imm:$imm))), 2644 (VSHUFPSYrmi VR256:$src1, addr:$src2, imm:$imm)>; 2645 2646 def : Pat<(v4i64 (X86Shufp VR256:$src1, VR256:$src2, (i8 imm:$imm))), 2647 (VSHUFPDYrri VR256:$src1, VR256:$src2, imm:$imm)>; 2648 def : Pat<(v4i64 (X86Shufp VR256:$src1, 2649 (loadv4i64 addr:$src2), (i8 imm:$imm))), 2650 (VSHUFPDYrmi VR256:$src1, addr:$src2, imm:$imm)>; 2651} 2652 2653let Predicates = [UseSSE1] in { 2654 def : Pat<(v4i32 (X86Shufp VR128:$src1, 2655 (bc_v4i32 (memopv2i64 addr:$src2)), (i8 imm:$imm))), 2656 (SHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>; 2657 def : Pat<(v4i32 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))), 2658 (SHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>; 2659} 2660 2661let Predicates = [UseSSE2] in { 2662 // Generic SHUFPD patterns 2663 def : Pat<(v2i64 (X86Shufp VR128:$src1, 2664 (memopv2i64 addr:$src2), (i8 imm:$imm))), 2665 (SHUFPDrmi VR128:$src1, addr:$src2, imm:$imm)>; 2666 def : Pat<(v2i64 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))), 2667 (SHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>; 2668} 2669 2670//===----------------------------------------------------------------------===// 2671// SSE 1 & 2 - Unpack FP Instructions 2672//===----------------------------------------------------------------------===// 2673 2674/// sse12_unpack_interleave - sse 1 & 2 fp unpack and interleave 2675multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt, 2676 PatFrag mem_frag, RegisterClass RC, 2677 X86MemOperand x86memop, string asm, 2678 Domain d> { 2679 def rr : PI<opc, MRMSrcReg, 2680 (outs RC:$dst), (ins RC:$src1, RC:$src2), 2681 asm, [(set RC:$dst, 2682 (vt (OpNode RC:$src1, RC:$src2)))], 2683 IIC_SSE_UNPCK, d>, Sched<[WriteFShuffle]>; 2684 def rm : PI<opc, MRMSrcMem, 2685 (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 2686 asm, [(set RC:$dst, 2687 (vt (OpNode RC:$src1, 2688 (mem_frag addr:$src2))))], 2689 IIC_SSE_UNPCK, d>, 2690 Sched<[WriteFShuffleLd, ReadAfterLd]>; 2691} 2692 2693let Predicates = [HasAVX, NoVLX] in { 2694defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, loadv4f32, 2695 VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2696 SSEPackedSingle>, PS, VEX_4V; 2697defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, loadv2f64, 2698 VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2699 SSEPackedDouble>, PD, VEX_4V; 2700defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, loadv4f32, 2701 VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2702 SSEPackedSingle>, PS, VEX_4V; 2703defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, loadv2f64, 2704 VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2705 SSEPackedDouble>, PD, VEX_4V; 2706 2707defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, loadv8f32, 2708 VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2709 SSEPackedSingle>, PS, VEX_4V, VEX_L; 2710defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, loadv4f64, 2711 VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2712 SSEPackedDouble>, PD, VEX_4V, VEX_L; 2713defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, loadv8f32, 2714 VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2715 SSEPackedSingle>, PS, VEX_4V, VEX_L; 2716defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, loadv4f64, 2717 VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2718 SSEPackedDouble>, PD, VEX_4V, VEX_L; 2719}// Predicates = [HasAVX, NoVLX] 2720let Constraints = "$src1 = $dst" in { 2721 defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memopv4f32, 2722 VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}", 2723 SSEPackedSingle>, PS; 2724 defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memopv2f64, 2725 VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}", 2726 SSEPackedDouble>, PD; 2727 defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memopv4f32, 2728 VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}", 2729 SSEPackedSingle>, PS; 2730 defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memopv2f64, 2731 VR128, f128mem, "unpcklpd\t{$src2, $dst|$dst, $src2}", 2732 SSEPackedDouble>, PD; 2733} // Constraints = "$src1 = $dst" 2734 2735let Predicates = [HasAVX1Only] in { 2736 def : Pat<(v8i32 (X86Unpckl VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))), 2737 (VUNPCKLPSYrm VR256:$src1, addr:$src2)>; 2738 def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)), 2739 (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>; 2740 def : Pat<(v8i32 (X86Unpckh VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))), 2741 (VUNPCKHPSYrm VR256:$src1, addr:$src2)>; 2742 def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)), 2743 (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>; 2744 2745 def : Pat<(v4i64 (X86Unpckl VR256:$src1, (loadv4i64 addr:$src2))), 2746 (VUNPCKLPDYrm VR256:$src1, addr:$src2)>; 2747 def : Pat<(v4i64 (X86Unpckl VR256:$src1, VR256:$src2)), 2748 (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>; 2749 def : Pat<(v4i64 (X86Unpckh VR256:$src1, (loadv4i64 addr:$src2))), 2750 (VUNPCKHPDYrm VR256:$src1, addr:$src2)>; 2751 def : Pat<(v4i64 (X86Unpckh VR256:$src1, VR256:$src2)), 2752 (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>; 2753} 2754 2755//===----------------------------------------------------------------------===// 2756// SSE 1 & 2 - Extract Floating-Point Sign mask 2757//===----------------------------------------------------------------------===// 2758 2759/// sse12_extr_sign_mask - sse 1 & 2 unpack and interleave 2760multiclass sse12_extr_sign_mask<RegisterClass RC, Intrinsic Int, string asm, 2761 Domain d> { 2762 def rr : PI<0x50, MRMSrcReg, (outs GR32orGR64:$dst), (ins RC:$src), 2763 !strconcat(asm, "\t{$src, $dst|$dst, $src}"), 2764 [(set GR32orGR64:$dst, (Int RC:$src))], IIC_SSE_MOVMSK, d>, 2765 Sched<[WriteVecLogic]>; 2766} 2767 2768let Predicates = [HasAVX] in { 2769 defm VMOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps, 2770 "movmskps", SSEPackedSingle>, PS, VEX; 2771 defm VMOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd, 2772 "movmskpd", SSEPackedDouble>, PD, VEX; 2773 defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_ps_256, 2774 "movmskps", SSEPackedSingle>, PS, 2775 VEX, VEX_L; 2776 defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_pd_256, 2777 "movmskpd", SSEPackedDouble>, PD, 2778 VEX, VEX_L; 2779 2780 def : Pat<(i32 (X86fgetsign FR32:$src)), 2781 (VMOVMSKPSrr (COPY_TO_REGCLASS FR32:$src, VR128))>; 2782 def : Pat<(i64 (X86fgetsign FR32:$src)), 2783 (SUBREG_TO_REG (i64 0), 2784 (VMOVMSKPSrr (COPY_TO_REGCLASS FR32:$src, VR128)), sub_32bit)>; 2785 def : Pat<(i32 (X86fgetsign FR64:$src)), 2786 (VMOVMSKPDrr (COPY_TO_REGCLASS FR64:$src, VR128))>; 2787 def : Pat<(i64 (X86fgetsign FR64:$src)), 2788 (SUBREG_TO_REG (i64 0), 2789 (VMOVMSKPDrr (COPY_TO_REGCLASS FR64:$src, VR128)), sub_32bit)>; 2790} 2791 2792defm MOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps, "movmskps", 2793 SSEPackedSingle>, PS; 2794defm MOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd, "movmskpd", 2795 SSEPackedDouble>, PD; 2796 2797def : Pat<(i32 (X86fgetsign FR32:$src)), 2798 (MOVMSKPSrr (COPY_TO_REGCLASS FR32:$src, VR128))>, 2799 Requires<[UseSSE1]>; 2800def : Pat<(i64 (X86fgetsign FR32:$src)), 2801 (SUBREG_TO_REG (i64 0), 2802 (MOVMSKPSrr (COPY_TO_REGCLASS FR32:$src, VR128)), sub_32bit)>, 2803 Requires<[UseSSE1]>; 2804def : Pat<(i32 (X86fgetsign FR64:$src)), 2805 (MOVMSKPDrr (COPY_TO_REGCLASS FR64:$src, VR128))>, 2806 Requires<[UseSSE2]>; 2807def : Pat<(i64 (X86fgetsign FR64:$src)), 2808 (SUBREG_TO_REG (i64 0), 2809 (MOVMSKPDrr (COPY_TO_REGCLASS FR64:$src, VR128)), sub_32bit)>, 2810 Requires<[UseSSE2]>; 2811 2812//===---------------------------------------------------------------------===// 2813// SSE2 - Packed Integer Logical Instructions 2814//===---------------------------------------------------------------------===// 2815 2816let ExeDomain = SSEPackedInt in { // SSE integer instructions 2817 2818/// PDI_binop_rm - Simple SSE2 binary operator. 2819multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, 2820 ValueType OpVT, RegisterClass RC, PatFrag memop_frag, 2821 X86MemOperand x86memop, OpndItins itins, 2822 bit IsCommutable, bit Is2Addr> { 2823 let isCommutable = IsCommutable in 2824 def rr : PDI<opc, MRMSrcReg, (outs RC:$dst), 2825 (ins RC:$src1, RC:$src2), 2826 !if(Is2Addr, 2827 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 2828 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 2829 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))], itins.rr>, 2830 Sched<[itins.Sched]>; 2831 def rm : PDI<opc, MRMSrcMem, (outs RC:$dst), 2832 (ins RC:$src1, x86memop:$src2), 2833 !if(Is2Addr, 2834 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 2835 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 2836 [(set RC:$dst, (OpVT (OpNode RC:$src1, 2837 (bitconvert (memop_frag addr:$src2)))))], 2838 itins.rm>, 2839 Sched<[itins.Sched.Folded, ReadAfterLd]>; 2840} 2841} // ExeDomain = SSEPackedInt 2842 2843multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode, 2844 ValueType OpVT128, ValueType OpVT256, 2845 OpndItins itins, bit IsCommutable = 0, Predicate prd> { 2846let Predicates = [HasAVX, prd] in 2847 defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128, 2848 VR128, loadv2i64, i128mem, itins, IsCommutable, 0>, VEX_4V; 2849 2850let Constraints = "$src1 = $dst" in 2851 defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128, 2852 memopv2i64, i128mem, itins, IsCommutable, 1>; 2853 2854let Predicates = [HasAVX2, prd] in 2855 defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, 2856 OpVT256, VR256, loadv4i64, i256mem, itins, 2857 IsCommutable, 0>, VEX_4V, VEX_L; 2858} 2859 2860// These are ordered here for pattern ordering requirements with the fp versions 2861 2862defm PAND : PDI_binop_all<0xDB, "pand", and, v2i64, v4i64, 2863 SSE_VEC_BIT_ITINS_P, 1, NoVLX>; 2864defm POR : PDI_binop_all<0xEB, "por", or, v2i64, v4i64, 2865 SSE_VEC_BIT_ITINS_P, 1, NoVLX>; 2866defm PXOR : PDI_binop_all<0xEF, "pxor", xor, v2i64, v4i64, 2867 SSE_VEC_BIT_ITINS_P, 1, NoVLX>; 2868defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64, 2869 SSE_VEC_BIT_ITINS_P, 0, NoVLX>; 2870 2871//===----------------------------------------------------------------------===// 2872// SSE 1 & 2 - Logical Instructions 2873//===----------------------------------------------------------------------===// 2874 2875// Multiclass for scalars using the X86 logical operation aliases for FP. 2876multiclass sse12_fp_packed_scalar_logical_alias< 2877 bits<8> opc, string OpcodeStr, SDNode OpNode, OpndItins itins> { 2878 defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, 2879 FR32, f32, f128mem, loadf32_128, SSEPackedSingle, itins, 0>, 2880 PS, VEX_4V; 2881 2882 defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, 2883 FR64, f64, f128mem, loadf64_128, SSEPackedDouble, itins, 0>, 2884 PD, VEX_4V; 2885 2886 let Constraints = "$src1 = $dst" in { 2887 defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, FR32, 2888 f32, f128mem, memopfsf32_128, SSEPackedSingle, itins>, PS; 2889 2890 defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, FR64, 2891 f64, f128mem, memopfsf64_128, SSEPackedDouble, itins>, PD; 2892 } 2893} 2894 2895let isCodeGenOnly = 1 in { 2896 defm FsAND : sse12_fp_packed_scalar_logical_alias<0x54, "and", X86fand, 2897 SSE_BIT_ITINS_P>; 2898 defm FsOR : sse12_fp_packed_scalar_logical_alias<0x56, "or", X86for, 2899 SSE_BIT_ITINS_P>; 2900 defm FsXOR : sse12_fp_packed_scalar_logical_alias<0x57, "xor", X86fxor, 2901 SSE_BIT_ITINS_P>; 2902 2903 let isCommutable = 0 in 2904 defm FsANDN : sse12_fp_packed_scalar_logical_alias<0x55, "andn", X86fandn, 2905 SSE_BIT_ITINS_P>; 2906} 2907 2908// Multiclass for vectors using the X86 logical operation aliases for FP. 2909multiclass sse12_fp_packed_vector_logical_alias< 2910 bits<8> opc, string OpcodeStr, SDNode OpNode, OpndItins itins> { 2911 let Predicates = [HasAVX, NoVLX_Or_NoDQI] in { 2912 defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, 2913 VR128, v4f32, f128mem, loadv4f32, SSEPackedSingle, itins, 0>, 2914 PS, VEX_4V; 2915 2916 defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, 2917 VR128, v2f64, f128mem, loadv2f64, SSEPackedDouble, itins, 0>, 2918 PD, VEX_4V; 2919 2920 defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, 2921 VR256, v8f32, f256mem, loadv8f32, SSEPackedSingle, itins, 0>, 2922 PS, VEX_4V, VEX_L; 2923 2924 defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, 2925 VR256, v4f64, f256mem, loadv4f64, SSEPackedDouble, itins, 0>, 2926 PD, VEX_4V, VEX_L; 2927 } 2928 2929 let Constraints = "$src1 = $dst" in { 2930 defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128, 2931 v4f32, f128mem, memopv4f32, SSEPackedSingle, itins>, 2932 PS; 2933 2934 defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128, 2935 v2f64, f128mem, memopv2f64, SSEPackedDouble, itins>, 2936 PD; 2937 } 2938} 2939 2940let isCodeGenOnly = 1 in { 2941 defm FvAND : sse12_fp_packed_vector_logical_alias<0x54, "and", X86fand, 2942 SSE_BIT_ITINS_P>; 2943 defm FvOR : sse12_fp_packed_vector_logical_alias<0x56, "or", X86for, 2944 SSE_BIT_ITINS_P>; 2945 defm FvXOR : sse12_fp_packed_vector_logical_alias<0x57, "xor", X86fxor, 2946 SSE_BIT_ITINS_P>; 2947 2948 let isCommutable = 0 in 2949 defm FvANDN : sse12_fp_packed_vector_logical_alias<0x55, "andn", X86fandn, 2950 SSE_BIT_ITINS_P>; 2951} 2952 2953/// sse12_fp_packed_logical - SSE 1 & 2 packed FP logical ops 2954/// 2955multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr, 2956 SDNode OpNode> { 2957 let Predicates = [HasAVX, NoVLX] in { 2958 defm V#NAME#PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle, 2959 !strconcat(OpcodeStr, "ps"), f256mem, 2960 [(set VR256:$dst, (v4i64 (OpNode VR256:$src1, VR256:$src2)))], 2961 [(set VR256:$dst, (OpNode (bc_v4i64 (v8f32 VR256:$src1)), 2962 (loadv4i64 addr:$src2)))], 0>, PS, VEX_4V, VEX_L; 2963 2964 defm V#NAME#PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble, 2965 !strconcat(OpcodeStr, "pd"), f256mem, 2966 [(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)), 2967 (bc_v4i64 (v4f64 VR256:$src2))))], 2968 [(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)), 2969 (loadv4i64 addr:$src2)))], 0>, 2970 PD, VEX_4V, VEX_L; 2971 2972 // In AVX no need to add a pattern for 128-bit logical rr ps, because they 2973 // are all promoted to v2i64, and the patterns are covered by the int 2974 // version. This is needed in SSE only, because v2i64 isn't supported on 2975 // SSE1, but only on SSE2. 2976 defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle, 2977 !strconcat(OpcodeStr, "ps"), f128mem, [], 2978 [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)), 2979 (loadv2i64 addr:$src2)))], 0>, PS, VEX_4V; 2980 2981 defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble, 2982 !strconcat(OpcodeStr, "pd"), f128mem, 2983 [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)), 2984 (bc_v2i64 (v2f64 VR128:$src2))))], 2985 [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)), 2986 (loadv2i64 addr:$src2)))], 0>, 2987 PD, VEX_4V; 2988 } 2989 2990 let Constraints = "$src1 = $dst" in { 2991 defm PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle, 2992 !strconcat(OpcodeStr, "ps"), f128mem, 2993 [(set VR128:$dst, (v2i64 (OpNode VR128:$src1, VR128:$src2)))], 2994 [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)), 2995 (memopv2i64 addr:$src2)))]>, PS; 2996 2997 defm PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble, 2998 !strconcat(OpcodeStr, "pd"), f128mem, 2999 [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)), 3000 (bc_v2i64 (v2f64 VR128:$src2))))], 3001 [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)), 3002 (memopv2i64 addr:$src2)))]>, PD; 3003 } 3004} 3005 3006defm AND : sse12_fp_packed_logical<0x54, "and", and>; 3007defm OR : sse12_fp_packed_logical<0x56, "or", or>; 3008defm XOR : sse12_fp_packed_logical<0x57, "xor", xor>; 3009let isCommutable = 0 in 3010 defm ANDN : sse12_fp_packed_logical<0x55, "andn", X86andnp>; 3011 3012// AVX1 requires type coercions in order to fold loads directly into logical 3013// operations. 3014let Predicates = [HasAVX1Only] in { 3015 def : Pat<(bc_v8f32 (and VR256:$src1, (loadv4i64 addr:$src2))), 3016 (VANDPSYrm VR256:$src1, addr:$src2)>; 3017 def : Pat<(bc_v8f32 (or VR256:$src1, (loadv4i64 addr:$src2))), 3018 (VORPSYrm VR256:$src1, addr:$src2)>; 3019 def : Pat<(bc_v8f32 (xor VR256:$src1, (loadv4i64 addr:$src2))), 3020 (VXORPSYrm VR256:$src1, addr:$src2)>; 3021 def : Pat<(bc_v8f32 (X86andnp VR256:$src1, (loadv4i64 addr:$src2))), 3022 (VANDNPSYrm VR256:$src1, addr:$src2)>; 3023} 3024 3025//===----------------------------------------------------------------------===// 3026// SSE 1 & 2 - Arithmetic Instructions 3027//===----------------------------------------------------------------------===// 3028 3029/// basic_sse12_fp_binop_xxx - SSE 1 & 2 binops come in both scalar and 3030/// vector forms. 3031/// 3032/// In addition, we also have a special variant of the scalar form here to 3033/// represent the associated intrinsic operation. This form is unlike the 3034/// plain scalar form, in that it takes an entire vector (instead of a scalar) 3035/// and leaves the top elements unmodified (therefore these cannot be commuted). 3036/// 3037/// These three forms can each be reg+reg or reg+mem. 3038/// 3039 3040/// FIXME: once all 256-bit intrinsics are matched, cleanup and refactor those 3041/// classes below 3042multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr, 3043 SDNode OpNode, SizeItins itins> { 3044 let Predicates = [HasAVX, NoVLX] in { 3045 defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, 3046 VR128, v4f32, f128mem, loadv4f32, 3047 SSEPackedSingle, itins.s, 0>, PS, VEX_4V; 3048 defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, 3049 VR128, v2f64, f128mem, loadv2f64, 3050 SSEPackedDouble, itins.d, 0>, PD, VEX_4V; 3051 3052 defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), 3053 OpNode, VR256, v8f32, f256mem, loadv8f32, 3054 SSEPackedSingle, itins.s, 0>, PS, VEX_4V, VEX_L; 3055 defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), 3056 OpNode, VR256, v4f64, f256mem, loadv4f64, 3057 SSEPackedDouble, itins.d, 0>, PD, VEX_4V, VEX_L; 3058 } 3059 3060 let Constraints = "$src1 = $dst" in { 3061 defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128, 3062 v4f32, f128mem, memopv4f32, SSEPackedSingle, 3063 itins.s>, PS; 3064 defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128, 3065 v2f64, f128mem, memopv2f64, SSEPackedDouble, 3066 itins.d>, PD; 3067 } 3068} 3069 3070multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, 3071 SizeItins itins> { 3072 defm V#NAME#SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"), 3073 OpNode, FR32, f32mem, SSEPackedSingle, itins.s, 0>, 3074 XS, VEX_4V, VEX_LIG; 3075 defm V#NAME#SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"), 3076 OpNode, FR64, f64mem, SSEPackedDouble, itins.d, 0>, 3077 XD, VEX_4V, VEX_LIG; 3078 3079 let Constraints = "$src1 = $dst" in { 3080 defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"), 3081 OpNode, FR32, f32mem, SSEPackedSingle, 3082 itins.s>, XS; 3083 defm SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"), 3084 OpNode, FR64, f64mem, SSEPackedDouble, 3085 itins.d>, XD; 3086 } 3087} 3088 3089multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr, 3090 SizeItins itins> { 3091 defm V#NAME#SS : sse12_fp_scalar_int<opc, OpcodeStr, VR128, 3092 !strconcat(OpcodeStr, "ss"), "", "_ss", ssmem, sse_load_f32, 3093 SSEPackedSingle, itins.s, 0>, XS, VEX_4V, VEX_LIG; 3094 defm V#NAME#SD : sse12_fp_scalar_int<opc, OpcodeStr, VR128, 3095 !strconcat(OpcodeStr, "sd"), "2", "_sd", sdmem, sse_load_f64, 3096 SSEPackedDouble, itins.d, 0>, XD, VEX_4V, VEX_LIG; 3097 3098 let Constraints = "$src1 = $dst" in { 3099 defm SS : sse12_fp_scalar_int<opc, OpcodeStr, VR128, 3100 !strconcat(OpcodeStr, "ss"), "", "_ss", ssmem, sse_load_f32, 3101 SSEPackedSingle, itins.s>, XS; 3102 defm SD : sse12_fp_scalar_int<opc, OpcodeStr, VR128, 3103 !strconcat(OpcodeStr, "sd"), "2", "_sd", sdmem, sse_load_f64, 3104 SSEPackedDouble, itins.d>, XD; 3105 } 3106} 3107 3108// Binary Arithmetic instructions 3109defm ADD : basic_sse12_fp_binop_p<0x58, "add", fadd, SSE_ALU_ITINS_P>, 3110 basic_sse12_fp_binop_s<0x58, "add", fadd, SSE_ALU_ITINS_S>, 3111 basic_sse12_fp_binop_s_int<0x58, "add", SSE_ALU_ITINS_S>; 3112defm MUL : basic_sse12_fp_binop_p<0x59, "mul", fmul, SSE_MUL_ITINS_P>, 3113 basic_sse12_fp_binop_s<0x59, "mul", fmul, SSE_MUL_ITINS_S>, 3114 basic_sse12_fp_binop_s_int<0x59, "mul", SSE_MUL_ITINS_S>; 3115let isCommutable = 0 in { 3116 defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", fsub, SSE_ALU_ITINS_P>, 3117 basic_sse12_fp_binop_s<0x5C, "sub", fsub, SSE_ALU_ITINS_S>, 3118 basic_sse12_fp_binop_s_int<0x5C, "sub", SSE_ALU_ITINS_S>; 3119 defm DIV : basic_sse12_fp_binop_p<0x5E, "div", fdiv, SSE_DIV_ITINS_P>, 3120 basic_sse12_fp_binop_s<0x5E, "div", fdiv, SSE_DIV_ITINS_S>, 3121 basic_sse12_fp_binop_s_int<0x5E, "div", SSE_DIV_ITINS_S>; 3122 defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SSE_ALU_ITINS_P>, 3123 basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SSE_ALU_ITINS_S>, 3124 basic_sse12_fp_binop_s_int<0x5F, "max", SSE_ALU_ITINS_S>; 3125 defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SSE_ALU_ITINS_P>, 3126 basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SSE_ALU_ITINS_S>, 3127 basic_sse12_fp_binop_s_int<0x5D, "min", SSE_ALU_ITINS_S>; 3128} 3129 3130let isCodeGenOnly = 1 in { 3131 defm MAXC: basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_P>, 3132 basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_S>; 3133 defm MINC: basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SSE_ALU_ITINS_P>, 3134 basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SSE_ALU_ITINS_S>; 3135} 3136 3137// Patterns used to select SSE scalar fp arithmetic instructions from 3138// either: 3139// 3140// (1) a scalar fp operation followed by a blend 3141// 3142// The effect is that the backend no longer emits unnecessary vector 3143// insert instructions immediately after SSE scalar fp instructions 3144// like addss or mulss. 3145// 3146// For example, given the following code: 3147// __m128 foo(__m128 A, __m128 B) { 3148// A[0] += B[0]; 3149// return A; 3150// } 3151// 3152// Previously we generated: 3153// addss %xmm0, %xmm1 3154// movss %xmm1, %xmm0 3155// 3156// We now generate: 3157// addss %xmm1, %xmm0 3158// 3159// (2) a vector packed single/double fp operation followed by a vector insert 3160// 3161// The effect is that the backend converts the packed fp instruction 3162// followed by a vector insert into a single SSE scalar fp instruction. 3163// 3164// For example, given the following code: 3165// __m128 foo(__m128 A, __m128 B) { 3166// __m128 C = A + B; 3167// return (__m128) {c[0], a[1], a[2], a[3]}; 3168// } 3169// 3170// Previously we generated: 3171// addps %xmm0, %xmm1 3172// movss %xmm1, %xmm0 3173// 3174// We now generate: 3175// addss %xmm1, %xmm0 3176 3177// TODO: Some canonicalization in lowering would simplify the number of 3178// patterns we have to try to match. 3179multiclass scalar_math_f32_patterns<SDNode Op, string OpcPrefix> { 3180 let Predicates = [UseSSE1] in { 3181 // extracted scalar math op with insert via movss 3182 def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector 3183 (Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))), 3184 FR32:$src))))), 3185 (!cast<I>(OpcPrefix#SSrr_Int) v4f32:$dst, 3186 (COPY_TO_REGCLASS FR32:$src, VR128))>; 3187 3188 // vector math op with insert via movss 3189 def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), 3190 (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)))), 3191 (!cast<I>(OpcPrefix#SSrr_Int) v4f32:$dst, v4f32:$src)>; 3192 } 3193 3194 // With SSE 4.1, blendi is preferred to movsd, so match that too. 3195 let Predicates = [UseSSE41] in { 3196 // extracted scalar math op with insert via blend 3197 def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector 3198 (Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))), 3199 FR32:$src))), (i8 1))), 3200 (!cast<I>(OpcPrefix#SSrr_Int) v4f32:$dst, 3201 (COPY_TO_REGCLASS FR32:$src, VR128))>; 3202 3203 // vector math op with insert via blend 3204 def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), 3205 (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), 3206 (!cast<I>(OpcPrefix#SSrr_Int)v4f32:$dst, v4f32:$src)>; 3207 3208 } 3209 3210 // Repeat everything for AVX, except for the movss + scalar combo... 3211 // because that one shouldn't occur with AVX codegen? 3212 let Predicates = [HasAVX] in { 3213 // extracted scalar math op with insert via blend 3214 def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector 3215 (Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))), 3216 FR32:$src))), (i8 1))), 3217 (!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst, 3218 (COPY_TO_REGCLASS FR32:$src, VR128))>; 3219 3220 // vector math op with insert via movss 3221 def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), 3222 (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)))), 3223 (!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst, v4f32:$src)>; 3224 3225 // vector math op with insert via blend 3226 def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), 3227 (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), 3228 (!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst, v4f32:$src)>; 3229 } 3230} 3231 3232defm : scalar_math_f32_patterns<fadd, "ADD">; 3233defm : scalar_math_f32_patterns<fsub, "SUB">; 3234defm : scalar_math_f32_patterns<fmul, "MUL">; 3235defm : scalar_math_f32_patterns<fdiv, "DIV">; 3236 3237multiclass scalar_math_f64_patterns<SDNode Op, string OpcPrefix> { 3238 let Predicates = [UseSSE2] in { 3239 // extracted scalar math op with insert via movsd 3240 def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector 3241 (Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))), 3242 FR64:$src))))), 3243 (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst, 3244 (COPY_TO_REGCLASS FR64:$src, VR128))>; 3245 3246 // vector math op with insert via movsd 3247 def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), 3248 (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)))), 3249 (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>; 3250 } 3251 3252 // With SSE 4.1, blendi is preferred to movsd, so match those too. 3253 let Predicates = [UseSSE41] in { 3254 // extracted scalar math op with insert via blend 3255 def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector 3256 (Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))), 3257 FR64:$src))), (i8 1))), 3258 (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst, 3259 (COPY_TO_REGCLASS FR64:$src, VR128))>; 3260 3261 // vector math op with insert via blend 3262 def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), 3263 (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), 3264 (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>; 3265 } 3266 3267 // Repeat everything for AVX. 3268 let Predicates = [HasAVX] in { 3269 // extracted scalar math op with insert via movsd 3270 def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector 3271 (Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))), 3272 FR64:$src))))), 3273 (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst, 3274 (COPY_TO_REGCLASS FR64:$src, VR128))>; 3275 3276 // extracted scalar math op with insert via blend 3277 def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector 3278 (Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))), 3279 FR64:$src))), (i8 1))), 3280 (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst, 3281 (COPY_TO_REGCLASS FR64:$src, VR128))>; 3282 3283 // vector math op with insert via movsd 3284 def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), 3285 (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)))), 3286 (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>; 3287 3288 // vector math op with insert via blend 3289 def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), 3290 (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), 3291 (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>; 3292 } 3293} 3294 3295defm : scalar_math_f64_patterns<fadd, "ADD">; 3296defm : scalar_math_f64_patterns<fsub, "SUB">; 3297defm : scalar_math_f64_patterns<fmul, "MUL">; 3298defm : scalar_math_f64_patterns<fdiv, "DIV">; 3299 3300 3301/// Unop Arithmetic 3302/// In addition, we also have a special variant of the scalar form here to 3303/// represent the associated intrinsic operation. This form is unlike the 3304/// plain scalar form, in that it takes an entire vector (instead of a 3305/// scalar) and leaves the top elements undefined. 3306/// 3307/// And, we have a special variant form for a full-vector intrinsic form. 3308 3309let Sched = WriteFSqrt in { 3310def SSE_SQRTPS : OpndItins< 3311 IIC_SSE_SQRTPS_RR, IIC_SSE_SQRTPS_RM 3312>; 3313 3314def SSE_SQRTSS : OpndItins< 3315 IIC_SSE_SQRTSS_RR, IIC_SSE_SQRTSS_RM 3316>; 3317 3318def SSE_SQRTPD : OpndItins< 3319 IIC_SSE_SQRTPD_RR, IIC_SSE_SQRTPD_RM 3320>; 3321 3322def SSE_SQRTSD : OpndItins< 3323 IIC_SSE_SQRTSD_RR, IIC_SSE_SQRTSD_RM 3324>; 3325} 3326 3327let Sched = WriteFRsqrt in { 3328def SSE_RSQRTPS : OpndItins< 3329 IIC_SSE_RSQRTPS_RR, IIC_SSE_RSQRTPS_RM 3330>; 3331 3332def SSE_RSQRTSS : OpndItins< 3333 IIC_SSE_RSQRTSS_RR, IIC_SSE_RSQRTSS_RM 3334>; 3335} 3336 3337let Sched = WriteFRcp in { 3338def SSE_RCPP : OpndItins< 3339 IIC_SSE_RCPP_RR, IIC_SSE_RCPP_RM 3340>; 3341 3342def SSE_RCPS : OpndItins< 3343 IIC_SSE_RCPS_RR, IIC_SSE_RCPS_RM 3344>; 3345} 3346 3347/// sse_fp_unop_s - SSE1 unops in scalar form 3348/// For the non-AVX defs, we need $src1 to be tied to $dst because 3349/// the HW instructions are 2 operand / destructive. 3350multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, 3351 ValueType vt, ValueType ScalarVT, 3352 X86MemOperand x86memop, Operand vec_memop, 3353 ComplexPattern mem_cpat, Intrinsic Intr, 3354 SDNode OpNode, Domain d, OpndItins itins, 3355 Predicate target, string Suffix> { 3356 let hasSideEffects = 0 in { 3357 def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1), 3358 !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"), 3359 [(set RC:$dst, (OpNode RC:$src1))], itins.rr, d>, Sched<[itins.Sched]>, 3360 Requires<[target]>; 3361 let mayLoad = 1 in 3362 def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1), 3363 !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"), 3364 [(set RC:$dst, (OpNode (load addr:$src1)))], itins.rm, d>, 3365 Sched<[itins.Sched.Folded, ReadAfterLd]>, 3366 Requires<[target, OptForSize]>; 3367 3368 let isCodeGenOnly = 1, Constraints = "$src1 = $dst" in { 3369 def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), 3370 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3371 []>, Sched<[itins.Sched.Folded, ReadAfterLd]>; 3372 let mayLoad = 1 in 3373 def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, vec_memop:$src2), 3374 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3375 []>, Sched<[itins.Sched.Folded, ReadAfterLd]>; 3376 } 3377 } 3378 3379 let Predicates = [target] in { 3380 def : Pat<(vt (OpNode mem_cpat:$src)), 3381 (vt (COPY_TO_REGCLASS (vt (!cast<Instruction>(NAME#Suffix##m_Int) 3382 (vt (IMPLICIT_DEF)), mem_cpat:$src)), RC))>; 3383 // These are unary operations, but they are modeled as having 2 source operands 3384 // because the high elements of the destination are unchanged in SSE. 3385 def : Pat<(Intr VR128:$src), 3386 (!cast<Instruction>(NAME#Suffix##r_Int) VR128:$src, VR128:$src)>; 3387 def : Pat<(Intr (load addr:$src)), 3388 (vt (COPY_TO_REGCLASS(!cast<Instruction>(NAME#Suffix##m) 3389 addr:$src), VR128))>; 3390 def : Pat<(Intr mem_cpat:$src), 3391 (!cast<Instruction>(NAME#Suffix##m_Int) 3392 (vt (IMPLICIT_DEF)), mem_cpat:$src)>; 3393 } 3394} 3395 3396multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, 3397 ValueType vt, ValueType ScalarVT, 3398 X86MemOperand x86memop, Operand vec_memop, 3399 ComplexPattern mem_cpat, 3400 Intrinsic Intr, SDNode OpNode, Domain d, 3401 OpndItins itins, string Suffix> { 3402 let hasSideEffects = 0 in { 3403 def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 3404 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 3405 [], itins.rr, d>, Sched<[itins.Sched]>; 3406 let mayLoad = 1 in 3407 def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 3408 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 3409 [], itins.rm, d>, Sched<[itins.Sched.Folded, ReadAfterLd]>; 3410 let isCodeGenOnly = 1 in { 3411 def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst), 3412 (ins VR128:$src1, VR128:$src2), 3413 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 3414 []>, Sched<[itins.Sched.Folded]>; 3415 let mayLoad = 1 in 3416 def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), 3417 (ins VR128:$src1, vec_memop:$src2), 3418 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 3419 []>, Sched<[itins.Sched.Folded, ReadAfterLd]>; 3420 } 3421 } 3422 3423 let Predicates = [UseAVX] in { 3424 def : Pat<(OpNode RC:$src), (!cast<Instruction>("V"#NAME#Suffix##r) 3425 (ScalarVT (IMPLICIT_DEF)), RC:$src)>; 3426 3427 def : Pat<(vt (OpNode mem_cpat:$src)), 3428 (!cast<Instruction>("V"#NAME#Suffix##m_Int) (vt (IMPLICIT_DEF)), 3429 mem_cpat:$src)>; 3430 3431 } 3432 let Predicates = [HasAVX] in { 3433 def : Pat<(Intr VR128:$src), 3434 (!cast<Instruction>("V"#NAME#Suffix##r_Int) (vt (IMPLICIT_DEF)), 3435 VR128:$src)>; 3436 3437 def : Pat<(Intr mem_cpat:$src), 3438 (!cast<Instruction>("V"#NAME#Suffix##m_Int) 3439 (vt (IMPLICIT_DEF)), mem_cpat:$src)>; 3440 } 3441 let Predicates = [UseAVX, OptForSize] in 3442 def : Pat<(ScalarVT (OpNode (load addr:$src))), 3443 (!cast<Instruction>("V"#NAME#Suffix##m) (ScalarVT (IMPLICIT_DEF)), 3444 addr:$src)>; 3445} 3446 3447/// sse1_fp_unop_p - SSE1 unops in packed form. 3448multiclass sse1_fp_unop_p<bits<8> opc, string OpcodeStr, SDNode OpNode, 3449 OpndItins itins> { 3450let Predicates = [HasAVX] in { 3451 def V#NAME#PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 3452 !strconcat("v", OpcodeStr, 3453 "ps\t{$src, $dst|$dst, $src}"), 3454 [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))], 3455 itins.rr>, VEX, Sched<[itins.Sched]>; 3456 def V#NAME#PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 3457 !strconcat("v", OpcodeStr, 3458 "ps\t{$src, $dst|$dst, $src}"), 3459 [(set VR128:$dst, (OpNode (loadv4f32 addr:$src)))], 3460 itins.rm>, VEX, Sched<[itins.Sched.Folded]>; 3461 def V#NAME#PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 3462 !strconcat("v", OpcodeStr, 3463 "ps\t{$src, $dst|$dst, $src}"), 3464 [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))], 3465 itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>; 3466 def V#NAME#PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), 3467 !strconcat("v", OpcodeStr, 3468 "ps\t{$src, $dst|$dst, $src}"), 3469 [(set VR256:$dst, (OpNode (loadv8f32 addr:$src)))], 3470 itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>; 3471} 3472 3473 def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 3474 !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), 3475 [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))], itins.rr>, 3476 Sched<[itins.Sched]>; 3477 def PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 3478 !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), 3479 [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))], itins.rm>, 3480 Sched<[itins.Sched.Folded]>; 3481} 3482 3483/// sse2_fp_unop_p - SSE2 unops in vector forms. 3484multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr, 3485 SDNode OpNode, OpndItins itins> { 3486let Predicates = [HasAVX] in { 3487 def V#NAME#PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 3488 !strconcat("v", OpcodeStr, 3489 "pd\t{$src, $dst|$dst, $src}"), 3490 [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))], 3491 itins.rr>, VEX, Sched<[itins.Sched]>; 3492 def V#NAME#PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 3493 !strconcat("v", OpcodeStr, 3494 "pd\t{$src, $dst|$dst, $src}"), 3495 [(set VR128:$dst, (OpNode (loadv2f64 addr:$src)))], 3496 itins.rm>, VEX, Sched<[itins.Sched.Folded]>; 3497 def V#NAME#PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 3498 !strconcat("v", OpcodeStr, 3499 "pd\t{$src, $dst|$dst, $src}"), 3500 [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))], 3501 itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>; 3502 def V#NAME#PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), 3503 !strconcat("v", OpcodeStr, 3504 "pd\t{$src, $dst|$dst, $src}"), 3505 [(set VR256:$dst, (OpNode (loadv4f64 addr:$src)))], 3506 itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>; 3507} 3508 3509 def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 3510 !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"), 3511 [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))], itins.rr>, 3512 Sched<[itins.Sched]>; 3513 def PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 3514 !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"), 3515 [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))], itins.rm>, 3516 Sched<[itins.Sched.Folded]>; 3517} 3518 3519multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, 3520 OpndItins itins> { 3521 defm SS : sse_fp_unop_s<opc, OpcodeStr##ss, FR32, v4f32, f32, f32mem, 3522 ssmem, sse_load_f32, 3523 !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode, 3524 SSEPackedSingle, itins, UseSSE1, "SS">, XS; 3525 defm V#NAME#SS : avx_fp_unop_s<opc, "v"#OpcodeStr##ss, FR32, v4f32, f32, 3526 f32mem, ssmem, sse_load_f32, 3527 !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode, 3528 SSEPackedSingle, itins, "SS">, XS, VEX_4V, VEX_LIG; 3529} 3530 3531multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, 3532 OpndItins itins> { 3533 defm SD : sse_fp_unop_s<opc, OpcodeStr##sd, FR64, v2f64, f64, f64mem, 3534 sdmem, sse_load_f64, 3535 !cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd), 3536 OpNode, SSEPackedDouble, itins, UseSSE2, "SD">, XD; 3537 defm V#NAME#SD : avx_fp_unop_s<opc, "v"#OpcodeStr##sd, FR64, v2f64, f64, 3538 f64mem, sdmem, sse_load_f64, 3539 !cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd), 3540 OpNode, SSEPackedDouble, itins, "SD">, 3541 XD, VEX_4V, VEX_LIG; 3542} 3543 3544// Square root. 3545defm SQRT : sse1_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSS>, 3546 sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPS>, 3547 sse2_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSD>, 3548 sse2_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPD>; 3549 3550// Reciprocal approximations. Note that these typically require refinement 3551// in order to obtain suitable precision. 3552defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SSE_RSQRTSS>, 3553 sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_RSQRTPS>; 3554defm RCP : sse1_fp_unop_s<0x53, "rcp", X86frcp, SSE_RCPS>, 3555 sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP>; 3556 3557// There is no f64 version of the reciprocal approximation instructions. 3558 3559// TODO: We should add *scalar* op patterns for these just like we have for 3560// the binops above. If the binop and unop patterns could all be unified 3561// that would be even better. 3562 3563multiclass scalar_unary_math_patterns<Intrinsic Intr, string OpcPrefix, 3564 SDNode Move, ValueType VT, 3565 Predicate BasePredicate> { 3566 let Predicates = [BasePredicate] in { 3567 def : Pat<(VT (Move VT:$dst, (Intr VT:$src))), 3568 (!cast<I>(OpcPrefix#r_Int) VT:$dst, VT:$src)>; 3569 } 3570 3571 // With SSE 4.1, blendi is preferred to movs*, so match that too. 3572 let Predicates = [UseSSE41] in { 3573 def : Pat<(VT (X86Blendi VT:$dst, (Intr VT:$src), (i8 1))), 3574 (!cast<I>(OpcPrefix#r_Int) VT:$dst, VT:$src)>; 3575 } 3576 3577 // Repeat for AVX versions of the instructions. 3578 let Predicates = [HasAVX] in { 3579 def : Pat<(VT (Move VT:$dst, (Intr VT:$src))), 3580 (!cast<I>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>; 3581 3582 def : Pat<(VT (X86Blendi VT:$dst, (Intr VT:$src), (i8 1))), 3583 (!cast<I>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>; 3584 } 3585} 3586 3587defm : scalar_unary_math_patterns<int_x86_sse_rcp_ss, "RCPSS", X86Movss, 3588 v4f32, UseSSE1>; 3589defm : scalar_unary_math_patterns<int_x86_sse_rsqrt_ss, "RSQRTSS", X86Movss, 3590 v4f32, UseSSE1>; 3591defm : scalar_unary_math_patterns<int_x86_sse_sqrt_ss, "SQRTSS", X86Movss, 3592 v4f32, UseSSE1>; 3593defm : scalar_unary_math_patterns<int_x86_sse2_sqrt_sd, "SQRTSD", X86Movsd, 3594 v2f64, UseSSE2>; 3595 3596 3597//===----------------------------------------------------------------------===// 3598// SSE 1 & 2 - Non-temporal stores 3599//===----------------------------------------------------------------------===// 3600 3601let AddedComplexity = 400 in { // Prefer non-temporal versions 3602let SchedRW = [WriteStore] in { 3603let Predicates = [HasAVX, NoVLX] in { 3604def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs), 3605 (ins f128mem:$dst, VR128:$src), 3606 "movntps\t{$src, $dst|$dst, $src}", 3607 [(alignednontemporalstore (v4f32 VR128:$src), 3608 addr:$dst)], 3609 IIC_SSE_MOVNT>, VEX; 3610def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs), 3611 (ins f128mem:$dst, VR128:$src), 3612 "movntpd\t{$src, $dst|$dst, $src}", 3613 [(alignednontemporalstore (v2f64 VR128:$src), 3614 addr:$dst)], 3615 IIC_SSE_MOVNT>, VEX; 3616 3617let ExeDomain = SSEPackedInt in 3618def VMOVNTDQmr : VPDI<0xE7, MRMDestMem, (outs), 3619 (ins f128mem:$dst, VR128:$src), 3620 "movntdq\t{$src, $dst|$dst, $src}", 3621 [(alignednontemporalstore (v2i64 VR128:$src), 3622 addr:$dst)], 3623 IIC_SSE_MOVNT>, VEX; 3624 3625def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs), 3626 (ins f256mem:$dst, VR256:$src), 3627 "movntps\t{$src, $dst|$dst, $src}", 3628 [(alignednontemporalstore (v8f32 VR256:$src), 3629 addr:$dst)], 3630 IIC_SSE_MOVNT>, VEX, VEX_L; 3631def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs), 3632 (ins f256mem:$dst, VR256:$src), 3633 "movntpd\t{$src, $dst|$dst, $src}", 3634 [(alignednontemporalstore (v4f64 VR256:$src), 3635 addr:$dst)], 3636 IIC_SSE_MOVNT>, VEX, VEX_L; 3637let ExeDomain = SSEPackedInt in 3638def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs), 3639 (ins f256mem:$dst, VR256:$src), 3640 "movntdq\t{$src, $dst|$dst, $src}", 3641 [(alignednontemporalstore (v4i64 VR256:$src), 3642 addr:$dst)], 3643 IIC_SSE_MOVNT>, VEX, VEX_L; 3644} 3645 3646def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 3647 "movntps\t{$src, $dst|$dst, $src}", 3648 [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)], 3649 IIC_SSE_MOVNT>; 3650def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 3651 "movntpd\t{$src, $dst|$dst, $src}", 3652 [(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)], 3653 IIC_SSE_MOVNT>; 3654 3655let ExeDomain = SSEPackedInt in 3656def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 3657 "movntdq\t{$src, $dst|$dst, $src}", 3658 [(alignednontemporalstore (v2i64 VR128:$src), addr:$dst)], 3659 IIC_SSE_MOVNT>; 3660 3661// There is no AVX form for instructions below this point 3662def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src), 3663 "movnti{l}\t{$src, $dst|$dst, $src}", 3664 [(nontemporalstore (i32 GR32:$src), addr:$dst)], 3665 IIC_SSE_MOVNT>, 3666 PS, Requires<[HasSSE2]>; 3667def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), 3668 "movnti{q}\t{$src, $dst|$dst, $src}", 3669 [(nontemporalstore (i64 GR64:$src), addr:$dst)], 3670 IIC_SSE_MOVNT>, 3671 PS, Requires<[HasSSE2]>; 3672} // SchedRW = [WriteStore] 3673 3674let Predicates = [HasAVX2, NoVLX] in { 3675 def : Pat<(alignednontemporalstore (v8i32 VR256:$src), addr:$dst), 3676 (VMOVNTDQYmr addr:$dst, VR256:$src)>; 3677 def : Pat<(alignednontemporalstore (v16i16 VR256:$src), addr:$dst), 3678 (VMOVNTDQYmr addr:$dst, VR256:$src)>; 3679 def : Pat<(alignednontemporalstore (v32i8 VR256:$src), addr:$dst), 3680 (VMOVNTDQYmr addr:$dst, VR256:$src)>; 3681} 3682 3683let Predicates = [HasAVX, NoVLX] in { 3684 def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst), 3685 (VMOVNTDQmr addr:$dst, VR128:$src)>; 3686 def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst), 3687 (VMOVNTDQmr addr:$dst, VR128:$src)>; 3688 def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst), 3689 (VMOVNTDQmr addr:$dst, VR128:$src)>; 3690} 3691 3692def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst), 3693 (MOVNTDQmr addr:$dst, VR128:$src)>; 3694def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst), 3695 (MOVNTDQmr addr:$dst, VR128:$src)>; 3696def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst), 3697 (MOVNTDQmr addr:$dst, VR128:$src)>; 3698 3699} // AddedComplexity 3700 3701//===----------------------------------------------------------------------===// 3702// SSE 1 & 2 - Prefetch and memory fence 3703//===----------------------------------------------------------------------===// 3704 3705// Prefetch intrinsic. 3706let Predicates = [HasSSE1], SchedRW = [WriteLoad] in { 3707def PREFETCHT0 : I<0x18, MRM1m, (outs), (ins i8mem:$src), 3708 "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3), (i32 1))], 3709 IIC_SSE_PREFETCH>, TB; 3710def PREFETCHT1 : I<0x18, MRM2m, (outs), (ins i8mem:$src), 3711 "prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2), (i32 1))], 3712 IIC_SSE_PREFETCH>, TB; 3713def PREFETCHT2 : I<0x18, MRM3m, (outs), (ins i8mem:$src), 3714 "prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1), (i32 1))], 3715 IIC_SSE_PREFETCH>, TB; 3716def PREFETCHNTA : I<0x18, MRM0m, (outs), (ins i8mem:$src), 3717 "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0), (i32 1))], 3718 IIC_SSE_PREFETCH>, TB; 3719} 3720 3721// FIXME: How should flush instruction be modeled? 3722let SchedRW = [WriteLoad] in { 3723// Flush cache 3724def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src), 3725 "clflush\t$src", [(int_x86_sse2_clflush addr:$src)], 3726 IIC_SSE_PREFETCH>, PS, Requires<[HasSSE2]>; 3727} 3728 3729let SchedRW = [WriteNop] in { 3730// Pause. This "instruction" is encoded as "rep; nop", so even though it 3731// was introduced with SSE2, it's backward compatible. 3732def PAUSE : I<0x90, RawFrm, (outs), (ins), 3733 "pause", [(int_x86_sse2_pause)], IIC_SSE_PAUSE>, 3734 OBXS, Requires<[HasSSE2]>; 3735} 3736 3737let SchedRW = [WriteFence] in { 3738// Load, store, and memory fence 3739def SFENCE : I<0xAE, MRM_F8, (outs), (ins), 3740 "sfence", [(int_x86_sse_sfence)], IIC_SSE_SFENCE>, 3741 PS, Requires<[HasSSE1]>; 3742def LFENCE : I<0xAE, MRM_E8, (outs), (ins), 3743 "lfence", [(int_x86_sse2_lfence)], IIC_SSE_LFENCE>, 3744 TB, Requires<[HasSSE2]>; 3745def MFENCE : I<0xAE, MRM_F0, (outs), (ins), 3746 "mfence", [(int_x86_sse2_mfence)], IIC_SSE_MFENCE>, 3747 TB, Requires<[HasSSE2]>; 3748} // SchedRW 3749 3750def : Pat<(X86SFence), (SFENCE)>; 3751def : Pat<(X86LFence), (LFENCE)>; 3752def : Pat<(X86MFence), (MFENCE)>; 3753 3754//===----------------------------------------------------------------------===// 3755// SSE 1 & 2 - Load/Store XCSR register 3756//===----------------------------------------------------------------------===// 3757 3758def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src), 3759 "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)], 3760 IIC_SSE_LDMXCSR>, VEX, Sched<[WriteLoad]>; 3761def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst), 3762 "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)], 3763 IIC_SSE_STMXCSR>, VEX, Sched<[WriteStore]>; 3764 3765let Predicates = [UseSSE1] in { 3766def LDMXCSR : I<0xAE, MRM2m, (outs), (ins i32mem:$src), 3767 "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)], 3768 IIC_SSE_LDMXCSR>, TB, Sched<[WriteLoad]>; 3769def STMXCSR : I<0xAE, MRM3m, (outs), (ins i32mem:$dst), 3770 "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)], 3771 IIC_SSE_STMXCSR>, TB, Sched<[WriteStore]>; 3772} 3773 3774//===---------------------------------------------------------------------===// 3775// SSE2 - Move Aligned/Unaligned Packed Integer Instructions 3776//===---------------------------------------------------------------------===// 3777 3778let ExeDomain = SSEPackedInt in { // SSE integer instructions 3779 3780let hasSideEffects = 0, SchedRW = [WriteMove] in { 3781def VMOVDQArr : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 3782 "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>, 3783 VEX; 3784def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 3785 "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>, 3786 VEX, VEX_L; 3787def VMOVDQUrr : VSSI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 3788 "movdqu\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVU_P_RR>, 3789 VEX; 3790def VMOVDQUYrr : VSSI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 3791 "movdqu\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVU_P_RR>, 3792 VEX, VEX_L; 3793} 3794 3795// For Disassembler 3796let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, 3797 SchedRW = [WriteMove] in { 3798def VMOVDQArr_REV : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 3799 "movdqa\t{$src, $dst|$dst, $src}", [], 3800 IIC_SSE_MOVA_P_RR>, 3801 VEX; 3802def VMOVDQAYrr_REV : VPDI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src), 3803 "movdqa\t{$src, $dst|$dst, $src}", [], 3804 IIC_SSE_MOVA_P_RR>, VEX, VEX_L; 3805def VMOVDQUrr_REV : VSSI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 3806 "movdqu\t{$src, $dst|$dst, $src}", [], 3807 IIC_SSE_MOVU_P_RR>, 3808 VEX; 3809def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src), 3810 "movdqu\t{$src, $dst|$dst, $src}", [], 3811 IIC_SSE_MOVU_P_RR>, VEX, VEX_L; 3812} 3813 3814let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1, 3815 hasSideEffects = 0, SchedRW = [WriteLoad] in { 3816def VMOVDQArm : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 3817 "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RM>, 3818 VEX; 3819def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), 3820 "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RM>, 3821 VEX, VEX_L; 3822let Predicates = [HasAVX] in { 3823 def VMOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 3824 "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_RM>, 3825 XS, VEX; 3826 def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), 3827 "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_RM>, 3828 XS, VEX, VEX_L; 3829} 3830} 3831 3832let mayStore = 1, hasSideEffects = 0, SchedRW = [WriteStore] in { 3833def VMOVDQAmr : VPDI<0x7F, MRMDestMem, (outs), 3834 (ins i128mem:$dst, VR128:$src), 3835 "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_MR>, 3836 VEX; 3837def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs), 3838 (ins i256mem:$dst, VR256:$src), 3839 "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_MR>, 3840 VEX, VEX_L; 3841let Predicates = [HasAVX] in { 3842def VMOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), 3843 "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_MR>, 3844 XS, VEX; 3845def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src), 3846 "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_MR>, 3847 XS, VEX, VEX_L; 3848} 3849} 3850 3851let SchedRW = [WriteMove] in { 3852let hasSideEffects = 0 in 3853def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 3854 "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>; 3855 3856def MOVDQUrr : I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 3857 "movdqu\t{$src, $dst|$dst, $src}", 3858 [], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>; 3859 3860// For Disassembler 3861let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in { 3862def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 3863 "movdqa\t{$src, $dst|$dst, $src}", [], 3864 IIC_SSE_MOVA_P_RR>; 3865 3866def MOVDQUrr_REV : I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 3867 "movdqu\t{$src, $dst|$dst, $src}", 3868 [], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>; 3869} 3870} // SchedRW 3871 3872let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1, 3873 hasSideEffects = 0, SchedRW = [WriteLoad] in { 3874def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 3875 "movdqa\t{$src, $dst|$dst, $src}", 3876 [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/], 3877 IIC_SSE_MOVA_P_RM>; 3878def MOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 3879 "movdqu\t{$src, $dst|$dst, $src}", 3880 [/*(set VR128:$dst, (loadv2i64 addr:$src))*/], 3881 IIC_SSE_MOVU_P_RM>, 3882 XS, Requires<[UseSSE2]>; 3883} 3884 3885let mayStore = 1, hasSideEffects = 0, SchedRW = [WriteStore] in { 3886def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), 3887 "movdqa\t{$src, $dst|$dst, $src}", 3888 [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/], 3889 IIC_SSE_MOVA_P_MR>; 3890def MOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), 3891 "movdqu\t{$src, $dst|$dst, $src}", 3892 [/*(store (v2i64 VR128:$src), addr:$dst)*/], 3893 IIC_SSE_MOVU_P_MR>, 3894 XS, Requires<[UseSSE2]>; 3895} 3896 3897} // ExeDomain = SSEPackedInt 3898 3899let Predicates = [HasAVX] in { 3900 def : Pat<(int_x86_sse2_storeu_dq addr:$dst, VR128:$src), 3901 (VMOVDQUmr addr:$dst, VR128:$src)>; 3902 def : Pat<(int_x86_avx_storeu_dq_256 addr:$dst, VR256:$src), 3903 (VMOVDQUYmr addr:$dst, VR256:$src)>; 3904} 3905let Predicates = [UseSSE2] in 3906def : Pat<(int_x86_sse2_storeu_dq addr:$dst, VR128:$src), 3907 (MOVDQUmr addr:$dst, VR128:$src)>; 3908 3909//===---------------------------------------------------------------------===// 3910// SSE2 - Packed Integer Arithmetic Instructions 3911//===---------------------------------------------------------------------===// 3912 3913let Sched = WriteVecIMul in 3914def SSE_PMADD : OpndItins< 3915 IIC_SSE_PMADD, IIC_SSE_PMADD 3916>; 3917 3918let ExeDomain = SSEPackedInt in { // SSE integer instructions 3919 3920multiclass PDI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId, 3921 RegisterClass RC, PatFrag memop_frag, 3922 X86MemOperand x86memop, 3923 OpndItins itins, 3924 bit IsCommutable = 0, 3925 bit Is2Addr = 1> { 3926 let isCommutable = IsCommutable in 3927 def rr : PDI<opc, MRMSrcReg, (outs RC:$dst), 3928 (ins RC:$src1, RC:$src2), 3929 !if(Is2Addr, 3930 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3931 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3932 [(set RC:$dst, (IntId RC:$src1, RC:$src2))], itins.rr>, 3933 Sched<[itins.Sched]>; 3934 def rm : PDI<opc, MRMSrcMem, (outs RC:$dst), 3935 (ins RC:$src1, x86memop:$src2), 3936 !if(Is2Addr, 3937 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3938 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3939 [(set RC:$dst, (IntId RC:$src1, (bitconvert (memop_frag addr:$src2))))], 3940 itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>; 3941} 3942 3943multiclass PDI_binop_all_int<bits<8> opc, string OpcodeStr, Intrinsic IntId128, 3944 Intrinsic IntId256, OpndItins itins, 3945 bit IsCommutable = 0> { 3946let Predicates = [HasAVX] in 3947 defm V#NAME : PDI_binop_rm_int<opc, !strconcat("v", OpcodeStr), IntId128, 3948 VR128, loadv2i64, i128mem, itins, 3949 IsCommutable, 0>, VEX_4V; 3950 3951let Constraints = "$src1 = $dst" in 3952 defm NAME : PDI_binop_rm_int<opc, OpcodeStr, IntId128, VR128, memopv2i64, 3953 i128mem, itins, IsCommutable, 1>; 3954 3955let Predicates = [HasAVX2] in 3956 defm V#NAME#Y : PDI_binop_rm_int<opc, !strconcat("v", OpcodeStr), IntId256, 3957 VR256, loadv4i64, i256mem, itins, 3958 IsCommutable, 0>, VEX_4V, VEX_L; 3959} 3960 3961multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm, 3962 string OpcodeStr, SDNode OpNode, 3963 SDNode OpNode2, RegisterClass RC, 3964 ValueType DstVT, ValueType SrcVT, PatFrag bc_frag, 3965 PatFrag ld_frag, ShiftOpndItins itins, 3966 bit Is2Addr = 1> { 3967 // src2 is always 128-bit 3968 def rr : PDI<opc, MRMSrcReg, (outs RC:$dst), 3969 (ins RC:$src1, VR128:$src2), 3970 !if(Is2Addr, 3971 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3972 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3973 [(set RC:$dst, (DstVT (OpNode RC:$src1, (SrcVT VR128:$src2))))], 3974 itins.rr>, Sched<[WriteVecShift]>; 3975 def rm : PDI<opc, MRMSrcMem, (outs RC:$dst), 3976 (ins RC:$src1, i128mem:$src2), 3977 !if(Is2Addr, 3978 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3979 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3980 [(set RC:$dst, (DstVT (OpNode RC:$src1, 3981 (bc_frag (ld_frag addr:$src2)))))], itins.rm>, 3982 Sched<[WriteVecShiftLd, ReadAfterLd]>; 3983 def ri : PDIi8<opc2, ImmForm, (outs RC:$dst), 3984 (ins RC:$src1, u8imm:$src2), 3985 !if(Is2Addr, 3986 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3987 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3988 [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i8 imm:$src2))))], itins.ri>, 3989 Sched<[WriteVecShift]>; 3990} 3991 3992/// PDI_binop_rm2 - Simple SSE2 binary operator with different src and dst types 3993multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode, 3994 ValueType DstVT, ValueType SrcVT, RegisterClass RC, 3995 PatFrag memop_frag, X86MemOperand x86memop, 3996 OpndItins itins, 3997 bit IsCommutable = 0, bit Is2Addr = 1> { 3998 let isCommutable = IsCommutable in 3999 def rr : PDI<opc, MRMSrcReg, (outs RC:$dst), 4000 (ins RC:$src1, RC:$src2), 4001 !if(Is2Addr, 4002 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4003 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4004 [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>, 4005 Sched<[itins.Sched]>; 4006 def rm : PDI<opc, MRMSrcMem, (outs RC:$dst), 4007 (ins RC:$src1, x86memop:$src2), 4008 !if(Is2Addr, 4009 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4010 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4011 [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), 4012 (bitconvert (memop_frag addr:$src2)))))]>, 4013 Sched<[itins.Sched.Folded, ReadAfterLd]>; 4014} 4015} // ExeDomain = SSEPackedInt 4016 4017defm PADDB : PDI_binop_all<0xFC, "paddb", add, v16i8, v32i8, 4018 SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; 4019defm PADDW : PDI_binop_all<0xFD, "paddw", add, v8i16, v16i16, 4020 SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; 4021defm PADDD : PDI_binop_all<0xFE, "paddd", add, v4i32, v8i32, 4022 SSE_INTALU_ITINS_P, 1, NoVLX>; 4023defm PADDQ : PDI_binop_all<0xD4, "paddq", add, v2i64, v4i64, 4024 SSE_INTALUQ_ITINS_P, 1, NoVLX>; 4025defm PMULLW : PDI_binop_all<0xD5, "pmullw", mul, v8i16, v16i16, 4026 SSE_INTMUL_ITINS_P, 1, NoVLX_Or_NoBWI>; 4027defm PMULHUW : PDI_binop_all<0xE4, "pmulhuw", mulhu, v8i16, v16i16, 4028 SSE_INTMUL_ITINS_P, 1, NoVLX_Or_NoBWI>; 4029defm PMULHW : PDI_binop_all<0xE5, "pmulhw", mulhs, v8i16, v16i16, 4030 SSE_INTMUL_ITINS_P, 1, NoVLX_Or_NoBWI>; 4031defm PSUBB : PDI_binop_all<0xF8, "psubb", sub, v16i8, v32i8, 4032 SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>; 4033defm PSUBW : PDI_binop_all<0xF9, "psubw", sub, v8i16, v16i16, 4034 SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>; 4035defm PSUBD : PDI_binop_all<0xFA, "psubd", sub, v4i32, v8i32, 4036 SSE_INTALU_ITINS_P, 0, NoVLX>; 4037defm PSUBQ : PDI_binop_all<0xFB, "psubq", sub, v2i64, v4i64, 4038 SSE_INTALUQ_ITINS_P, 0, NoVLX>; 4039defm PSUBUSB : PDI_binop_all<0xD8, "psubusb", X86subus, v16i8, v32i8, 4040 SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>; 4041defm PSUBUSW : PDI_binop_all<0xD9, "psubusw", X86subus, v8i16, v16i16, 4042 SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>; 4043defm PMINUB : PDI_binop_all<0xDA, "pminub", umin, v16i8, v32i8, 4044 SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; 4045defm PMINSW : PDI_binop_all<0xEA, "pminsw", smin, v8i16, v16i16, 4046 SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; 4047defm PMAXUB : PDI_binop_all<0xDE, "pmaxub", umax, v16i8, v32i8, 4048 SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; 4049defm PMAXSW : PDI_binop_all<0xEE, "pmaxsw", smax, v8i16, v16i16, 4050 SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; 4051defm PAVGB : PDI_binop_all<0xE0, "pavgb", X86avg, v16i8, v32i8, 4052 SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; 4053defm PAVGW : PDI_binop_all<0xE3, "pavgw", X86avg, v8i16, v16i16, 4054 SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; 4055 4056// Intrinsic forms 4057defm PSUBSB : PDI_binop_all_int<0xE8, "psubsb", int_x86_sse2_psubs_b, 4058 int_x86_avx2_psubs_b, SSE_INTALU_ITINS_P, 0>; 4059defm PSUBSW : PDI_binop_all_int<0xE9, "psubsw" , int_x86_sse2_psubs_w, 4060 int_x86_avx2_psubs_w, SSE_INTALU_ITINS_P, 0>; 4061defm PADDSB : PDI_binop_all_int<0xEC, "paddsb" , int_x86_sse2_padds_b, 4062 int_x86_avx2_padds_b, SSE_INTALU_ITINS_P, 1>; 4063defm PADDSW : PDI_binop_all_int<0xED, "paddsw" , int_x86_sse2_padds_w, 4064 int_x86_avx2_padds_w, SSE_INTALU_ITINS_P, 1>; 4065defm PADDUSB : PDI_binop_all_int<0xDC, "paddusb", int_x86_sse2_paddus_b, 4066 int_x86_avx2_paddus_b, SSE_INTALU_ITINS_P, 1>; 4067defm PADDUSW : PDI_binop_all_int<0xDD, "paddusw", int_x86_sse2_paddus_w, 4068 int_x86_avx2_paddus_w, SSE_INTALU_ITINS_P, 1>; 4069defm PMADDWD : PDI_binop_all_int<0xF5, "pmaddwd", int_x86_sse2_pmadd_wd, 4070 int_x86_avx2_pmadd_wd, SSE_PMADD, 1>; 4071 4072let Predicates = [HasAVX] in 4073defm VPSADBW : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v2i64, v16i8, VR128, 4074 loadv2i64, i128mem, SSE_INTMUL_ITINS_P, 1, 0>, 4075 VEX_4V; 4076let Predicates = [HasAVX2] in 4077defm VPSADBWY : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v4i64, v32i8, VR256, 4078 loadv4i64, i256mem, SSE_INTMUL_ITINS_P, 1, 0>, 4079 VEX_4V, VEX_L; 4080let Constraints = "$src1 = $dst" in 4081defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128, 4082 memopv2i64, i128mem, SSE_INTALU_ITINS_P, 1>; 4083 4084let Predicates = [HasAVX] in 4085defm VPMULUDQ : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v2i64, v4i32, VR128, 4086 loadv2i64, i128mem, SSE_INTMUL_ITINS_P, 1, 0>, 4087 VEX_4V; 4088let Predicates = [HasAVX2] in 4089defm VPMULUDQY : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v4i64, v8i32, 4090 VR256, loadv4i64, i256mem, 4091 SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L; 4092let Constraints = "$src1 = $dst" in 4093defm PMULUDQ : PDI_binop_rm2<0xF4, "pmuludq", X86pmuludq, v2i64, v4i32, VR128, 4094 memopv2i64, i128mem, SSE_INTMUL_ITINS_P, 1>; 4095 4096//===---------------------------------------------------------------------===// 4097// SSE2 - Packed Integer Logical Instructions 4098//===---------------------------------------------------------------------===// 4099 4100let Predicates = [HasAVX, NoVLX] in { 4101defm VPSLLW : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli, 4102 VR128, v8i16, v8i16, bc_v8i16, loadv2i64, 4103 SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; 4104defm VPSLLD : PDI_binop_rmi<0xF2, 0x72, MRM6r, "vpslld", X86vshl, X86vshli, 4105 VR128, v4i32, v4i32, bc_v4i32, loadv2i64, 4106 SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; 4107defm VPSLLQ : PDI_binop_rmi<0xF3, 0x73, MRM6r, "vpsllq", X86vshl, X86vshli, 4108 VR128, v2i64, v2i64, bc_v2i64, loadv2i64, 4109 SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; 4110 4111defm VPSRLW : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli, 4112 VR128, v8i16, v8i16, bc_v8i16, loadv2i64, 4113 SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; 4114defm VPSRLD : PDI_binop_rmi<0xD2, 0x72, MRM2r, "vpsrld", X86vsrl, X86vsrli, 4115 VR128, v4i32, v4i32, bc_v4i32, loadv2i64, 4116 SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; 4117defm VPSRLQ : PDI_binop_rmi<0xD3, 0x73, MRM2r, "vpsrlq", X86vsrl, X86vsrli, 4118 VR128, v2i64, v2i64, bc_v2i64, loadv2i64, 4119 SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; 4120 4121defm VPSRAW : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai, 4122 VR128, v8i16, v8i16, bc_v8i16, loadv2i64, 4123 SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; 4124defm VPSRAD : PDI_binop_rmi<0xE2, 0x72, MRM4r, "vpsrad", X86vsra, X86vsrai, 4125 VR128, v4i32, v4i32, bc_v4i32, loadv2i64, 4126 SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; 4127} // Predicates = [HasAVX] 4128 4129let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] , 4130 Predicates = [HasAVX, NoVLX_Or_NoBWI]in { 4131 // 128-bit logical shifts. 4132 def VPSLLDQri : PDIi8<0x73, MRM7r, 4133 (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2), 4134 "vpslldq\t{$src2, $src1, $dst|$dst, $src1, $src2}", 4135 [(set VR128:$dst, 4136 (v2i64 (X86vshldq VR128:$src1, (i8 imm:$src2))))]>, 4137 VEX_4V; 4138 def VPSRLDQri : PDIi8<0x73, MRM3r, 4139 (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2), 4140 "vpsrldq\t{$src2, $src1, $dst|$dst, $src1, $src2}", 4141 [(set VR128:$dst, 4142 (v2i64 (X86vshrdq VR128:$src1, (i8 imm:$src2))))]>, 4143 VEX_4V; 4144 // PSRADQri doesn't exist in SSE[1-3]. 4145} // Predicates = [HasAVX, NoVLX_Or_NoBWI] 4146 4147let Predicates = [HasAVX2, NoVLX] in { 4148defm VPSLLWY : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli, 4149 VR256, v16i16, v8i16, bc_v8i16, loadv2i64, 4150 SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; 4151defm VPSLLDY : PDI_binop_rmi<0xF2, 0x72, MRM6r, "vpslld", X86vshl, X86vshli, 4152 VR256, v8i32, v4i32, bc_v4i32, loadv2i64, 4153 SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; 4154defm VPSLLQY : PDI_binop_rmi<0xF3, 0x73, MRM6r, "vpsllq", X86vshl, X86vshli, 4155 VR256, v4i64, v2i64, bc_v2i64, loadv2i64, 4156 SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; 4157 4158defm VPSRLWY : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli, 4159 VR256, v16i16, v8i16, bc_v8i16, loadv2i64, 4160 SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; 4161defm VPSRLDY : PDI_binop_rmi<0xD2, 0x72, MRM2r, "vpsrld", X86vsrl, X86vsrli, 4162 VR256, v8i32, v4i32, bc_v4i32, loadv2i64, 4163 SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; 4164defm VPSRLQY : PDI_binop_rmi<0xD3, 0x73, MRM2r, "vpsrlq", X86vsrl, X86vsrli, 4165 VR256, v4i64, v2i64, bc_v2i64, loadv2i64, 4166 SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; 4167 4168defm VPSRAWY : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai, 4169 VR256, v16i16, v8i16, bc_v8i16, loadv2i64, 4170 SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; 4171defm VPSRADY : PDI_binop_rmi<0xE2, 0x72, MRM4r, "vpsrad", X86vsra, X86vsrai, 4172 VR256, v8i32, v4i32, bc_v4i32, loadv2i64, 4173 SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; 4174}// Predicates = [HasAVX2] 4175 4176let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift], hasSideEffects = 0 , 4177 Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 4178 // 256-bit logical shifts. 4179 def VPSLLDQYri : PDIi8<0x73, MRM7r, 4180 (outs VR256:$dst), (ins VR256:$src1, u8imm:$src2), 4181 "vpslldq\t{$src2, $src1, $dst|$dst, $src1, $src2}", 4182 [(set VR256:$dst, 4183 (v4i64 (X86vshldq VR256:$src1, (i8 imm:$src2))))]>, 4184 VEX_4V, VEX_L; 4185 def VPSRLDQYri : PDIi8<0x73, MRM3r, 4186 (outs VR256:$dst), (ins VR256:$src1, u8imm:$src2), 4187 "vpsrldq\t{$src2, $src1, $dst|$dst, $src1, $src2}", 4188 [(set VR256:$dst, 4189 (v4i64 (X86vshrdq VR256:$src1, (i8 imm:$src2))))]>, 4190 VEX_4V, VEX_L; 4191 // PSRADQYri doesn't exist in SSE[1-3]. 4192} // Predicates = [HasAVX2, NoVLX_Or_NoBWI] 4193 4194let Constraints = "$src1 = $dst" in { 4195defm PSLLW : PDI_binop_rmi<0xF1, 0x71, MRM6r, "psllw", X86vshl, X86vshli, 4196 VR128, v8i16, v8i16, bc_v8i16, memopv2i64, 4197 SSE_INTSHIFT_ITINS_P>; 4198defm PSLLD : PDI_binop_rmi<0xF2, 0x72, MRM6r, "pslld", X86vshl, X86vshli, 4199 VR128, v4i32, v4i32, bc_v4i32, memopv2i64, 4200 SSE_INTSHIFT_ITINS_P>; 4201defm PSLLQ : PDI_binop_rmi<0xF3, 0x73, MRM6r, "psllq", X86vshl, X86vshli, 4202 VR128, v2i64, v2i64, bc_v2i64, memopv2i64, 4203 SSE_INTSHIFT_ITINS_P>; 4204 4205defm PSRLW : PDI_binop_rmi<0xD1, 0x71, MRM2r, "psrlw", X86vsrl, X86vsrli, 4206 VR128, v8i16, v8i16, bc_v8i16, memopv2i64, 4207 SSE_INTSHIFT_ITINS_P>; 4208defm PSRLD : PDI_binop_rmi<0xD2, 0x72, MRM2r, "psrld", X86vsrl, X86vsrli, 4209 VR128, v4i32, v4i32, bc_v4i32, memopv2i64, 4210 SSE_INTSHIFT_ITINS_P>; 4211defm PSRLQ : PDI_binop_rmi<0xD3, 0x73, MRM2r, "psrlq", X86vsrl, X86vsrli, 4212 VR128, v2i64, v2i64, bc_v2i64, memopv2i64, 4213 SSE_INTSHIFT_ITINS_P>; 4214 4215defm PSRAW : PDI_binop_rmi<0xE1, 0x71, MRM4r, "psraw", X86vsra, X86vsrai, 4216 VR128, v8i16, v8i16, bc_v8i16, memopv2i64, 4217 SSE_INTSHIFT_ITINS_P>; 4218defm PSRAD : PDI_binop_rmi<0xE2, 0x72, MRM4r, "psrad", X86vsra, X86vsrai, 4219 VR128, v4i32, v4i32, bc_v4i32, memopv2i64, 4220 SSE_INTSHIFT_ITINS_P>; 4221 4222let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift], hasSideEffects = 0 in { 4223 // 128-bit logical shifts. 4224 def PSLLDQri : PDIi8<0x73, MRM7r, 4225 (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2), 4226 "pslldq\t{$src2, $dst|$dst, $src2}", 4227 [(set VR128:$dst, 4228 (v2i64 (X86vshldq VR128:$src1, (i8 imm:$src2))))], 4229 IIC_SSE_INTSHDQ_P_RI>; 4230 def PSRLDQri : PDIi8<0x73, MRM3r, 4231 (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2), 4232 "psrldq\t{$src2, $dst|$dst, $src2}", 4233 [(set VR128:$dst, 4234 (v2i64 (X86vshrdq VR128:$src1, (i8 imm:$src2))))], 4235 IIC_SSE_INTSHDQ_P_RI>; 4236 // PSRADQri doesn't exist in SSE[1-3]. 4237} 4238} // Constraints = "$src1 = $dst" 4239 4240//===---------------------------------------------------------------------===// 4241// SSE2 - Packed Integer Comparison Instructions 4242//===---------------------------------------------------------------------===// 4243 4244defm PCMPEQB : PDI_binop_all<0x74, "pcmpeqb", X86pcmpeq, v16i8, v32i8, 4245 SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; 4246defm PCMPEQW : PDI_binop_all<0x75, "pcmpeqw", X86pcmpeq, v8i16, v16i16, 4247 SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; 4248defm PCMPEQD : PDI_binop_all<0x76, "pcmpeqd", X86pcmpeq, v4i32, v8i32, 4249 SSE_INTALU_ITINS_P, 1, NoVLX>; 4250defm PCMPGTB : PDI_binop_all<0x64, "pcmpgtb", X86pcmpgt, v16i8, v32i8, 4251 SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>; 4252defm PCMPGTW : PDI_binop_all<0x65, "pcmpgtw", X86pcmpgt, v8i16, v16i16, 4253 SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>; 4254defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32, 4255 SSE_INTALU_ITINS_P, 0, NoVLX>; 4256 4257//===---------------------------------------------------------------------===// 4258// SSE2 - Packed Integer Shuffle Instructions 4259//===---------------------------------------------------------------------===// 4260 4261let ExeDomain = SSEPackedInt in { 4262multiclass sse2_pshuffle<string OpcodeStr, ValueType vt128, ValueType vt256, 4263 SDNode OpNode> { 4264let Predicates = [HasAVX] in { 4265 def V#NAME#ri : Ii8<0x70, MRMSrcReg, (outs VR128:$dst), 4266 (ins VR128:$src1, u8imm:$src2), 4267 !strconcat("v", OpcodeStr, 4268 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 4269 [(set VR128:$dst, 4270 (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))], 4271 IIC_SSE_PSHUF_RI>, VEX, Sched<[WriteShuffle]>; 4272 def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst), 4273 (ins i128mem:$src1, u8imm:$src2), 4274 !strconcat("v", OpcodeStr, 4275 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 4276 [(set VR128:$dst, 4277 (vt128 (OpNode (bitconvert (loadv2i64 addr:$src1)), 4278 (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>, VEX, 4279 Sched<[WriteShuffleLd]>; 4280} 4281 4282let Predicates = [HasAVX2] in { 4283 def V#NAME#Yri : Ii8<0x70, MRMSrcReg, (outs VR256:$dst), 4284 (ins VR256:$src1, u8imm:$src2), 4285 !strconcat("v", OpcodeStr, 4286 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 4287 [(set VR256:$dst, 4288 (vt256 (OpNode VR256:$src1, (i8 imm:$src2))))], 4289 IIC_SSE_PSHUF_RI>, VEX, VEX_L, Sched<[WriteShuffle]>; 4290 def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst), 4291 (ins i256mem:$src1, u8imm:$src2), 4292 !strconcat("v", OpcodeStr, 4293 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 4294 [(set VR256:$dst, 4295 (vt256 (OpNode (bitconvert (loadv4i64 addr:$src1)), 4296 (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>, VEX, VEX_L, 4297 Sched<[WriteShuffleLd]>; 4298} 4299 4300let Predicates = [UseSSE2] in { 4301 def ri : Ii8<0x70, MRMSrcReg, 4302 (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2), 4303 !strconcat(OpcodeStr, 4304 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 4305 [(set VR128:$dst, 4306 (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))], 4307 IIC_SSE_PSHUF_RI>, Sched<[WriteShuffle]>; 4308 def mi : Ii8<0x70, MRMSrcMem, 4309 (outs VR128:$dst), (ins i128mem:$src1, u8imm:$src2), 4310 !strconcat(OpcodeStr, 4311 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 4312 [(set VR128:$dst, 4313 (vt128 (OpNode (bitconvert (memopv2i64 addr:$src1)), 4314 (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>, 4315 Sched<[WriteShuffleLd, ReadAfterLd]>; 4316} 4317} 4318} // ExeDomain = SSEPackedInt 4319 4320defm PSHUFD : sse2_pshuffle<"pshufd", v4i32, v8i32, X86PShufd>, PD; 4321defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw>, XS; 4322defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw>, XD; 4323 4324let Predicates = [HasAVX] in { 4325 def : Pat<(v4f32 (X86PShufd (loadv4f32 addr:$src1), (i8 imm:$imm))), 4326 (VPSHUFDmi addr:$src1, imm:$imm)>; 4327 def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))), 4328 (VPSHUFDri VR128:$src1, imm:$imm)>; 4329} 4330 4331let Predicates = [UseSSE2] in { 4332 def : Pat<(v4f32 (X86PShufd (memopv4f32 addr:$src1), (i8 imm:$imm))), 4333 (PSHUFDmi addr:$src1, imm:$imm)>; 4334 def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))), 4335 (PSHUFDri VR128:$src1, imm:$imm)>; 4336} 4337 4338//===---------------------------------------------------------------------===// 4339// Packed Integer Pack Instructions (SSE & AVX) 4340//===---------------------------------------------------------------------===// 4341 4342let ExeDomain = SSEPackedInt in { 4343multiclass sse2_pack<bits<8> opc, string OpcodeStr, ValueType OutVT, 4344 ValueType ArgVT, SDNode OpNode, PatFrag bc_frag, 4345 PatFrag ld_frag, bit Is2Addr = 1> { 4346 def rr : PDI<opc, MRMSrcReg, 4347 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), 4348 !if(Is2Addr, 4349 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4350 !strconcat(OpcodeStr, 4351 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4352 [(set VR128:$dst, 4353 (OutVT (OpNode (ArgVT VR128:$src1), VR128:$src2)))]>, 4354 Sched<[WriteShuffle]>; 4355 def rm : PDI<opc, MRMSrcMem, 4356 (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2), 4357 !if(Is2Addr, 4358 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4359 !strconcat(OpcodeStr, 4360 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4361 [(set VR128:$dst, 4362 (OutVT (OpNode VR128:$src1, 4363 (bc_frag (ld_frag addr:$src2)))))]>, 4364 Sched<[WriteShuffleLd, ReadAfterLd]>; 4365} 4366 4367multiclass sse2_pack_y<bits<8> opc, string OpcodeStr, ValueType OutVT, 4368 ValueType ArgVT, SDNode OpNode, PatFrag bc_frag> { 4369 def Yrr : PDI<opc, MRMSrcReg, 4370 (outs VR256:$dst), (ins VR256:$src1, VR256:$src2), 4371 !strconcat(OpcodeStr, 4372 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 4373 [(set VR256:$dst, 4374 (OutVT (OpNode (ArgVT VR256:$src1), VR256:$src2)))]>, 4375 Sched<[WriteShuffle]>; 4376 def Yrm : PDI<opc, MRMSrcMem, 4377 (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2), 4378 !strconcat(OpcodeStr, 4379 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 4380 [(set VR256:$dst, 4381 (OutVT (OpNode VR256:$src1, 4382 (bc_frag (loadv4i64 addr:$src2)))))]>, 4383 Sched<[WriteShuffleLd, ReadAfterLd]>; 4384} 4385 4386multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT, 4387 ValueType ArgVT, SDNode OpNode, PatFrag bc_frag, 4388 PatFrag ld_frag, bit Is2Addr = 1> { 4389 def rr : SS48I<opc, MRMSrcReg, 4390 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), 4391 !if(Is2Addr, 4392 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4393 !strconcat(OpcodeStr, 4394 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4395 [(set VR128:$dst, 4396 (OutVT (OpNode (ArgVT VR128:$src1), VR128:$src2)))]>, 4397 Sched<[WriteShuffle]>; 4398 def rm : SS48I<opc, MRMSrcMem, 4399 (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2), 4400 !if(Is2Addr, 4401 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4402 !strconcat(OpcodeStr, 4403 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4404 [(set VR128:$dst, 4405 (OutVT (OpNode VR128:$src1, 4406 (bc_frag (ld_frag addr:$src2)))))]>, 4407 Sched<[WriteShuffleLd, ReadAfterLd]>; 4408} 4409 4410multiclass sse4_pack_y<bits<8> opc, string OpcodeStr, ValueType OutVT, 4411 ValueType ArgVT, SDNode OpNode, PatFrag bc_frag> { 4412 def Yrr : SS48I<opc, MRMSrcReg, 4413 (outs VR256:$dst), (ins VR256:$src1, VR256:$src2), 4414 !strconcat(OpcodeStr, 4415 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 4416 [(set VR256:$dst, 4417 (OutVT (OpNode (ArgVT VR256:$src1), VR256:$src2)))]>, 4418 Sched<[WriteShuffle]>; 4419 def Yrm : SS48I<opc, MRMSrcMem, 4420 (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2), 4421 !strconcat(OpcodeStr, 4422 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 4423 [(set VR256:$dst, 4424 (OutVT (OpNode VR256:$src1, 4425 (bc_frag (loadv4i64 addr:$src2)))))]>, 4426 Sched<[WriteShuffleLd, ReadAfterLd]>; 4427} 4428 4429let Predicates = [HasAVX] in { 4430 defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss, 4431 bc_v8i16, loadv2i64, 0>, VEX_4V; 4432 defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss, 4433 bc_v4i32, loadv2i64, 0>, VEX_4V; 4434 4435 defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus, 4436 bc_v8i16, loadv2i64, 0>, VEX_4V; 4437 defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus, 4438 bc_v4i32, loadv2i64, 0>, VEX_4V; 4439} 4440 4441let Predicates = [HasAVX2] in { 4442 defm VPACKSSWB : sse2_pack_y<0x63, "vpacksswb", v32i8, v16i16, X86Packss, 4443 bc_v16i16>, VEX_4V, VEX_L; 4444 defm VPACKSSDW : sse2_pack_y<0x6B, "vpackssdw", v16i16, v8i32, X86Packss, 4445 bc_v8i32>, VEX_4V, VEX_L; 4446 4447 defm VPACKUSWB : sse2_pack_y<0x67, "vpackuswb", v32i8, v16i16, X86Packus, 4448 bc_v16i16>, VEX_4V, VEX_L; 4449 defm VPACKUSDW : sse4_pack_y<0x2B, "vpackusdw", v16i16, v8i32, X86Packus, 4450 bc_v8i32>, VEX_4V, VEX_L; 4451} 4452 4453let Constraints = "$src1 = $dst" in { 4454 defm PACKSSWB : sse2_pack<0x63, "packsswb", v16i8, v8i16, X86Packss, 4455 bc_v8i16, memopv2i64>; 4456 defm PACKSSDW : sse2_pack<0x6B, "packssdw", v8i16, v4i32, X86Packss, 4457 bc_v4i32, memopv2i64>; 4458 4459 defm PACKUSWB : sse2_pack<0x67, "packuswb", v16i8, v8i16, X86Packus, 4460 bc_v8i16, memopv2i64>; 4461 4462 let Predicates = [HasSSE41] in 4463 defm PACKUSDW : sse4_pack<0x2B, "packusdw", v8i16, v4i32, X86Packus, 4464 bc_v4i32, memopv2i64>; 4465} 4466} // ExeDomain = SSEPackedInt 4467 4468//===---------------------------------------------------------------------===// 4469// SSE2 - Packed Integer Unpack Instructions 4470//===---------------------------------------------------------------------===// 4471 4472let ExeDomain = SSEPackedInt in { 4473multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt, 4474 SDNode OpNode, PatFrag bc_frag, PatFrag ld_frag, 4475 bit Is2Addr = 1> { 4476 def rr : PDI<opc, MRMSrcReg, 4477 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), 4478 !if(Is2Addr, 4479 !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"), 4480 !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4481 [(set VR128:$dst, (vt (OpNode VR128:$src1, VR128:$src2)))], 4482 IIC_SSE_UNPCK>, Sched<[WriteShuffle]>; 4483 def rm : PDI<opc, MRMSrcMem, 4484 (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2), 4485 !if(Is2Addr, 4486 !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"), 4487 !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4488 [(set VR128:$dst, (OpNode VR128:$src1, 4489 (bc_frag (ld_frag addr:$src2))))], 4490 IIC_SSE_UNPCK>, 4491 Sched<[WriteShuffleLd, ReadAfterLd]>; 4492} 4493 4494multiclass sse2_unpack_y<bits<8> opc, string OpcodeStr, ValueType vt, 4495 SDNode OpNode, PatFrag bc_frag> { 4496 def Yrr : PDI<opc, MRMSrcReg, 4497 (outs VR256:$dst), (ins VR256:$src1, VR256:$src2), 4498 !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 4499 [(set VR256:$dst, (vt (OpNode VR256:$src1, VR256:$src2)))]>, 4500 Sched<[WriteShuffle]>; 4501 def Yrm : PDI<opc, MRMSrcMem, 4502 (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2), 4503 !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 4504 [(set VR256:$dst, (OpNode VR256:$src1, 4505 (bc_frag (loadv4i64 addr:$src2))))]>, 4506 Sched<[WriteShuffleLd, ReadAfterLd]>; 4507} 4508 4509 4510let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 4511 defm VPUNPCKLBW : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl, 4512 bc_v16i8, loadv2i64, 0>, VEX_4V; 4513 defm VPUNPCKLWD : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl, 4514 bc_v8i16, loadv2i64, 0>, VEX_4V; 4515 defm VPUNPCKHBW : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh, 4516 bc_v16i8, loadv2i64, 0>, VEX_4V; 4517 defm VPUNPCKHWD : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh, 4518 bc_v8i16, loadv2i64, 0>, VEX_4V; 4519} 4520let Predicates = [HasAVX, NoVLX] in { 4521 defm VPUNPCKLDQ : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl, 4522 bc_v4i32, loadv2i64, 0>, VEX_4V; 4523 defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl, 4524 bc_v2i64, loadv2i64, 0>, VEX_4V; 4525 defm VPUNPCKHDQ : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh, 4526 bc_v4i32, loadv2i64, 0>, VEX_4V; 4527 defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh, 4528 bc_v2i64, loadv2i64, 0>, VEX_4V; 4529} 4530 4531let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 4532 defm VPUNPCKLBW : sse2_unpack_y<0x60, "vpunpcklbw", v32i8, X86Unpckl, 4533 bc_v32i8>, VEX_4V, VEX_L; 4534 defm VPUNPCKLWD : sse2_unpack_y<0x61, "vpunpcklwd", v16i16, X86Unpckl, 4535 bc_v16i16>, VEX_4V, VEX_L; 4536 defm VPUNPCKHBW : sse2_unpack_y<0x68, "vpunpckhbw", v32i8, X86Unpckh, 4537 bc_v32i8>, VEX_4V, VEX_L; 4538 defm VPUNPCKHWD : sse2_unpack_y<0x69, "vpunpckhwd", v16i16, X86Unpckh, 4539 bc_v16i16>, VEX_4V, VEX_L; 4540} 4541let Predicates = [HasAVX2, NoVLX] in { 4542 defm VPUNPCKLDQ : sse2_unpack_y<0x62, "vpunpckldq", v8i32, X86Unpckl, 4543 bc_v8i32>, VEX_4V, VEX_L; 4544 defm VPUNPCKLQDQ : sse2_unpack_y<0x6C, "vpunpcklqdq", v4i64, X86Unpckl, 4545 bc_v4i64>, VEX_4V, VEX_L; 4546 defm VPUNPCKHDQ : sse2_unpack_y<0x6A, "vpunpckhdq", v8i32, X86Unpckh, 4547 bc_v8i32>, VEX_4V, VEX_L; 4548 defm VPUNPCKHQDQ : sse2_unpack_y<0x6D, "vpunpckhqdq", v4i64, X86Unpckh, 4549 bc_v4i64>, VEX_4V, VEX_L; 4550} 4551 4552let Constraints = "$src1 = $dst" in { 4553 defm PUNPCKLBW : sse2_unpack<0x60, "punpcklbw", v16i8, X86Unpckl, 4554 bc_v16i8, memopv2i64>; 4555 defm PUNPCKLWD : sse2_unpack<0x61, "punpcklwd", v8i16, X86Unpckl, 4556 bc_v8i16, memopv2i64>; 4557 defm PUNPCKLDQ : sse2_unpack<0x62, "punpckldq", v4i32, X86Unpckl, 4558 bc_v4i32, memopv2i64>; 4559 defm PUNPCKLQDQ : sse2_unpack<0x6C, "punpcklqdq", v2i64, X86Unpckl, 4560 bc_v2i64, memopv2i64>; 4561 4562 defm PUNPCKHBW : sse2_unpack<0x68, "punpckhbw", v16i8, X86Unpckh, 4563 bc_v16i8, memopv2i64>; 4564 defm PUNPCKHWD : sse2_unpack<0x69, "punpckhwd", v8i16, X86Unpckh, 4565 bc_v8i16, memopv2i64>; 4566 defm PUNPCKHDQ : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Unpckh, 4567 bc_v4i32, memopv2i64>; 4568 defm PUNPCKHQDQ : sse2_unpack<0x6D, "punpckhqdq", v2i64, X86Unpckh, 4569 bc_v2i64, memopv2i64>; 4570} 4571} // ExeDomain = SSEPackedInt 4572 4573//===---------------------------------------------------------------------===// 4574// SSE2 - Packed Integer Extract and Insert 4575//===---------------------------------------------------------------------===// 4576 4577let ExeDomain = SSEPackedInt in { 4578multiclass sse2_pinsrw<bit Is2Addr = 1> { 4579 def rri : Ii8<0xC4, MRMSrcReg, 4580 (outs VR128:$dst), (ins VR128:$src1, 4581 GR32orGR64:$src2, u8imm:$src3), 4582 !if(Is2Addr, 4583 "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}", 4584 "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 4585 [(set VR128:$dst, 4586 (X86pinsrw VR128:$src1, GR32orGR64:$src2, imm:$src3))], 4587 IIC_SSE_PINSRW>, Sched<[WriteShuffle]>; 4588 def rmi : Ii8<0xC4, MRMSrcMem, 4589 (outs VR128:$dst), (ins VR128:$src1, 4590 i16mem:$src2, u8imm:$src3), 4591 !if(Is2Addr, 4592 "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}", 4593 "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 4594 [(set VR128:$dst, 4595 (X86pinsrw VR128:$src1, (extloadi16 addr:$src2), 4596 imm:$src3))], IIC_SSE_PINSRW>, 4597 Sched<[WriteShuffleLd, ReadAfterLd]>; 4598} 4599 4600// Extract 4601let Predicates = [HasAVX, NoBWI] in 4602def VPEXTRWri : Ii8<0xC5, MRMSrcReg, 4603 (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2), 4604 "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", 4605 [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1), 4606 imm:$src2))]>, PD, VEX, 4607 Sched<[WriteShuffle]>; 4608def PEXTRWri : PDIi8<0xC5, MRMSrcReg, 4609 (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2), 4610 "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", 4611 [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1), 4612 imm:$src2))], IIC_SSE_PEXTRW>, 4613 Sched<[WriteShuffleLd, ReadAfterLd]>; 4614 4615// Insert 4616let Predicates = [HasAVX, NoBWI] in 4617defm VPINSRW : sse2_pinsrw<0>, PD, VEX_4V; 4618 4619let Predicates = [UseSSE2], Constraints = "$src1 = $dst" in 4620defm PINSRW : sse2_pinsrw, PD; 4621 4622} // ExeDomain = SSEPackedInt 4623 4624//===---------------------------------------------------------------------===// 4625// SSE2 - Packed Mask Creation 4626//===---------------------------------------------------------------------===// 4627 4628let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLogic] in { 4629 4630def VPMOVMSKBrr : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), 4631 (ins VR128:$src), 4632 "pmovmskb\t{$src, $dst|$dst, $src}", 4633 [(set GR32orGR64:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))], 4634 IIC_SSE_MOVMSK>, VEX; 4635 4636let Predicates = [HasAVX2] in { 4637def VPMOVMSKBYrr : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), 4638 (ins VR256:$src), 4639 "pmovmskb\t{$src, $dst|$dst, $src}", 4640 [(set GR32orGR64:$dst, (int_x86_avx2_pmovmskb VR256:$src))]>, 4641 VEX, VEX_L; 4642} 4643 4644def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src), 4645 "pmovmskb\t{$src, $dst|$dst, $src}", 4646 [(set GR32orGR64:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))], 4647 IIC_SSE_MOVMSK>; 4648 4649} // ExeDomain = SSEPackedInt 4650 4651//===---------------------------------------------------------------------===// 4652// SSE2 - Conditional Store 4653//===---------------------------------------------------------------------===// 4654 4655let ExeDomain = SSEPackedInt, SchedRW = [WriteStore] in { 4656 4657let Uses = [EDI], Predicates = [HasAVX,Not64BitMode] in 4658def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs), 4659 (ins VR128:$src, VR128:$mask), 4660 "maskmovdqu\t{$mask, $src|$src, $mask}", 4661 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)], 4662 IIC_SSE_MASKMOV>, VEX; 4663let Uses = [RDI], Predicates = [HasAVX,In64BitMode] in 4664def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs), 4665 (ins VR128:$src, VR128:$mask), 4666 "maskmovdqu\t{$mask, $src|$src, $mask}", 4667 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)], 4668 IIC_SSE_MASKMOV>, VEX; 4669 4670let Uses = [EDI], Predicates = [UseSSE2,Not64BitMode] in 4671def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask), 4672 "maskmovdqu\t{$mask, $src|$src, $mask}", 4673 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)], 4674 IIC_SSE_MASKMOV>; 4675let Uses = [RDI], Predicates = [UseSSE2,In64BitMode] in 4676def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask), 4677 "maskmovdqu\t{$mask, $src|$src, $mask}", 4678 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)], 4679 IIC_SSE_MASKMOV>; 4680 4681} // ExeDomain = SSEPackedInt 4682 4683//===---------------------------------------------------------------------===// 4684// SSE2 - Move Doubleword/Quadword 4685//===---------------------------------------------------------------------===// 4686 4687//===---------------------------------------------------------------------===// 4688// Move Int Doubleword to Packed Double Int 4689// 4690def VMOVDI2PDIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src), 4691 "movd\t{$src, $dst|$dst, $src}", 4692 [(set VR128:$dst, 4693 (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>, 4694 VEX, Sched<[WriteMove]>; 4695def VMOVDI2PDIrm : VS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src), 4696 "movd\t{$src, $dst|$dst, $src}", 4697 [(set VR128:$dst, 4698 (v4i32 (scalar_to_vector (loadi32 addr:$src))))], 4699 IIC_SSE_MOVDQ>, 4700 VEX, Sched<[WriteLoad]>; 4701def VMOV64toPQIrr : VRS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), 4702 "movq\t{$src, $dst|$dst, $src}", 4703 [(set VR128:$dst, 4704 (v2i64 (scalar_to_vector GR64:$src)))], 4705 IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>; 4706let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in 4707def VMOV64toPQIrm : VRS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 4708 "movq\t{$src, $dst|$dst, $src}", 4709 [], IIC_SSE_MOVDQ>, VEX, Sched<[WriteLoad]>; 4710let isCodeGenOnly = 1 in 4711def VMOV64toSDrr : VRS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src), 4712 "movq\t{$src, $dst|$dst, $src}", 4713 [(set FR64:$dst, (bitconvert GR64:$src))], 4714 IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>; 4715 4716def MOVDI2PDIrr : S2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src), 4717 "movd\t{$src, $dst|$dst, $src}", 4718 [(set VR128:$dst, 4719 (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>, 4720 Sched<[WriteMove]>; 4721def MOVDI2PDIrm : S2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src), 4722 "movd\t{$src, $dst|$dst, $src}", 4723 [(set VR128:$dst, 4724 (v4i32 (scalar_to_vector (loadi32 addr:$src))))], 4725 IIC_SSE_MOVDQ>, Sched<[WriteLoad]>; 4726def MOV64toPQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), 4727 "mov{d|q}\t{$src, $dst|$dst, $src}", 4728 [(set VR128:$dst, 4729 (v2i64 (scalar_to_vector GR64:$src)))], 4730 IIC_SSE_MOVDQ>, Sched<[WriteMove]>; 4731let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in 4732def MOV64toPQIrm : RS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 4733 "mov{d|q}\t{$src, $dst|$dst, $src}", 4734 [], IIC_SSE_MOVDQ>, Sched<[WriteLoad]>; 4735let isCodeGenOnly = 1 in 4736def MOV64toSDrr : RS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src), 4737 "mov{d|q}\t{$src, $dst|$dst, $src}", 4738 [(set FR64:$dst, (bitconvert GR64:$src))], 4739 IIC_SSE_MOVDQ>, Sched<[WriteMove]>; 4740 4741//===---------------------------------------------------------------------===// 4742// Move Int Doubleword to Single Scalar 4743// 4744let isCodeGenOnly = 1 in { 4745 def VMOVDI2SSrr : VS2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src), 4746 "movd\t{$src, $dst|$dst, $src}", 4747 [(set FR32:$dst, (bitconvert GR32:$src))], 4748 IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>; 4749 4750 def VMOVDI2SSrm : VS2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src), 4751 "movd\t{$src, $dst|$dst, $src}", 4752 [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))], 4753 IIC_SSE_MOVDQ>, 4754 VEX, Sched<[WriteLoad]>; 4755 def MOVDI2SSrr : S2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src), 4756 "movd\t{$src, $dst|$dst, $src}", 4757 [(set FR32:$dst, (bitconvert GR32:$src))], 4758 IIC_SSE_MOVDQ>, Sched<[WriteMove]>; 4759 4760 def MOVDI2SSrm : S2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src), 4761 "movd\t{$src, $dst|$dst, $src}", 4762 [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))], 4763 IIC_SSE_MOVDQ>, Sched<[WriteLoad]>; 4764} 4765 4766//===---------------------------------------------------------------------===// 4767// Move Packed Doubleword Int to Packed Double Int 4768// 4769def VMOVPDI2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src), 4770 "movd\t{$src, $dst|$dst, $src}", 4771 [(set GR32:$dst, (extractelt (v4i32 VR128:$src), 4772 (iPTR 0)))], IIC_SSE_MOVD_ToGP>, VEX, 4773 Sched<[WriteMove]>; 4774def VMOVPDI2DImr : VS2I<0x7E, MRMDestMem, (outs), 4775 (ins i32mem:$dst, VR128:$src), 4776 "movd\t{$src, $dst|$dst, $src}", 4777 [(store (i32 (extractelt (v4i32 VR128:$src), 4778 (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>, 4779 VEX, Sched<[WriteStore]>; 4780def MOVPDI2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src), 4781 "movd\t{$src, $dst|$dst, $src}", 4782 [(set GR32:$dst, (extractelt (v4i32 VR128:$src), 4783 (iPTR 0)))], IIC_SSE_MOVD_ToGP>, 4784 Sched<[WriteMove]>; 4785def MOVPDI2DImr : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src), 4786 "movd\t{$src, $dst|$dst, $src}", 4787 [(store (i32 (extractelt (v4i32 VR128:$src), 4788 (iPTR 0))), addr:$dst)], 4789 IIC_SSE_MOVDQ>, Sched<[WriteStore]>; 4790 4791def : Pat<(v8i32 (X86Vinsert (v8i32 immAllZerosV), GR32:$src2, (iPTR 0))), 4792 (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src2), sub_xmm)>; 4793 4794def : Pat<(v4i64 (X86Vinsert (bc_v4i64 (v8i32 immAllZerosV)), GR64:$src2, (iPTR 0))), 4795 (SUBREG_TO_REG (i32 0), (VMOV64toPQIrr GR64:$src2), sub_xmm)>; 4796 4797def : Pat<(v8i32 (X86Vinsert undef, GR32:$src2, (iPTR 0))), 4798 (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src2), sub_xmm)>; 4799 4800def : Pat<(v4i64 (X86Vinsert undef, GR64:$src2, (iPTR 0))), 4801 (SUBREG_TO_REG (i32 0), (VMOV64toPQIrr GR64:$src2), sub_xmm)>; 4802 4803//===---------------------------------------------------------------------===// 4804// Move Packed Doubleword Int first element to Doubleword Int 4805// 4806let SchedRW = [WriteMove] in { 4807def VMOVPQIto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), 4808 "movq\t{$src, $dst|$dst, $src}", 4809 [(set GR64:$dst, (extractelt (v2i64 VR128:$src), 4810 (iPTR 0)))], 4811 IIC_SSE_MOVD_ToGP>, 4812 VEX; 4813 4814def MOVPQIto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), 4815 "mov{d|q}\t{$src, $dst|$dst, $src}", 4816 [(set GR64:$dst, (extractelt (v2i64 VR128:$src), 4817 (iPTR 0)))], 4818 IIC_SSE_MOVD_ToGP>; 4819} //SchedRW 4820 4821let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in 4822def VMOVPQIto64rm : VRS2I<0x7E, MRMDestMem, (outs i64mem:$dst), 4823 (ins VR128:$src), "movq\t{$src, $dst|$dst, $src}", 4824 [], IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>; 4825let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in 4826def MOVPQIto64rm : RS2I<0x7E, MRMDestMem, (outs i64mem:$dst), (ins VR128:$src), 4827 "mov{d|q}\t{$src, $dst|$dst, $src}", 4828 [], IIC_SSE_MOVDQ>, Sched<[WriteStore]>; 4829 4830//===---------------------------------------------------------------------===// 4831// Bitcast FR64 <-> GR64 4832// 4833let isCodeGenOnly = 1 in { 4834 let Predicates = [UseAVX] in 4835 def VMOV64toSDrm : VS2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src), 4836 "movq\t{$src, $dst|$dst, $src}", 4837 [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>, 4838 VEX, Sched<[WriteLoad]>; 4839 def VMOVSDto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src), 4840 "movq\t{$src, $dst|$dst, $src}", 4841 [(set GR64:$dst, (bitconvert FR64:$src))], 4842 IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>; 4843 def VMOVSDto64mr : VRS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src), 4844 "movq\t{$src, $dst|$dst, $src}", 4845 [(store (i64 (bitconvert FR64:$src)), addr:$dst)], 4846 IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>; 4847 4848 def MOV64toSDrm : S2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src), 4849 "movq\t{$src, $dst|$dst, $src}", 4850 [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))], 4851 IIC_SSE_MOVDQ>, Sched<[WriteLoad]>; 4852 def MOVSDto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src), 4853 "mov{d|q}\t{$src, $dst|$dst, $src}", 4854 [(set GR64:$dst, (bitconvert FR64:$src))], 4855 IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>; 4856 def MOVSDto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src), 4857 "movq\t{$src, $dst|$dst, $src}", 4858 [(store (i64 (bitconvert FR64:$src)), addr:$dst)], 4859 IIC_SSE_MOVDQ>, Sched<[WriteStore]>; 4860} 4861 4862//===---------------------------------------------------------------------===// 4863// Move Scalar Single to Double Int 4864// 4865let isCodeGenOnly = 1 in { 4866 def VMOVSS2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src), 4867 "movd\t{$src, $dst|$dst, $src}", 4868 [(set GR32:$dst, (bitconvert FR32:$src))], 4869 IIC_SSE_MOVD_ToGP>, VEX, Sched<[WriteMove]>; 4870 def VMOVSS2DImr : VS2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src), 4871 "movd\t{$src, $dst|$dst, $src}", 4872 [(store (i32 (bitconvert FR32:$src)), addr:$dst)], 4873 IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>; 4874 def MOVSS2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src), 4875 "movd\t{$src, $dst|$dst, $src}", 4876 [(set GR32:$dst, (bitconvert FR32:$src))], 4877 IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>; 4878 def MOVSS2DImr : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src), 4879 "movd\t{$src, $dst|$dst, $src}", 4880 [(store (i32 (bitconvert FR32:$src)), addr:$dst)], 4881 IIC_SSE_MOVDQ>, Sched<[WriteStore]>; 4882} 4883 4884let Predicates = [UseAVX] in { 4885 let AddedComplexity = 15 in { 4886 def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))), 4887 (VMOVDI2PDIrr GR32:$src)>; 4888 4889 def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))), 4890 (VMOV64toPQIrr GR64:$src)>; 4891 4892 def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, 4893 (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))), 4894 (SUBREG_TO_REG (i64 0), (VMOV64toPQIrr GR64:$src), sub_xmm)>; 4895 } 4896 // AVX 128-bit movd/movq instructions write zeros in the high 128-bit part. 4897 // These instructions also write zeros in the high part of a 256-bit register. 4898 let AddedComplexity = 20 in { 4899 def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))), 4900 (VMOVDI2PDIrm addr:$src)>; 4901 def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))), 4902 (VMOVDI2PDIrm addr:$src)>; 4903 def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))), 4904 (VMOVDI2PDIrm addr:$src)>; 4905 def : Pat<(v8i32 (X86vzmovl (insert_subvector undef, 4906 (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))), 4907 (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrm addr:$src), sub_xmm)>; 4908 } 4909 // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext. 4910 def : Pat<(v8i32 (X86vzmovl (insert_subvector undef, 4911 (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))), 4912 (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src), sub_xmm)>; 4913} 4914 4915let Predicates = [UseSSE2] in { 4916 let AddedComplexity = 15 in { 4917 def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))), 4918 (MOVDI2PDIrr GR32:$src)>; 4919 4920 def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))), 4921 (MOV64toPQIrr GR64:$src)>; 4922 } 4923 let AddedComplexity = 20 in { 4924 def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))), 4925 (MOVDI2PDIrm addr:$src)>; 4926 def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))), 4927 (MOVDI2PDIrm addr:$src)>; 4928 def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))), 4929 (MOVDI2PDIrm addr:$src)>; 4930 } 4931} 4932 4933// These are the correct encodings of the instructions so that we know how to 4934// read correct assembly, even though we continue to emit the wrong ones for 4935// compatibility with Darwin's buggy assembler. 4936def : InstAlias<"movq\t{$src, $dst|$dst, $src}", 4937 (MOV64toPQIrr VR128:$dst, GR64:$src), 0>; 4938def : InstAlias<"movq\t{$src, $dst|$dst, $src}", 4939 (MOVPQIto64rr GR64:$dst, VR128:$src), 0>; 4940// Allow "vmovd" but print "vmovq" since we don't need compatibility for AVX. 4941def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}", 4942 (VMOV64toPQIrr VR128:$dst, GR64:$src), 0>; 4943def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}", 4944 (VMOVPQIto64rr GR64:$dst, VR128:$src), 0>; 4945 4946//===---------------------------------------------------------------------===// 4947// SSE2 - Move Quadword 4948//===---------------------------------------------------------------------===// 4949 4950//===---------------------------------------------------------------------===// 4951// Move Quadword Int to Packed Quadword Int 4952// 4953 4954let ExeDomain = SSEPackedInt, SchedRW = [WriteLoad] in { 4955def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 4956 "vmovq\t{$src, $dst|$dst, $src}", 4957 [(set VR128:$dst, 4958 (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS, 4959 VEX, Requires<[UseAVX]>; 4960def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 4961 "movq\t{$src, $dst|$dst, $src}", 4962 [(set VR128:$dst, 4963 (v2i64 (scalar_to_vector (loadi64 addr:$src))))], 4964 IIC_SSE_MOVDQ>, XS, 4965 Requires<[UseSSE2]>; // SSE2 instruction with XS Prefix 4966} // ExeDomain, SchedRW 4967 4968//===---------------------------------------------------------------------===// 4969// Move Packed Quadword Int to Quadword Int 4970// 4971let ExeDomain = SSEPackedInt, SchedRW = [WriteStore] in { 4972def VMOVPQI2QImr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), 4973 "movq\t{$src, $dst|$dst, $src}", 4974 [(store (i64 (extractelt (v2i64 VR128:$src), 4975 (iPTR 0))), addr:$dst)], 4976 IIC_SSE_MOVDQ>, VEX; 4977def MOVPQI2QImr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), 4978 "movq\t{$src, $dst|$dst, $src}", 4979 [(store (i64 (extractelt (v2i64 VR128:$src), 4980 (iPTR 0))), addr:$dst)], 4981 IIC_SSE_MOVDQ>; 4982} // ExeDomain, SchedRW 4983 4984// For disassembler only 4985let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, 4986 SchedRW = [WriteVecLogic] in { 4987def VMOVPQI2QIrr : VS2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 4988 "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVQ_RR>, VEX; 4989def MOVPQI2QIrr : S2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 4990 "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVQ_RR>; 4991} 4992 4993//===---------------------------------------------------------------------===// 4994// Store / copy lower 64-bits of a XMM register. 4995// 4996let Predicates = [HasAVX] in 4997def : Pat<(int_x86_sse2_storel_dq addr:$dst, VR128:$src), 4998 (VMOVPQI2QImr addr:$dst, VR128:$src)>; 4999let Predicates = [UseSSE2] in 5000def : Pat<(int_x86_sse2_storel_dq addr:$dst, VR128:$src), 5001 (MOVPQI2QImr addr:$dst, VR128:$src)>; 5002 5003let ExeDomain = SSEPackedInt, isCodeGenOnly = 1, AddedComplexity = 20 in { 5004def VMOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 5005 "vmovq\t{$src, $dst|$dst, $src}", 5006 [(set VR128:$dst, 5007 (v2i64 (X86vzmovl (v2i64 (scalar_to_vector 5008 (loadi64 addr:$src))))))], 5009 IIC_SSE_MOVDQ>, 5010 XS, VEX, Requires<[UseAVX]>, Sched<[WriteLoad]>; 5011 5012def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 5013 "movq\t{$src, $dst|$dst, $src}", 5014 [(set VR128:$dst, 5015 (v2i64 (X86vzmovl (v2i64 (scalar_to_vector 5016 (loadi64 addr:$src))))))], 5017 IIC_SSE_MOVDQ>, 5018 XS, Requires<[UseSSE2]>, Sched<[WriteLoad]>; 5019} // ExeDomain, isCodeGenOnly, AddedComplexity 5020 5021let Predicates = [UseAVX], AddedComplexity = 20 in { 5022 def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))), 5023 (VMOVZQI2PQIrm addr:$src)>; 5024 def : Pat<(v2i64 (X86vzload addr:$src)), 5025 (VMOVZQI2PQIrm addr:$src)>; 5026 def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, 5027 (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))), 5028 (SUBREG_TO_REG (i64 0), (VMOVZQI2PQIrm addr:$src), sub_xmm)>; 5029} 5030 5031let Predicates = [UseSSE2], AddedComplexity = 20 in { 5032 def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))), 5033 (MOVZQI2PQIrm addr:$src)>; 5034 def : Pat<(v2i64 (X86vzload addr:$src)), (MOVZQI2PQIrm addr:$src)>; 5035} 5036 5037let Predicates = [HasAVX] in { 5038def : Pat<(v4i64 (alignedX86vzload addr:$src)), 5039 (SUBREG_TO_REG (i32 0), (VMOVAPSrm addr:$src), sub_xmm)>; 5040def : Pat<(v4i64 (X86vzload addr:$src)), 5041 (SUBREG_TO_REG (i32 0), (VMOVUPSrm addr:$src), sub_xmm)>; 5042} 5043 5044//===---------------------------------------------------------------------===// 5045// Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in 5046// IA32 document. movq xmm1, xmm2 does clear the high bits. 5047// 5048let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLogic] in { 5049let AddedComplexity = 15 in 5050def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 5051 "vmovq\t{$src, $dst|$dst, $src}", 5052 [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))], 5053 IIC_SSE_MOVQ_RR>, 5054 XS, VEX, Requires<[UseAVX]>; 5055let AddedComplexity = 15 in 5056def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 5057 "movq\t{$src, $dst|$dst, $src}", 5058 [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))], 5059 IIC_SSE_MOVQ_RR>, 5060 XS, Requires<[UseSSE2]>; 5061} // ExeDomain, SchedRW 5062 5063let ExeDomain = SSEPackedInt, isCodeGenOnly = 1, SchedRW = [WriteVecLogicLd] in { 5064let AddedComplexity = 20 in 5065def VMOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 5066 "vmovq\t{$src, $dst|$dst, $src}", 5067 [(set VR128:$dst, (v2i64 (X86vzmovl 5068 (loadv2i64 addr:$src))))], 5069 IIC_SSE_MOVDQ>, 5070 XS, VEX, Requires<[UseAVX]>; 5071let AddedComplexity = 20 in { 5072def MOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 5073 "movq\t{$src, $dst|$dst, $src}", 5074 [(set VR128:$dst, (v2i64 (X86vzmovl 5075 (loadv2i64 addr:$src))))], 5076 IIC_SSE_MOVDQ>, 5077 XS, Requires<[UseSSE2]>; 5078} 5079} // ExeDomain, isCodeGenOnly, SchedRW 5080 5081let AddedComplexity = 20 in { 5082 let Predicates = [UseAVX] in { 5083 def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))), 5084 (VMOVZPQILo2PQIrr VR128:$src)>; 5085 } 5086 let Predicates = [UseSSE2] in { 5087 def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))), 5088 (MOVZPQILo2PQIrr VR128:$src)>; 5089 } 5090} 5091 5092//===---------------------------------------------------------------------===// 5093// SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP 5094//===---------------------------------------------------------------------===// 5095multiclass sse3_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr, 5096 ValueType vt, RegisterClass RC, PatFrag mem_frag, 5097 X86MemOperand x86memop> { 5098def rr : S3SI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src), 5099 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 5100 [(set RC:$dst, (vt (OpNode RC:$src)))], 5101 IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>; 5102def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), 5103 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 5104 [(set RC:$dst, (OpNode (mem_frag addr:$src)))], 5105 IIC_SSE_MOV_LH>, Sched<[WriteLoad]>; 5106} 5107 5108let Predicates = [HasAVX, NoVLX] in { 5109 defm VMOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup", 5110 v4f32, VR128, loadv4f32, f128mem>, VEX; 5111 defm VMOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup", 5112 v4f32, VR128, loadv4f32, f128mem>, VEX; 5113 defm VMOVSHDUPY : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup", 5114 v8f32, VR256, loadv8f32, f256mem>, VEX, VEX_L; 5115 defm VMOVSLDUPY : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup", 5116 v8f32, VR256, loadv8f32, f256mem>, VEX, VEX_L; 5117} 5118defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128, 5119 memopv4f32, f128mem>; 5120defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128, 5121 memopv4f32, f128mem>; 5122 5123let Predicates = [HasAVX, NoVLX] in { 5124 def : Pat<(v4i32 (X86Movshdup VR128:$src)), 5125 (VMOVSHDUPrr VR128:$src)>; 5126 def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (loadv2i64 addr:$src)))), 5127 (VMOVSHDUPrm addr:$src)>; 5128 def : Pat<(v4i32 (X86Movsldup VR128:$src)), 5129 (VMOVSLDUPrr VR128:$src)>; 5130 def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (loadv2i64 addr:$src)))), 5131 (VMOVSLDUPrm addr:$src)>; 5132 def : Pat<(v8i32 (X86Movshdup VR256:$src)), 5133 (VMOVSHDUPYrr VR256:$src)>; 5134 def : Pat<(v8i32 (X86Movshdup (bc_v8i32 (loadv4i64 addr:$src)))), 5135 (VMOVSHDUPYrm addr:$src)>; 5136 def : Pat<(v8i32 (X86Movsldup VR256:$src)), 5137 (VMOVSLDUPYrr VR256:$src)>; 5138 def : Pat<(v8i32 (X86Movsldup (bc_v8i32 (loadv4i64 addr:$src)))), 5139 (VMOVSLDUPYrm addr:$src)>; 5140} 5141 5142let Predicates = [UseSSE3] in { 5143 def : Pat<(v4i32 (X86Movshdup VR128:$src)), 5144 (MOVSHDUPrr VR128:$src)>; 5145 def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (memopv2i64 addr:$src)))), 5146 (MOVSHDUPrm addr:$src)>; 5147 def : Pat<(v4i32 (X86Movsldup VR128:$src)), 5148 (MOVSLDUPrr VR128:$src)>; 5149 def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (memopv2i64 addr:$src)))), 5150 (MOVSLDUPrm addr:$src)>; 5151} 5152 5153//===---------------------------------------------------------------------===// 5154// SSE3 - Replicate Double FP - MOVDDUP 5155//===---------------------------------------------------------------------===// 5156 5157multiclass sse3_replicate_dfp<string OpcodeStr> { 5158def rr : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 5159 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 5160 [(set VR128:$dst, (v2f64 (X86Movddup VR128:$src)))], 5161 IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>; 5162def rm : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), 5163 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 5164 [(set VR128:$dst, 5165 (v2f64 (X86Movddup 5166 (scalar_to_vector (loadf64 addr:$src)))))], 5167 IIC_SSE_MOV_LH>, Sched<[WriteLoad]>; 5168} 5169 5170// FIXME: Merge with above classe when there're patterns for the ymm version 5171multiclass sse3_replicate_dfp_y<string OpcodeStr> { 5172def rr : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 5173 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 5174 [(set VR256:$dst, (v4f64 (X86Movddup VR256:$src)))]>, 5175 Sched<[WriteFShuffle]>; 5176def rm : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), 5177 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 5178 [(set VR256:$dst, 5179 (v4f64 (X86Movddup (loadv4f64 addr:$src))))]>, 5180 Sched<[WriteLoad]>; 5181} 5182 5183let Predicates = [HasAVX, NoVLX] in { 5184 defm VMOVDDUP : sse3_replicate_dfp<"vmovddup">, VEX; 5185 defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup">, VEX, VEX_L; 5186} 5187 5188defm MOVDDUP : sse3_replicate_dfp<"movddup">; 5189 5190 5191let Predicates = [HasAVX, NoVLX] in { 5192 def : Pat<(X86Movddup (loadv2f64 addr:$src)), 5193 (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; 5194 5195 // 256-bit version 5196 def : Pat<(X86Movddup (loadv4i64 addr:$src)), 5197 (VMOVDDUPYrm addr:$src)>; 5198 def : Pat<(X86Movddup (v4i64 VR256:$src)), 5199 (VMOVDDUPYrr VR256:$src)>; 5200} 5201 5202let Predicates = [HasAVX] in { 5203 def : Pat<(X86Movddup (bc_v2f64 (loadv4f32 addr:$src))), 5204 (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; 5205 def : Pat<(X86Movddup (bc_v2f64 (loadv2i64 addr:$src))), 5206 (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; 5207 def : Pat<(X86Movddup (bc_v2f64 5208 (v2i64 (scalar_to_vector (loadi64 addr:$src))))), 5209 (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; 5210} 5211 5212let Predicates = [UseAVX, OptForSize] in { 5213 def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))), 5214 (VMOVDDUPrm addr:$src)>; 5215 def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))), 5216 (VMOVDDUPrm addr:$src)>; 5217} 5218 5219let Predicates = [UseSSE3] in { 5220 def : Pat<(X86Movddup (memopv2f64 addr:$src)), 5221 (MOVDDUPrm addr:$src)>; 5222 def : Pat<(X86Movddup (bc_v2f64 (memopv4f32 addr:$src))), 5223 (MOVDDUPrm addr:$src)>; 5224 def : Pat<(X86Movddup (bc_v2f64 (memopv2i64 addr:$src))), 5225 (MOVDDUPrm addr:$src)>; 5226 def : Pat<(X86Movddup (bc_v2f64 5227 (v2i64 (scalar_to_vector (loadi64 addr:$src))))), 5228 (MOVDDUPrm addr:$src)>; 5229} 5230 5231//===---------------------------------------------------------------------===// 5232// SSE3 - Move Unaligned Integer 5233//===---------------------------------------------------------------------===// 5234 5235let SchedRW = [WriteLoad] in { 5236let Predicates = [HasAVX] in { 5237 def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 5238 "vlddqu\t{$src, $dst|$dst, $src}", 5239 [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>, VEX; 5240 def VLDDQUYrm : S3DI<0xF0, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), 5241 "vlddqu\t{$src, $dst|$dst, $src}", 5242 [(set VR256:$dst, (int_x86_avx_ldu_dq_256 addr:$src))]>, 5243 VEX, VEX_L; 5244} 5245def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 5246 "lddqu\t{$src, $dst|$dst, $src}", 5247 [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))], 5248 IIC_SSE_LDDQU>; 5249} 5250 5251//===---------------------------------------------------------------------===// 5252// SSE3 - Arithmetic 5253//===---------------------------------------------------------------------===// 5254 5255multiclass sse3_addsub<Intrinsic Int, string OpcodeStr, RegisterClass RC, 5256 X86MemOperand x86memop, OpndItins itins, 5257 PatFrag ld_frag, bit Is2Addr = 1> { 5258 def rr : I<0xD0, MRMSrcReg, 5259 (outs RC:$dst), (ins RC:$src1, RC:$src2), 5260 !if(Is2Addr, 5261 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 5262 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 5263 [(set RC:$dst, (Int RC:$src1, RC:$src2))], itins.rr>, 5264 Sched<[itins.Sched]>; 5265 def rm : I<0xD0, MRMSrcMem, 5266 (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 5267 !if(Is2Addr, 5268 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 5269 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 5270 [(set RC:$dst, (Int RC:$src1, (ld_frag addr:$src2)))], itins.rr>, 5271 Sched<[itins.Sched.Folded, ReadAfterLd]>; 5272} 5273 5274let Predicates = [HasAVX] in { 5275 let ExeDomain = SSEPackedSingle in { 5276 defm VADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "vaddsubps", VR128, 5277 f128mem, SSE_ALU_F32P, loadv4f32, 0>, XD, VEX_4V; 5278 defm VADDSUBPSY : sse3_addsub<int_x86_avx_addsub_ps_256, "vaddsubps", VR256, 5279 f256mem, SSE_ALU_F32P, loadv8f32, 0>, XD, VEX_4V, VEX_L; 5280 } 5281 let ExeDomain = SSEPackedDouble in { 5282 defm VADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "vaddsubpd", VR128, 5283 f128mem, SSE_ALU_F64P, loadv2f64, 0>, PD, VEX_4V; 5284 defm VADDSUBPDY : sse3_addsub<int_x86_avx_addsub_pd_256, "vaddsubpd", VR256, 5285 f256mem, SSE_ALU_F64P, loadv4f64, 0>, PD, VEX_4V, VEX_L; 5286 } 5287} 5288let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in { 5289 let ExeDomain = SSEPackedSingle in 5290 defm ADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "addsubps", VR128, 5291 f128mem, SSE_ALU_F32P, memopv4f32>, XD; 5292 let ExeDomain = SSEPackedDouble in 5293 defm ADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "addsubpd", VR128, 5294 f128mem, SSE_ALU_F64P, memopv2f64>, PD; 5295} 5296 5297// Patterns used to select 'addsub' instructions. 5298let Predicates = [HasAVX] in { 5299 def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (v4f32 VR128:$rhs))), 5300 (VADDSUBPSrr VR128:$lhs, VR128:$rhs)>; 5301 def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (loadv4f32 addr:$rhs))), 5302 (VADDSUBPSrm VR128:$lhs, f128mem:$rhs)>; 5303 def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (v2f64 VR128:$rhs))), 5304 (VADDSUBPDrr VR128:$lhs, VR128:$rhs)>; 5305 def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (loadv2f64 addr:$rhs))), 5306 (VADDSUBPDrm VR128:$lhs, f128mem:$rhs)>; 5307 5308 def : Pat<(v8f32 (X86Addsub (v8f32 VR256:$lhs), (v8f32 VR256:$rhs))), 5309 (VADDSUBPSYrr VR256:$lhs, VR256:$rhs)>; 5310 def : Pat<(v8f32 (X86Addsub (v8f32 VR256:$lhs), (loadv8f32 addr:$rhs))), 5311 (VADDSUBPSYrm VR256:$lhs, f256mem:$rhs)>; 5312 def : Pat<(v4f64 (X86Addsub (v4f64 VR256:$lhs), (v4f64 VR256:$rhs))), 5313 (VADDSUBPDYrr VR256:$lhs, VR256:$rhs)>; 5314 def : Pat<(v4f64 (X86Addsub (v4f64 VR256:$lhs), (loadv4f64 addr:$rhs))), 5315 (VADDSUBPDYrm VR256:$lhs, f256mem:$rhs)>; 5316} 5317 5318let Predicates = [UseSSE3] in { 5319 def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (v4f32 VR128:$rhs))), 5320 (ADDSUBPSrr VR128:$lhs, VR128:$rhs)>; 5321 def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (memopv4f32 addr:$rhs))), 5322 (ADDSUBPSrm VR128:$lhs, f128mem:$rhs)>; 5323 def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (v2f64 VR128:$rhs))), 5324 (ADDSUBPDrr VR128:$lhs, VR128:$rhs)>; 5325 def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (memopv2f64 addr:$rhs))), 5326 (ADDSUBPDrm VR128:$lhs, f128mem:$rhs)>; 5327} 5328 5329//===---------------------------------------------------------------------===// 5330// SSE3 Instructions 5331//===---------------------------------------------------------------------===// 5332 5333// Horizontal ops 5334multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC, 5335 X86MemOperand x86memop, SDNode OpNode, PatFrag ld_frag, 5336 bit Is2Addr = 1> { 5337 def rr : S3DI<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 5338 !if(Is2Addr, 5339 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 5340 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 5341 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], IIC_SSE_HADDSUB_RR>, 5342 Sched<[WriteFAdd]>; 5343 5344 def rm : S3DI<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 5345 !if(Is2Addr, 5346 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 5347 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 5348 [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))], 5349 IIC_SSE_HADDSUB_RM>, Sched<[WriteFAddLd, ReadAfterLd]>; 5350} 5351multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC, 5352 X86MemOperand x86memop, SDNode OpNode, PatFrag ld_frag, 5353 bit Is2Addr = 1> { 5354 def rr : S3I<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 5355 !if(Is2Addr, 5356 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 5357 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 5358 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], IIC_SSE_HADDSUB_RR>, 5359 Sched<[WriteFAdd]>; 5360 5361 def rm : S3I<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 5362 !if(Is2Addr, 5363 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 5364 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 5365 [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))], 5366 IIC_SSE_HADDSUB_RM>, Sched<[WriteFAddLd, ReadAfterLd]>; 5367} 5368 5369let Predicates = [HasAVX] in { 5370 let ExeDomain = SSEPackedSingle in { 5371 defm VHADDPS : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem, 5372 X86fhadd, loadv4f32, 0>, VEX_4V; 5373 defm VHSUBPS : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem, 5374 X86fhsub, loadv4f32, 0>, VEX_4V; 5375 defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem, 5376 X86fhadd, loadv8f32, 0>, VEX_4V, VEX_L; 5377 defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem, 5378 X86fhsub, loadv8f32, 0>, VEX_4V, VEX_L; 5379 } 5380 let ExeDomain = SSEPackedDouble in { 5381 defm VHADDPD : S3_Int <0x7C, "vhaddpd", v2f64, VR128, f128mem, 5382 X86fhadd, loadv2f64, 0>, VEX_4V; 5383 defm VHSUBPD : S3_Int <0x7D, "vhsubpd", v2f64, VR128, f128mem, 5384 X86fhsub, loadv2f64, 0>, VEX_4V; 5385 defm VHADDPDY : S3_Int <0x7C, "vhaddpd", v4f64, VR256, f256mem, 5386 X86fhadd, loadv4f64, 0>, VEX_4V, VEX_L; 5387 defm VHSUBPDY : S3_Int <0x7D, "vhsubpd", v4f64, VR256, f256mem, 5388 X86fhsub, loadv4f64, 0>, VEX_4V, VEX_L; 5389 } 5390} 5391 5392let Constraints = "$src1 = $dst" in { 5393 let ExeDomain = SSEPackedSingle in { 5394 defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, X86fhadd, 5395 memopv4f32>; 5396 defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, X86fhsub, 5397 memopv4f32>; 5398 } 5399 let ExeDomain = SSEPackedDouble in { 5400 defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, X86fhadd, 5401 memopv2f64>; 5402 defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, X86fhsub, 5403 memopv2f64>; 5404 } 5405} 5406 5407//===---------------------------------------------------------------------===// 5408// SSSE3 - Packed Absolute Instructions 5409//===---------------------------------------------------------------------===// 5410 5411 5412/// SS3I_unop_rm_int - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}. 5413multiclass SS3I_unop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId128, 5414 PatFrag ld_frag> { 5415 def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst), 5416 (ins VR128:$src), 5417 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 5418 [(set VR128:$dst, (IntId128 VR128:$src))], IIC_SSE_PABS_RR>, 5419 Sched<[WriteVecALU]>; 5420 5421 def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst), 5422 (ins i128mem:$src), 5423 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 5424 [(set VR128:$dst, 5425 (IntId128 5426 (bitconvert (ld_frag addr:$src))))], IIC_SSE_PABS_RM>, 5427 Sched<[WriteVecALULd]>; 5428} 5429 5430/// SS3I_unop_rm_int_y - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}. 5431multiclass SS3I_unop_rm_int_y<bits<8> opc, string OpcodeStr, 5432 Intrinsic IntId256> { 5433 def rr256 : SS38I<opc, MRMSrcReg, (outs VR256:$dst), 5434 (ins VR256:$src), 5435 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 5436 [(set VR256:$dst, (IntId256 VR256:$src))]>, 5437 Sched<[WriteVecALU]>; 5438 5439 def rm256 : SS38I<opc, MRMSrcMem, (outs VR256:$dst), 5440 (ins i256mem:$src), 5441 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 5442 [(set VR256:$dst, 5443 (IntId256 5444 (bitconvert (loadv4i64 addr:$src))))]>, 5445 Sched<[WriteVecALULd]>; 5446} 5447 5448// Helper fragments to match sext vXi1 to vXiY. 5449def v16i1sextv16i8 : PatLeaf<(v16i8 (X86pcmpgt (bc_v16i8 (v4i32 immAllZerosV)), 5450 VR128:$src))>; 5451def v8i1sextv8i16 : PatLeaf<(v8i16 (X86vsrai VR128:$src, (i8 15)))>; 5452def v4i1sextv4i32 : PatLeaf<(v4i32 (X86vsrai VR128:$src, (i8 31)))>; 5453def v32i1sextv32i8 : PatLeaf<(v32i8 (X86pcmpgt (bc_v32i8 (v8i32 immAllZerosV)), 5454 VR256:$src))>; 5455def v16i1sextv16i16: PatLeaf<(v16i16 (X86vsrai VR256:$src, (i8 15)))>; 5456def v8i1sextv8i32 : PatLeaf<(v8i32 (X86vsrai VR256:$src, (i8 31)))>; 5457 5458let Predicates = [HasAVX] in { 5459 defm VPABSB : SS3I_unop_rm_int<0x1C, "vpabsb", int_x86_ssse3_pabs_b_128, 5460 loadv2i64>, VEX; 5461 defm VPABSW : SS3I_unop_rm_int<0x1D, "vpabsw", int_x86_ssse3_pabs_w_128, 5462 loadv2i64>, VEX; 5463 defm VPABSD : SS3I_unop_rm_int<0x1E, "vpabsd", int_x86_ssse3_pabs_d_128, 5464 loadv2i64>, VEX; 5465 5466 def : Pat<(xor 5467 (bc_v2i64 (v16i1sextv16i8)), 5468 (bc_v2i64 (add (v16i8 VR128:$src), (v16i1sextv16i8)))), 5469 (VPABSBrr128 VR128:$src)>; 5470 def : Pat<(xor 5471 (bc_v2i64 (v8i1sextv8i16)), 5472 (bc_v2i64 (add (v8i16 VR128:$src), (v8i1sextv8i16)))), 5473 (VPABSWrr128 VR128:$src)>; 5474 def : Pat<(xor 5475 (bc_v2i64 (v4i1sextv4i32)), 5476 (bc_v2i64 (add (v4i32 VR128:$src), (v4i1sextv4i32)))), 5477 (VPABSDrr128 VR128:$src)>; 5478} 5479 5480let Predicates = [HasAVX2] in { 5481 defm VPABSB : SS3I_unop_rm_int_y<0x1C, "vpabsb", 5482 int_x86_avx2_pabs_b>, VEX, VEX_L; 5483 defm VPABSW : SS3I_unop_rm_int_y<0x1D, "vpabsw", 5484 int_x86_avx2_pabs_w>, VEX, VEX_L; 5485 defm VPABSD : SS3I_unop_rm_int_y<0x1E, "vpabsd", 5486 int_x86_avx2_pabs_d>, VEX, VEX_L; 5487 5488 def : Pat<(xor 5489 (bc_v4i64 (v32i1sextv32i8)), 5490 (bc_v4i64 (add (v32i8 VR256:$src), (v32i1sextv32i8)))), 5491 (VPABSBrr256 VR256:$src)>; 5492 def : Pat<(xor 5493 (bc_v4i64 (v16i1sextv16i16)), 5494 (bc_v4i64 (add (v16i16 VR256:$src), (v16i1sextv16i16)))), 5495 (VPABSWrr256 VR256:$src)>; 5496 def : Pat<(xor 5497 (bc_v4i64 (v8i1sextv8i32)), 5498 (bc_v4i64 (add (v8i32 VR256:$src), (v8i1sextv8i32)))), 5499 (VPABSDrr256 VR256:$src)>; 5500} 5501 5502defm PABSB : SS3I_unop_rm_int<0x1C, "pabsb", int_x86_ssse3_pabs_b_128, 5503 memopv2i64>; 5504defm PABSW : SS3I_unop_rm_int<0x1D, "pabsw", int_x86_ssse3_pabs_w_128, 5505 memopv2i64>; 5506defm PABSD : SS3I_unop_rm_int<0x1E, "pabsd", int_x86_ssse3_pabs_d_128, 5507 memopv2i64>; 5508 5509let Predicates = [HasSSSE3] in { 5510 def : Pat<(xor 5511 (bc_v2i64 (v16i1sextv16i8)), 5512 (bc_v2i64 (add (v16i8 VR128:$src), (v16i1sextv16i8)))), 5513 (PABSBrr128 VR128:$src)>; 5514 def : Pat<(xor 5515 (bc_v2i64 (v8i1sextv8i16)), 5516 (bc_v2i64 (add (v8i16 VR128:$src), (v8i1sextv8i16)))), 5517 (PABSWrr128 VR128:$src)>; 5518 def : Pat<(xor 5519 (bc_v2i64 (v4i1sextv4i32)), 5520 (bc_v2i64 (add (v4i32 VR128:$src), (v4i1sextv4i32)))), 5521 (PABSDrr128 VR128:$src)>; 5522} 5523 5524//===---------------------------------------------------------------------===// 5525// SSSE3 - Packed Binary Operator Instructions 5526//===---------------------------------------------------------------------===// 5527 5528let Sched = WriteVecALU in { 5529def SSE_PHADDSUBD : OpndItins< 5530 IIC_SSE_PHADDSUBD_RR, IIC_SSE_PHADDSUBD_RM 5531>; 5532def SSE_PHADDSUBSW : OpndItins< 5533 IIC_SSE_PHADDSUBSW_RR, IIC_SSE_PHADDSUBSW_RM 5534>; 5535def SSE_PHADDSUBW : OpndItins< 5536 IIC_SSE_PHADDSUBW_RR, IIC_SSE_PHADDSUBW_RM 5537>; 5538} 5539let Sched = WriteShuffle in 5540def SSE_PSHUFB : OpndItins< 5541 IIC_SSE_PSHUFB_RR, IIC_SSE_PSHUFB_RM 5542>; 5543let Sched = WriteVecALU in 5544def SSE_PSIGN : OpndItins< 5545 IIC_SSE_PSIGN_RR, IIC_SSE_PSIGN_RM 5546>; 5547let Sched = WriteVecIMul in 5548def SSE_PMULHRSW : OpndItins< 5549 IIC_SSE_PMULHRSW, IIC_SSE_PMULHRSW 5550>; 5551 5552/// SS3I_binop_rm - Simple SSSE3 bin op 5553multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, 5554 ValueType OpVT, RegisterClass RC, PatFrag memop_frag, 5555 X86MemOperand x86memop, OpndItins itins, 5556 bit Is2Addr = 1> { 5557 let isCommutable = 1 in 5558 def rr : SS38I<opc, MRMSrcReg, (outs RC:$dst), 5559 (ins RC:$src1, RC:$src2), 5560 !if(Is2Addr, 5561 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 5562 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 5563 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))], itins.rr>, 5564 Sched<[itins.Sched]>; 5565 def rm : SS38I<opc, MRMSrcMem, (outs RC:$dst), 5566 (ins RC:$src1, x86memop:$src2), 5567 !if(Is2Addr, 5568 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 5569 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 5570 [(set RC:$dst, 5571 (OpVT (OpNode RC:$src1, 5572 (bitconvert (memop_frag addr:$src2)))))], itins.rm>, 5573 Sched<[itins.Sched.Folded, ReadAfterLd]>; 5574} 5575 5576/// SS3I_binop_rm_int - Simple SSSE3 bin op whose type can be v*{i8,i16,i32}. 5577multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr, 5578 Intrinsic IntId128, OpndItins itins, 5579 PatFrag ld_frag, bit Is2Addr = 1> { 5580 let isCommutable = 1 in 5581 def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst), 5582 (ins VR128:$src1, VR128:$src2), 5583 !if(Is2Addr, 5584 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 5585 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 5586 [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>, 5587 Sched<[itins.Sched]>; 5588 def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst), 5589 (ins VR128:$src1, i128mem:$src2), 5590 !if(Is2Addr, 5591 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 5592 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 5593 [(set VR128:$dst, 5594 (IntId128 VR128:$src1, 5595 (bitconvert (ld_frag addr:$src2))))]>, 5596 Sched<[itins.Sched.Folded, ReadAfterLd]>; 5597} 5598 5599multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr, 5600 Intrinsic IntId256, 5601 X86FoldableSchedWrite Sched> { 5602 let isCommutable = 1 in 5603 def rr256 : SS38I<opc, MRMSrcReg, (outs VR256:$dst), 5604 (ins VR256:$src1, VR256:$src2), 5605 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5606 [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>, 5607 Sched<[Sched]>; 5608 def rm256 : SS38I<opc, MRMSrcMem, (outs VR256:$dst), 5609 (ins VR256:$src1, i256mem:$src2), 5610 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5611 [(set VR256:$dst, 5612 (IntId256 VR256:$src1, (bitconvert (loadv4i64 addr:$src2))))]>, 5613 Sched<[Sched.Folded, ReadAfterLd]>; 5614} 5615 5616let ImmT = NoImm, Predicates = [HasAVX] in { 5617let isCommutable = 0 in { 5618 defm VPHADDW : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, VR128, 5619 loadv2i64, i128mem, 5620 SSE_PHADDSUBW, 0>, VEX_4V; 5621 defm VPHADDD : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, VR128, 5622 loadv2i64, i128mem, 5623 SSE_PHADDSUBD, 0>, VEX_4V; 5624 defm VPHSUBW : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, VR128, 5625 loadv2i64, i128mem, 5626 SSE_PHADDSUBW, 0>, VEX_4V; 5627 defm VPHSUBD : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, VR128, 5628 loadv2i64, i128mem, 5629 SSE_PHADDSUBD, 0>, VEX_4V; 5630 defm VPSIGNB : SS3I_binop_rm<0x08, "vpsignb", X86psign, v16i8, VR128, 5631 loadv2i64, i128mem, 5632 SSE_PSIGN, 0>, VEX_4V; 5633 defm VPSIGNW : SS3I_binop_rm<0x09, "vpsignw", X86psign, v8i16, VR128, 5634 loadv2i64, i128mem, 5635 SSE_PSIGN, 0>, VEX_4V; 5636 defm VPSIGND : SS3I_binop_rm<0x0A, "vpsignd", X86psign, v4i32, VR128, 5637 loadv2i64, i128mem, 5638 SSE_PSIGN, 0>, VEX_4V; 5639 defm VPSHUFB : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, VR128, 5640 loadv2i64, i128mem, 5641 SSE_PSHUFB, 0>, VEX_4V; 5642 defm VPHADDSW : SS3I_binop_rm_int<0x03, "vphaddsw", 5643 int_x86_ssse3_phadd_sw_128, 5644 SSE_PHADDSUBSW, loadv2i64, 0>, VEX_4V; 5645 defm VPHSUBSW : SS3I_binop_rm_int<0x07, "vphsubsw", 5646 int_x86_ssse3_phsub_sw_128, 5647 SSE_PHADDSUBSW, loadv2i64, 0>, VEX_4V; 5648 defm VPMADDUBSW : SS3I_binop_rm_int<0x04, "vpmaddubsw", 5649 int_x86_ssse3_pmadd_ub_sw_128, 5650 SSE_PMADD, loadv2i64, 0>, VEX_4V; 5651} 5652defm VPMULHRSW : SS3I_binop_rm_int<0x0B, "vpmulhrsw", 5653 int_x86_ssse3_pmul_hr_sw_128, 5654 SSE_PMULHRSW, loadv2i64, 0>, VEX_4V; 5655} 5656 5657let ImmT = NoImm, Predicates = [HasAVX2] in { 5658let isCommutable = 0 in { 5659 defm VPHADDWY : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, VR256, 5660 loadv4i64, i256mem, 5661 SSE_PHADDSUBW, 0>, VEX_4V, VEX_L; 5662 defm VPHADDDY : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, VR256, 5663 loadv4i64, i256mem, 5664 SSE_PHADDSUBW, 0>, VEX_4V, VEX_L; 5665 defm VPHSUBWY : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, VR256, 5666 loadv4i64, i256mem, 5667 SSE_PHADDSUBW, 0>, VEX_4V, VEX_L; 5668 defm VPHSUBDY : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, VR256, 5669 loadv4i64, i256mem, 5670 SSE_PHADDSUBW, 0>, VEX_4V, VEX_L; 5671 defm VPSIGNBY : SS3I_binop_rm<0x08, "vpsignb", X86psign, v32i8, VR256, 5672 loadv4i64, i256mem, 5673 SSE_PHADDSUBW, 0>, VEX_4V, VEX_L; 5674 defm VPSIGNWY : SS3I_binop_rm<0x09, "vpsignw", X86psign, v16i16, VR256, 5675 loadv4i64, i256mem, 5676 SSE_PHADDSUBW, 0>, VEX_4V, VEX_L; 5677 defm VPSIGNDY : SS3I_binop_rm<0x0A, "vpsignd", X86psign, v8i32, VR256, 5678 loadv4i64, i256mem, 5679 SSE_PHADDSUBW, 0>, VEX_4V, VEX_L; 5680 defm VPSHUFBY : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, VR256, 5681 loadv4i64, i256mem, 5682 SSE_PSHUFB, 0>, VEX_4V, VEX_L; 5683 defm VPHADDSW : SS3I_binop_rm_int_y<0x03, "vphaddsw", 5684 int_x86_avx2_phadd_sw, 5685 WriteVecALU>, VEX_4V, VEX_L; 5686 defm VPHSUBSW : SS3I_binop_rm_int_y<0x07, "vphsubsw", 5687 int_x86_avx2_phsub_sw, 5688 WriteVecALU>, VEX_4V, VEX_L; 5689 defm VPMADDUBSW : SS3I_binop_rm_int_y<0x04, "vpmaddubsw", 5690 int_x86_avx2_pmadd_ub_sw, 5691 WriteVecIMul>, VEX_4V, VEX_L; 5692} 5693defm VPMULHRSW : SS3I_binop_rm_int_y<0x0B, "vpmulhrsw", 5694 int_x86_avx2_pmul_hr_sw, 5695 WriteVecIMul>, VEX_4V, VEX_L; 5696} 5697 5698// None of these have i8 immediate fields. 5699let ImmT = NoImm, Constraints = "$src1 = $dst" in { 5700let isCommutable = 0 in { 5701 defm PHADDW : SS3I_binop_rm<0x01, "phaddw", X86hadd, v8i16, VR128, 5702 memopv2i64, i128mem, SSE_PHADDSUBW>; 5703 defm PHADDD : SS3I_binop_rm<0x02, "phaddd", X86hadd, v4i32, VR128, 5704 memopv2i64, i128mem, SSE_PHADDSUBD>; 5705 defm PHSUBW : SS3I_binop_rm<0x05, "phsubw", X86hsub, v8i16, VR128, 5706 memopv2i64, i128mem, SSE_PHADDSUBW>; 5707 defm PHSUBD : SS3I_binop_rm<0x06, "phsubd", X86hsub, v4i32, VR128, 5708 memopv2i64, i128mem, SSE_PHADDSUBD>; 5709 defm PSIGNB : SS3I_binop_rm<0x08, "psignb", X86psign, v16i8, VR128, 5710 memopv2i64, i128mem, SSE_PSIGN>; 5711 defm PSIGNW : SS3I_binop_rm<0x09, "psignw", X86psign, v8i16, VR128, 5712 memopv2i64, i128mem, SSE_PSIGN>; 5713 defm PSIGND : SS3I_binop_rm<0x0A, "psignd", X86psign, v4i32, VR128, 5714 memopv2i64, i128mem, SSE_PSIGN>; 5715 defm PSHUFB : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, VR128, 5716 memopv2i64, i128mem, SSE_PSHUFB>; 5717 defm PHADDSW : SS3I_binop_rm_int<0x03, "phaddsw", 5718 int_x86_ssse3_phadd_sw_128, 5719 SSE_PHADDSUBSW, memopv2i64>; 5720 defm PHSUBSW : SS3I_binop_rm_int<0x07, "phsubsw", 5721 int_x86_ssse3_phsub_sw_128, 5722 SSE_PHADDSUBSW, memopv2i64>; 5723 defm PMADDUBSW : SS3I_binop_rm_int<0x04, "pmaddubsw", 5724 int_x86_ssse3_pmadd_ub_sw_128, 5725 SSE_PMADD, memopv2i64>; 5726} 5727defm PMULHRSW : SS3I_binop_rm_int<0x0B, "pmulhrsw", 5728 int_x86_ssse3_pmul_hr_sw_128, 5729 SSE_PMULHRSW, memopv2i64>; 5730} 5731 5732//===---------------------------------------------------------------------===// 5733// SSSE3 - Packed Align Instruction Patterns 5734//===---------------------------------------------------------------------===// 5735 5736multiclass ssse3_palignr<string asm, bit Is2Addr = 1> { 5737 let hasSideEffects = 0 in { 5738 def R128rr : SS3AI<0x0F, MRMSrcReg, (outs VR128:$dst), 5739 (ins VR128:$src1, VR128:$src2, u8imm:$src3), 5740 !if(Is2Addr, 5741 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5742 !strconcat(asm, 5743 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5744 [], IIC_SSE_PALIGNRR>, Sched<[WriteShuffle]>; 5745 let mayLoad = 1 in 5746 def R128rm : SS3AI<0x0F, MRMSrcMem, (outs VR128:$dst), 5747 (ins VR128:$src1, i128mem:$src2, u8imm:$src3), 5748 !if(Is2Addr, 5749 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5750 !strconcat(asm, 5751 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5752 [], IIC_SSE_PALIGNRM>, Sched<[WriteShuffleLd, ReadAfterLd]>; 5753 } 5754} 5755 5756multiclass ssse3_palignr_y<string asm, bit Is2Addr = 1> { 5757 let hasSideEffects = 0 in { 5758 def R256rr : SS3AI<0x0F, MRMSrcReg, (outs VR256:$dst), 5759 (ins VR256:$src1, VR256:$src2, u8imm:$src3), 5760 !strconcat(asm, 5761 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 5762 []>, Sched<[WriteShuffle]>; 5763 let mayLoad = 1 in 5764 def R256rm : SS3AI<0x0F, MRMSrcMem, (outs VR256:$dst), 5765 (ins VR256:$src1, i256mem:$src2, u8imm:$src3), 5766 !strconcat(asm, 5767 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 5768 []>, Sched<[WriteShuffleLd, ReadAfterLd]>; 5769 } 5770} 5771 5772let Predicates = [HasAVX] in 5773 defm VPALIGN : ssse3_palignr<"vpalignr", 0>, VEX_4V; 5774let Predicates = [HasAVX2] in 5775 defm VPALIGN : ssse3_palignr_y<"vpalignr", 0>, VEX_4V, VEX_L; 5776let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in 5777 defm PALIGN : ssse3_palignr<"palignr">; 5778 5779let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 5780def : Pat<(v8i32 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))), 5781 (VPALIGNR256rr VR256:$src1, VR256:$src2, imm:$imm)>; 5782def : Pat<(v8f32 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))), 5783 (VPALIGNR256rr VR256:$src1, VR256:$src2, imm:$imm)>; 5784def : Pat<(v16i16 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))), 5785 (VPALIGNR256rr VR256:$src1, VR256:$src2, imm:$imm)>; 5786def : Pat<(v32i8 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))), 5787 (VPALIGNR256rr VR256:$src1, VR256:$src2, imm:$imm)>; 5788} 5789 5790let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 5791def : Pat<(v4i32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))), 5792 (VPALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>; 5793def : Pat<(v4f32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))), 5794 (VPALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>; 5795def : Pat<(v8i16 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))), 5796 (VPALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>; 5797def : Pat<(v16i8 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))), 5798 (VPALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>; 5799} 5800 5801let Predicates = [UseSSSE3] in { 5802def : Pat<(v4i32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))), 5803 (PALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>; 5804def : Pat<(v4f32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))), 5805 (PALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>; 5806def : Pat<(v8i16 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))), 5807 (PALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>; 5808def : Pat<(v16i8 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))), 5809 (PALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>; 5810} 5811 5812//===---------------------------------------------------------------------===// 5813// SSSE3 - Thread synchronization 5814//===---------------------------------------------------------------------===// 5815 5816let SchedRW = [WriteSystem] in { 5817let usesCustomInserter = 1 in { 5818def MONITOR : PseudoI<(outs), (ins i32mem:$src1, GR32:$src2, GR32:$src3), 5819 [(int_x86_sse3_monitor addr:$src1, GR32:$src2, GR32:$src3)]>, 5820 Requires<[HasSSE3]>; 5821} 5822 5823let Uses = [EAX, ECX, EDX] in 5824def MONITORrrr : I<0x01, MRM_C8, (outs), (ins), "monitor", [], IIC_SSE_MONITOR>, 5825 TB, Requires<[HasSSE3]>; 5826let Uses = [ECX, EAX] in 5827def MWAITrr : I<0x01, MRM_C9, (outs), (ins), "mwait", 5828 [(int_x86_sse3_mwait ECX, EAX)], IIC_SSE_MWAIT>, 5829 TB, Requires<[HasSSE3]>; 5830} // SchedRW 5831 5832def : InstAlias<"mwait\t{%eax, %ecx|ecx, eax}", (MWAITrr)>, Requires<[Not64BitMode]>; 5833def : InstAlias<"mwait\t{%rax, %rcx|rcx, rax}", (MWAITrr)>, Requires<[In64BitMode]>; 5834 5835def : InstAlias<"monitor\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITORrrr)>, 5836 Requires<[Not64BitMode]>; 5837def : InstAlias<"monitor\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITORrrr)>, 5838 Requires<[In64BitMode]>; 5839 5840//===----------------------------------------------------------------------===// 5841// SSE4.1 - Packed Move with Sign/Zero Extend 5842//===----------------------------------------------------------------------===// 5843 5844multiclass SS41I_pmovx_rrrm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp, 5845 RegisterClass OutRC, RegisterClass InRC, 5846 OpndItins itins> { 5847 def rr : SS48I<opc, MRMSrcReg, (outs OutRC:$dst), (ins InRC:$src), 5848 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 5849 [], itins.rr>, 5850 Sched<[itins.Sched]>; 5851 5852 def rm : SS48I<opc, MRMSrcMem, (outs OutRC:$dst), (ins MemOp:$src), 5853 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 5854 [], 5855 itins.rm>, Sched<[itins.Sched.Folded]>; 5856} 5857 5858multiclass SS41I_pmovx_rm_all<bits<8> opc, string OpcodeStr, 5859 X86MemOperand MemOp, X86MemOperand MemYOp, 5860 OpndItins SSEItins, OpndItins AVXItins, 5861 OpndItins AVX2Itins> { 5862 defm NAME : SS41I_pmovx_rrrm<opc, OpcodeStr, MemOp, VR128, VR128, SSEItins>; 5863 let Predicates = [HasAVX, NoVLX] in 5864 defm V#NAME : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemOp, 5865 VR128, VR128, AVXItins>, VEX; 5866 let Predicates = [HasAVX2, NoVLX] in 5867 defm V#NAME#Y : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemYOp, 5868 VR256, VR128, AVX2Itins>, VEX, VEX_L; 5869} 5870 5871multiclass SS41I_pmovx_rm<bits<8> opc, string OpcodeStr, 5872 X86MemOperand MemOp, X86MemOperand MemYOp> { 5873 defm PMOVSX#NAME : SS41I_pmovx_rm_all<opc, !strconcat("pmovsx", OpcodeStr), 5874 MemOp, MemYOp, 5875 SSE_INTALU_ITINS_SHUFF_P, 5876 DEFAULT_ITINS_SHUFFLESCHED, 5877 DEFAULT_ITINS_SHUFFLESCHED>; 5878 defm PMOVZX#NAME : SS41I_pmovx_rm_all<!add(opc, 0x10), 5879 !strconcat("pmovzx", OpcodeStr), 5880 MemOp, MemYOp, 5881 SSE_INTALU_ITINS_SHUFF_P, 5882 DEFAULT_ITINS_SHUFFLESCHED, 5883 DEFAULT_ITINS_SHUFFLESCHED>; 5884} 5885 5886defm BW : SS41I_pmovx_rm<0x20, "bw", i64mem, i128mem>; 5887defm WD : SS41I_pmovx_rm<0x23, "wd", i64mem, i128mem>; 5888defm DQ : SS41I_pmovx_rm<0x25, "dq", i64mem, i128mem>; 5889 5890defm BD : SS41I_pmovx_rm<0x21, "bd", i32mem, i64mem>; 5891defm WQ : SS41I_pmovx_rm<0x24, "wq", i32mem, i64mem>; 5892 5893defm BQ : SS41I_pmovx_rm<0x22, "bq", i16mem, i32mem>; 5894 5895// AVX2 Patterns 5896multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy, SDNode ExtOp> { 5897 // Register-Register patterns 5898 def : Pat<(v16i16 (ExtOp (v16i8 VR128:$src))), 5899 (!cast<I>(OpcPrefix#BWYrr) VR128:$src)>; 5900 def : Pat<(v8i32 (ExtOp (v16i8 VR128:$src))), 5901 (!cast<I>(OpcPrefix#BDYrr) VR128:$src)>; 5902 def : Pat<(v4i64 (ExtOp (v16i8 VR128:$src))), 5903 (!cast<I>(OpcPrefix#BQYrr) VR128:$src)>; 5904 5905 def : Pat<(v8i32 (ExtOp (v8i16 VR128:$src))), 5906 (!cast<I>(OpcPrefix#WDYrr) VR128:$src)>; 5907 def : Pat<(v4i64 (ExtOp (v8i16 VR128:$src))), 5908 (!cast<I>(OpcPrefix#WQYrr) VR128:$src)>; 5909 5910 def : Pat<(v4i64 (ExtOp (v4i32 VR128:$src))), 5911 (!cast<I>(OpcPrefix#DQYrr) VR128:$src)>; 5912 5913 // On AVX2, we also support 256bit inputs. 5914 def : Pat<(v16i16 (ExtOp (v32i8 VR256:$src))), 5915 (!cast<I>(OpcPrefix#BWYrr) (EXTRACT_SUBREG VR256:$src, sub_xmm))>; 5916 def : Pat<(v8i32 (ExtOp (v32i8 VR256:$src))), 5917 (!cast<I>(OpcPrefix#BDYrr) (EXTRACT_SUBREG VR256:$src, sub_xmm))>; 5918 def : Pat<(v4i64 (ExtOp (v32i8 VR256:$src))), 5919 (!cast<I>(OpcPrefix#BQYrr) (EXTRACT_SUBREG VR256:$src, sub_xmm))>; 5920 5921 def : Pat<(v8i32 (ExtOp (v16i16 VR256:$src))), 5922 (!cast<I>(OpcPrefix#WDYrr) (EXTRACT_SUBREG VR256:$src, sub_xmm))>; 5923 def : Pat<(v4i64 (ExtOp (v16i16 VR256:$src))), 5924 (!cast<I>(OpcPrefix#WQYrr) (EXTRACT_SUBREG VR256:$src, sub_xmm))>; 5925 5926 def : Pat<(v4i64 (ExtOp (v8i32 VR256:$src))), 5927 (!cast<I>(OpcPrefix#DQYrr) (EXTRACT_SUBREG VR256:$src, sub_xmm))>; 5928 5929 // Simple Register-Memory patterns 5930 def : Pat<(v16i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), 5931 (!cast<I>(OpcPrefix#BWYrm) addr:$src)>; 5932 def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), 5933 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; 5934 def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), 5935 (!cast<I>(OpcPrefix#BQYrm) addr:$src)>; 5936 5937 def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)), 5938 (!cast<I>(OpcPrefix#WDYrm) addr:$src)>; 5939 def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)), 5940 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; 5941 5942 def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)), 5943 (!cast<I>(OpcPrefix#DQYrm) addr:$src)>; 5944 5945 // AVX2 Register-Memory patterns 5946 def : Pat<(v16i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), 5947 (!cast<I>(OpcPrefix#BWYrm) addr:$src)>; 5948 def : Pat<(v16i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))), 5949 (!cast<I>(OpcPrefix#BWYrm) addr:$src)>; 5950 def : Pat<(v16i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), 5951 (!cast<I>(OpcPrefix#BWYrm) addr:$src)>; 5952 def : Pat<(v16i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), 5953 (!cast<I>(OpcPrefix#BWYrm) addr:$src)>; 5954 5955 def : Pat<(v8i32 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), 5956 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; 5957 def : Pat<(v8i32 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))), 5958 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; 5959 def : Pat<(v8i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), 5960 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; 5961 def : Pat<(v8i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), 5962 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; 5963 5964 def : Pat<(v4i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), 5965 (!cast<I>(OpcPrefix#BQYrm) addr:$src)>; 5966 def : Pat<(v4i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))), 5967 (!cast<I>(OpcPrefix#BQYrm) addr:$src)>; 5968 def : Pat<(v4i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), 5969 (!cast<I>(OpcPrefix#BQYrm) addr:$src)>; 5970 def : Pat<(v4i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), 5971 (!cast<I>(OpcPrefix#BQYrm) addr:$src)>; 5972 5973 def : Pat<(v8i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))), 5974 (!cast<I>(OpcPrefix#WDYrm) addr:$src)>; 5975 def : Pat<(v8i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))), 5976 (!cast<I>(OpcPrefix#WDYrm) addr:$src)>; 5977 def : Pat<(v8i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), 5978 (!cast<I>(OpcPrefix#WDYrm) addr:$src)>; 5979 def : Pat<(v8i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))), 5980 (!cast<I>(OpcPrefix#WDYrm) addr:$src)>; 5981 5982 def : Pat<(v4i64 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), 5983 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; 5984 def : Pat<(v4i64 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))), 5985 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; 5986 def : Pat<(v4i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), 5987 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; 5988 def : Pat<(v4i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))), 5989 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; 5990 5991 def : Pat<(v4i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))), 5992 (!cast<I>(OpcPrefix#DQYrm) addr:$src)>; 5993 def : Pat<(v4i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))), 5994 (!cast<I>(OpcPrefix#DQYrm) addr:$src)>; 5995 def : Pat<(v4i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))), 5996 (!cast<I>(OpcPrefix#DQYrm) addr:$src)>; 5997 def : Pat<(v4i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))), 5998 (!cast<I>(OpcPrefix#DQYrm) addr:$src)>; 5999} 6000 6001let Predicates = [HasAVX2, NoVLX] in { 6002 defm : SS41I_pmovx_avx2_patterns<"VPMOVSX", "s", X86vsext>; 6003 defm : SS41I_pmovx_avx2_patterns<"VPMOVZX", "z", X86vzext>; 6004} 6005 6006// SSE4.1/AVX patterns. 6007multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy, 6008 SDNode ExtOp, PatFrag ExtLoad16> { 6009 def : Pat<(v8i16 (ExtOp (v16i8 VR128:$src))), 6010 (!cast<I>(OpcPrefix#BWrr) VR128:$src)>; 6011 def : Pat<(v4i32 (ExtOp (v16i8 VR128:$src))), 6012 (!cast<I>(OpcPrefix#BDrr) VR128:$src)>; 6013 def : Pat<(v2i64 (ExtOp (v16i8 VR128:$src))), 6014 (!cast<I>(OpcPrefix#BQrr) VR128:$src)>; 6015 6016 def : Pat<(v4i32 (ExtOp (v8i16 VR128:$src))), 6017 (!cast<I>(OpcPrefix#WDrr) VR128:$src)>; 6018 def : Pat<(v2i64 (ExtOp (v8i16 VR128:$src))), 6019 (!cast<I>(OpcPrefix#WQrr) VR128:$src)>; 6020 6021 def : Pat<(v2i64 (ExtOp (v4i32 VR128:$src))), 6022 (!cast<I>(OpcPrefix#DQrr) VR128:$src)>; 6023 6024 def : Pat<(v8i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), 6025 (!cast<I>(OpcPrefix#BWrm) addr:$src)>; 6026 def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), 6027 (!cast<I>(OpcPrefix#BDrm) addr:$src)>; 6028 def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), 6029 (!cast<I>(OpcPrefix#BQrm) addr:$src)>; 6030 6031 def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)), 6032 (!cast<I>(OpcPrefix#WDrm) addr:$src)>; 6033 def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)), 6034 (!cast<I>(OpcPrefix#WQrm) addr:$src)>; 6035 6036 def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)), 6037 (!cast<I>(OpcPrefix#DQrm) addr:$src)>; 6038 6039 def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), 6040 (!cast<I>(OpcPrefix#BWrm) addr:$src)>; 6041 def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), 6042 (!cast<I>(OpcPrefix#BWrm) addr:$src)>; 6043 def : Pat<(v8i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))), 6044 (!cast<I>(OpcPrefix#BWrm) addr:$src)>; 6045 def : Pat<(v8i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), 6046 (!cast<I>(OpcPrefix#BWrm) addr:$src)>; 6047 def : Pat<(v8i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), 6048 (!cast<I>(OpcPrefix#BWrm) addr:$src)>; 6049 6050 def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), 6051 (!cast<I>(OpcPrefix#BDrm) addr:$src)>; 6052 def : Pat<(v4i32 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))), 6053 (!cast<I>(OpcPrefix#BDrm) addr:$src)>; 6054 def : Pat<(v4i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), 6055 (!cast<I>(OpcPrefix#BDrm) addr:$src)>; 6056 def : Pat<(v4i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), 6057 (!cast<I>(OpcPrefix#BDrm) addr:$src)>; 6058 6059 def : Pat<(v2i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (ExtLoad16 addr:$src)))))), 6060 (!cast<I>(OpcPrefix#BQrm) addr:$src)>; 6061 def : Pat<(v2i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))), 6062 (!cast<I>(OpcPrefix#BQrm) addr:$src)>; 6063 def : Pat<(v2i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), 6064 (!cast<I>(OpcPrefix#BQrm) addr:$src)>; 6065 def : Pat<(v2i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), 6066 (!cast<I>(OpcPrefix#BQrm) addr:$src)>; 6067 6068 def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), 6069 (!cast<I>(OpcPrefix#WDrm) addr:$src)>; 6070 def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), 6071 (!cast<I>(OpcPrefix#WDrm) addr:$src)>; 6072 def : Pat<(v4i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))), 6073 (!cast<I>(OpcPrefix#WDrm) addr:$src)>; 6074 def : Pat<(v4i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), 6075 (!cast<I>(OpcPrefix#WDrm) addr:$src)>; 6076 def : Pat<(v4i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))), 6077 (!cast<I>(OpcPrefix#WDrm) addr:$src)>; 6078 6079 def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), 6080 (!cast<I>(OpcPrefix#WQrm) addr:$src)>; 6081 def : Pat<(v2i64 (ExtOp (v8i16 (vzmovl_v4i32 addr:$src)))), 6082 (!cast<I>(OpcPrefix#WQrm) addr:$src)>; 6083 def : Pat<(v2i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), 6084 (!cast<I>(OpcPrefix#WQrm) addr:$src)>; 6085 def : Pat<(v2i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))), 6086 (!cast<I>(OpcPrefix#WQrm) addr:$src)>; 6087 6088 def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), 6089 (!cast<I>(OpcPrefix#DQrm) addr:$src)>; 6090 def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), 6091 (!cast<I>(OpcPrefix#DQrm) addr:$src)>; 6092 def : Pat<(v2i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))), 6093 (!cast<I>(OpcPrefix#DQrm) addr:$src)>; 6094 def : Pat<(v2i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))), 6095 (!cast<I>(OpcPrefix#DQrm) addr:$src)>; 6096 def : Pat<(v2i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))), 6097 (!cast<I>(OpcPrefix#DQrm) addr:$src)>; 6098} 6099 6100let Predicates = [HasAVX, NoVLX] in { 6101 defm : SS41I_pmovx_patterns<"VPMOVSX", "s", X86vsext, extloadi32i16>; 6102 defm : SS41I_pmovx_patterns<"VPMOVZX", "z", X86vzext, loadi16_anyext>; 6103} 6104 6105let Predicates = [UseSSE41] in { 6106 defm : SS41I_pmovx_patterns<"PMOVSX", "s", X86vsext, extloadi32i16>; 6107 defm : SS41I_pmovx_patterns<"PMOVZX", "z", X86vzext, loadi16_anyext>; 6108} 6109 6110//===----------------------------------------------------------------------===// 6111// SSE4.1 - Extract Instructions 6112//===----------------------------------------------------------------------===// 6113 6114/// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem 6115multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> { 6116 def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst), 6117 (ins VR128:$src1, u8imm:$src2), 6118 !strconcat(OpcodeStr, 6119 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 6120 [(set GR32orGR64:$dst, (X86pextrb (v16i8 VR128:$src1), 6121 imm:$src2))]>, 6122 Sched<[WriteShuffle]>; 6123 let hasSideEffects = 0, mayStore = 1, 6124 SchedRW = [WriteShuffleLd, WriteRMW] in 6125 def mr : SS4AIi8<opc, MRMDestMem, (outs), 6126 (ins i8mem:$dst, VR128:$src1, u8imm:$src2), 6127 !strconcat(OpcodeStr, 6128 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 6129 [(store (i8 (trunc (assertzext (X86pextrb (v16i8 VR128:$src1), 6130 imm:$src2)))), addr:$dst)]>; 6131} 6132 6133let Predicates = [HasAVX, NoBWI] in 6134 defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX; 6135 6136defm PEXTRB : SS41I_extract8<0x14, "pextrb">; 6137 6138 6139/// SS41I_extract16 - SSE 4.1 extract 16 bits to memory destination 6140multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> { 6141 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in 6142 def rr_REV : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst), 6143 (ins VR128:$src1, u8imm:$src2), 6144 !strconcat(OpcodeStr, 6145 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 6146 []>, Sched<[WriteShuffle]>; 6147 6148 let hasSideEffects = 0, mayStore = 1, 6149 SchedRW = [WriteShuffleLd, WriteRMW] in 6150 def mr : SS4AIi8<opc, MRMDestMem, (outs), 6151 (ins i16mem:$dst, VR128:$src1, u8imm:$src2), 6152 !strconcat(OpcodeStr, 6153 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 6154 [(store (i16 (trunc (assertzext (X86pextrw (v8i16 VR128:$src1), 6155 imm:$src2)))), addr:$dst)]>; 6156} 6157 6158let Predicates = [HasAVX, NoBWI] in 6159 defm VPEXTRW : SS41I_extract16<0x15, "vpextrw">, VEX; 6160 6161defm PEXTRW : SS41I_extract16<0x15, "pextrw">; 6162 6163 6164/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination 6165multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> { 6166 def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst), 6167 (ins VR128:$src1, u8imm:$src2), 6168 !strconcat(OpcodeStr, 6169 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 6170 [(set GR32:$dst, 6171 (extractelt (v4i32 VR128:$src1), imm:$src2))]>, 6172 Sched<[WriteShuffle]>; 6173 let SchedRW = [WriteShuffleLd, WriteRMW] in 6174 def mr : SS4AIi8<opc, MRMDestMem, (outs), 6175 (ins i32mem:$dst, VR128:$src1, u8imm:$src2), 6176 !strconcat(OpcodeStr, 6177 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 6178 [(store (extractelt (v4i32 VR128:$src1), imm:$src2), 6179 addr:$dst)]>; 6180} 6181 6182let Predicates = [HasAVX, NoDQI] in 6183 defm VPEXTRD : SS41I_extract32<0x16, "vpextrd">, VEX; 6184 6185defm PEXTRD : SS41I_extract32<0x16, "pextrd">; 6186 6187/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination 6188multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> { 6189 def rr : SS4AIi8<opc, MRMDestReg, (outs GR64:$dst), 6190 (ins VR128:$src1, u8imm:$src2), 6191 !strconcat(OpcodeStr, 6192 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 6193 [(set GR64:$dst, 6194 (extractelt (v2i64 VR128:$src1), imm:$src2))]>, 6195 Sched<[WriteShuffle]>, REX_W; 6196 let SchedRW = [WriteShuffleLd, WriteRMW] in 6197 def mr : SS4AIi8<opc, MRMDestMem, (outs), 6198 (ins i64mem:$dst, VR128:$src1, u8imm:$src2), 6199 !strconcat(OpcodeStr, 6200 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 6201 [(store (extractelt (v2i64 VR128:$src1), imm:$src2), 6202 addr:$dst)]>, REX_W; 6203} 6204 6205let Predicates = [HasAVX, NoDQI] in 6206 defm VPEXTRQ : SS41I_extract64<0x16, "vpextrq">, VEX, VEX_W; 6207 6208defm PEXTRQ : SS41I_extract64<0x16, "pextrq">; 6209 6210/// SS41I_extractf32 - SSE 4.1 extract 32 bits fp value to int reg or memory 6211/// destination 6212multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr, 6213 OpndItins itins = DEFAULT_ITINS> { 6214 def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst), 6215 (ins VR128:$src1, u8imm:$src2), 6216 !strconcat(OpcodeStr, 6217 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 6218 [(set GR32orGR64:$dst, 6219 (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))], 6220 itins.rr>, Sched<[WriteFBlend]>; 6221 let SchedRW = [WriteFBlendLd, WriteRMW] in 6222 def mr : SS4AIi8<opc, MRMDestMem, (outs), 6223 (ins f32mem:$dst, VR128:$src1, u8imm:$src2), 6224 !strconcat(OpcodeStr, 6225 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 6226 [(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2), 6227 addr:$dst)], itins.rm>; 6228} 6229 6230let ExeDomain = SSEPackedSingle in { 6231 let Predicates = [UseAVX] in 6232 defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX; 6233 defm EXTRACTPS : SS41I_extractf32<0x17, "extractps", SSE_EXTRACT_ITINS>; 6234} 6235 6236// Also match an EXTRACTPS store when the store is done as f32 instead of i32. 6237def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)), 6238 imm:$src2))), 6239 addr:$dst), 6240 (VEXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>, 6241 Requires<[HasAVX]>; 6242def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)), 6243 imm:$src2))), 6244 addr:$dst), 6245 (EXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>, 6246 Requires<[UseSSE41]>; 6247 6248//===----------------------------------------------------------------------===// 6249// SSE4.1 - Insert Instructions 6250//===----------------------------------------------------------------------===// 6251 6252multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> { 6253 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), 6254 (ins VR128:$src1, GR32orGR64:$src2, u8imm:$src3), 6255 !if(Is2Addr, 6256 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 6257 !strconcat(asm, 6258 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 6259 [(set VR128:$dst, 6260 (X86pinsrb VR128:$src1, GR32orGR64:$src2, imm:$src3))]>, 6261 Sched<[WriteShuffle]>; 6262 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), 6263 (ins VR128:$src1, i8mem:$src2, u8imm:$src3), 6264 !if(Is2Addr, 6265 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 6266 !strconcat(asm, 6267 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 6268 [(set VR128:$dst, 6269 (X86pinsrb VR128:$src1, (extloadi8 addr:$src2), 6270 imm:$src3))]>, Sched<[WriteShuffleLd, ReadAfterLd]>; 6271} 6272 6273let Predicates = [HasAVX, NoBWI] in 6274 defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V; 6275let Constraints = "$src1 = $dst" in 6276 defm PINSRB : SS41I_insert8<0x20, "pinsrb">; 6277 6278multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> { 6279 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), 6280 (ins VR128:$src1, GR32:$src2, u8imm:$src3), 6281 !if(Is2Addr, 6282 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 6283 !strconcat(asm, 6284 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 6285 [(set VR128:$dst, 6286 (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>, 6287 Sched<[WriteShuffle]>; 6288 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), 6289 (ins VR128:$src1, i32mem:$src2, u8imm:$src3), 6290 !if(Is2Addr, 6291 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 6292 !strconcat(asm, 6293 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 6294 [(set VR128:$dst, 6295 (v4i32 (insertelt VR128:$src1, (loadi32 addr:$src2), 6296 imm:$src3)))]>, Sched<[WriteShuffleLd, ReadAfterLd]>; 6297} 6298 6299let Predicates = [HasAVX, NoDQI] in 6300 defm VPINSRD : SS41I_insert32<0x22, "vpinsrd", 0>, VEX_4V; 6301let Constraints = "$src1 = $dst" in 6302 defm PINSRD : SS41I_insert32<0x22, "pinsrd">; 6303 6304multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> { 6305 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), 6306 (ins VR128:$src1, GR64:$src2, u8imm:$src3), 6307 !if(Is2Addr, 6308 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 6309 !strconcat(asm, 6310 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 6311 [(set VR128:$dst, 6312 (v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>, 6313 Sched<[WriteShuffle]>; 6314 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), 6315 (ins VR128:$src1, i64mem:$src2, u8imm:$src3), 6316 !if(Is2Addr, 6317 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 6318 !strconcat(asm, 6319 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 6320 [(set VR128:$dst, 6321 (v2i64 (insertelt VR128:$src1, (loadi64 addr:$src2), 6322 imm:$src3)))]>, Sched<[WriteShuffleLd, ReadAfterLd]>; 6323} 6324 6325let Predicates = [HasAVX, NoDQI] in 6326 defm VPINSRQ : SS41I_insert64<0x22, "vpinsrq", 0>, VEX_4V, VEX_W; 6327let Constraints = "$src1 = $dst" in 6328 defm PINSRQ : SS41I_insert64<0x22, "pinsrq">, REX_W; 6329 6330// insertps has a few different modes, there's the first two here below which 6331// are optimized inserts that won't zero arbitrary elements in the destination 6332// vector. The next one matches the intrinsic and could zero arbitrary elements 6333// in the target vector. 6334multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1, 6335 OpndItins itins = DEFAULT_ITINS> { 6336 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), 6337 (ins VR128:$src1, VR128:$src2, u8imm:$src3), 6338 !if(Is2Addr, 6339 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 6340 !strconcat(asm, 6341 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 6342 [(set VR128:$dst, 6343 (X86insertps VR128:$src1, VR128:$src2, imm:$src3))], itins.rr>, 6344 Sched<[WriteFShuffle]>; 6345 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), 6346 (ins VR128:$src1, f32mem:$src2, u8imm:$src3), 6347 !if(Is2Addr, 6348 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 6349 !strconcat(asm, 6350 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 6351 [(set VR128:$dst, 6352 (X86insertps VR128:$src1, 6353 (v4f32 (scalar_to_vector (loadf32 addr:$src2))), 6354 imm:$src3))], itins.rm>, 6355 Sched<[WriteFShuffleLd, ReadAfterLd]>; 6356} 6357 6358let ExeDomain = SSEPackedSingle in { 6359 let Predicates = [UseAVX] in 6360 defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>, VEX_4V; 6361 let Constraints = "$src1 = $dst" in 6362 defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1, SSE_INSERT_ITINS>; 6363} 6364 6365let Predicates = [UseSSE41] in { 6366 // If we're inserting an element from a load or a null pshuf of a load, 6367 // fold the load into the insertps instruction. 6368 def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), (X86PShufd (v4f32 6369 (scalar_to_vector (loadf32 addr:$src2))), (i8 0)), 6370 imm:$src3)), 6371 (INSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>; 6372 def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), (X86PShufd 6373 (loadv4f32 addr:$src2), (i8 0)), imm:$src3)), 6374 (INSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>; 6375} 6376 6377let Predicates = [UseAVX] in { 6378 // If we're inserting an element from a vbroadcast of a load, fold the 6379 // load into the X86insertps instruction. 6380 def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), 6381 (X86VBroadcast (loadf32 addr:$src2)), imm:$src3)), 6382 (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>; 6383 def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), 6384 (X86VBroadcast (loadv4f32 addr:$src2)), imm:$src3)), 6385 (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>; 6386} 6387 6388//===----------------------------------------------------------------------===// 6389// SSE4.1 - Round Instructions 6390//===----------------------------------------------------------------------===// 6391 6392multiclass sse41_fp_unop_rm<bits<8> opcps, bits<8> opcpd, string OpcodeStr, 6393 X86MemOperand x86memop, RegisterClass RC, 6394 PatFrag mem_frag32, PatFrag mem_frag64, 6395 Intrinsic V4F32Int, Intrinsic V2F64Int> { 6396let ExeDomain = SSEPackedSingle in { 6397 // Intrinsic operation, reg. 6398 // Vector intrinsic operation, reg 6399 def PSr : SS4AIi8<opcps, MRMSrcReg, 6400 (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2), 6401 !strconcat(OpcodeStr, 6402 "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 6403 [(set RC:$dst, (V4F32Int RC:$src1, imm:$src2))], 6404 IIC_SSE_ROUNDPS_REG>, Sched<[WriteFAdd]>; 6405 6406 // Vector intrinsic operation, mem 6407 def PSm : SS4AIi8<opcps, MRMSrcMem, 6408 (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2), 6409 !strconcat(OpcodeStr, 6410 "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 6411 [(set RC:$dst, 6412 (V4F32Int (mem_frag32 addr:$src1),imm:$src2))], 6413 IIC_SSE_ROUNDPS_MEM>, Sched<[WriteFAddLd]>; 6414} // ExeDomain = SSEPackedSingle 6415 6416let ExeDomain = SSEPackedDouble in { 6417 // Vector intrinsic operation, reg 6418 def PDr : SS4AIi8<opcpd, MRMSrcReg, 6419 (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2), 6420 !strconcat(OpcodeStr, 6421 "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 6422 [(set RC:$dst, (V2F64Int RC:$src1, imm:$src2))], 6423 IIC_SSE_ROUNDPS_REG>, Sched<[WriteFAdd]>; 6424 6425 // Vector intrinsic operation, mem 6426 def PDm : SS4AIi8<opcpd, MRMSrcMem, 6427 (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2), 6428 !strconcat(OpcodeStr, 6429 "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 6430 [(set RC:$dst, 6431 (V2F64Int (mem_frag64 addr:$src1),imm:$src2))], 6432 IIC_SSE_ROUNDPS_REG>, Sched<[WriteFAddLd]>; 6433} // ExeDomain = SSEPackedDouble 6434} 6435 6436multiclass sse41_fp_binop_rm<bits<8> opcss, bits<8> opcsd, 6437 string OpcodeStr, 6438 Intrinsic F32Int, 6439 Intrinsic F64Int, bit Is2Addr = 1> { 6440let ExeDomain = GenericDomain in { 6441 // Operation, reg. 6442 let hasSideEffects = 0 in 6443 def SSr : SS4AIi8<opcss, MRMSrcReg, 6444 (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32u8imm:$src3), 6445 !if(Is2Addr, 6446 !strconcat(OpcodeStr, 6447 "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 6448 !strconcat(OpcodeStr, 6449 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 6450 []>, Sched<[WriteFAdd]>; 6451 6452 // Intrinsic operation, reg. 6453 let isCodeGenOnly = 1 in 6454 def SSr_Int : SS4AIi8<opcss, MRMSrcReg, 6455 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3), 6456 !if(Is2Addr, 6457 !strconcat(OpcodeStr, 6458 "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 6459 !strconcat(OpcodeStr, 6460 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 6461 [(set VR128:$dst, (F32Int VR128:$src1, VR128:$src2, imm:$src3))]>, 6462 Sched<[WriteFAdd]>; 6463 6464 // Intrinsic operation, mem. 6465 def SSm : SS4AIi8<opcss, MRMSrcMem, 6466 (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32u8imm:$src3), 6467 !if(Is2Addr, 6468 !strconcat(OpcodeStr, 6469 "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 6470 !strconcat(OpcodeStr, 6471 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 6472 [(set VR128:$dst, 6473 (F32Int VR128:$src1, sse_load_f32:$src2, imm:$src3))]>, 6474 Sched<[WriteFAddLd, ReadAfterLd]>; 6475 6476 // Operation, reg. 6477 let hasSideEffects = 0 in 6478 def SDr : SS4AIi8<opcsd, MRMSrcReg, 6479 (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32u8imm:$src3), 6480 !if(Is2Addr, 6481 !strconcat(OpcodeStr, 6482 "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 6483 !strconcat(OpcodeStr, 6484 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 6485 []>, Sched<[WriteFAdd]>; 6486 6487 // Intrinsic operation, reg. 6488 let isCodeGenOnly = 1 in 6489 def SDr_Int : SS4AIi8<opcsd, MRMSrcReg, 6490 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3), 6491 !if(Is2Addr, 6492 !strconcat(OpcodeStr, 6493 "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 6494 !strconcat(OpcodeStr, 6495 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 6496 [(set VR128:$dst, (F64Int VR128:$src1, VR128:$src2, imm:$src3))]>, 6497 Sched<[WriteFAdd]>; 6498 6499 // Intrinsic operation, mem. 6500 def SDm : SS4AIi8<opcsd, MRMSrcMem, 6501 (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32u8imm:$src3), 6502 !if(Is2Addr, 6503 !strconcat(OpcodeStr, 6504 "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 6505 !strconcat(OpcodeStr, 6506 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 6507 [(set VR128:$dst, 6508 (F64Int VR128:$src1, sse_load_f64:$src2, imm:$src3))]>, 6509 Sched<[WriteFAddLd, ReadAfterLd]>; 6510} // ExeDomain = GenericDomain 6511} 6512 6513// FP round - roundss, roundps, roundsd, roundpd 6514let Predicates = [HasAVX] in { 6515 // Intrinsic form 6516 defm VROUND : sse41_fp_unop_rm<0x08, 0x09, "vround", f128mem, VR128, 6517 loadv4f32, loadv2f64, 6518 int_x86_sse41_round_ps, 6519 int_x86_sse41_round_pd>, VEX; 6520 defm VROUNDY : sse41_fp_unop_rm<0x08, 0x09, "vround", f256mem, VR256, 6521 loadv8f32, loadv4f64, 6522 int_x86_avx_round_ps_256, 6523 int_x86_avx_round_pd_256>, VEX, VEX_L; 6524 defm VROUND : sse41_fp_binop_rm<0x0A, 0x0B, "vround", 6525 int_x86_sse41_round_ss, 6526 int_x86_sse41_round_sd, 0>, VEX_4V, VEX_LIG; 6527} 6528 6529let Predicates = [UseAVX] in { 6530 def : Pat<(ffloor FR32:$src), 6531 (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x9))>; 6532 def : Pat<(f64 (ffloor FR64:$src)), 6533 (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x9))>; 6534 def : Pat<(f32 (fnearbyint FR32:$src)), 6535 (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>; 6536 def : Pat<(f64 (fnearbyint FR64:$src)), 6537 (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>; 6538 def : Pat<(f32 (fceil FR32:$src)), 6539 (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xA))>; 6540 def : Pat<(f64 (fceil FR64:$src)), 6541 (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xA))>; 6542 def : Pat<(f32 (frint FR32:$src)), 6543 (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>; 6544 def : Pat<(f64 (frint FR64:$src)), 6545 (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>; 6546 def : Pat<(f32 (ftrunc FR32:$src)), 6547 (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xB))>; 6548 def : Pat<(f64 (ftrunc FR64:$src)), 6549 (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xB))>; 6550} 6551 6552let Predicates = [HasAVX] in { 6553 def : Pat<(v4f32 (ffloor VR128:$src)), 6554 (VROUNDPSr VR128:$src, (i32 0x9))>; 6555 def : Pat<(v4f32 (fnearbyint VR128:$src)), 6556 (VROUNDPSr VR128:$src, (i32 0xC))>; 6557 def : Pat<(v4f32 (fceil VR128:$src)), 6558 (VROUNDPSr VR128:$src, (i32 0xA))>; 6559 def : Pat<(v4f32 (frint VR128:$src)), 6560 (VROUNDPSr VR128:$src, (i32 0x4))>; 6561 def : Pat<(v4f32 (ftrunc VR128:$src)), 6562 (VROUNDPSr VR128:$src, (i32 0xB))>; 6563 6564 def : Pat<(v2f64 (ffloor VR128:$src)), 6565 (VROUNDPDr VR128:$src, (i32 0x9))>; 6566 def : Pat<(v2f64 (fnearbyint VR128:$src)), 6567 (VROUNDPDr VR128:$src, (i32 0xC))>; 6568 def : Pat<(v2f64 (fceil VR128:$src)), 6569 (VROUNDPDr VR128:$src, (i32 0xA))>; 6570 def : Pat<(v2f64 (frint VR128:$src)), 6571 (VROUNDPDr VR128:$src, (i32 0x4))>; 6572 def : Pat<(v2f64 (ftrunc VR128:$src)), 6573 (VROUNDPDr VR128:$src, (i32 0xB))>; 6574 6575 def : Pat<(v8f32 (ffloor VR256:$src)), 6576 (VROUNDYPSr VR256:$src, (i32 0x9))>; 6577 def : Pat<(v8f32 (fnearbyint VR256:$src)), 6578 (VROUNDYPSr VR256:$src, (i32 0xC))>; 6579 def : Pat<(v8f32 (fceil VR256:$src)), 6580 (VROUNDYPSr VR256:$src, (i32 0xA))>; 6581 def : Pat<(v8f32 (frint VR256:$src)), 6582 (VROUNDYPSr VR256:$src, (i32 0x4))>; 6583 def : Pat<(v8f32 (ftrunc VR256:$src)), 6584 (VROUNDYPSr VR256:$src, (i32 0xB))>; 6585 6586 def : Pat<(v4f64 (ffloor VR256:$src)), 6587 (VROUNDYPDr VR256:$src, (i32 0x9))>; 6588 def : Pat<(v4f64 (fnearbyint VR256:$src)), 6589 (VROUNDYPDr VR256:$src, (i32 0xC))>; 6590 def : Pat<(v4f64 (fceil VR256:$src)), 6591 (VROUNDYPDr VR256:$src, (i32 0xA))>; 6592 def : Pat<(v4f64 (frint VR256:$src)), 6593 (VROUNDYPDr VR256:$src, (i32 0x4))>; 6594 def : Pat<(v4f64 (ftrunc VR256:$src)), 6595 (VROUNDYPDr VR256:$src, (i32 0xB))>; 6596} 6597 6598defm ROUND : sse41_fp_unop_rm<0x08, 0x09, "round", f128mem, VR128, 6599 memopv4f32, memopv2f64, 6600 int_x86_sse41_round_ps, int_x86_sse41_round_pd>; 6601let Constraints = "$src1 = $dst" in 6602defm ROUND : sse41_fp_binop_rm<0x0A, 0x0B, "round", 6603 int_x86_sse41_round_ss, int_x86_sse41_round_sd>; 6604 6605let Predicates = [UseSSE41] in { 6606 def : Pat<(ffloor FR32:$src), 6607 (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x9))>; 6608 def : Pat<(f64 (ffloor FR64:$src)), 6609 (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x9))>; 6610 def : Pat<(f32 (fnearbyint FR32:$src)), 6611 (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>; 6612 def : Pat<(f64 (fnearbyint FR64:$src)), 6613 (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>; 6614 def : Pat<(f32 (fceil FR32:$src)), 6615 (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xA))>; 6616 def : Pat<(f64 (fceil FR64:$src)), 6617 (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xA))>; 6618 def : Pat<(f32 (frint FR32:$src)), 6619 (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>; 6620 def : Pat<(f64 (frint FR64:$src)), 6621 (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>; 6622 def : Pat<(f32 (ftrunc FR32:$src)), 6623 (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xB))>; 6624 def : Pat<(f64 (ftrunc FR64:$src)), 6625 (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xB))>; 6626 6627 def : Pat<(v4f32 (ffloor VR128:$src)), 6628 (ROUNDPSr VR128:$src, (i32 0x9))>; 6629 def : Pat<(v4f32 (fnearbyint VR128:$src)), 6630 (ROUNDPSr VR128:$src, (i32 0xC))>; 6631 def : Pat<(v4f32 (fceil VR128:$src)), 6632 (ROUNDPSr VR128:$src, (i32 0xA))>; 6633 def : Pat<(v4f32 (frint VR128:$src)), 6634 (ROUNDPSr VR128:$src, (i32 0x4))>; 6635 def : Pat<(v4f32 (ftrunc VR128:$src)), 6636 (ROUNDPSr VR128:$src, (i32 0xB))>; 6637 6638 def : Pat<(v2f64 (ffloor VR128:$src)), 6639 (ROUNDPDr VR128:$src, (i32 0x9))>; 6640 def : Pat<(v2f64 (fnearbyint VR128:$src)), 6641 (ROUNDPDr VR128:$src, (i32 0xC))>; 6642 def : Pat<(v2f64 (fceil VR128:$src)), 6643 (ROUNDPDr VR128:$src, (i32 0xA))>; 6644 def : Pat<(v2f64 (frint VR128:$src)), 6645 (ROUNDPDr VR128:$src, (i32 0x4))>; 6646 def : Pat<(v2f64 (ftrunc VR128:$src)), 6647 (ROUNDPDr VR128:$src, (i32 0xB))>; 6648} 6649 6650//===----------------------------------------------------------------------===// 6651// SSE4.1 - Packed Bit Test 6652//===----------------------------------------------------------------------===// 6653 6654// ptest instruction we'll lower to this in X86ISelLowering primarily from 6655// the intel intrinsic that corresponds to this. 6656let Defs = [EFLAGS], Predicates = [HasAVX] in { 6657def VPTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2), 6658 "vptest\t{$src2, $src1|$src1, $src2}", 6659 [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>, 6660 Sched<[WriteVecLogic]>, VEX; 6661def VPTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2), 6662 "vptest\t{$src2, $src1|$src1, $src2}", 6663 [(set EFLAGS,(X86ptest VR128:$src1, (loadv2i64 addr:$src2)))]>, 6664 Sched<[WriteVecLogicLd, ReadAfterLd]>, VEX; 6665 6666def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2), 6667 "vptest\t{$src2, $src1|$src1, $src2}", 6668 [(set EFLAGS, (X86ptest VR256:$src1, (v4i64 VR256:$src2)))]>, 6669 Sched<[WriteVecLogic]>, VEX, VEX_L; 6670def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2), 6671 "vptest\t{$src2, $src1|$src1, $src2}", 6672 [(set EFLAGS,(X86ptest VR256:$src1, (loadv4i64 addr:$src2)))]>, 6673 Sched<[WriteVecLogicLd, ReadAfterLd]>, VEX, VEX_L; 6674} 6675 6676let Defs = [EFLAGS] in { 6677def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2), 6678 "ptest\t{$src2, $src1|$src1, $src2}", 6679 [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>, 6680 Sched<[WriteVecLogic]>; 6681def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2), 6682 "ptest\t{$src2, $src1|$src1, $src2}", 6683 [(set EFLAGS, (X86ptest VR128:$src1, (memopv2i64 addr:$src2)))]>, 6684 Sched<[WriteVecLogicLd, ReadAfterLd]>; 6685} 6686 6687// The bit test instructions below are AVX only 6688multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC, 6689 X86MemOperand x86memop, PatFrag mem_frag, ValueType vt> { 6690 def rr : SS48I<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2), 6691 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), 6692 [(set EFLAGS, (X86testp RC:$src1, (vt RC:$src2)))]>, 6693 Sched<[WriteVecLogic]>, VEX; 6694 def rm : SS48I<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2), 6695 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), 6696 [(set EFLAGS, (X86testp RC:$src1, (mem_frag addr:$src2)))]>, 6697 Sched<[WriteVecLogicLd, ReadAfterLd]>, VEX; 6698} 6699 6700let Defs = [EFLAGS], Predicates = [HasAVX] in { 6701let ExeDomain = SSEPackedSingle in { 6702defm VTESTPS : avx_bittest<0x0E, "vtestps", VR128, f128mem, loadv4f32, v4f32>; 6703defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, loadv8f32, v8f32>, 6704 VEX_L; 6705} 6706let ExeDomain = SSEPackedDouble in { 6707defm VTESTPD : avx_bittest<0x0F, "vtestpd", VR128, f128mem, loadv2f64, v2f64>; 6708defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, loadv4f64, v4f64>, 6709 VEX_L; 6710} 6711} 6712 6713//===----------------------------------------------------------------------===// 6714// SSE4.1 - Misc Instructions 6715//===----------------------------------------------------------------------===// 6716 6717let Defs = [EFLAGS], Predicates = [HasPOPCNT] in { 6718 def POPCNT16rr : I<0xB8, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), 6719 "popcnt{w}\t{$src, $dst|$dst, $src}", 6720 [(set GR16:$dst, (ctpop GR16:$src)), (implicit EFLAGS)], 6721 IIC_SSE_POPCNT_RR>, Sched<[WriteFAdd]>, 6722 OpSize16, XS; 6723 def POPCNT16rm : I<0xB8, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), 6724 "popcnt{w}\t{$src, $dst|$dst, $src}", 6725 [(set GR16:$dst, (ctpop (loadi16 addr:$src))), 6726 (implicit EFLAGS)], IIC_SSE_POPCNT_RM>, 6727 Sched<[WriteFAddLd]>, OpSize16, XS; 6728 6729 def POPCNT32rr : I<0xB8, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), 6730 "popcnt{l}\t{$src, $dst|$dst, $src}", 6731 [(set GR32:$dst, (ctpop GR32:$src)), (implicit EFLAGS)], 6732 IIC_SSE_POPCNT_RR>, Sched<[WriteFAdd]>, 6733 OpSize32, XS; 6734 6735 def POPCNT32rm : I<0xB8, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), 6736 "popcnt{l}\t{$src, $dst|$dst, $src}", 6737 [(set GR32:$dst, (ctpop (loadi32 addr:$src))), 6738 (implicit EFLAGS)], IIC_SSE_POPCNT_RM>, 6739 Sched<[WriteFAddLd]>, OpSize32, XS; 6740 6741 def POPCNT64rr : RI<0xB8, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), 6742 "popcnt{q}\t{$src, $dst|$dst, $src}", 6743 [(set GR64:$dst, (ctpop GR64:$src)), (implicit EFLAGS)], 6744 IIC_SSE_POPCNT_RR>, Sched<[WriteFAdd]>, XS; 6745 def POPCNT64rm : RI<0xB8, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), 6746 "popcnt{q}\t{$src, $dst|$dst, $src}", 6747 [(set GR64:$dst, (ctpop (loadi64 addr:$src))), 6748 (implicit EFLAGS)], IIC_SSE_POPCNT_RM>, 6749 Sched<[WriteFAddLd]>, XS; 6750} 6751 6752 6753 6754// SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16. 6755multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr, 6756 Intrinsic IntId128, PatFrag ld_frag, 6757 X86FoldableSchedWrite Sched> { 6758 def rr128 : SS48I<opc, MRMSrcReg, (outs VR128:$dst), 6759 (ins VR128:$src), 6760 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 6761 [(set VR128:$dst, (IntId128 VR128:$src))]>, 6762 Sched<[Sched]>; 6763 def rm128 : SS48I<opc, MRMSrcMem, (outs VR128:$dst), 6764 (ins i128mem:$src), 6765 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 6766 [(set VR128:$dst, 6767 (IntId128 (bitconvert (ld_frag addr:$src))))]>, 6768 Sched<[Sched.Folded]>; 6769} 6770 6771// PHMIN has the same profile as PSAD, thus we use the same scheduling 6772// model, although the naming is misleading. 6773let Predicates = [HasAVX] in 6774defm VPHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "vphminposuw", 6775 int_x86_sse41_phminposuw, loadv2i64, 6776 WriteVecIMul>, VEX; 6777defm PHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "phminposuw", 6778 int_x86_sse41_phminposuw, memopv2i64, 6779 WriteVecIMul>; 6780 6781/// SS48I_binop_rm - Simple SSE41 binary operator. 6782multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, 6783 ValueType OpVT, RegisterClass RC, PatFrag memop_frag, 6784 X86MemOperand x86memop, bit Is2Addr = 1, 6785 OpndItins itins = SSE_INTALU_ITINS_P> { 6786 let isCommutable = 1 in 6787 def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst), 6788 (ins RC:$src1, RC:$src2), 6789 !if(Is2Addr, 6790 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 6791 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 6792 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>, 6793 Sched<[itins.Sched]>; 6794 def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst), 6795 (ins RC:$src1, x86memop:$src2), 6796 !if(Is2Addr, 6797 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 6798 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 6799 [(set RC:$dst, 6800 (OpVT (OpNode RC:$src1, (bitconvert (memop_frag addr:$src2)))))]>, 6801 Sched<[itins.Sched.Folded, ReadAfterLd]>; 6802} 6803 6804/// SS48I_binop_rm2 - Simple SSE41 binary operator with different src and dst 6805/// types. 6806multiclass SS48I_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode, 6807 ValueType DstVT, ValueType SrcVT, RegisterClass RC, 6808 PatFrag memop_frag, X86MemOperand x86memop, 6809 OpndItins itins, 6810 bit IsCommutable = 0, bit Is2Addr = 1> { 6811 let isCommutable = IsCommutable in 6812 def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst), 6813 (ins RC:$src1, RC:$src2), 6814 !if(Is2Addr, 6815 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 6816 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 6817 [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>, 6818 Sched<[itins.Sched]>; 6819 def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst), 6820 (ins RC:$src1, x86memop:$src2), 6821 !if(Is2Addr, 6822 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 6823 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 6824 [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), 6825 (bitconvert (memop_frag addr:$src2)))))]>, 6826 Sched<[itins.Sched.Folded, ReadAfterLd]>; 6827} 6828 6829let Predicates = [HasAVX, NoVLX] in { 6830 defm VPMINSB : SS48I_binop_rm<0x38, "vpminsb", smin, v16i8, VR128, 6831 loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, 6832 VEX_4V; 6833 defm VPMINSD : SS48I_binop_rm<0x39, "vpminsd", smin, v4i32, VR128, 6834 loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, 6835 VEX_4V; 6836 defm VPMINUD : SS48I_binop_rm<0x3B, "vpminud", umin, v4i32, VR128, 6837 loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, 6838 VEX_4V; 6839 defm VPMINUW : SS48I_binop_rm<0x3A, "vpminuw", umin, v8i16, VR128, 6840 loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, 6841 VEX_4V; 6842 defm VPMAXSB : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v16i8, VR128, 6843 loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, 6844 VEX_4V; 6845 defm VPMAXSD : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v4i32, VR128, 6846 loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, 6847 VEX_4V; 6848 defm VPMAXUD : SS48I_binop_rm<0x3F, "vpmaxud", umax, v4i32, VR128, 6849 loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, 6850 VEX_4V; 6851 defm VPMAXUW : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v8i16, VR128, 6852 loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, 6853 VEX_4V; 6854 defm VPMULDQ : SS48I_binop_rm2<0x28, "vpmuldq", X86pmuldq, v2i64, v4i32, 6855 VR128, loadv2i64, i128mem, 6856 SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V; 6857} 6858 6859let Predicates = [HasAVX2, NoVLX] in { 6860 defm VPMINSBY : SS48I_binop_rm<0x38, "vpminsb", smin, v32i8, VR256, 6861 loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, 6862 VEX_4V, VEX_L; 6863 defm VPMINSDY : SS48I_binop_rm<0x39, "vpminsd", smin, v8i32, VR256, 6864 loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, 6865 VEX_4V, VEX_L; 6866 defm VPMINUDY : SS48I_binop_rm<0x3B, "vpminud", umin, v8i32, VR256, 6867 loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, 6868 VEX_4V, VEX_L; 6869 defm VPMINUWY : SS48I_binop_rm<0x3A, "vpminuw", umin, v16i16, VR256, 6870 loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, 6871 VEX_4V, VEX_L; 6872 defm VPMAXSBY : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v32i8, VR256, 6873 loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, 6874 VEX_4V, VEX_L; 6875 defm VPMAXSDY : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v8i32, VR256, 6876 loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, 6877 VEX_4V, VEX_L; 6878 defm VPMAXUDY : SS48I_binop_rm<0x3F, "vpmaxud", umax, v8i32, VR256, 6879 loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, 6880 VEX_4V, VEX_L; 6881 defm VPMAXUWY : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v16i16, VR256, 6882 loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, 6883 VEX_4V, VEX_L; 6884 defm VPMULDQY : SS48I_binop_rm2<0x28, "vpmuldq", X86pmuldq, v4i64, v8i32, 6885 VR256, loadv4i64, i256mem, 6886 SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L; 6887} 6888 6889let Constraints = "$src1 = $dst" in { 6890 defm PMINSB : SS48I_binop_rm<0x38, "pminsb", smin, v16i8, VR128, 6891 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>; 6892 defm PMINSD : SS48I_binop_rm<0x39, "pminsd", smin, v4i32, VR128, 6893 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>; 6894 defm PMINUD : SS48I_binop_rm<0x3B, "pminud", umin, v4i32, VR128, 6895 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>; 6896 defm PMINUW : SS48I_binop_rm<0x3A, "pminuw", umin, v8i16, VR128, 6897 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>; 6898 defm PMAXSB : SS48I_binop_rm<0x3C, "pmaxsb", smax, v16i8, VR128, 6899 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>; 6900 defm PMAXSD : SS48I_binop_rm<0x3D, "pmaxsd", smax, v4i32, VR128, 6901 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>; 6902 defm PMAXUD : SS48I_binop_rm<0x3F, "pmaxud", umax, v4i32, VR128, 6903 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>; 6904 defm PMAXUW : SS48I_binop_rm<0x3E, "pmaxuw", umax, v8i16, VR128, 6905 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>; 6906 defm PMULDQ : SS48I_binop_rm2<0x28, "pmuldq", X86pmuldq, v2i64, v4i32, 6907 VR128, memopv2i64, i128mem, 6908 SSE_INTMUL_ITINS_P, 1>; 6909} 6910 6911let Predicates = [HasAVX, NoVLX] in { 6912 defm VPMULLD : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128, 6913 memopv2i64, i128mem, 0, SSE_PMULLD_ITINS>, 6914 VEX_4V; 6915 defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128, 6916 memopv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, 6917 VEX_4V; 6918} 6919let Predicates = [HasAVX2] in { 6920 defm VPMULLDY : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256, 6921 loadv4i64, i256mem, 0, SSE_PMULLD_ITINS>, 6922 VEX_4V, VEX_L; 6923 defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256, 6924 loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, 6925 VEX_4V, VEX_L; 6926} 6927 6928let Constraints = "$src1 = $dst" in { 6929 defm PMULLD : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, VR128, 6930 memopv2i64, i128mem, 1, SSE_PMULLD_ITINS>; 6931 defm PCMPEQQ : SS48I_binop_rm<0x29, "pcmpeqq", X86pcmpeq, v2i64, VR128, 6932 memopv2i64, i128mem, 1, SSE_INTALUQ_ITINS_P>; 6933} 6934 6935/// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate 6936multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr, 6937 Intrinsic IntId, RegisterClass RC, PatFrag memop_frag, 6938 X86MemOperand x86memop, bit Is2Addr = 1, 6939 OpndItins itins = DEFAULT_ITINS> { 6940 let isCommutable = 1 in 6941 def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst), 6942 (ins RC:$src1, RC:$src2, u8imm:$src3), 6943 !if(Is2Addr, 6944 !strconcat(OpcodeStr, 6945 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 6946 !strconcat(OpcodeStr, 6947 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 6948 [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))], itins.rr>, 6949 Sched<[itins.Sched]>; 6950 def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst), 6951 (ins RC:$src1, x86memop:$src2, u8imm:$src3), 6952 !if(Is2Addr, 6953 !strconcat(OpcodeStr, 6954 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 6955 !strconcat(OpcodeStr, 6956 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 6957 [(set RC:$dst, 6958 (IntId RC:$src1, 6959 (bitconvert (memop_frag addr:$src2)), imm:$src3))], itins.rm>, 6960 Sched<[itins.Sched.Folded, ReadAfterLd]>; 6961} 6962 6963/// SS41I_binop_rmi - SSE 4.1 binary operator with 8-bit immediate 6964multiclass SS41I_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode, 6965 ValueType OpVT, RegisterClass RC, PatFrag memop_frag, 6966 X86MemOperand x86memop, bit Is2Addr = 1, 6967 OpndItins itins = DEFAULT_ITINS> { 6968 let isCommutable = 1 in 6969 def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst), 6970 (ins RC:$src1, RC:$src2, u8imm:$src3), 6971 !if(Is2Addr, 6972 !strconcat(OpcodeStr, 6973 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 6974 !strconcat(OpcodeStr, 6975 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 6976 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))], 6977 itins.rr>, Sched<[itins.Sched]>; 6978 def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst), 6979 (ins RC:$src1, x86memop:$src2, u8imm:$src3), 6980 !if(Is2Addr, 6981 !strconcat(OpcodeStr, 6982 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 6983 !strconcat(OpcodeStr, 6984 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 6985 [(set RC:$dst, 6986 (OpVT (OpNode RC:$src1, 6987 (bitconvert (memop_frag addr:$src2)), imm:$src3)))], itins.rm>, 6988 Sched<[itins.Sched.Folded, ReadAfterLd]>; 6989} 6990 6991let Predicates = [HasAVX] in { 6992 let isCommutable = 0 in { 6993 defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw, 6994 VR128, loadv2i64, i128mem, 0, 6995 DEFAULT_ITINS_MPSADSCHED>, VEX_4V; 6996 } 6997 6998 let ExeDomain = SSEPackedSingle in { 6999 defm VBLENDPS : SS41I_binop_rmi<0x0C, "vblendps", X86Blendi, v4f32, 7000 VR128, loadv4f32, f128mem, 0, 7001 DEFAULT_ITINS_FBLENDSCHED>, VEX_4V; 7002 defm VBLENDPSY : SS41I_binop_rmi<0x0C, "vblendps", X86Blendi, v8f32, 7003 VR256, loadv8f32, f256mem, 0, 7004 DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_L; 7005 } 7006 let ExeDomain = SSEPackedDouble in { 7007 defm VBLENDPD : SS41I_binop_rmi<0x0D, "vblendpd", X86Blendi, v2f64, 7008 VR128, loadv2f64, f128mem, 0, 7009 DEFAULT_ITINS_FBLENDSCHED>, VEX_4V; 7010 defm VBLENDPDY : SS41I_binop_rmi<0x0D, "vblendpd", X86Blendi, v4f64, 7011 VR256, loadv4f64, f256mem, 0, 7012 DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_L; 7013 } 7014 defm VPBLENDW : SS41I_binop_rmi<0x0E, "vpblendw", X86Blendi, v8i16, 7015 VR128, loadv2i64, i128mem, 0, 7016 DEFAULT_ITINS_BLENDSCHED>, VEX_4V; 7017 7018 let ExeDomain = SSEPackedSingle in 7019 defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps, 7020 VR128, loadv4f32, f128mem, 0, 7021 SSE_DPPS_ITINS>, VEX_4V; 7022 let ExeDomain = SSEPackedDouble in 7023 defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd, 7024 VR128, loadv2f64, f128mem, 0, 7025 SSE_DPPS_ITINS>, VEX_4V; 7026 let ExeDomain = SSEPackedSingle in 7027 defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256, 7028 VR256, loadv8f32, i256mem, 0, 7029 SSE_DPPS_ITINS>, VEX_4V, VEX_L; 7030} 7031 7032let Predicates = [HasAVX2] in { 7033 let isCommutable = 0 in { 7034 defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw, 7035 VR256, loadv4i64, i256mem, 0, 7036 DEFAULT_ITINS_MPSADSCHED>, VEX_4V, VEX_L; 7037 } 7038 defm VPBLENDWY : SS41I_binop_rmi<0x0E, "vpblendw", X86Blendi, v16i16, 7039 VR256, loadv4i64, i256mem, 0, 7040 DEFAULT_ITINS_BLENDSCHED>, VEX_4V, VEX_L; 7041} 7042 7043let Constraints = "$src1 = $dst" in { 7044 let isCommutable = 0 in { 7045 defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw, 7046 VR128, memopv2i64, i128mem, 7047 1, SSE_MPSADBW_ITINS>; 7048 } 7049 let ExeDomain = SSEPackedSingle in 7050 defm BLENDPS : SS41I_binop_rmi<0x0C, "blendps", X86Blendi, v4f32, 7051 VR128, memopv4f32, f128mem, 7052 1, SSE_INTALU_ITINS_FBLEND_P>; 7053 let ExeDomain = SSEPackedDouble in 7054 defm BLENDPD : SS41I_binop_rmi<0x0D, "blendpd", X86Blendi, v2f64, 7055 VR128, memopv2f64, f128mem, 7056 1, SSE_INTALU_ITINS_FBLEND_P>; 7057 defm PBLENDW : SS41I_binop_rmi<0x0E, "pblendw", X86Blendi, v8i16, 7058 VR128, memopv2i64, i128mem, 7059 1, SSE_INTALU_ITINS_BLEND_P>; 7060 let ExeDomain = SSEPackedSingle in 7061 defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps, 7062 VR128, memopv4f32, f128mem, 1, 7063 SSE_DPPS_ITINS>; 7064 let ExeDomain = SSEPackedDouble in 7065 defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd, 7066 VR128, memopv2f64, f128mem, 1, 7067 SSE_DPPD_ITINS>; 7068} 7069 7070/// SS41I_quaternary_int_avx - AVX SSE 4.1 with 4 operators 7071multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr, 7072 RegisterClass RC, X86MemOperand x86memop, 7073 PatFrag mem_frag, Intrinsic IntId, 7074 X86FoldableSchedWrite Sched> { 7075 def rr : Ii8<opc, MRMSrcReg, (outs RC:$dst), 7076 (ins RC:$src1, RC:$src2, RC:$src3), 7077 !strconcat(OpcodeStr, 7078 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 7079 [(set RC:$dst, (IntId RC:$src1, RC:$src2, RC:$src3))], 7080 NoItinerary, SSEPackedInt>, TAPD, VEX_4V, VEX_I8IMM, 7081 Sched<[Sched]>; 7082 7083 def rm : Ii8<opc, MRMSrcMem, (outs RC:$dst), 7084 (ins RC:$src1, x86memop:$src2, RC:$src3), 7085 !strconcat(OpcodeStr, 7086 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 7087 [(set RC:$dst, 7088 (IntId RC:$src1, (bitconvert (mem_frag addr:$src2)), 7089 RC:$src3))], 7090 NoItinerary, SSEPackedInt>, TAPD, VEX_4V, VEX_I8IMM, 7091 Sched<[Sched.Folded, ReadAfterLd]>; 7092} 7093 7094let Predicates = [HasAVX] in { 7095let ExeDomain = SSEPackedDouble in { 7096defm VBLENDVPD : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR128, f128mem, 7097 loadv2f64, int_x86_sse41_blendvpd, 7098 WriteFVarBlend>; 7099defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, f256mem, 7100 loadv4f64, int_x86_avx_blendv_pd_256, 7101 WriteFVarBlend>, VEX_L; 7102} // ExeDomain = SSEPackedDouble 7103let ExeDomain = SSEPackedSingle in { 7104defm VBLENDVPS : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR128, f128mem, 7105 loadv4f32, int_x86_sse41_blendvps, 7106 WriteFVarBlend>; 7107defm VBLENDVPSY : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR256, f256mem, 7108 loadv8f32, int_x86_avx_blendv_ps_256, 7109 WriteFVarBlend>, VEX_L; 7110} // ExeDomain = SSEPackedSingle 7111defm VPBLENDVB : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR128, i128mem, 7112 loadv2i64, int_x86_sse41_pblendvb, 7113 WriteVarBlend>; 7114} 7115 7116let Predicates = [HasAVX2] in { 7117defm VPBLENDVBY : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR256, i256mem, 7118 loadv4i64, int_x86_avx2_pblendvb, 7119 WriteVarBlend>, VEX_L; 7120} 7121 7122let Predicates = [HasAVX] in { 7123 def : Pat<(v16i8 (vselect (v16i8 VR128:$mask), (v16i8 VR128:$src1), 7124 (v16i8 VR128:$src2))), 7125 (VPBLENDVBrr VR128:$src2, VR128:$src1, VR128:$mask)>; 7126 def : Pat<(v4i32 (vselect (v4i32 VR128:$mask), (v4i32 VR128:$src1), 7127 (v4i32 VR128:$src2))), 7128 (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>; 7129 def : Pat<(v4f32 (vselect (v4i32 VR128:$mask), (v4f32 VR128:$src1), 7130 (v4f32 VR128:$src2))), 7131 (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>; 7132 def : Pat<(v2i64 (vselect (v2i64 VR128:$mask), (v2i64 VR128:$src1), 7133 (v2i64 VR128:$src2))), 7134 (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>; 7135 def : Pat<(v2f64 (vselect (v2i64 VR128:$mask), (v2f64 VR128:$src1), 7136 (v2f64 VR128:$src2))), 7137 (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>; 7138 def : Pat<(v8i32 (vselect (v8i32 VR256:$mask), (v8i32 VR256:$src1), 7139 (v8i32 VR256:$src2))), 7140 (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>; 7141 def : Pat<(v8f32 (vselect (v8i32 VR256:$mask), (v8f32 VR256:$src1), 7142 (v8f32 VR256:$src2))), 7143 (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>; 7144 def : Pat<(v4i64 (vselect (v4i64 VR256:$mask), (v4i64 VR256:$src1), 7145 (v4i64 VR256:$src2))), 7146 (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>; 7147 def : Pat<(v4f64 (vselect (v4i64 VR256:$mask), (v4f64 VR256:$src1), 7148 (v4f64 VR256:$src2))), 7149 (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>; 7150} 7151 7152let Predicates = [HasAVX2] in { 7153 def : Pat<(v32i8 (vselect (v32i8 VR256:$mask), (v32i8 VR256:$src1), 7154 (v32i8 VR256:$src2))), 7155 (VPBLENDVBYrr VR256:$src2, VR256:$src1, VR256:$mask)>; 7156} 7157 7158// Patterns 7159// FIXME: Prefer a movss or movsd over a blendps when optimizing for size or 7160// on targets where they have equal performance. These were changed to use 7161// blends because blends have better throughput on SandyBridge and Haswell, but 7162// movs[s/d] are 1-2 byte shorter instructions. 7163let Predicates = [UseAVX] in { 7164 let AddedComplexity = 15 in { 7165 // Move scalar to XMM zero-extended, zeroing a VR128 then do a 7166 // MOVS{S,D} to the lower bits. 7167 def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))), 7168 (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)>; 7169 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), 7170 (VBLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>; 7171 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), 7172 (VPBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>; 7173 def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))), 7174 (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)>; 7175 7176 // Move low f32 and clear high bits. 7177 def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))), 7178 (VBLENDPSYrri (v8f32 (AVX_SET0)), VR256:$src, (i8 1))>; 7179 7180 // Move low f64 and clear high bits. 7181 def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))), 7182 (VBLENDPDYrri (v4f64 (AVX_SET0)), VR256:$src, (i8 1))>; 7183 } 7184 7185 def : Pat<(v8f32 (X86vzmovl (insert_subvector undef, 7186 (v4f32 (scalar_to_vector FR32:$src)), (iPTR 0)))), 7187 (SUBREG_TO_REG (i32 0), 7188 (v4f32 (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)), 7189 sub_xmm)>; 7190 def : Pat<(v4f64 (X86vzmovl (insert_subvector undef, 7191 (v2f64 (scalar_to_vector FR64:$src)), (iPTR 0)))), 7192 (SUBREG_TO_REG (i64 0), 7193 (v2f64 (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)), 7194 sub_xmm)>; 7195 7196 // These will incur an FP/int domain crossing penalty, but it may be the only 7197 // way without AVX2. Do not add any complexity because we may be able to match 7198 // more optimal patterns defined earlier in this file. 7199 def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))), 7200 (VBLENDPSYrri (v8i32 (AVX_SET0)), VR256:$src, (i8 1))>; 7201 def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))), 7202 (VBLENDPDYrri (v4i64 (AVX_SET0)), VR256:$src, (i8 1))>; 7203} 7204 7205// FIXME: Prefer a movss or movsd over a blendps when optimizing for size or 7206// on targets where they have equal performance. These were changed to use 7207// blends because blends have better throughput on SandyBridge and Haswell, but 7208// movs[s/d] are 1-2 byte shorter instructions. 7209let Predicates = [UseSSE41] in { 7210 // With SSE41 we can use blends for these patterns. 7211 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), 7212 (BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>; 7213 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), 7214 (PBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>; 7215 def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))), 7216 (BLENDPDrri (v2f64 (V_SET0)), VR128:$src, (i8 1))>; 7217} 7218 7219 7220/// SS41I_ternary_int - SSE 4.1 ternary operator 7221let Uses = [XMM0], Constraints = "$src1 = $dst" in { 7222 multiclass SS41I_ternary_int<bits<8> opc, string OpcodeStr, PatFrag mem_frag, 7223 X86MemOperand x86memop, Intrinsic IntId, 7224 OpndItins itins = DEFAULT_ITINS> { 7225 def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst), 7226 (ins VR128:$src1, VR128:$src2), 7227 !strconcat(OpcodeStr, 7228 "\t{$src2, $dst|$dst, $src2}"), 7229 [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0))], 7230 itins.rr>, Sched<[itins.Sched]>; 7231 7232 def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst), 7233 (ins VR128:$src1, x86memop:$src2), 7234 !strconcat(OpcodeStr, 7235 "\t{$src2, $dst|$dst, $src2}"), 7236 [(set VR128:$dst, 7237 (IntId VR128:$src1, 7238 (bitconvert (mem_frag addr:$src2)), XMM0))], 7239 itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>; 7240 } 7241} 7242 7243let ExeDomain = SSEPackedDouble in 7244defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", memopv2f64, f128mem, 7245 int_x86_sse41_blendvpd, 7246 DEFAULT_ITINS_FBLENDSCHED>; 7247let ExeDomain = SSEPackedSingle in 7248defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", memopv4f32, f128mem, 7249 int_x86_sse41_blendvps, 7250 DEFAULT_ITINS_FBLENDSCHED>; 7251defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", memopv2i64, i128mem, 7252 int_x86_sse41_pblendvb, 7253 DEFAULT_ITINS_VARBLENDSCHED>; 7254 7255// Aliases with the implicit xmm0 argument 7256def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}", 7257 (BLENDVPDrr0 VR128:$dst, VR128:$src2)>; 7258def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}", 7259 (BLENDVPDrm0 VR128:$dst, f128mem:$src2)>; 7260def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}", 7261 (BLENDVPSrr0 VR128:$dst, VR128:$src2)>; 7262def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}", 7263 (BLENDVPSrm0 VR128:$dst, f128mem:$src2)>; 7264def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}", 7265 (PBLENDVBrr0 VR128:$dst, VR128:$src2)>; 7266def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}", 7267 (PBLENDVBrm0 VR128:$dst, i128mem:$src2)>; 7268 7269let Predicates = [UseSSE41] in { 7270 def : Pat<(v16i8 (vselect (v16i8 XMM0), (v16i8 VR128:$src1), 7271 (v16i8 VR128:$src2))), 7272 (PBLENDVBrr0 VR128:$src2, VR128:$src1)>; 7273 def : Pat<(v4i32 (vselect (v4i32 XMM0), (v4i32 VR128:$src1), 7274 (v4i32 VR128:$src2))), 7275 (BLENDVPSrr0 VR128:$src2, VR128:$src1)>; 7276 def : Pat<(v4f32 (vselect (v4i32 XMM0), (v4f32 VR128:$src1), 7277 (v4f32 VR128:$src2))), 7278 (BLENDVPSrr0 VR128:$src2, VR128:$src1)>; 7279 def : Pat<(v2i64 (vselect (v2i64 XMM0), (v2i64 VR128:$src1), 7280 (v2i64 VR128:$src2))), 7281 (BLENDVPDrr0 VR128:$src2, VR128:$src1)>; 7282 def : Pat<(v2f64 (vselect (v2i64 XMM0), (v2f64 VR128:$src1), 7283 (v2f64 VR128:$src2))), 7284 (BLENDVPDrr0 VR128:$src2, VR128:$src1)>; 7285} 7286 7287let SchedRW = [WriteLoad] in { 7288let Predicates = [HasAVX] in 7289def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 7290 "vmovntdqa\t{$src, $dst|$dst, $src}", 7291 [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>, 7292 VEX; 7293let Predicates = [HasAVX2] in 7294def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), 7295 "vmovntdqa\t{$src, $dst|$dst, $src}", 7296 [(set VR256:$dst, (int_x86_avx2_movntdqa addr:$src))]>, 7297 VEX, VEX_L; 7298def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 7299 "movntdqa\t{$src, $dst|$dst, $src}", 7300 [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>; 7301} // SchedRW 7302 7303//===----------------------------------------------------------------------===// 7304// SSE4.2 - Compare Instructions 7305//===----------------------------------------------------------------------===// 7306 7307/// SS42I_binop_rm - Simple SSE 4.2 binary operator 7308multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, 7309 ValueType OpVT, RegisterClass RC, PatFrag memop_frag, 7310 X86MemOperand x86memop, bit Is2Addr = 1> { 7311 def rr : SS428I<opc, MRMSrcReg, (outs RC:$dst), 7312 (ins RC:$src1, RC:$src2), 7313 !if(Is2Addr, 7314 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 7315 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 7316 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>; 7317 def rm : SS428I<opc, MRMSrcMem, (outs RC:$dst), 7318 (ins RC:$src1, x86memop:$src2), 7319 !if(Is2Addr, 7320 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 7321 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 7322 [(set RC:$dst, 7323 (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>; 7324} 7325 7326let Predicates = [HasAVX] in 7327 defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128, 7328 loadv2i64, i128mem, 0>, VEX_4V; 7329 7330let Predicates = [HasAVX2] in 7331 defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256, 7332 loadv4i64, i256mem, 0>, VEX_4V, VEX_L; 7333 7334let Constraints = "$src1 = $dst" in 7335 defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128, 7336 memopv2i64, i128mem>; 7337 7338//===----------------------------------------------------------------------===// 7339// SSE4.2 - String/text Processing Instructions 7340//===----------------------------------------------------------------------===// 7341 7342// Packed Compare Implicit Length Strings, Return Mask 7343multiclass pseudo_pcmpistrm<string asm, PatFrag ld_frag> { 7344 def REG : PseudoI<(outs VR128:$dst), 7345 (ins VR128:$src1, VR128:$src2, u8imm:$src3), 7346 [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1, VR128:$src2, 7347 imm:$src3))]>; 7348 def MEM : PseudoI<(outs VR128:$dst), 7349 (ins VR128:$src1, i128mem:$src2, u8imm:$src3), 7350 [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1, 7351 (bc_v16i8 (ld_frag addr:$src2)), imm:$src3))]>; 7352} 7353 7354let Defs = [EFLAGS], usesCustomInserter = 1 in { 7355 defm VPCMPISTRM128 : pseudo_pcmpistrm<"#VPCMPISTRM128", loadv2i64>, 7356 Requires<[HasAVX]>; 7357 defm PCMPISTRM128 : pseudo_pcmpistrm<"#PCMPISTRM128", memopv2i64>, 7358 Requires<[UseSSE42]>; 7359} 7360 7361multiclass pcmpistrm_SS42AI<string asm> { 7362 def rr : SS42AI<0x62, MRMSrcReg, (outs), 7363 (ins VR128:$src1, VR128:$src2, u8imm:$src3), 7364 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), 7365 []>, Sched<[WritePCmpIStrM]>; 7366 let mayLoad = 1 in 7367 def rm :SS42AI<0x62, MRMSrcMem, (outs), 7368 (ins VR128:$src1, i128mem:$src2, u8imm:$src3), 7369 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), 7370 []>, Sched<[WritePCmpIStrMLd, ReadAfterLd]>; 7371} 7372 7373let Defs = [XMM0, EFLAGS], hasSideEffects = 0 in { 7374 let Predicates = [HasAVX] in 7375 defm VPCMPISTRM128 : pcmpistrm_SS42AI<"vpcmpistrm">, VEX; 7376 defm PCMPISTRM128 : pcmpistrm_SS42AI<"pcmpistrm"> ; 7377} 7378 7379// Packed Compare Explicit Length Strings, Return Mask 7380multiclass pseudo_pcmpestrm<string asm, PatFrag ld_frag> { 7381 def REG : PseudoI<(outs VR128:$dst), 7382 (ins VR128:$src1, VR128:$src3, u8imm:$src5), 7383 [(set VR128:$dst, (int_x86_sse42_pcmpestrm128 7384 VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>; 7385 def MEM : PseudoI<(outs VR128:$dst), 7386 (ins VR128:$src1, i128mem:$src3, u8imm:$src5), 7387 [(set VR128:$dst, (int_x86_sse42_pcmpestrm128 VR128:$src1, EAX, 7388 (bc_v16i8 (ld_frag addr:$src3)), EDX, imm:$src5))]>; 7389} 7390 7391let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in { 7392 defm VPCMPESTRM128 : pseudo_pcmpestrm<"#VPCMPESTRM128", loadv2i64>, 7393 Requires<[HasAVX]>; 7394 defm PCMPESTRM128 : pseudo_pcmpestrm<"#PCMPESTRM128", memopv2i64>, 7395 Requires<[UseSSE42]>; 7396} 7397 7398multiclass SS42AI_pcmpestrm<string asm> { 7399 def rr : SS42AI<0x60, MRMSrcReg, (outs), 7400 (ins VR128:$src1, VR128:$src3, u8imm:$src5), 7401 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), 7402 []>, Sched<[WritePCmpEStrM]>; 7403 let mayLoad = 1 in 7404 def rm : SS42AI<0x60, MRMSrcMem, (outs), 7405 (ins VR128:$src1, i128mem:$src3, u8imm:$src5), 7406 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), 7407 []>, Sched<[WritePCmpEStrMLd, ReadAfterLd]>; 7408} 7409 7410let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in { 7411 let Predicates = [HasAVX] in 7412 defm VPCMPESTRM128 : SS42AI_pcmpestrm<"vpcmpestrm">, VEX; 7413 defm PCMPESTRM128 : SS42AI_pcmpestrm<"pcmpestrm">; 7414} 7415 7416// Packed Compare Implicit Length Strings, Return Index 7417multiclass pseudo_pcmpistri<string asm, PatFrag ld_frag> { 7418 def REG : PseudoI<(outs GR32:$dst), 7419 (ins VR128:$src1, VR128:$src2, u8imm:$src3), 7420 [(set GR32:$dst, EFLAGS, 7421 (X86pcmpistri VR128:$src1, VR128:$src2, imm:$src3))]>; 7422 def MEM : PseudoI<(outs GR32:$dst), 7423 (ins VR128:$src1, i128mem:$src2, u8imm:$src3), 7424 [(set GR32:$dst, EFLAGS, (X86pcmpistri VR128:$src1, 7425 (bc_v16i8 (ld_frag addr:$src2)), imm:$src3))]>; 7426} 7427 7428let Defs = [EFLAGS], usesCustomInserter = 1 in { 7429 defm VPCMPISTRI : pseudo_pcmpistri<"#VPCMPISTRI", loadv2i64>, 7430 Requires<[HasAVX]>; 7431 defm PCMPISTRI : pseudo_pcmpistri<"#PCMPISTRI", memopv2i64>, 7432 Requires<[UseSSE42]>; 7433} 7434 7435multiclass SS42AI_pcmpistri<string asm> { 7436 def rr : SS42AI<0x63, MRMSrcReg, (outs), 7437 (ins VR128:$src1, VR128:$src2, u8imm:$src3), 7438 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), 7439 []>, Sched<[WritePCmpIStrI]>; 7440 let mayLoad = 1 in 7441 def rm : SS42AI<0x63, MRMSrcMem, (outs), 7442 (ins VR128:$src1, i128mem:$src2, u8imm:$src3), 7443 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), 7444 []>, Sched<[WritePCmpIStrILd, ReadAfterLd]>; 7445} 7446 7447let Defs = [ECX, EFLAGS], hasSideEffects = 0 in { 7448 let Predicates = [HasAVX] in 7449 defm VPCMPISTRI : SS42AI_pcmpistri<"vpcmpistri">, VEX; 7450 defm PCMPISTRI : SS42AI_pcmpistri<"pcmpistri">; 7451} 7452 7453// Packed Compare Explicit Length Strings, Return Index 7454multiclass pseudo_pcmpestri<string asm, PatFrag ld_frag> { 7455 def REG : PseudoI<(outs GR32:$dst), 7456 (ins VR128:$src1, VR128:$src3, u8imm:$src5), 7457 [(set GR32:$dst, EFLAGS, 7458 (X86pcmpestri VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>; 7459 def MEM : PseudoI<(outs GR32:$dst), 7460 (ins VR128:$src1, i128mem:$src3, u8imm:$src5), 7461 [(set GR32:$dst, EFLAGS, 7462 (X86pcmpestri VR128:$src1, EAX, (bc_v16i8 (ld_frag addr:$src3)), EDX, 7463 imm:$src5))]>; 7464} 7465 7466let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in { 7467 defm VPCMPESTRI : pseudo_pcmpestri<"#VPCMPESTRI", loadv2i64>, 7468 Requires<[HasAVX]>; 7469 defm PCMPESTRI : pseudo_pcmpestri<"#PCMPESTRI", memopv2i64>, 7470 Requires<[UseSSE42]>; 7471} 7472 7473multiclass SS42AI_pcmpestri<string asm> { 7474 def rr : SS42AI<0x61, MRMSrcReg, (outs), 7475 (ins VR128:$src1, VR128:$src3, u8imm:$src5), 7476 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), 7477 []>, Sched<[WritePCmpEStrI]>; 7478 let mayLoad = 1 in 7479 def rm : SS42AI<0x61, MRMSrcMem, (outs), 7480 (ins VR128:$src1, i128mem:$src3, u8imm:$src5), 7481 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), 7482 []>, Sched<[WritePCmpEStrILd, ReadAfterLd]>; 7483} 7484 7485let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in { 7486 let Predicates = [HasAVX] in 7487 defm VPCMPESTRI : SS42AI_pcmpestri<"vpcmpestri">, VEX; 7488 defm PCMPESTRI : SS42AI_pcmpestri<"pcmpestri">; 7489} 7490 7491//===----------------------------------------------------------------------===// 7492// SSE4.2 - CRC Instructions 7493//===----------------------------------------------------------------------===// 7494 7495// No CRC instructions have AVX equivalents 7496 7497// crc intrinsic instruction 7498// This set of instructions are only rm, the only difference is the size 7499// of r and m. 7500class SS42I_crc32r<bits<8> opc, string asm, RegisterClass RCOut, 7501 RegisterClass RCIn, SDPatternOperator Int> : 7502 SS42FI<opc, MRMSrcReg, (outs RCOut:$dst), (ins RCOut:$src1, RCIn:$src2), 7503 !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"), 7504 [(set RCOut:$dst, (Int RCOut:$src1, RCIn:$src2))], IIC_CRC32_REG>, 7505 Sched<[WriteFAdd]>; 7506 7507class SS42I_crc32m<bits<8> opc, string asm, RegisterClass RCOut, 7508 X86MemOperand x86memop, SDPatternOperator Int> : 7509 SS42FI<opc, MRMSrcMem, (outs RCOut:$dst), (ins RCOut:$src1, x86memop:$src2), 7510 !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"), 7511 [(set RCOut:$dst, (Int RCOut:$src1, (load addr:$src2)))], 7512 IIC_CRC32_MEM>, Sched<[WriteFAddLd, ReadAfterLd]>; 7513 7514let Constraints = "$src1 = $dst" in { 7515 def CRC32r32m8 : SS42I_crc32m<0xF0, "crc32{b}", GR32, i8mem, 7516 int_x86_sse42_crc32_32_8>; 7517 def CRC32r32r8 : SS42I_crc32r<0xF0, "crc32{b}", GR32, GR8, 7518 int_x86_sse42_crc32_32_8>; 7519 def CRC32r32m16 : SS42I_crc32m<0xF1, "crc32{w}", GR32, i16mem, 7520 int_x86_sse42_crc32_32_16>, OpSize16; 7521 def CRC32r32r16 : SS42I_crc32r<0xF1, "crc32{w}", GR32, GR16, 7522 int_x86_sse42_crc32_32_16>, OpSize16; 7523 def CRC32r32m32 : SS42I_crc32m<0xF1, "crc32{l}", GR32, i32mem, 7524 int_x86_sse42_crc32_32_32>, OpSize32; 7525 def CRC32r32r32 : SS42I_crc32r<0xF1, "crc32{l}", GR32, GR32, 7526 int_x86_sse42_crc32_32_32>, OpSize32; 7527 def CRC32r64m64 : SS42I_crc32m<0xF1, "crc32{q}", GR64, i64mem, 7528 int_x86_sse42_crc32_64_64>, REX_W; 7529 def CRC32r64r64 : SS42I_crc32r<0xF1, "crc32{q}", GR64, GR64, 7530 int_x86_sse42_crc32_64_64>, REX_W; 7531 let hasSideEffects = 0 in { 7532 let mayLoad = 1 in 7533 def CRC32r64m8 : SS42I_crc32m<0xF0, "crc32{b}", GR64, i8mem, 7534 null_frag>, REX_W; 7535 def CRC32r64r8 : SS42I_crc32r<0xF0, "crc32{b}", GR64, GR8, 7536 null_frag>, REX_W; 7537 } 7538} 7539 7540//===----------------------------------------------------------------------===// 7541// SHA-NI Instructions 7542//===----------------------------------------------------------------------===// 7543 7544multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId, 7545 bit UsesXMM0 = 0> { 7546 def rr : I<Opc, MRMSrcReg, (outs VR128:$dst), 7547 (ins VR128:$src1, VR128:$src2), 7548 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 7549 [!if(UsesXMM0, 7550 (set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0)), 7551 (set VR128:$dst, (IntId VR128:$src1, VR128:$src2)))]>, T8; 7552 7553 def rm : I<Opc, MRMSrcMem, (outs VR128:$dst), 7554 (ins VR128:$src1, i128mem:$src2), 7555 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 7556 [!if(UsesXMM0, 7557 (set VR128:$dst, (IntId VR128:$src1, 7558 (bc_v4i32 (memopv2i64 addr:$src2)), XMM0)), 7559 (set VR128:$dst, (IntId VR128:$src1, 7560 (bc_v4i32 (memopv2i64 addr:$src2)))))]>, T8; 7561} 7562 7563let Constraints = "$src1 = $dst", Predicates = [HasSHA] in { 7564 def SHA1RNDS4rri : Ii8<0xCC, MRMSrcReg, (outs VR128:$dst), 7565 (ins VR128:$src1, VR128:$src2, u8imm:$src3), 7566 "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}", 7567 [(set VR128:$dst, 7568 (int_x86_sha1rnds4 VR128:$src1, VR128:$src2, 7569 (i8 imm:$src3)))]>, TA; 7570 def SHA1RNDS4rmi : Ii8<0xCC, MRMSrcMem, (outs VR128:$dst), 7571 (ins VR128:$src1, i128mem:$src2, u8imm:$src3), 7572 "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}", 7573 [(set VR128:$dst, 7574 (int_x86_sha1rnds4 VR128:$src1, 7575 (bc_v4i32 (memopv2i64 addr:$src2)), 7576 (i8 imm:$src3)))]>, TA; 7577 7578 defm SHA1NEXTE : SHAI_binop<0xC8, "sha1nexte", int_x86_sha1nexte>; 7579 defm SHA1MSG1 : SHAI_binop<0xC9, "sha1msg1", int_x86_sha1msg1>; 7580 defm SHA1MSG2 : SHAI_binop<0xCA, "sha1msg2", int_x86_sha1msg2>; 7581 7582 let Uses=[XMM0] in 7583 defm SHA256RNDS2 : SHAI_binop<0xCB, "sha256rnds2", int_x86_sha256rnds2, 1>; 7584 7585 defm SHA256MSG1 : SHAI_binop<0xCC, "sha256msg1", int_x86_sha256msg1>; 7586 defm SHA256MSG2 : SHAI_binop<0xCD, "sha256msg2", int_x86_sha256msg2>; 7587} 7588 7589// Aliases with explicit %xmm0 7590def : InstAlias<"sha256rnds2\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}", 7591 (SHA256RNDS2rr VR128:$dst, VR128:$src2)>; 7592def : InstAlias<"sha256rnds2\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}", 7593 (SHA256RNDS2rm VR128:$dst, i128mem:$src2)>; 7594 7595//===----------------------------------------------------------------------===// 7596// AES-NI Instructions 7597//===----------------------------------------------------------------------===// 7598 7599multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId128, 7600 PatFrag ld_frag, bit Is2Addr = 1> { 7601 def rr : AES8I<opc, MRMSrcReg, (outs VR128:$dst), 7602 (ins VR128:$src1, VR128:$src2), 7603 !if(Is2Addr, 7604 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 7605 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 7606 [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>, 7607 Sched<[WriteAESDecEnc]>; 7608 def rm : AES8I<opc, MRMSrcMem, (outs VR128:$dst), 7609 (ins VR128:$src1, i128mem:$src2), 7610 !if(Is2Addr, 7611 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 7612 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 7613 [(set VR128:$dst, 7614 (IntId128 VR128:$src1, (ld_frag addr:$src2)))]>, 7615 Sched<[WriteAESDecEncLd, ReadAfterLd]>; 7616} 7617 7618// Perform One Round of an AES Encryption/Decryption Flow 7619let Predicates = [HasAVX, HasAES] in { 7620 defm VAESENC : AESI_binop_rm_int<0xDC, "vaesenc", 7621 int_x86_aesni_aesenc, loadv2i64, 0>, VEX_4V; 7622 defm VAESENCLAST : AESI_binop_rm_int<0xDD, "vaesenclast", 7623 int_x86_aesni_aesenclast, loadv2i64, 0>, VEX_4V; 7624 defm VAESDEC : AESI_binop_rm_int<0xDE, "vaesdec", 7625 int_x86_aesni_aesdec, loadv2i64, 0>, VEX_4V; 7626 defm VAESDECLAST : AESI_binop_rm_int<0xDF, "vaesdeclast", 7627 int_x86_aesni_aesdeclast, loadv2i64, 0>, VEX_4V; 7628} 7629 7630let Constraints = "$src1 = $dst" in { 7631 defm AESENC : AESI_binop_rm_int<0xDC, "aesenc", 7632 int_x86_aesni_aesenc, memopv2i64>; 7633 defm AESENCLAST : AESI_binop_rm_int<0xDD, "aesenclast", 7634 int_x86_aesni_aesenclast, memopv2i64>; 7635 defm AESDEC : AESI_binop_rm_int<0xDE, "aesdec", 7636 int_x86_aesni_aesdec, memopv2i64>; 7637 defm AESDECLAST : AESI_binop_rm_int<0xDF, "aesdeclast", 7638 int_x86_aesni_aesdeclast, memopv2i64>; 7639} 7640 7641// Perform the AES InvMixColumn Transformation 7642let Predicates = [HasAVX, HasAES] in { 7643 def VAESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst), 7644 (ins VR128:$src1), 7645 "vaesimc\t{$src1, $dst|$dst, $src1}", 7646 [(set VR128:$dst, 7647 (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>, 7648 VEX; 7649 def VAESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst), 7650 (ins i128mem:$src1), 7651 "vaesimc\t{$src1, $dst|$dst, $src1}", 7652 [(set VR128:$dst, (int_x86_aesni_aesimc (loadv2i64 addr:$src1)))]>, 7653 Sched<[WriteAESIMCLd]>, VEX; 7654} 7655def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst), 7656 (ins VR128:$src1), 7657 "aesimc\t{$src1, $dst|$dst, $src1}", 7658 [(set VR128:$dst, 7659 (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>; 7660def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst), 7661 (ins i128mem:$src1), 7662 "aesimc\t{$src1, $dst|$dst, $src1}", 7663 [(set VR128:$dst, (int_x86_aesni_aesimc (memopv2i64 addr:$src1)))]>, 7664 Sched<[WriteAESIMCLd]>; 7665 7666// AES Round Key Generation Assist 7667let Predicates = [HasAVX, HasAES] in { 7668 def VAESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst), 7669 (ins VR128:$src1, u8imm:$src2), 7670 "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", 7671 [(set VR128:$dst, 7672 (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>, 7673 Sched<[WriteAESKeyGen]>, VEX; 7674 def VAESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst), 7675 (ins i128mem:$src1, u8imm:$src2), 7676 "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", 7677 [(set VR128:$dst, 7678 (int_x86_aesni_aeskeygenassist (loadv2i64 addr:$src1), imm:$src2))]>, 7679 Sched<[WriteAESKeyGenLd]>, VEX; 7680} 7681def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst), 7682 (ins VR128:$src1, u8imm:$src2), 7683 "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", 7684 [(set VR128:$dst, 7685 (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>, 7686 Sched<[WriteAESKeyGen]>; 7687def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst), 7688 (ins i128mem:$src1, u8imm:$src2), 7689 "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", 7690 [(set VR128:$dst, 7691 (int_x86_aesni_aeskeygenassist (memopv2i64 addr:$src1), imm:$src2))]>, 7692 Sched<[WriteAESKeyGenLd]>; 7693 7694//===----------------------------------------------------------------------===// 7695// PCLMUL Instructions 7696//===----------------------------------------------------------------------===// 7697 7698// AVX carry-less Multiplication instructions 7699let isCommutable = 1 in 7700def VPCLMULQDQrr : AVXPCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst), 7701 (ins VR128:$src1, VR128:$src2, u8imm:$src3), 7702 "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 7703 [(set VR128:$dst, 7704 (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))]>, 7705 Sched<[WriteCLMul]>; 7706 7707def VPCLMULQDQrm : AVXPCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst), 7708 (ins VR128:$src1, i128mem:$src2, u8imm:$src3), 7709 "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 7710 [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1, 7711 (loadv2i64 addr:$src2), imm:$src3))]>, 7712 Sched<[WriteCLMulLd, ReadAfterLd]>; 7713 7714// Carry-less Multiplication instructions 7715let Constraints = "$src1 = $dst" in { 7716let isCommutable = 1 in 7717def PCLMULQDQrr : PCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst), 7718 (ins VR128:$src1, VR128:$src2, u8imm:$src3), 7719 "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}", 7720 [(set VR128:$dst, 7721 (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))], 7722 IIC_SSE_PCLMULQDQ_RR>, Sched<[WriteCLMul]>; 7723 7724def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst), 7725 (ins VR128:$src1, i128mem:$src2, u8imm:$src3), 7726 "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}", 7727 [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1, 7728 (memopv2i64 addr:$src2), imm:$src3))], 7729 IIC_SSE_PCLMULQDQ_RM>, 7730 Sched<[WriteCLMulLd, ReadAfterLd]>; 7731} // Constraints = "$src1 = $dst" 7732 7733 7734multiclass pclmul_alias<string asm, int immop> { 7735 def : InstAlias<!strconcat("pclmul", asm, "dq {$src, $dst|$dst, $src}"), 7736 (PCLMULQDQrr VR128:$dst, VR128:$src, immop), 0>; 7737 7738 def : InstAlias<!strconcat("pclmul", asm, "dq {$src, $dst|$dst, $src}"), 7739 (PCLMULQDQrm VR128:$dst, i128mem:$src, immop), 0>; 7740 7741 def : InstAlias<!strconcat("vpclmul", asm, 7742 "dq {$src2, $src1, $dst|$dst, $src1, $src2}"), 7743 (VPCLMULQDQrr VR128:$dst, VR128:$src1, VR128:$src2, immop), 7744 0>; 7745 7746 def : InstAlias<!strconcat("vpclmul", asm, 7747 "dq {$src2, $src1, $dst|$dst, $src1, $src2}"), 7748 (VPCLMULQDQrm VR128:$dst, VR128:$src1, i128mem:$src2, immop), 7749 0>; 7750} 7751defm : pclmul_alias<"hqhq", 0x11>; 7752defm : pclmul_alias<"hqlq", 0x01>; 7753defm : pclmul_alias<"lqhq", 0x10>; 7754defm : pclmul_alias<"lqlq", 0x00>; 7755 7756//===----------------------------------------------------------------------===// 7757// SSE4A Instructions 7758//===----------------------------------------------------------------------===// 7759 7760let Predicates = [HasSSE4A] in { 7761 7762let Constraints = "$src = $dst" in { 7763def EXTRQI : Ii8<0x78, MRMXr, (outs VR128:$dst), 7764 (ins VR128:$src, u8imm:$len, u8imm:$idx), 7765 "extrq\t{$idx, $len, $src|$src, $len, $idx}", 7766 [(set VR128:$dst, (X86extrqi VR128:$src, imm:$len, 7767 imm:$idx))]>, PD; 7768def EXTRQ : I<0x79, MRMSrcReg, (outs VR128:$dst), 7769 (ins VR128:$src, VR128:$mask), 7770 "extrq\t{$mask, $src|$src, $mask}", 7771 [(set VR128:$dst, (int_x86_sse4a_extrq VR128:$src, 7772 VR128:$mask))]>, PD; 7773 7774def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst), 7775 (ins VR128:$src, VR128:$src2, u8imm:$len, u8imm:$idx), 7776 "insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}", 7777 [(set VR128:$dst, (X86insertqi VR128:$src, VR128:$src2, 7778 imm:$len, imm:$idx))]>, XD; 7779def INSERTQ : I<0x79, MRMSrcReg, (outs VR128:$dst), 7780 (ins VR128:$src, VR128:$mask), 7781 "insertq\t{$mask, $src|$src, $mask}", 7782 [(set VR128:$dst, (int_x86_sse4a_insertq VR128:$src, 7783 VR128:$mask))]>, XD; 7784} 7785 7786def MOVNTSS : I<0x2B, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src), 7787 "movntss\t{$src, $dst|$dst, $src}", 7788 [(int_x86_sse4a_movnt_ss addr:$dst, VR128:$src)]>, XS; 7789 7790def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 7791 "movntsd\t{$src, $dst|$dst, $src}", 7792 [(int_x86_sse4a_movnt_sd addr:$dst, VR128:$src)]>, XD; 7793} 7794 7795//===----------------------------------------------------------------------===// 7796// AVX Instructions 7797//===----------------------------------------------------------------------===// 7798 7799//===----------------------------------------------------------------------===// 7800// VBROADCAST - Load from memory and broadcast to all elements of the 7801// destination operand 7802// 7803class avx_broadcast_rm<bits<8> opc, string OpcodeStr, RegisterClass RC, 7804 X86MemOperand x86memop, ValueType VT, 7805 PatFrag ld_frag, SchedWrite Sched> : 7806 AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), 7807 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 7808 [(set RC:$dst, (VT (X86VBroadcast (ld_frag addr:$src))))]>, 7809 Sched<[Sched]>, VEX { 7810 let mayLoad = 1; 7811} 7812 7813// AVX2 adds register forms 7814class avx2_broadcast_rr<bits<8> opc, string OpcodeStr, RegisterClass RC, 7815 ValueType ResVT, ValueType OpVT, SchedWrite Sched> : 7816 AVX28I<opc, MRMSrcReg, (outs RC:$dst), (ins VR128:$src), 7817 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 7818 [(set RC:$dst, (ResVT (X86VBroadcast (OpVT VR128:$src))))]>, 7819 Sched<[Sched]>, VEX; 7820 7821let ExeDomain = SSEPackedSingle in { 7822 def VBROADCASTSSrm : avx_broadcast_rm<0x18, "vbroadcastss", VR128, 7823 f32mem, v4f32, loadf32, WriteLoad>; 7824 def VBROADCASTSSYrm : avx_broadcast_rm<0x18, "vbroadcastss", VR256, 7825 f32mem, v8f32, loadf32, 7826 WriteFShuffleLd>, VEX_L; 7827} 7828let ExeDomain = SSEPackedDouble in 7829def VBROADCASTSDYrm : avx_broadcast_rm<0x19, "vbroadcastsd", VR256, f64mem, 7830 v4f64, loadf64, WriteFShuffleLd>, VEX_L; 7831 7832let ExeDomain = SSEPackedSingle in { 7833 def VBROADCASTSSrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR128, 7834 v4f32, v4f32, WriteFShuffle>; 7835 def VBROADCASTSSYrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR256, 7836 v8f32, v4f32, WriteFShuffle256>, VEX_L; 7837} 7838let ExeDomain = SSEPackedDouble in 7839def VBROADCASTSDYrr : avx2_broadcast_rr<0x19, "vbroadcastsd", VR256, 7840 v4f64, v2f64, WriteFShuffle256>, VEX_L; 7841 7842let mayLoad = 1, Predicates = [HasAVX2] in 7843def VBROADCASTI128 : AVX8I<0x5A, MRMSrcMem, (outs VR256:$dst), 7844 (ins i128mem:$src), 7845 "vbroadcasti128\t{$src, $dst|$dst, $src}", []>, 7846 Sched<[WriteLoad]>, VEX, VEX_L; 7847 7848def VBROADCASTF128 : AVX8I<0x1A, MRMSrcMem, (outs VR256:$dst), 7849 (ins f128mem:$src), 7850 "vbroadcastf128\t{$src, $dst|$dst, $src}", 7851 [(set VR256:$dst, 7852 (int_x86_avx_vbroadcastf128_pd_256 addr:$src))]>, 7853 Sched<[WriteFShuffleLd]>, VEX, VEX_L; 7854 7855let Predicates = [HasAVX] in 7856def : Pat<(int_x86_avx_vbroadcastf128_ps_256 addr:$src), 7857 (VBROADCASTF128 addr:$src)>; 7858 7859 7860//===----------------------------------------------------------------------===// 7861// VINSERTF128 - Insert packed floating-point values 7862// 7863let hasSideEffects = 0, ExeDomain = SSEPackedSingle in { 7864def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst), 7865 (ins VR256:$src1, VR128:$src2, u8imm:$src3), 7866 "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 7867 []>, Sched<[WriteFShuffle]>, VEX_4V, VEX_L; 7868let mayLoad = 1 in 7869def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst), 7870 (ins VR256:$src1, f128mem:$src2, u8imm:$src3), 7871 "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 7872 []>, Sched<[WriteFShuffleLd, ReadAfterLd]>, VEX_4V, VEX_L; 7873} 7874 7875let Predicates = [HasAVX, NoVLX] in { 7876def : Pat<(vinsert128_insert:$ins (v8f32 VR256:$src1), (v4f32 VR128:$src2), 7877 (iPTR imm)), 7878 (VINSERTF128rr VR256:$src1, VR128:$src2, 7879 (INSERT_get_vinsert128_imm VR256:$ins))>; 7880def : Pat<(vinsert128_insert:$ins (v4f64 VR256:$src1), (v2f64 VR128:$src2), 7881 (iPTR imm)), 7882 (VINSERTF128rr VR256:$src1, VR128:$src2, 7883 (INSERT_get_vinsert128_imm VR256:$ins))>; 7884 7885def : Pat<(vinsert128_insert:$ins (v8f32 VR256:$src1), (loadv4f32 addr:$src2), 7886 (iPTR imm)), 7887 (VINSERTF128rm VR256:$src1, addr:$src2, 7888 (INSERT_get_vinsert128_imm VR256:$ins))>; 7889def : Pat<(vinsert128_insert:$ins (v4f64 VR256:$src1), (loadv2f64 addr:$src2), 7890 (iPTR imm)), 7891 (VINSERTF128rm VR256:$src1, addr:$src2, 7892 (INSERT_get_vinsert128_imm VR256:$ins))>; 7893} 7894 7895let Predicates = [HasAVX1Only] in { 7896def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2), 7897 (iPTR imm)), 7898 (VINSERTF128rr VR256:$src1, VR128:$src2, 7899 (INSERT_get_vinsert128_imm VR256:$ins))>; 7900def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2), 7901 (iPTR imm)), 7902 (VINSERTF128rr VR256:$src1, VR128:$src2, 7903 (INSERT_get_vinsert128_imm VR256:$ins))>; 7904def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2), 7905 (iPTR imm)), 7906 (VINSERTF128rr VR256:$src1, VR128:$src2, 7907 (INSERT_get_vinsert128_imm VR256:$ins))>; 7908def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2), 7909 (iPTR imm)), 7910 (VINSERTF128rr VR256:$src1, VR128:$src2, 7911 (INSERT_get_vinsert128_imm VR256:$ins))>; 7912 7913def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (loadv2i64 addr:$src2), 7914 (iPTR imm)), 7915 (VINSERTF128rm VR256:$src1, addr:$src2, 7916 (INSERT_get_vinsert128_imm VR256:$ins))>; 7917def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1), 7918 (bc_v4i32 (loadv2i64 addr:$src2)), 7919 (iPTR imm)), 7920 (VINSERTF128rm VR256:$src1, addr:$src2, 7921 (INSERT_get_vinsert128_imm VR256:$ins))>; 7922def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1), 7923 (bc_v16i8 (loadv2i64 addr:$src2)), 7924 (iPTR imm)), 7925 (VINSERTF128rm VR256:$src1, addr:$src2, 7926 (INSERT_get_vinsert128_imm VR256:$ins))>; 7927def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1), 7928 (bc_v8i16 (loadv2i64 addr:$src2)), 7929 (iPTR imm)), 7930 (VINSERTF128rm VR256:$src1, addr:$src2, 7931 (INSERT_get_vinsert128_imm VR256:$ins))>; 7932} 7933 7934//===----------------------------------------------------------------------===// 7935// VEXTRACTF128 - Extract packed floating-point values 7936// 7937let hasSideEffects = 0, ExeDomain = SSEPackedSingle in { 7938def VEXTRACTF128rr : AVXAIi8<0x19, MRMDestReg, (outs VR128:$dst), 7939 (ins VR256:$src1, u8imm:$src2), 7940 "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}", 7941 []>, Sched<[WriteFShuffle]>, VEX, VEX_L; 7942let mayStore = 1 in 7943def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs), 7944 (ins f128mem:$dst, VR256:$src1, u8imm:$src2), 7945 "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}", 7946 []>, Sched<[WriteStore]>, VEX, VEX_L; 7947} 7948 7949// AVX1 patterns 7950let Predicates = [HasAVX] in { 7951def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), 7952 (v4f32 (VEXTRACTF128rr 7953 (v8f32 VR256:$src1), 7954 (EXTRACT_get_vextract128_imm VR128:$ext)))>; 7955def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), 7956 (v2f64 (VEXTRACTF128rr 7957 (v4f64 VR256:$src1), 7958 (EXTRACT_get_vextract128_imm VR128:$ext)))>; 7959 7960def : Pat<(store (v4f32 (vextract128_extract:$ext (v8f32 VR256:$src1), 7961 (iPTR imm))), addr:$dst), 7962 (VEXTRACTF128mr addr:$dst, VR256:$src1, 7963 (EXTRACT_get_vextract128_imm VR128:$ext))>; 7964def : Pat<(store (v2f64 (vextract128_extract:$ext (v4f64 VR256:$src1), 7965 (iPTR imm))), addr:$dst), 7966 (VEXTRACTF128mr addr:$dst, VR256:$src1, 7967 (EXTRACT_get_vextract128_imm VR128:$ext))>; 7968} 7969 7970let Predicates = [HasAVX1Only] in { 7971def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), 7972 (v2i64 (VEXTRACTF128rr 7973 (v4i64 VR256:$src1), 7974 (EXTRACT_get_vextract128_imm VR128:$ext)))>; 7975def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), 7976 (v4i32 (VEXTRACTF128rr 7977 (v8i32 VR256:$src1), 7978 (EXTRACT_get_vextract128_imm VR128:$ext)))>; 7979def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), 7980 (v8i16 (VEXTRACTF128rr 7981 (v16i16 VR256:$src1), 7982 (EXTRACT_get_vextract128_imm VR128:$ext)))>; 7983def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), 7984 (v16i8 (VEXTRACTF128rr 7985 (v32i8 VR256:$src1), 7986 (EXTRACT_get_vextract128_imm VR128:$ext)))>; 7987 7988def : Pat<(alignedstore (v2i64 (vextract128_extract:$ext (v4i64 VR256:$src1), 7989 (iPTR imm))), addr:$dst), 7990 (VEXTRACTF128mr addr:$dst, VR256:$src1, 7991 (EXTRACT_get_vextract128_imm VR128:$ext))>; 7992def : Pat<(alignedstore (v4i32 (vextract128_extract:$ext (v8i32 VR256:$src1), 7993 (iPTR imm))), addr:$dst), 7994 (VEXTRACTF128mr addr:$dst, VR256:$src1, 7995 (EXTRACT_get_vextract128_imm VR128:$ext))>; 7996def : Pat<(alignedstore (v8i16 (vextract128_extract:$ext (v16i16 VR256:$src1), 7997 (iPTR imm))), addr:$dst), 7998 (VEXTRACTF128mr addr:$dst, VR256:$src1, 7999 (EXTRACT_get_vextract128_imm VR128:$ext))>; 8000def : Pat<(alignedstore (v16i8 (vextract128_extract:$ext (v32i8 VR256:$src1), 8001 (iPTR imm))), addr:$dst), 8002 (VEXTRACTF128mr addr:$dst, VR256:$src1, 8003 (EXTRACT_get_vextract128_imm VR128:$ext))>; 8004} 8005 8006//===----------------------------------------------------------------------===// 8007// VMASKMOV - Conditional SIMD Packed Loads and Stores 8008// 8009multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr, 8010 Intrinsic IntLd, Intrinsic IntLd256, 8011 Intrinsic IntSt, Intrinsic IntSt256> { 8012 def rm : AVX8I<opc_rm, MRMSrcMem, (outs VR128:$dst), 8013 (ins VR128:$src1, f128mem:$src2), 8014 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 8015 [(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))]>, 8016 VEX_4V; 8017 def Yrm : AVX8I<opc_rm, MRMSrcMem, (outs VR256:$dst), 8018 (ins VR256:$src1, f256mem:$src2), 8019 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 8020 [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>, 8021 VEX_4V, VEX_L; 8022 def mr : AVX8I<opc_mr, MRMDestMem, (outs), 8023 (ins f128mem:$dst, VR128:$src1, VR128:$src2), 8024 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 8025 [(IntSt addr:$dst, VR128:$src1, VR128:$src2)]>, VEX_4V; 8026 def Ymr : AVX8I<opc_mr, MRMDestMem, (outs), 8027 (ins f256mem:$dst, VR256:$src1, VR256:$src2), 8028 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 8029 [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, VEX_4V, VEX_L; 8030} 8031 8032let ExeDomain = SSEPackedSingle in 8033defm VMASKMOVPS : avx_movmask_rm<0x2C, 0x2E, "vmaskmovps", 8034 int_x86_avx_maskload_ps, 8035 int_x86_avx_maskload_ps_256, 8036 int_x86_avx_maskstore_ps, 8037 int_x86_avx_maskstore_ps_256>; 8038let ExeDomain = SSEPackedDouble in 8039defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd", 8040 int_x86_avx_maskload_pd, 8041 int_x86_avx_maskload_pd_256, 8042 int_x86_avx_maskstore_pd, 8043 int_x86_avx_maskstore_pd_256>; 8044 8045//===----------------------------------------------------------------------===// 8046// VPERMIL - Permute Single and Double Floating-Point Values 8047// 8048multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr, 8049 RegisterClass RC, X86MemOperand x86memop_f, 8050 X86MemOperand x86memop_i, PatFrag i_frag, 8051 Intrinsic IntVar, ValueType vt> { 8052 def rr : AVX8I<opc_rm, MRMSrcReg, (outs RC:$dst), 8053 (ins RC:$src1, RC:$src2), 8054 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 8055 [(set RC:$dst, (IntVar RC:$src1, RC:$src2))]>, VEX_4V, 8056 Sched<[WriteFShuffle]>; 8057 def rm : AVX8I<opc_rm, MRMSrcMem, (outs RC:$dst), 8058 (ins RC:$src1, x86memop_i:$src2), 8059 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 8060 [(set RC:$dst, (IntVar RC:$src1, 8061 (bitconvert (i_frag addr:$src2))))]>, VEX_4V, 8062 Sched<[WriteFShuffleLd, ReadAfterLd]>; 8063 8064 let Predicates = [HasAVX, NoVLX] in { 8065 def ri : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst), 8066 (ins RC:$src1, u8imm:$src2), 8067 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 8068 [(set RC:$dst, (vt (X86VPermilpi RC:$src1, (i8 imm:$src2))))]>, VEX, 8069 Sched<[WriteFShuffle]>; 8070 def mi : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst), 8071 (ins x86memop_f:$src1, u8imm:$src2), 8072 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 8073 [(set RC:$dst, 8074 (vt (X86VPermilpi (load addr:$src1), (i8 imm:$src2))))]>, VEX, 8075 Sched<[WriteFShuffleLd]>; 8076 }// Predicates = [HasAVX, NoVLX] 8077} 8078 8079let ExeDomain = SSEPackedSingle in { 8080 defm VPERMILPS : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem, 8081 loadv2i64, int_x86_avx_vpermilvar_ps, v4f32>; 8082 defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem, 8083 loadv4i64, int_x86_avx_vpermilvar_ps_256, v8f32>, VEX_L; 8084} 8085let ExeDomain = SSEPackedDouble in { 8086 defm VPERMILPD : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem, 8087 loadv2i64, int_x86_avx_vpermilvar_pd, v2f64>; 8088 defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem, 8089 loadv4i64, int_x86_avx_vpermilvar_pd_256, v4f64>, VEX_L; 8090} 8091 8092let Predicates = [HasAVX, NoVLX] in { 8093def : Pat<(v8f32 (X86VPermilpv VR256:$src1, (v8i32 VR256:$src2))), 8094 (VPERMILPSYrr VR256:$src1, VR256:$src2)>; 8095def : Pat<(v8f32 (X86VPermilpv VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))), 8096 (VPERMILPSYrm VR256:$src1, addr:$src2)>; 8097def : Pat<(v4f64 (X86VPermilpv VR256:$src1, (v4i64 VR256:$src2))), 8098 (VPERMILPDYrr VR256:$src1, VR256:$src2)>; 8099def : Pat<(v4f64 (X86VPermilpv VR256:$src1, (loadv4i64 addr:$src2))), 8100 (VPERMILPDYrm VR256:$src1, addr:$src2)>; 8101 8102def : Pat<(v8i32 (X86VPermilpi VR256:$src1, (i8 imm:$imm))), 8103 (VPERMILPSYri VR256:$src1, imm:$imm)>; 8104def : Pat<(v4i64 (X86VPermilpi VR256:$src1, (i8 imm:$imm))), 8105 (VPERMILPDYri VR256:$src1, imm:$imm)>; 8106def : Pat<(v8i32 (X86VPermilpi (bc_v8i32 (loadv4i64 addr:$src1)), 8107 (i8 imm:$imm))), 8108 (VPERMILPSYmi addr:$src1, imm:$imm)>; 8109def : Pat<(v4i64 (X86VPermilpi (loadv4i64 addr:$src1), (i8 imm:$imm))), 8110 (VPERMILPDYmi addr:$src1, imm:$imm)>; 8111 8112def : Pat<(v4f32 (X86VPermilpv VR128:$src1, (v4i32 VR128:$src2))), 8113 (VPERMILPSrr VR128:$src1, VR128:$src2)>; 8114def : Pat<(v4f32 (X86VPermilpv VR128:$src1, (bc_v4i32 (loadv2i64 addr:$src2)))), 8115 (VPERMILPSrm VR128:$src1, addr:$src2)>; 8116def : Pat<(v2f64 (X86VPermilpv VR128:$src1, (v2i64 VR128:$src2))), 8117 (VPERMILPDrr VR128:$src1, VR128:$src2)>; 8118def : Pat<(v2f64 (X86VPermilpv VR128:$src1, (loadv2i64 addr:$src2))), 8119 (VPERMILPDrm VR128:$src1, addr:$src2)>; 8120 8121def : Pat<(v2i64 (X86VPermilpi VR128:$src1, (i8 imm:$imm))), 8122 (VPERMILPDri VR128:$src1, imm:$imm)>; 8123def : Pat<(v2i64 (X86VPermilpi (loadv2i64 addr:$src1), (i8 imm:$imm))), 8124 (VPERMILPDmi addr:$src1, imm:$imm)>; 8125} 8126 8127//===----------------------------------------------------------------------===// 8128// VPERM2F128 - Permute Floating-Point Values in 128-bit chunks 8129// 8130let ExeDomain = SSEPackedSingle in { 8131def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst), 8132 (ins VR256:$src1, VR256:$src2, u8imm:$src3), 8133 "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 8134 [(set VR256:$dst, (v8f32 (X86VPerm2x128 VR256:$src1, VR256:$src2, 8135 (i8 imm:$src3))))]>, VEX_4V, VEX_L, 8136 Sched<[WriteFShuffle]>; 8137def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst), 8138 (ins VR256:$src1, f256mem:$src2, u8imm:$src3), 8139 "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 8140 [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv8f32 addr:$src2), 8141 (i8 imm:$src3)))]>, VEX_4V, VEX_L, 8142 Sched<[WriteFShuffleLd, ReadAfterLd]>; 8143} 8144 8145let Predicates = [HasAVX] in { 8146def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), 8147 (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; 8148def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1, 8149 (loadv4f64 addr:$src2), (i8 imm:$imm))), 8150 (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>; 8151} 8152 8153let Predicates = [HasAVX1Only] in { 8154def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), 8155 (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; 8156def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), 8157 (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; 8158def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), 8159 (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; 8160def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), 8161 (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; 8162 8163def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, 8164 (bc_v8i32 (loadv4i64 addr:$src2)), (i8 imm:$imm))), 8165 (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>; 8166def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, 8167 (loadv4i64 addr:$src2), (i8 imm:$imm))), 8168 (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>; 8169def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, 8170 (bc_v32i8 (loadv4i64 addr:$src2)), (i8 imm:$imm))), 8171 (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>; 8172def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, 8173 (bc_v16i16 (loadv4i64 addr:$src2)), (i8 imm:$imm))), 8174 (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>; 8175} 8176 8177//===----------------------------------------------------------------------===// 8178// VZERO - Zero YMM registers 8179// 8180let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7, 8181 YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15] in { 8182 // Zero All YMM registers 8183 def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall", 8184 [(int_x86_avx_vzeroall)]>, PS, VEX, VEX_L, Requires<[HasAVX]>; 8185 8186 // Zero Upper bits of YMM registers 8187 def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper", 8188 [(int_x86_avx_vzeroupper)]>, PS, VEX, Requires<[HasAVX]>; 8189} 8190 8191//===----------------------------------------------------------------------===// 8192// Half precision conversion instructions 8193//===----------------------------------------------------------------------===// 8194multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> { 8195 def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src), 8196 "vcvtph2ps\t{$src, $dst|$dst, $src}", 8197 [(set RC:$dst, (Int VR128:$src))]>, 8198 T8PD, VEX, Sched<[WriteCvtF2F]>; 8199 let hasSideEffects = 0, mayLoad = 1 in 8200 def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), 8201 "vcvtph2ps\t{$src, $dst|$dst, $src}", []>, T8PD, VEX, 8202 Sched<[WriteCvtF2FLd]>; 8203} 8204 8205multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> { 8206 def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst), 8207 (ins RC:$src1, i32u8imm:$src2), 8208 "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", 8209 [(set VR128:$dst, (Int RC:$src1, imm:$src2))]>, 8210 TAPD, VEX, Sched<[WriteCvtF2F]>; 8211 let hasSideEffects = 0, mayStore = 1, 8212 SchedRW = [WriteCvtF2FLd, WriteRMW] in 8213 def mr : Ii8<0x1D, MRMDestMem, (outs), 8214 (ins x86memop:$dst, RC:$src1, i32u8imm:$src2), 8215 "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 8216 TAPD, VEX; 8217} 8218 8219let Predicates = [HasF16C] in { 8220 defm VCVTPH2PS : f16c_ph2ps<VR128, f64mem, int_x86_vcvtph2ps_128>; 8221 defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, int_x86_vcvtph2ps_256>, VEX_L; 8222 defm VCVTPS2PH : f16c_ps2ph<VR128, f64mem, int_x86_vcvtps2ph_128>; 8223 defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, int_x86_vcvtps2ph_256>, VEX_L; 8224 8225 // Pattern match vcvtph2ps of a scalar i64 load. 8226 def : Pat<(int_x86_vcvtph2ps_128 (vzmovl_v2i64 addr:$src)), 8227 (VCVTPH2PSrm addr:$src)>; 8228 def : Pat<(int_x86_vcvtph2ps_128 (vzload_v2i64 addr:$src)), 8229 (VCVTPH2PSrm addr:$src)>; 8230 8231 def : Pat<(store (f64 (extractelt (bc_v2f64 (v8i16 8232 (int_x86_vcvtps2ph_128 VR128:$src1, i32:$src2))), (iPTR 0))), 8233 addr:$dst), 8234 (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>; 8235 def : Pat<(store (i64 (extractelt (bc_v2i64 (v8i16 8236 (int_x86_vcvtps2ph_128 VR128:$src1, i32:$src2))), (iPTR 0))), 8237 addr:$dst), 8238 (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>; 8239 def : Pat<(store (v8i16 (int_x86_vcvtps2ph_256 VR256:$src1, i32:$src2)), 8240 addr:$dst), 8241 (VCVTPS2PHYmr addr:$dst, VR256:$src1, imm:$src2)>; 8242} 8243 8244// Patterns for matching conversions from float to half-float and vice versa. 8245let Predicates = [HasF16C] in { 8246 def : Pat<(fp_to_f16 FR32:$src), 8247 (i16 (EXTRACT_SUBREG (VMOVPDI2DIrr (VCVTPS2PHrr 8248 (COPY_TO_REGCLASS FR32:$src, VR128), 0)), sub_16bit))>; 8249 8250 def : Pat<(f16_to_fp GR16:$src), 8251 (f32 (COPY_TO_REGCLASS (VCVTPH2PSrr 8252 (COPY_TO_REGCLASS (MOVSX32rr16 GR16:$src), VR128)), FR32)) >; 8253 8254 def : Pat<(f16_to_fp (i16 (fp_to_f16 FR32:$src))), 8255 (f32 (COPY_TO_REGCLASS (VCVTPH2PSrr 8256 (VCVTPS2PHrr (COPY_TO_REGCLASS FR32:$src, VR128), 0)), FR32)) >; 8257} 8258 8259//===----------------------------------------------------------------------===// 8260// AVX2 Instructions 8261//===----------------------------------------------------------------------===// 8262 8263/// AVX2_binop_rmi - AVX2 binary operator with 8-bit immediate 8264multiclass AVX2_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode, 8265 ValueType OpVT, RegisterClass RC, PatFrag memop_frag, 8266 X86MemOperand x86memop> { 8267 let isCommutable = 1 in 8268 def rri : AVX2AIi8<opc, MRMSrcReg, (outs RC:$dst), 8269 (ins RC:$src1, RC:$src2, u8imm:$src3), 8270 !strconcat(OpcodeStr, 8271 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 8272 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))]>, 8273 Sched<[WriteBlend]>, VEX_4V; 8274 def rmi : AVX2AIi8<opc, MRMSrcMem, (outs RC:$dst), 8275 (ins RC:$src1, x86memop:$src2, u8imm:$src3), 8276 !strconcat(OpcodeStr, 8277 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 8278 [(set RC:$dst, 8279 (OpVT (OpNode RC:$src1, 8280 (bitconvert (memop_frag addr:$src2)), imm:$src3)))]>, 8281 Sched<[WriteBlendLd, ReadAfterLd]>, VEX_4V; 8282} 8283 8284defm VPBLENDD : AVX2_binop_rmi<0x02, "vpblendd", X86Blendi, v4i32, 8285 VR128, loadv2i64, i128mem>; 8286defm VPBLENDDY : AVX2_binop_rmi<0x02, "vpblendd", X86Blendi, v8i32, 8287 VR256, loadv4i64, i256mem>, VEX_L; 8288 8289//===----------------------------------------------------------------------===// 8290// VPBROADCAST - Load from memory and broadcast to all elements of the 8291// destination operand 8292// 8293multiclass avx2_broadcast<bits<8> opc, string OpcodeStr, 8294 X86MemOperand x86memop, PatFrag ld_frag, 8295 ValueType OpVT128, ValueType OpVT256, Predicate prd> { 8296 let Predicates = [HasAVX2, prd] in { 8297 def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 8298 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 8299 [(set VR128:$dst, 8300 (OpVT128 (X86VBroadcast (OpVT128 VR128:$src))))]>, 8301 Sched<[WriteShuffle]>, VEX; 8302 def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src), 8303 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 8304 [(set VR128:$dst, 8305 (OpVT128 (X86VBroadcast (ld_frag addr:$src))))]>, 8306 Sched<[WriteLoad]>, VEX; 8307 def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), 8308 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 8309 [(set VR256:$dst, 8310 (OpVT256 (X86VBroadcast (OpVT128 VR128:$src))))]>, 8311 Sched<[WriteShuffle256]>, VEX, VEX_L; 8312 def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src), 8313 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 8314 [(set VR256:$dst, 8315 (OpVT256 (X86VBroadcast (ld_frag addr:$src))))]>, 8316 Sched<[WriteLoad]>, VEX, VEX_L; 8317 8318 // Provide aliases for broadcast from the same register class that 8319 // automatically does the extract. 8320 def : Pat<(OpVT256 (X86VBroadcast (OpVT256 VR256:$src))), 8321 (!cast<Instruction>(NAME#"Yrr") 8322 (OpVT128 (EXTRACT_SUBREG (OpVT256 VR256:$src),sub_xmm)))>; 8323 } 8324} 8325 8326defm VPBROADCASTB : avx2_broadcast<0x78, "vpbroadcastb", i8mem, loadi8, 8327 v16i8, v32i8, NoVLX_Or_NoBWI>; 8328defm VPBROADCASTW : avx2_broadcast<0x79, "vpbroadcastw", i16mem, loadi16, 8329 v8i16, v16i16, NoVLX_Or_NoBWI>; 8330defm VPBROADCASTD : avx2_broadcast<0x58, "vpbroadcastd", i32mem, loadi32, 8331 v4i32, v8i32, NoVLX>; 8332defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, loadi64, 8333 v2i64, v4i64, NoVLX>; 8334 8335let Predicates = [HasAVX2] in { 8336 // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably. 8337 // This means we'll encounter truncated i32 loads; match that here. 8338 def : Pat<(v8i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))), 8339 (VPBROADCASTWrm addr:$src)>; 8340 def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))), 8341 (VPBROADCASTWYrm addr:$src)>; 8342 def : Pat<(v8i16 (X86VBroadcast 8343 (i16 (trunc (i32 (zextloadi16 addr:$src)))))), 8344 (VPBROADCASTWrm addr:$src)>; 8345 def : Pat<(v16i16 (X86VBroadcast 8346 (i16 (trunc (i32 (zextloadi16 addr:$src)))))), 8347 (VPBROADCASTWYrm addr:$src)>; 8348 8349 // Provide aliases for broadcast from the same register class that 8350 // automatically does the extract. 8351 def : Pat<(v8f32 (X86VBroadcast (v8f32 VR256:$src))), 8352 (VBROADCASTSSYrr (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), 8353 sub_xmm)))>; 8354 def : Pat<(v4f64 (X86VBroadcast (v4f64 VR256:$src))), 8355 (VBROADCASTSDYrr (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), 8356 sub_xmm)))>; 8357 8358 // Provide fallback in case the load node that is used in the patterns above 8359 // is used by additional users, which prevents the pattern selection. 8360 let AddedComplexity = 20 in { 8361 def : Pat<(v4f32 (X86VBroadcast FR32:$src)), 8362 (VBROADCASTSSrr (COPY_TO_REGCLASS FR32:$src, VR128))>; 8363 def : Pat<(v8f32 (X86VBroadcast FR32:$src)), 8364 (VBROADCASTSSYrr (COPY_TO_REGCLASS FR32:$src, VR128))>; 8365 def : Pat<(v4f64 (X86VBroadcast FR64:$src)), 8366 (VBROADCASTSDYrr (COPY_TO_REGCLASS FR64:$src, VR128))>; 8367 8368 def : Pat<(v4i32 (X86VBroadcast GR32:$src)), 8369 (VBROADCASTSSrr (COPY_TO_REGCLASS GR32:$src, VR128))>; 8370 def : Pat<(v8i32 (X86VBroadcast GR32:$src)), 8371 (VBROADCASTSSYrr (COPY_TO_REGCLASS GR32:$src, VR128))>; 8372 def : Pat<(v4i64 (X86VBroadcast GR64:$src)), 8373 (VBROADCASTSDYrr (COPY_TO_REGCLASS GR64:$src, VR128))>; 8374 8375 def : Pat<(v16i8 (X86VBroadcast GR8:$src)), 8376 (VPBROADCASTBrr (COPY_TO_REGCLASS 8377 (i32 (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit)), 8378 VR128))>; 8379 def : Pat<(v32i8 (X86VBroadcast GR8:$src)), 8380 (VPBROADCASTBYrr (COPY_TO_REGCLASS 8381 (i32 (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit)), 8382 VR128))>; 8383 8384 def : Pat<(v8i16 (X86VBroadcast GR16:$src)), 8385 (VPBROADCASTWrr (COPY_TO_REGCLASS 8386 (i32 (SUBREG_TO_REG (i32 0), GR16:$src, sub_16bit)), 8387 VR128))>; 8388 def : Pat<(v16i16 (X86VBroadcast GR16:$src)), 8389 (VPBROADCASTWYrr (COPY_TO_REGCLASS 8390 (i32 (SUBREG_TO_REG (i32 0), GR16:$src, sub_16bit)), 8391 VR128))>; 8392 8393 // The patterns for VPBROADCASTD are not needed because they would match 8394 // the exact same thing as VBROADCASTSS patterns. 8395 8396 def : Pat<(v2i64 (X86VBroadcast GR64:$src)), 8397 (VPBROADCASTQrr (COPY_TO_REGCLASS GR64:$src, VR128))>; 8398 // The v4i64 pattern is not needed because VBROADCASTSDYrr already match. 8399 } 8400} 8401 8402// AVX1 broadcast patterns 8403let Predicates = [HasAVX1Only] in { 8404def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))), 8405 (VBROADCASTSSYrm addr:$src)>; 8406def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))), 8407 (VBROADCASTSDYrm addr:$src)>; 8408def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))), 8409 (VBROADCASTSSrm addr:$src)>; 8410} 8411 8412let Predicates = [HasAVX] in { 8413 // Provide fallback in case the load node that is used in the patterns above 8414 // is used by additional users, which prevents the pattern selection. 8415 let AddedComplexity = 20 in { 8416 // 128bit broadcasts: 8417 def : Pat<(v4f32 (X86VBroadcast FR32:$src)), 8418 (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0)>; 8419 def : Pat<(v8f32 (X86VBroadcast FR32:$src)), 8420 (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), 8421 (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0), sub_xmm), 8422 (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0), 1)>; 8423 def : Pat<(v4f64 (X86VBroadcast FR64:$src)), 8424 (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), 8425 (VPSHUFDri (COPY_TO_REGCLASS FR64:$src, VR128), 0x44), sub_xmm), 8426 (VPSHUFDri (COPY_TO_REGCLASS FR64:$src, VR128), 0x44), 1)>; 8427 8428 def : Pat<(v4i32 (X86VBroadcast GR32:$src)), 8429 (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0)>; 8430 def : Pat<(v8i32 (X86VBroadcast GR32:$src)), 8431 (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 8432 (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0), sub_xmm), 8433 (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0), 1)>; 8434 def : Pat<(v4i64 (X86VBroadcast GR64:$src)), 8435 (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), 8436 (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44), sub_xmm), 8437 (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44), 1)>; 8438 } 8439 8440 def : Pat<(v2f64 (X86VBroadcast f64:$src)), 8441 (VMOVDDUPrr (COPY_TO_REGCLASS FR64:$src, VR128))>; 8442 def : Pat<(v2i64 (X86VBroadcast i64:$src)), 8443 (VMOVDDUPrr (COPY_TO_REGCLASS GR64:$src, VR128))>; 8444} 8445 8446//===----------------------------------------------------------------------===// 8447// VPERM - Permute instructions 8448// 8449 8450multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag, 8451 ValueType OpVT, X86FoldableSchedWrite Sched> { 8452 def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), 8453 (ins VR256:$src1, VR256:$src2), 8454 !strconcat(OpcodeStr, 8455 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 8456 [(set VR256:$dst, 8457 (OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>, 8458 Sched<[Sched]>, VEX_4V, VEX_L; 8459 def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), 8460 (ins VR256:$src1, i256mem:$src2), 8461 !strconcat(OpcodeStr, 8462 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 8463 [(set VR256:$dst, 8464 (OpVT (X86VPermv VR256:$src1, 8465 (bitconvert (mem_frag addr:$src2)))))]>, 8466 Sched<[Sched.Folded, ReadAfterLd]>, VEX_4V, VEX_L; 8467} 8468 8469defm VPERMD : avx2_perm<0x36, "vpermd", loadv4i64, v8i32, WriteShuffle256>; 8470let ExeDomain = SSEPackedSingle in 8471defm VPERMPS : avx2_perm<0x16, "vpermps", loadv8f32, v8f32, WriteFShuffle256>; 8472 8473multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag, 8474 ValueType OpVT, X86FoldableSchedWrite Sched> { 8475 def Yri : AVX2AIi8<opc, MRMSrcReg, (outs VR256:$dst), 8476 (ins VR256:$src1, u8imm:$src2), 8477 !strconcat(OpcodeStr, 8478 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 8479 [(set VR256:$dst, 8480 (OpVT (X86VPermi VR256:$src1, (i8 imm:$src2))))]>, 8481 Sched<[Sched]>, VEX, VEX_L; 8482 def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst), 8483 (ins i256mem:$src1, u8imm:$src2), 8484 !strconcat(OpcodeStr, 8485 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 8486 [(set VR256:$dst, 8487 (OpVT (X86VPermi (mem_frag addr:$src1), 8488 (i8 imm:$src2))))]>, 8489 Sched<[Sched.Folded, ReadAfterLd]>, VEX, VEX_L; 8490} 8491 8492defm VPERMQ : avx2_perm_imm<0x00, "vpermq", loadv4i64, v4i64, 8493 WriteShuffle256>, VEX_W; 8494let ExeDomain = SSEPackedDouble in 8495defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64, 8496 WriteFShuffle256>, VEX_W; 8497 8498//===----------------------------------------------------------------------===// 8499// VPERM2I128 - Permute Floating-Point Values in 128-bit chunks 8500// 8501def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst), 8502 (ins VR256:$src1, VR256:$src2, u8imm:$src3), 8503 "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 8504 [(set VR256:$dst, (v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, 8505 (i8 imm:$src3))))]>, Sched<[WriteShuffle256]>, 8506 VEX_4V, VEX_L; 8507def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst), 8508 (ins VR256:$src1, f256mem:$src2, u8imm:$src3), 8509 "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 8510 [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4i64 addr:$src2), 8511 (i8 imm:$src3)))]>, 8512 Sched<[WriteShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L; 8513 8514let Predicates = [HasAVX2] in { 8515def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), 8516 (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>; 8517def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), 8518 (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>; 8519def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), 8520 (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>; 8521 8522def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, (bc_v32i8 (loadv4i64 addr:$src2)), 8523 (i8 imm:$imm))), 8524 (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>; 8525def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, 8526 (bc_v16i16 (loadv4i64 addr:$src2)), (i8 imm:$imm))), 8527 (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>; 8528def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)), 8529 (i8 imm:$imm))), 8530 (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>; 8531} 8532 8533 8534//===----------------------------------------------------------------------===// 8535// VINSERTI128 - Insert packed integer values 8536// 8537let hasSideEffects = 0 in { 8538def VINSERTI128rr : AVX2AIi8<0x38, MRMSrcReg, (outs VR256:$dst), 8539 (ins VR256:$src1, VR128:$src2, u8imm:$src3), 8540 "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 8541 []>, Sched<[WriteShuffle256]>, VEX_4V, VEX_L; 8542let mayLoad = 1 in 8543def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst), 8544 (ins VR256:$src1, i128mem:$src2, u8imm:$src3), 8545 "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 8546 []>, Sched<[WriteShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L; 8547} 8548 8549let Predicates = [HasAVX2, NoVLX] in { 8550def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2), 8551 (iPTR imm)), 8552 (VINSERTI128rr VR256:$src1, VR128:$src2, 8553 (INSERT_get_vinsert128_imm VR256:$ins))>; 8554def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2), 8555 (iPTR imm)), 8556 (VINSERTI128rr VR256:$src1, VR128:$src2, 8557 (INSERT_get_vinsert128_imm VR256:$ins))>; 8558def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2), 8559 (iPTR imm)), 8560 (VINSERTI128rr VR256:$src1, VR128:$src2, 8561 (INSERT_get_vinsert128_imm VR256:$ins))>; 8562def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2), 8563 (iPTR imm)), 8564 (VINSERTI128rr VR256:$src1, VR128:$src2, 8565 (INSERT_get_vinsert128_imm VR256:$ins))>; 8566 8567def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (loadv2i64 addr:$src2), 8568 (iPTR imm)), 8569 (VINSERTI128rm VR256:$src1, addr:$src2, 8570 (INSERT_get_vinsert128_imm VR256:$ins))>; 8571def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1), 8572 (bc_v4i32 (loadv2i64 addr:$src2)), 8573 (iPTR imm)), 8574 (VINSERTI128rm VR256:$src1, addr:$src2, 8575 (INSERT_get_vinsert128_imm VR256:$ins))>; 8576def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1), 8577 (bc_v16i8 (loadv2i64 addr:$src2)), 8578 (iPTR imm)), 8579 (VINSERTI128rm VR256:$src1, addr:$src2, 8580 (INSERT_get_vinsert128_imm VR256:$ins))>; 8581def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1), 8582 (bc_v8i16 (loadv2i64 addr:$src2)), 8583 (iPTR imm)), 8584 (VINSERTI128rm VR256:$src1, addr:$src2, 8585 (INSERT_get_vinsert128_imm VR256:$ins))>; 8586} 8587 8588//===----------------------------------------------------------------------===// 8589// VEXTRACTI128 - Extract packed integer values 8590// 8591def VEXTRACTI128rr : AVX2AIi8<0x39, MRMDestReg, (outs VR128:$dst), 8592 (ins VR256:$src1, u8imm:$src2), 8593 "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 8594 Sched<[WriteShuffle256]>, VEX, VEX_L; 8595let hasSideEffects = 0, mayStore = 1 in 8596def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs), 8597 (ins i128mem:$dst, VR256:$src1, u8imm:$src2), 8598 "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 8599 Sched<[WriteStore]>, VEX, VEX_L; 8600 8601let Predicates = [HasAVX2] in { 8602def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), 8603 (v2i64 (VEXTRACTI128rr 8604 (v4i64 VR256:$src1), 8605 (EXTRACT_get_vextract128_imm VR128:$ext)))>; 8606def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), 8607 (v4i32 (VEXTRACTI128rr 8608 (v8i32 VR256:$src1), 8609 (EXTRACT_get_vextract128_imm VR128:$ext)))>; 8610def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), 8611 (v8i16 (VEXTRACTI128rr 8612 (v16i16 VR256:$src1), 8613 (EXTRACT_get_vextract128_imm VR128:$ext)))>; 8614def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), 8615 (v16i8 (VEXTRACTI128rr 8616 (v32i8 VR256:$src1), 8617 (EXTRACT_get_vextract128_imm VR128:$ext)))>; 8618 8619def : Pat<(store (v2i64 (vextract128_extract:$ext (v4i64 VR256:$src1), 8620 (iPTR imm))), addr:$dst), 8621 (VEXTRACTI128mr addr:$dst, VR256:$src1, 8622 (EXTRACT_get_vextract128_imm VR128:$ext))>; 8623def : Pat<(store (v4i32 (vextract128_extract:$ext (v8i32 VR256:$src1), 8624 (iPTR imm))), addr:$dst), 8625 (VEXTRACTI128mr addr:$dst, VR256:$src1, 8626 (EXTRACT_get_vextract128_imm VR128:$ext))>; 8627def : Pat<(store (v8i16 (vextract128_extract:$ext (v16i16 VR256:$src1), 8628 (iPTR imm))), addr:$dst), 8629 (VEXTRACTI128mr addr:$dst, VR256:$src1, 8630 (EXTRACT_get_vextract128_imm VR128:$ext))>; 8631def : Pat<(store (v16i8 (vextract128_extract:$ext (v32i8 VR256:$src1), 8632 (iPTR imm))), addr:$dst), 8633 (VEXTRACTI128mr addr:$dst, VR256:$src1, 8634 (EXTRACT_get_vextract128_imm VR128:$ext))>; 8635} 8636 8637//===----------------------------------------------------------------------===// 8638// VPMASKMOV - Conditional SIMD Integer Packed Loads and Stores 8639// 8640multiclass avx2_pmovmask<string OpcodeStr, 8641 Intrinsic IntLd128, Intrinsic IntLd256, 8642 Intrinsic IntSt128, Intrinsic IntSt256> { 8643 def rm : AVX28I<0x8c, MRMSrcMem, (outs VR128:$dst), 8644 (ins VR128:$src1, i128mem:$src2), 8645 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 8646 [(set VR128:$dst, (IntLd128 addr:$src2, VR128:$src1))]>, VEX_4V; 8647 def Yrm : AVX28I<0x8c, MRMSrcMem, (outs VR256:$dst), 8648 (ins VR256:$src1, i256mem:$src2), 8649 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 8650 [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>, 8651 VEX_4V, VEX_L; 8652 def mr : AVX28I<0x8e, MRMDestMem, (outs), 8653 (ins i128mem:$dst, VR128:$src1, VR128:$src2), 8654 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 8655 [(IntSt128 addr:$dst, VR128:$src1, VR128:$src2)]>, VEX_4V; 8656 def Ymr : AVX28I<0x8e, MRMDestMem, (outs), 8657 (ins i256mem:$dst, VR256:$src1, VR256:$src2), 8658 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 8659 [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, VEX_4V, VEX_L; 8660} 8661 8662defm VPMASKMOVD : avx2_pmovmask<"vpmaskmovd", 8663 int_x86_avx2_maskload_d, 8664 int_x86_avx2_maskload_d_256, 8665 int_x86_avx2_maskstore_d, 8666 int_x86_avx2_maskstore_d_256>; 8667defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq", 8668 int_x86_avx2_maskload_q, 8669 int_x86_avx2_maskload_q_256, 8670 int_x86_avx2_maskstore_q, 8671 int_x86_avx2_maskstore_q_256>, VEX_W; 8672 8673def: Pat<(X86mstore addr:$ptr, (v8i32 VR256:$mask), (v8f32 VR256:$src)), 8674 (VMASKMOVPSYmr addr:$ptr, VR256:$mask, VR256:$src)>; 8675 8676def: Pat<(X86mstore addr:$ptr, (v8i32 VR256:$mask), (v8i32 VR256:$src)), 8677 (VPMASKMOVDYmr addr:$ptr, VR256:$mask, VR256:$src)>; 8678 8679def: Pat<(X86mstore addr:$ptr, (v4i32 VR128:$mask), (v4f32 VR128:$src)), 8680 (VMASKMOVPSmr addr:$ptr, VR128:$mask, VR128:$src)>; 8681 8682def: Pat<(X86mstore addr:$ptr, (v4i32 VR128:$mask), (v4i32 VR128:$src)), 8683 (VPMASKMOVDmr addr:$ptr, VR128:$mask, VR128:$src)>; 8684 8685def: Pat<(v8f32 (masked_load addr:$ptr, (v8i32 VR256:$mask), undef)), 8686 (VMASKMOVPSYrm VR256:$mask, addr:$ptr)>; 8687 8688def: Pat<(v8f32 (masked_load addr:$ptr, (v8i32 VR256:$mask), 8689 (bc_v8f32 (v8i32 immAllZerosV)))), 8690 (VMASKMOVPSYrm VR256:$mask, addr:$ptr)>; 8691 8692def: Pat<(v8f32 (masked_load addr:$ptr, (v8i32 VR256:$mask), (v8f32 VR256:$src0))), 8693 (VBLENDVPSYrr VR256:$src0, (VMASKMOVPSYrm VR256:$mask, addr:$ptr), 8694 VR256:$mask)>; 8695 8696def: Pat<(v8i32 (masked_load addr:$ptr, (v8i32 VR256:$mask), undef)), 8697 (VPMASKMOVDYrm VR256:$mask, addr:$ptr)>; 8698 8699def: Pat<(v8i32 (masked_load addr:$ptr, (v8i32 VR256:$mask), (v8i32 immAllZerosV))), 8700 (VPMASKMOVDYrm VR256:$mask, addr:$ptr)>; 8701 8702def: Pat<(v8i32 (masked_load addr:$ptr, (v8i32 VR256:$mask), (v8i32 VR256:$src0))), 8703 (VBLENDVPSYrr VR256:$src0, (VPMASKMOVDYrm VR256:$mask, addr:$ptr), 8704 VR256:$mask)>; 8705 8706def: Pat<(v4f32 (masked_load addr:$ptr, (v4i32 VR128:$mask), undef)), 8707 (VMASKMOVPSrm VR128:$mask, addr:$ptr)>; 8708 8709def: Pat<(v4f32 (masked_load addr:$ptr, (v4i32 VR128:$mask), 8710 (bc_v4f32 (v4i32 immAllZerosV)))), 8711 (VMASKMOVPSrm VR128:$mask, addr:$ptr)>; 8712 8713def: Pat<(v4f32 (masked_load addr:$ptr, (v4i32 VR128:$mask), (v4f32 VR128:$src0))), 8714 (VBLENDVPSrr VR128:$src0, (VMASKMOVPSrm VR128:$mask, addr:$ptr), 8715 VR128:$mask)>; 8716 8717def: Pat<(v4i32 (masked_load addr:$ptr, (v4i32 VR128:$mask), undef)), 8718 (VPMASKMOVDrm VR128:$mask, addr:$ptr)>; 8719 8720def: Pat<(v4i32 (masked_load addr:$ptr, (v4i32 VR128:$mask), (v4i32 immAllZerosV))), 8721 (VPMASKMOVDrm VR128:$mask, addr:$ptr)>; 8722 8723def: Pat<(v4i32 (masked_load addr:$ptr, (v4i32 VR128:$mask), (v4i32 VR128:$src0))), 8724 (VBLENDVPSrr VR128:$src0, (VPMASKMOVDrm VR128:$mask, addr:$ptr), 8725 VR128:$mask)>; 8726 8727def: Pat<(X86mstore addr:$ptr, (v4i64 VR256:$mask), (v4f64 VR256:$src)), 8728 (VMASKMOVPDYmr addr:$ptr, VR256:$mask, VR256:$src)>; 8729 8730def: Pat<(X86mstore addr:$ptr, (v4i64 VR256:$mask), (v4i64 VR256:$src)), 8731 (VPMASKMOVQYmr addr:$ptr, VR256:$mask, VR256:$src)>; 8732 8733def: Pat<(v4f64 (masked_load addr:$ptr, (v4i64 VR256:$mask), undef)), 8734 (VMASKMOVPDYrm VR256:$mask, addr:$ptr)>; 8735 8736def: Pat<(v4f64 (masked_load addr:$ptr, (v4i64 VR256:$mask), 8737 (v4f64 immAllZerosV))), 8738 (VMASKMOVPDYrm VR256:$mask, addr:$ptr)>; 8739 8740def: Pat<(v4f64 (masked_load addr:$ptr, (v4i64 VR256:$mask), (v4f64 VR256:$src0))), 8741 (VBLENDVPDYrr VR256:$src0, (VMASKMOVPDYrm VR256:$mask, addr:$ptr), 8742 VR256:$mask)>; 8743 8744def: Pat<(v4i64 (masked_load addr:$ptr, (v4i64 VR256:$mask), undef)), 8745 (VPMASKMOVQYrm VR256:$mask, addr:$ptr)>; 8746 8747def: Pat<(v4i64 (masked_load addr:$ptr, (v4i64 VR256:$mask), 8748 (bc_v4i64 (v8i32 immAllZerosV)))), 8749 (VPMASKMOVQYrm VR256:$mask, addr:$ptr)>; 8750 8751def: Pat<(v4i64 (masked_load addr:$ptr, (v4i64 VR256:$mask), (v4i64 VR256:$src0))), 8752 (VBLENDVPDYrr VR256:$src0, (VPMASKMOVQYrm VR256:$mask, addr:$ptr), 8753 VR256:$mask)>; 8754 8755def: Pat<(X86mstore addr:$ptr, (v2i64 VR128:$mask), (v2f64 VR128:$src)), 8756 (VMASKMOVPDmr addr:$ptr, VR128:$mask, VR128:$src)>; 8757 8758def: Pat<(X86mstore addr:$ptr, (v2i64 VR128:$mask), (v2i64 VR128:$src)), 8759 (VPMASKMOVQmr addr:$ptr, VR128:$mask, VR128:$src)>; 8760 8761def: Pat<(v2f64 (masked_load addr:$ptr, (v2i64 VR128:$mask), undef)), 8762 (VMASKMOVPDrm VR128:$mask, addr:$ptr)>; 8763 8764def: Pat<(v2f64 (masked_load addr:$ptr, (v2i64 VR128:$mask), 8765 (v2f64 immAllZerosV))), 8766 (VMASKMOVPDrm VR128:$mask, addr:$ptr)>; 8767 8768def: Pat<(v2f64 (masked_load addr:$ptr, (v2i64 VR128:$mask), (v2f64 VR128:$src0))), 8769 (VBLENDVPDrr VR128:$src0, (VMASKMOVPDrm VR128:$mask, addr:$ptr), 8770 VR128:$mask)>; 8771 8772def: Pat<(v2i64 (masked_load addr:$ptr, (v2i64 VR128:$mask), undef)), 8773 (VPMASKMOVQrm VR128:$mask, addr:$ptr)>; 8774 8775def: Pat<(v2i64 (masked_load addr:$ptr, (v2i64 VR128:$mask), 8776 (bc_v2i64 (v4i32 immAllZerosV)))), 8777 (VPMASKMOVQrm VR128:$mask, addr:$ptr)>; 8778 8779def: Pat<(v2i64 (masked_load addr:$ptr, (v2i64 VR128:$mask), (v2i64 VR128:$src0))), 8780 (VBLENDVPDrr VR128:$src0, (VPMASKMOVQrm VR128:$mask, addr:$ptr), 8781 VR128:$mask)>; 8782 8783//===----------------------------------------------------------------------===// 8784// Variable Bit Shifts 8785// 8786multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode, 8787 ValueType vt128, ValueType vt256> { 8788 def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), 8789 (ins VR128:$src1, VR128:$src2), 8790 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 8791 [(set VR128:$dst, 8792 (vt128 (OpNode VR128:$src1, (vt128 VR128:$src2))))]>, 8793 VEX_4V, Sched<[WriteVarVecShift]>; 8794 def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), 8795 (ins VR128:$src1, i128mem:$src2), 8796 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 8797 [(set VR128:$dst, 8798 (vt128 (OpNode VR128:$src1, 8799 (vt128 (bitconvert (loadv2i64 addr:$src2))))))]>, 8800 VEX_4V, Sched<[WriteVarVecShiftLd, ReadAfterLd]>; 8801 def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), 8802 (ins VR256:$src1, VR256:$src2), 8803 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 8804 [(set VR256:$dst, 8805 (vt256 (OpNode VR256:$src1, (vt256 VR256:$src2))))]>, 8806 VEX_4V, VEX_L, Sched<[WriteVarVecShift]>; 8807 def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), 8808 (ins VR256:$src1, i256mem:$src2), 8809 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 8810 [(set VR256:$dst, 8811 (vt256 (OpNode VR256:$src1, 8812 (vt256 (bitconvert (loadv4i64 addr:$src2))))))]>, 8813 VEX_4V, VEX_L, Sched<[WriteVarVecShiftLd, ReadAfterLd]>; 8814} 8815 8816defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", shl, v4i32, v8i32>; 8817defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", shl, v2i64, v4i64>, VEX_W; 8818defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", srl, v4i32, v8i32>; 8819defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", srl, v2i64, v4i64>, VEX_W; 8820defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", sra, v4i32, v8i32>; 8821 8822//===----------------------------------------------------------------------===// 8823// VGATHER - GATHER Operations 8824multiclass avx2_gather<bits<8> opc, string OpcodeStr, RegisterClass RC256, 8825 X86MemOperand memop128, X86MemOperand memop256> { 8826 def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst, VR128:$mask_wb), 8827 (ins VR128:$src1, memop128:$src2, VR128:$mask), 8828 !strconcat(OpcodeStr, 8829 "\t{$mask, $src2, $dst|$dst, $src2, $mask}"), 8830 []>, VEX_4VOp3; 8831 def Yrm : AVX28I<opc, MRMSrcMem, (outs RC256:$dst, RC256:$mask_wb), 8832 (ins RC256:$src1, memop256:$src2, RC256:$mask), 8833 !strconcat(OpcodeStr, 8834 "\t{$mask, $src2, $dst|$dst, $src2, $mask}"), 8835 []>, VEX_4VOp3, VEX_L; 8836} 8837 8838let mayLoad = 1, Constraints 8839 = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb" 8840 in { 8841 defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", VR256, vx64mem, vx64mem>, VEX_W; 8842 defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", VR256, vx64mem, vy64mem>, VEX_W; 8843 defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", VR256, vx32mem, vy32mem>; 8844 defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", VR128, vx32mem, vy32mem>; 8845 8846 let ExeDomain = SSEPackedDouble in { 8847 defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", VR256, vx64mem, vx64mem>, VEX_W; 8848 defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", VR256, vx64mem, vy64mem>, VEX_W; 8849 } 8850 8851 let ExeDomain = SSEPackedSingle in { 8852 defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", VR256, vx32mem, vy32mem>; 8853 defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", VR128, vx32mem, vy32mem>; 8854 } 8855} 8856 8857//===----------------------------------------------------------------------===// 8858// Extra selection patterns for FR128, f128, f128mem 8859 8860// movaps is shorter than movdqa. movaps is in SSE and movdqa is in SSE2. 8861def : Pat<(store (f128 FR128:$src), addr:$dst), 8862 (MOVAPSmr addr:$dst, (COPY_TO_REGCLASS (f128 FR128:$src), VR128))>; 8863 8864def : Pat<(loadf128 addr:$src), 8865 (COPY_TO_REGCLASS (MOVAPSrm addr:$src), FR128)>; 8866 8867// andps is shorter than andpd or pand. andps is SSE and andpd/pand are in SSE2 8868def : Pat<(X86fand FR128:$src1, (loadf128 addr:$src2)), 8869 (COPY_TO_REGCLASS 8870 (ANDPSrm (COPY_TO_REGCLASS FR128:$src1, VR128), f128mem:$src2), 8871 FR128)>; 8872 8873def : Pat<(X86fand FR128:$src1, FR128:$src2), 8874 (COPY_TO_REGCLASS 8875 (ANDPSrr (COPY_TO_REGCLASS FR128:$src1, VR128), 8876 (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>; 8877 8878def : Pat<(and FR128:$src1, FR128:$src2), 8879 (COPY_TO_REGCLASS 8880 (ANDPSrr (COPY_TO_REGCLASS FR128:$src1, VR128), 8881 (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>; 8882 8883def : Pat<(X86for FR128:$src1, (loadf128 addr:$src2)), 8884 (COPY_TO_REGCLASS 8885 (ORPSrm (COPY_TO_REGCLASS FR128:$src1, VR128), f128mem:$src2), 8886 FR128)>; 8887 8888def : Pat<(X86for FR128:$src1, FR128:$src2), 8889 (COPY_TO_REGCLASS 8890 (ORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128), 8891 (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>; 8892 8893def : Pat<(or FR128:$src1, FR128:$src2), 8894 (COPY_TO_REGCLASS 8895 (ORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128), 8896 (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>; 8897 8898def : Pat<(X86fxor FR128:$src1, (loadf128 addr:$src2)), 8899 (COPY_TO_REGCLASS 8900 (XORPSrm (COPY_TO_REGCLASS FR128:$src1, VR128), f128mem:$src2), 8901 FR128)>; 8902 8903def : Pat<(X86fxor FR128:$src1, FR128:$src2), 8904 (COPY_TO_REGCLASS 8905 (XORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128), 8906 (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>; 8907 8908def : Pat<(xor FR128:$src1, FR128:$src2), 8909 (COPY_TO_REGCLASS 8910 (XORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128), 8911 (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>; 8912