1//===-- X86InstrSSE.td - SSE Instruction Set ---------------*- tablegen -*-===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file describes the X86 SSE instruction set, defining the instructions, 11// and properties of the instructions which are needed for code generation, 12// machine code emission, and analysis. 13// 14//===----------------------------------------------------------------------===// 15 16//===----------------------------------------------------------------------===// 17// SSE 1 & 2 Instructions Classes 18//===----------------------------------------------------------------------===// 19 20/// sse12_fp_scalar - SSE 1 & 2 scalar instructions class 21multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode, 22 RegisterClass RC, X86MemOperand x86memop, 23 Domain d, X86FoldableSchedWrite sched, 24 bit Is2Addr = 1> { 25 let isCommutable = 1 in { 26 def rr : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 27 !if(Is2Addr, 28 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 29 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 30 [(set RC:$dst, (OpNode RC:$src1, RC:$src2))], d>, 31 Sched<[sched]>; 32 } 33 def rm : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 34 !if(Is2Addr, 35 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 36 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 37 [(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))], d>, 38 Sched<[sched.Folded, ReadAfterLd]>; 39} 40 41/// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class 42multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr, 43 SDPatternOperator OpNode, RegisterClass RC, 44 ValueType VT, string asm, Operand memopr, 45 ComplexPattern mem_cpat, Domain d, 46 X86FoldableSchedWrite sched, bit Is2Addr = 1> { 47let isCodeGenOnly = 1, hasSideEffects = 0 in { 48 def rr_Int : SI_Int<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 49 !if(Is2Addr, 50 !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), 51 !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 52 [(set RC:$dst, (VT (OpNode RC:$src1, RC:$src2)))], d>, 53 Sched<[sched]>; 54 let mayLoad = 1 in 55 def rm_Int : SI_Int<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, memopr:$src2), 56 !if(Is2Addr, 57 !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), 58 !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 59 [(set RC:$dst, (VT (OpNode RC:$src1, mem_cpat:$src2)))], d>, 60 Sched<[sched.Folded, ReadAfterLd]>; 61} 62} 63 64/// sse12_fp_packed - SSE 1 & 2 packed instructions class 65multiclass sse12_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode, 66 RegisterClass RC, ValueType vt, 67 X86MemOperand x86memop, PatFrag mem_frag, 68 Domain d, X86FoldableSchedWrite sched, 69 bit Is2Addr = 1> { 70 let isCommutable = 1 in 71 def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 72 !if(Is2Addr, 73 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 74 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 75 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], d>, 76 Sched<[sched]>; 77 let mayLoad = 1 in 78 def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 79 !if(Is2Addr, 80 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 81 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 82 [(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))], 83 d>, 84 Sched<[sched.Folded, ReadAfterLd]>; 85} 86 87/// sse12_fp_packed_logical_rm - SSE 1 & 2 packed instructions class 88multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d, 89 string OpcodeStr, X86MemOperand x86memop, 90 X86FoldableSchedWrite sched, 91 list<dag> pat_rr, list<dag> pat_rm, 92 bit Is2Addr = 1> { 93 let isCommutable = 1, hasSideEffects = 0 in 94 def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 95 !if(Is2Addr, 96 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 97 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 98 pat_rr, d>, 99 Sched<[sched]>; 100 let hasSideEffects = 0, mayLoad = 1 in 101 def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 102 !if(Is2Addr, 103 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 104 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 105 pat_rm, d>, 106 Sched<[sched.Folded, ReadAfterLd]>; 107} 108 109 110// Alias instructions that map fld0 to xorps for sse or vxorps for avx. 111// This is expanded by ExpandPostRAPseudos. 112let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, 113 isPseudo = 1, SchedRW = [WriteZero] in { 114 def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "", 115 [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1, NoAVX512]>; 116 def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "", 117 [(set FR64:$dst, fpimm0)]>, Requires<[HasSSE2, NoAVX512]>; 118} 119 120//===----------------------------------------------------------------------===// 121// AVX & SSE - Zero/One Vectors 122//===----------------------------------------------------------------------===// 123 124// Alias instruction that maps zero vector to pxor / xorp* for sse. 125// This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then 126// swizzled by ExecutionDomainFix to pxor. 127// We set canFoldAsLoad because this can be converted to a constant-pool 128// load of an all-zeros value if folding it would be beneficial. 129let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, 130 isPseudo = 1, SchedRW = [WriteZero] in { 131def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "", 132 [(set VR128:$dst, (v4f32 immAllZerosV))]>; 133} 134 135let Predicates = [NoAVX512] in 136def : Pat<(v4i32 immAllZerosV), (V_SET0)>; 137 138 139// The same as done above but for AVX. The 256-bit AVX1 ISA doesn't support PI, 140// and doesn't need it because on sandy bridge the register is set to zero 141// at the rename stage without using any execution unit, so SET0PSY 142// and SET0PDY can be used for vector int instructions without penalty 143let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, 144 isPseudo = 1, Predicates = [NoAVX512], SchedRW = [WriteZero] in { 145def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "", 146 [(set VR256:$dst, (v8i32 immAllZerosV))]>; 147} 148 149// We set canFoldAsLoad because this can be converted to a constant-pool 150// load of an all-ones value if folding it would be beneficial. 151let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, 152 isPseudo = 1, SchedRW = [WriteZero] in { 153 def V_SETALLONES : I<0, Pseudo, (outs VR128:$dst), (ins), "", 154 [(set VR128:$dst, (v4i32 immAllOnesV))]>; 155 let Predicates = [HasAVX1Only, OptForMinSize] in { 156 def AVX1_SETALLONES: I<0, Pseudo, (outs VR256:$dst), (ins), "", 157 [(set VR256:$dst, (v8i32 immAllOnesV))]>; 158 } 159 let Predicates = [HasAVX2] in 160 def AVX2_SETALLONES : I<0, Pseudo, (outs VR256:$dst), (ins), "", 161 [(set VR256:$dst, (v8i32 immAllOnesV))]>; 162} 163 164//===----------------------------------------------------------------------===// 165// SSE 1 & 2 - Move FP Scalar Instructions 166// 167// Move Instructions. Register-to-register movss/movsd is not used for FR32/64 168// register copies because it's a partial register update; Register-to-register 169// movss/movsd is not modeled as an INSERT_SUBREG because INSERT_SUBREG requires 170// that the insert be implementable in terms of a copy, and just mentioned, we 171// don't use movss/movsd for copies. 172//===----------------------------------------------------------------------===// 173 174multiclass sse12_move_rr<SDNode OpNode, ValueType vt, 175 X86MemOperand x86memop, string base_opc, 176 string asm_opr, Domain d, string Name> { 177 let isCommutable = 1 in 178 def rr : SI<0x10, MRMSrcReg, (outs VR128:$dst), 179 (ins VR128:$src1, VR128:$src2), 180 !strconcat(base_opc, asm_opr), 181 [(set VR128:$dst, (vt (OpNode VR128:$src1, VR128:$src2)))], d>, 182 Sched<[SchedWriteFShuffle.XMM]>; 183 184 // For the disassembler 185 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in 186 def rr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst), 187 (ins VR128:$src1, VR128:$src2), 188 !strconcat(base_opc, asm_opr), []>, 189 Sched<[SchedWriteFShuffle.XMM]>, FoldGenData<Name#rr>; 190} 191 192multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt, 193 X86MemOperand x86memop, string OpcodeStr, 194 Domain d, string Name, Predicate pred> { 195 // AVX 196 let Predicates = [UseAVX, OptForSize] in 197 defm V#NAME : sse12_move_rr<OpNode, vt, x86memop, OpcodeStr, 198 "\t{$src2, $src1, $dst|$dst, $src1, $src2}", d, 199 "V"#Name>, 200 VEX_4V, VEX_LIG, VEX_WIG; 201 202 def V#NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src), 203 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 204 [(store RC:$src, addr:$dst)], d>, 205 VEX, VEX_LIG, Sched<[WriteFStore]>, VEX_WIG; 206 // SSE1 & 2 207 let Constraints = "$src1 = $dst" in { 208 let Predicates = [pred, NoSSE41_Or_OptForSize] in 209 defm NAME : sse12_move_rr<OpNode, vt, x86memop, OpcodeStr, 210 "\t{$src2, $dst|$dst, $src2}", d, Name>; 211 } 212 213 def NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src), 214 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 215 [(store RC:$src, addr:$dst)], d>, 216 Sched<[WriteFStore]>; 217 218 def : InstAlias<"v"#OpcodeStr#".s\t{$src2, $src1, $dst|$dst, $src1, $src2}", 219 (!cast<Instruction>("V"#NAME#"rr_REV") 220 VR128:$dst, VR128:$src1, VR128:$src2), 0>; 221 def : InstAlias<OpcodeStr#".s\t{$src2, $dst|$dst, $src2}", 222 (!cast<Instruction>(NAME#"rr_REV") 223 VR128:$dst, VR128:$src2), 0>; 224} 225 226// Loading from memory automatically zeroing upper bits. 227multiclass sse12_move_rm<RegisterClass RC, X86MemOperand x86memop, 228 PatFrag mem_pat, string OpcodeStr, Domain d> { 229 def V#NAME#rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), 230 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 231 [(set RC:$dst, (mem_pat addr:$src))], d>, 232 VEX, VEX_LIG, Sched<[WriteFLoad]>, VEX_WIG; 233 def NAME#rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), 234 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 235 [(set RC:$dst, (mem_pat addr:$src))], d>, 236 Sched<[WriteFLoad]>; 237} 238 239defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss", 240 SSEPackedSingle, "MOVSS", UseSSE1>, XS; 241defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd", 242 SSEPackedDouble, "MOVSD", UseSSE2>, XD; 243 244let canFoldAsLoad = 1, isReMaterializable = 1 in { 245 defm MOVSS : sse12_move_rm<FR32, f32mem, loadf32, "movss", 246 SSEPackedSingle>, XS; 247 defm MOVSD : sse12_move_rm<FR64, f64mem, loadf64, "movsd", 248 SSEPackedDouble>, XD; 249} 250 251// Patterns 252let Predicates = [UseAVX] in { 253 // MOVSSrm zeros the high parts of the register; represent this 254 // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0 255 def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))), 256 (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>; 257 def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))), 258 (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>; 259 def : Pat<(v4f32 (X86vzload addr:$src)), 260 (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>; 261 262 // MOVSDrm zeros the high parts of the register; represent this 263 // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0 264 def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))), 265 (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>; 266 def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))), 267 (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>; 268 def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))), 269 (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>; 270 def : Pat<(v2f64 (X86vzload addr:$src)), 271 (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>; 272 273 // Represent the same patterns above but in the form they appear for 274 // 256-bit types 275 def : Pat<(v8f32 (X86vzmovl (insert_subvector undef, 276 (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))), 277 (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>; 278 def : Pat<(v8f32 (X86vzload addr:$src)), 279 (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>; 280 def : Pat<(v4f64 (X86vzmovl (insert_subvector undef, 281 (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))), 282 (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>; 283 def : Pat<(v4f64 (X86vzload addr:$src)), 284 (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>; 285 286 // Extract and store. 287 def : Pat<(store (f32 (extractelt (v4f32 VR128:$src), (iPTR 0))), 288 addr:$dst), 289 (VMOVSSmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32))>; 290} 291 292let Predicates = [UseAVX, OptForSize] in { 293 // Move scalar to XMM zero-extended, zeroing a VR128 then do a 294 // MOVSS to the lower bits. 295 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), 296 (VMOVSSrr (v4f32 (V_SET0)), VR128:$src)>; 297 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), 298 (VMOVSSrr (v4i32 (V_SET0)), VR128:$src)>; 299 300 // Move low f32 and clear high bits. 301 def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))), 302 (SUBREG_TO_REG (i32 0), 303 (v4f32 (VMOVSSrr (v4f32 (V_SET0)), 304 (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)))), sub_xmm)>; 305 def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))), 306 (SUBREG_TO_REG (i32 0), 307 (v4i32 (VMOVSSrr (v4i32 (V_SET0)), 308 (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)))), sub_xmm)>; 309 310 def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))), 311 (SUBREG_TO_REG (i32 0), 312 (v2f64 (VMOVSDrr (v2f64 (V_SET0)), 313 (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)))), 314 sub_xmm)>; 315 def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))), 316 (SUBREG_TO_REG (i32 0), 317 (v2i64 (VMOVSDrr (v2i64 (V_SET0)), 318 (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)))), 319 sub_xmm)>; 320} 321 322let Predicates = [UseSSE1] in { 323 let Predicates = [UseSSE1, NoSSE41_Or_OptForSize] in { 324 // Move scalar to XMM zero-extended, zeroing a VR128 then do a 325 // MOVSS to the lower bits. 326 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), 327 (MOVSSrr (v4f32 (V_SET0)), VR128:$src)>; 328 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), 329 (MOVSSrr (v4i32 (V_SET0)), VR128:$src)>; 330 } 331 332 // MOVSSrm already zeros the high parts of the register. 333 def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))), 334 (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>; 335 def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))), 336 (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>; 337 def : Pat<(v4f32 (X86vzload addr:$src)), 338 (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>; 339 340 // Extract and store. 341 def : Pat<(store (f32 (extractelt (v4f32 VR128:$src), (iPTR 0))), 342 addr:$dst), 343 (MOVSSmr addr:$dst, (COPY_TO_REGCLASS VR128:$src, FR32))>; 344} 345 346let Predicates = [UseSSE2] in { 347 // MOVSDrm already zeros the high parts of the register. 348 def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))), 349 (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>; 350 def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))), 351 (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>; 352 def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))), 353 (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>; 354 def : Pat<(v2f64 (X86vzload addr:$src)), 355 (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>; 356} 357 358// Aliases to help the assembler pick two byte VEX encodings by swapping the 359// operands relative to the normal instructions to use VEX.R instead of VEX.B. 360def : InstAlias<"vmovss\t{$src2, $src1, $dst|$dst, $src1, $src2}", 361 (VMOVSSrr_REV VR128L:$dst, VR128:$src1, VR128H:$src2), 0>; 362def : InstAlias<"vmovsd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 363 (VMOVSDrr_REV VR128L:$dst, VR128:$src1, VR128H:$src2), 0>; 364 365//===----------------------------------------------------------------------===// 366// SSE 1 & 2 - Move Aligned/Unaligned FP Instructions 367//===----------------------------------------------------------------------===// 368 369multiclass sse12_mov_packed<bits<8> opc, RegisterClass RC, 370 X86MemOperand x86memop, PatFrag ld_frag, 371 string asm, Domain d, 372 X86SchedWriteMoveLS sched> { 373let hasSideEffects = 0, isMoveReg = 1 in 374 def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), 375 !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], d>, 376 Sched<[sched.RR]>; 377let canFoldAsLoad = 1, isReMaterializable = 1 in 378 def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), 379 !strconcat(asm, "\t{$src, $dst|$dst, $src}"), 380 [(set RC:$dst, (ld_frag addr:$src))], d>, 381 Sched<[sched.RM]>; 382} 383 384let Predicates = [HasAVX, NoVLX] in { 385defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps", 386 SSEPackedSingle, SchedWriteFMoveLS.XMM>, 387 PS, VEX, VEX_WIG; 388defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd", 389 SSEPackedDouble, SchedWriteFMoveLS.XMM>, 390 PD, VEX, VEX_WIG; 391defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups", 392 SSEPackedSingle, SchedWriteFMoveLS.XMM>, 393 PS, VEX, VEX_WIG; 394defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd", 395 SSEPackedDouble, SchedWriteFMoveLS.XMM>, 396 PD, VEX, VEX_WIG; 397 398defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32, "movaps", 399 SSEPackedSingle, SchedWriteFMoveLS.YMM>, 400 PS, VEX, VEX_L, VEX_WIG; 401defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64, "movapd", 402 SSEPackedDouble, SchedWriteFMoveLS.YMM>, 403 PD, VEX, VEX_L, VEX_WIG; 404defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32, "movups", 405 SSEPackedSingle, SchedWriteFMoveLS.YMM>, 406 PS, VEX, VEX_L, VEX_WIG; 407defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64, "movupd", 408 SSEPackedDouble, SchedWriteFMoveLS.YMM>, 409 PD, VEX, VEX_L, VEX_WIG; 410} 411 412let Predicates = [UseSSE1] in { 413defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps", 414 SSEPackedSingle, SchedWriteFMoveLS.XMM>, 415 PS; 416defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups", 417 SSEPackedSingle, SchedWriteFMoveLS.XMM>, 418 PS; 419} 420let Predicates = [UseSSE2] in { 421defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd", 422 SSEPackedDouble, SchedWriteFMoveLS.XMM>, 423 PD; 424defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd", 425 SSEPackedDouble, SchedWriteFMoveLS.XMM>, 426 PD; 427} 428 429let Predicates = [HasAVX, NoVLX] in { 430let SchedRW = [SchedWriteFMoveLS.XMM.MR] in { 431def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 432 "movaps\t{$src, $dst|$dst, $src}", 433 [(alignedstore (v4f32 VR128:$src), addr:$dst)]>, 434 VEX, VEX_WIG; 435def VMOVAPDmr : VPDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 436 "movapd\t{$src, $dst|$dst, $src}", 437 [(alignedstore (v2f64 VR128:$src), addr:$dst)]>, 438 VEX, VEX_WIG; 439def VMOVUPSmr : VPSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 440 "movups\t{$src, $dst|$dst, $src}", 441 [(store (v4f32 VR128:$src), addr:$dst)]>, 442 VEX, VEX_WIG; 443def VMOVUPDmr : VPDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 444 "movupd\t{$src, $dst|$dst, $src}", 445 [(store (v2f64 VR128:$src), addr:$dst)]>, 446 VEX, VEX_WIG; 447} // SchedRW 448 449let SchedRW = [SchedWriteFMoveLS.YMM.MR] in { 450def VMOVAPSYmr : VPSI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), 451 "movaps\t{$src, $dst|$dst, $src}", 452 [(alignedstore (v8f32 VR256:$src), addr:$dst)]>, 453 VEX, VEX_L, VEX_WIG; 454def VMOVAPDYmr : VPDI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), 455 "movapd\t{$src, $dst|$dst, $src}", 456 [(alignedstore (v4f64 VR256:$src), addr:$dst)]>, 457 VEX, VEX_L, VEX_WIG; 458def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), 459 "movups\t{$src, $dst|$dst, $src}", 460 [(store (v8f32 VR256:$src), addr:$dst)]>, 461 VEX, VEX_L, VEX_WIG; 462def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), 463 "movupd\t{$src, $dst|$dst, $src}", 464 [(store (v4f64 VR256:$src), addr:$dst)]>, 465 VEX, VEX_L, VEX_WIG; 466} // SchedRW 467} // Predicate 468 469// For disassembler 470let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, 471 isMoveReg = 1 in { 472let SchedRW = [SchedWriteFMoveLS.XMM.RR] in { 473 def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst), 474 (ins VR128:$src), 475 "movaps\t{$src, $dst|$dst, $src}", []>, 476 VEX, VEX_WIG, FoldGenData<"VMOVAPSrr">; 477 def VMOVAPDrr_REV : VPDI<0x29, MRMDestReg, (outs VR128:$dst), 478 (ins VR128:$src), 479 "movapd\t{$src, $dst|$dst, $src}", []>, 480 VEX, VEX_WIG, FoldGenData<"VMOVAPDrr">; 481 def VMOVUPSrr_REV : VPSI<0x11, MRMDestReg, (outs VR128:$dst), 482 (ins VR128:$src), 483 "movups\t{$src, $dst|$dst, $src}", []>, 484 VEX, VEX_WIG, FoldGenData<"VMOVUPSrr">; 485 def VMOVUPDrr_REV : VPDI<0x11, MRMDestReg, (outs VR128:$dst), 486 (ins VR128:$src), 487 "movupd\t{$src, $dst|$dst, $src}", []>, 488 VEX, VEX_WIG, FoldGenData<"VMOVUPDrr">; 489} // SchedRW 490 491let SchedRW = [SchedWriteFMoveLS.YMM.RR] in { 492 def VMOVAPSYrr_REV : VPSI<0x29, MRMDestReg, (outs VR256:$dst), 493 (ins VR256:$src), 494 "movaps\t{$src, $dst|$dst, $src}", []>, 495 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVAPSYrr">; 496 def VMOVAPDYrr_REV : VPDI<0x29, MRMDestReg, (outs VR256:$dst), 497 (ins VR256:$src), 498 "movapd\t{$src, $dst|$dst, $src}", []>, 499 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVAPDYrr">; 500 def VMOVUPSYrr_REV : VPSI<0x11, MRMDestReg, (outs VR256:$dst), 501 (ins VR256:$src), 502 "movups\t{$src, $dst|$dst, $src}", []>, 503 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVUPSYrr">; 504 def VMOVUPDYrr_REV : VPDI<0x11, MRMDestReg, (outs VR256:$dst), 505 (ins VR256:$src), 506 "movupd\t{$src, $dst|$dst, $src}", []>, 507 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVUPDYrr">; 508} // SchedRW 509} // Predicate 510 511// Aliases to help the assembler pick two byte VEX encodings by swapping the 512// operands relative to the normal instructions to use VEX.R instead of VEX.B. 513def : InstAlias<"vmovaps\t{$src, $dst|$dst, $src}", 514 (VMOVAPSrr_REV VR128L:$dst, VR128H:$src), 0>; 515def : InstAlias<"vmovapd\t{$src, $dst|$dst, $src}", 516 (VMOVAPDrr_REV VR128L:$dst, VR128H:$src), 0>; 517def : InstAlias<"vmovups\t{$src, $dst|$dst, $src}", 518 (VMOVUPSrr_REV VR128L:$dst, VR128H:$src), 0>; 519def : InstAlias<"vmovupd\t{$src, $dst|$dst, $src}", 520 (VMOVUPDrr_REV VR128L:$dst, VR128H:$src), 0>; 521def : InstAlias<"vmovaps\t{$src, $dst|$dst, $src}", 522 (VMOVAPSYrr_REV VR256L:$dst, VR256H:$src), 0>; 523def : InstAlias<"vmovapd\t{$src, $dst|$dst, $src}", 524 (VMOVAPDYrr_REV VR256L:$dst, VR256H:$src), 0>; 525def : InstAlias<"vmovups\t{$src, $dst|$dst, $src}", 526 (VMOVUPSYrr_REV VR256L:$dst, VR256H:$src), 0>; 527def : InstAlias<"vmovupd\t{$src, $dst|$dst, $src}", 528 (VMOVUPDYrr_REV VR256L:$dst, VR256H:$src), 0>; 529 530// Reversed version with ".s" suffix for GAS compatibility. 531def : InstAlias<"vmovaps.s\t{$src, $dst|$dst, $src}", 532 (VMOVAPSrr_REV VR128:$dst, VR128:$src), 0>; 533def : InstAlias<"vmovapd.s\t{$src, $dst|$dst, $src}", 534 (VMOVAPDrr_REV VR128:$dst, VR128:$src), 0>; 535def : InstAlias<"vmovups.s\t{$src, $dst|$dst, $src}", 536 (VMOVUPSrr_REV VR128:$dst, VR128:$src), 0>; 537def : InstAlias<"vmovupd.s\t{$src, $dst|$dst, $src}", 538 (VMOVUPDrr_REV VR128:$dst, VR128:$src), 0>; 539def : InstAlias<"vmovaps.s\t{$src, $dst|$dst, $src}", 540 (VMOVAPSYrr_REV VR256:$dst, VR256:$src), 0>; 541def : InstAlias<"vmovapd.s\t{$src, $dst|$dst, $src}", 542 (VMOVAPDYrr_REV VR256:$dst, VR256:$src), 0>; 543def : InstAlias<"vmovups.s\t{$src, $dst|$dst, $src}", 544 (VMOVUPSYrr_REV VR256:$dst, VR256:$src), 0>; 545def : InstAlias<"vmovupd.s\t{$src, $dst|$dst, $src}", 546 (VMOVUPDYrr_REV VR256:$dst, VR256:$src), 0>; 547 548let SchedRW = [SchedWriteFMoveLS.XMM.MR] in { 549def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 550 "movaps\t{$src, $dst|$dst, $src}", 551 [(alignedstore (v4f32 VR128:$src), addr:$dst)]>; 552def MOVAPDmr : PDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 553 "movapd\t{$src, $dst|$dst, $src}", 554 [(alignedstore (v2f64 VR128:$src), addr:$dst)]>; 555def MOVUPSmr : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 556 "movups\t{$src, $dst|$dst, $src}", 557 [(store (v4f32 VR128:$src), addr:$dst)]>; 558def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 559 "movupd\t{$src, $dst|$dst, $src}", 560 [(store (v2f64 VR128:$src), addr:$dst)]>; 561} // SchedRW 562 563// For disassembler 564let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, 565 isMoveReg = 1, SchedRW = [SchedWriteFMoveLS.XMM.RR] in { 566 def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 567 "movaps\t{$src, $dst|$dst, $src}", []>, 568 FoldGenData<"MOVAPSrr">; 569 def MOVAPDrr_REV : PDI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 570 "movapd\t{$src, $dst|$dst, $src}", []>, 571 FoldGenData<"MOVAPDrr">; 572 def MOVUPSrr_REV : PSI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 573 "movups\t{$src, $dst|$dst, $src}", []>, 574 FoldGenData<"MOVUPSrr">; 575 def MOVUPDrr_REV : PDI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 576 "movupd\t{$src, $dst|$dst, $src}", []>, 577 FoldGenData<"MOVUPDrr">; 578} 579 580// Reversed version with ".s" suffix for GAS compatibility. 581def : InstAlias<"movaps.s\t{$src, $dst|$dst, $src}", 582 (MOVAPSrr_REV VR128:$dst, VR128:$src), 0>; 583def : InstAlias<"movapd.s\t{$src, $dst|$dst, $src}", 584 (MOVAPDrr_REV VR128:$dst, VR128:$src), 0>; 585def : InstAlias<"movups.s\t{$src, $dst|$dst, $src}", 586 (MOVUPSrr_REV VR128:$dst, VR128:$src), 0>; 587def : InstAlias<"movupd.s\t{$src, $dst|$dst, $src}", 588 (MOVUPDrr_REV VR128:$dst, VR128:$src), 0>; 589 590let Predicates = [HasAVX, NoVLX] in { 591 // 256-bit load/store need to use floating point load/store in case we don't 592 // have AVX2. Execution domain fixing will convert to integer if AVX2 is 593 // available and changing the domain is beneficial. 594 def : Pat<(alignedloadv4i64 addr:$src), 595 (VMOVAPSYrm addr:$src)>; 596 def : Pat<(loadv4i64 addr:$src), 597 (VMOVUPSYrm addr:$src)>; 598 def : Pat<(alignedstore (v4i64 VR256:$src), addr:$dst), 599 (VMOVAPSYmr addr:$dst, VR256:$src)>; 600 def : Pat<(alignedstore (v8i32 VR256:$src), addr:$dst), 601 (VMOVAPSYmr addr:$dst, VR256:$src)>; 602 def : Pat<(alignedstore (v16i16 VR256:$src), addr:$dst), 603 (VMOVAPSYmr addr:$dst, VR256:$src)>; 604 def : Pat<(alignedstore (v32i8 VR256:$src), addr:$dst), 605 (VMOVAPSYmr addr:$dst, VR256:$src)>; 606 def : Pat<(store (v4i64 VR256:$src), addr:$dst), 607 (VMOVUPSYmr addr:$dst, VR256:$src)>; 608 def : Pat<(store (v8i32 VR256:$src), addr:$dst), 609 (VMOVUPSYmr addr:$dst, VR256:$src)>; 610 def : Pat<(store (v16i16 VR256:$src), addr:$dst), 611 (VMOVUPSYmr addr:$dst, VR256:$src)>; 612 def : Pat<(store (v32i8 VR256:$src), addr:$dst), 613 (VMOVUPSYmr addr:$dst, VR256:$src)>; 614} 615 616// Use movaps / movups for SSE integer load / store (one byte shorter). 617// The instructions selected below are then converted to MOVDQA/MOVDQU 618// during the SSE domain pass. 619let Predicates = [UseSSE1] in { 620 def : Pat<(alignedloadv2i64 addr:$src), 621 (MOVAPSrm addr:$src)>; 622 def : Pat<(loadv2i64 addr:$src), 623 (MOVUPSrm addr:$src)>; 624 625 def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst), 626 (MOVAPSmr addr:$dst, VR128:$src)>; 627 def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst), 628 (MOVAPSmr addr:$dst, VR128:$src)>; 629 def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst), 630 (MOVAPSmr addr:$dst, VR128:$src)>; 631 def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst), 632 (MOVAPSmr addr:$dst, VR128:$src)>; 633 def : Pat<(store (v2i64 VR128:$src), addr:$dst), 634 (MOVUPSmr addr:$dst, VR128:$src)>; 635 def : Pat<(store (v4i32 VR128:$src), addr:$dst), 636 (MOVUPSmr addr:$dst, VR128:$src)>; 637 def : Pat<(store (v8i16 VR128:$src), addr:$dst), 638 (MOVUPSmr addr:$dst, VR128:$src)>; 639 def : Pat<(store (v16i8 VR128:$src), addr:$dst), 640 (MOVUPSmr addr:$dst, VR128:$src)>; 641} 642 643//===----------------------------------------------------------------------===// 644// SSE 1 & 2 - Move Low packed FP Instructions 645//===----------------------------------------------------------------------===// 646 647multiclass sse12_mov_hilo_packed_base<bits<8>opc, SDNode pdnode, 648 string base_opc, string asm_opr> { 649 // No pattern as they need be special cased between high and low. 650 let hasSideEffects = 0, mayLoad = 1 in 651 def PSrm : PI<opc, MRMSrcMem, 652 (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2), 653 !strconcat(base_opc, "s", asm_opr), 654 [], SSEPackedSingle>, PS, 655 Sched<[SchedWriteFShuffle.XMM.Folded, ReadAfterLd]>; 656 657 def PDrm : PI<opc, MRMSrcMem, 658 (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2), 659 !strconcat(base_opc, "d", asm_opr), 660 [(set VR128:$dst, (v2f64 (pdnode VR128:$src1, 661 (scalar_to_vector (loadf64 addr:$src2)))))], 662 SSEPackedDouble>, PD, 663 Sched<[SchedWriteFShuffle.XMM.Folded, ReadAfterLd]>; 664} 665 666multiclass sse12_mov_hilo_packed<bits<8>opc, SDPatternOperator pdnode, 667 string base_opc> { 668 let Predicates = [UseAVX] in 669 defm V#NAME : sse12_mov_hilo_packed_base<opc, pdnode, base_opc, 670 "\t{$src2, $src1, $dst|$dst, $src1, $src2}">, 671 VEX_4V, VEX_WIG; 672 673 let Constraints = "$src1 = $dst" in 674 defm NAME : sse12_mov_hilo_packed_base<opc, pdnode, base_opc, 675 "\t{$src2, $dst|$dst, $src2}">; 676} 677 678defm MOVL : sse12_mov_hilo_packed<0x12, X86Movsd, "movlp">; 679 680let SchedRW = [WriteFStore] in { 681let Predicates = [UseAVX] in { 682def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 683 "movlps\t{$src, $dst|$dst, $src}", 684 [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128:$src)), 685 (iPTR 0))), addr:$dst)]>, 686 VEX, VEX_WIG; 687def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 688 "movlpd\t{$src, $dst|$dst, $src}", 689 [(store (f64 (extractelt (v2f64 VR128:$src), 690 (iPTR 0))), addr:$dst)]>, 691 VEX, VEX_WIG; 692}// UseAVX 693def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 694 "movlps\t{$src, $dst|$dst, $src}", 695 [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128:$src)), 696 (iPTR 0))), addr:$dst)]>; 697def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 698 "movlpd\t{$src, $dst|$dst, $src}", 699 [(store (f64 (extractelt (v2f64 VR128:$src), 700 (iPTR 0))), addr:$dst)]>; 701} // SchedRW 702 703let Predicates = [UseSSE1] in { 704 // (store (vector_shuffle (load addr), v2, <4, 5, 2, 3>), addr) using MOVLPS 705 def : Pat<(store (i64 (extractelt (bc_v2i64 (v4f32 VR128:$src2)), 706 (iPTR 0))), addr:$src1), 707 (MOVLPSmr addr:$src1, VR128:$src2)>; 708 709 // This pattern helps select MOVLPS on SSE1 only targets. With SSE2 we'll 710 // end up with a movsd or blend instead of shufp. 711 // No need for aligned load, we're only loading 64-bits. 712 def : Pat<(X86Shufp (loadv4f32 addr:$src2), VR128:$src1, (i8 -28)), 713 (MOVLPSrm VR128:$src1, addr:$src2)>; 714} 715 716//===----------------------------------------------------------------------===// 717// SSE 1 & 2 - Move Hi packed FP Instructions 718//===----------------------------------------------------------------------===// 719 720defm MOVH : sse12_mov_hilo_packed<0x16, X86Unpckl, "movhp">; 721 722let SchedRW = [WriteFStore] in { 723// v2f64 extract element 1 is always custom lowered to unpack high to low 724// and extract element 0 so the non-store version isn't too horrible. 725let Predicates = [UseAVX] in { 726def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 727 "movhps\t{$src, $dst|$dst, $src}", 728 [(store (f64 (extractelt 729 (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)), 730 (bc_v2f64 (v4f32 VR128:$src))), 731 (iPTR 0))), addr:$dst)]>, VEX, VEX_WIG; 732def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 733 "movhpd\t{$src, $dst|$dst, $src}", 734 [(store (f64 (extractelt 735 (v2f64 (X86Unpckh VR128:$src, VR128:$src)), 736 (iPTR 0))), addr:$dst)]>, VEX, VEX_WIG; 737} // UseAVX 738def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 739 "movhps\t{$src, $dst|$dst, $src}", 740 [(store (f64 (extractelt 741 (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)), 742 (bc_v2f64 (v4f32 VR128:$src))), 743 (iPTR 0))), addr:$dst)]>; 744def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 745 "movhpd\t{$src, $dst|$dst, $src}", 746 [(store (f64 (extractelt 747 (v2f64 (X86Unpckh VR128:$src, VR128:$src)), 748 (iPTR 0))), addr:$dst)]>; 749} // SchedRW 750 751let Predicates = [UseAVX] in { 752 // Also handle an i64 load because that may get selected as a faster way to 753 // load the data. 754 def : Pat<(v2f64 (X86Unpckl VR128:$src1, 755 (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))), 756 (VMOVHPDrm VR128:$src1, addr:$src2)>; 757 758 def : Pat<(store (f64 (extractelt 759 (v2f64 (X86VPermilpi VR128:$src, (i8 1))), 760 (iPTR 0))), addr:$dst), 761 (VMOVHPDmr addr:$dst, VR128:$src)>; 762} 763 764let Predicates = [UseSSE1] in { 765 // This pattern helps select MOVHPS on SSE1 only targets. With SSE2 we'll 766 // end up with a movsd or blend instead of shufp. 767 // No need for aligned load, we're only loading 64-bits. 768 def : Pat<(X86Movlhps VR128:$src1, (loadv4f32 addr:$src2)), 769 (MOVHPSrm VR128:$src1, addr:$src2)>; 770} 771 772let Predicates = [UseSSE2] in { 773 // MOVHPD patterns 774 775 // Also handle an i64 load because that may get selected as a faster way to 776 // load the data. 777 def : Pat<(v2f64 (X86Unpckl VR128:$src1, 778 (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))), 779 (MOVHPDrm VR128:$src1, addr:$src2)>; 780 781 def : Pat<(store (f64 (extractelt 782 (v2f64 (X86Shufp VR128:$src, VR128:$src, (i8 1))), 783 (iPTR 0))), addr:$dst), 784 (MOVHPDmr addr:$dst, VR128:$src)>; 785} 786 787//===----------------------------------------------------------------------===// 788// SSE 1 & 2 - Move Low to High and High to Low packed FP Instructions 789//===----------------------------------------------------------------------===// 790 791let Predicates = [UseAVX] in { 792 def VMOVLHPSrr : VPSI<0x16, MRMSrcReg, (outs VR128:$dst), 793 (ins VR128:$src1, VR128:$src2), 794 "movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 795 [(set VR128:$dst, 796 (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))]>, 797 VEX_4V, Sched<[SchedWriteFShuffle.XMM]>, VEX_WIG; 798 let isCommutable = 1 in 799 def VMOVHLPSrr : VPSI<0x12, MRMSrcReg, (outs VR128:$dst), 800 (ins VR128:$src1, VR128:$src2), 801 "movhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 802 [(set VR128:$dst, 803 (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))]>, 804 VEX_4V, Sched<[SchedWriteFShuffle.XMM]>, VEX_WIG, 805 NotMemoryFoldable; 806} 807let Constraints = "$src1 = $dst" in { 808 def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst), 809 (ins VR128:$src1, VR128:$src2), 810 "movlhps\t{$src2, $dst|$dst, $src2}", 811 [(set VR128:$dst, 812 (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))]>, 813 Sched<[SchedWriteFShuffle.XMM]>; 814 let isCommutable = 1 in 815 def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst), 816 (ins VR128:$src1, VR128:$src2), 817 "movhlps\t{$src2, $dst|$dst, $src2}", 818 [(set VR128:$dst, 819 (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))]>, 820 Sched<[SchedWriteFShuffle.XMM]>, NotMemoryFoldable; 821} 822 823// TODO: This is largely to trick fastisel into ignoring the pattern. 824def UnpckhUnary : PatFrag<(ops node:$src1, node:$src2), 825 (X86Unpckh node:$src1, node:$src2), [{ 826 return N->getOperand(0) == N->getOperand(1); 827}]>; 828 829let Predicates = [UseSSE2] in { 830 // TODO: This is a hack pattern to allow lowering to emit unpckh instead of 831 // movhlps for sse2 without changing a bunch of tests. 832 def : Pat<(v2f64 (UnpckhUnary VR128:$src, VR128:$src)), 833 (MOVHLPSrr VR128:$src, VR128:$src)>; 834} 835 836//===----------------------------------------------------------------------===// 837// SSE 1 & 2 - Conversion Instructions 838//===----------------------------------------------------------------------===// 839 840multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, 841 SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag, 842 string asm, X86FoldableSchedWrite sched> { 843 def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm, 844 [(set DstRC:$dst, (OpNode SrcRC:$src))]>, 845 Sched<[sched]>; 846 def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm, 847 [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))]>, 848 Sched<[sched.Folded]>; 849} 850 851multiclass sse12_cvt_p<bits<8> opc, RegisterClass RC, X86MemOperand x86memop, 852 ValueType DstTy, ValueType SrcTy, PatFrag ld_frag, 853 string asm, Domain d, X86FoldableSchedWrite sched> { 854let hasSideEffects = 0 in { 855 def rr : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), asm, 856 [(set RC:$dst, (DstTy (sint_to_fp (SrcTy RC:$src))))], d>, 857 Sched<[sched]>; 858 let mayLoad = 1 in 859 def rm : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), asm, 860 [(set RC:$dst, (DstTy (sint_to_fp 861 (SrcTy (bitconvert (ld_frag addr:$src))))))], d>, 862 Sched<[sched.Folded]>; 863} 864} 865 866multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, 867 X86MemOperand x86memop, string asm, 868 X86FoldableSchedWrite sched> { 869let hasSideEffects = 0, Predicates = [UseAVX] in { 870 def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src), 871 !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>, 872 Sched<[sched]>; 873 let mayLoad = 1 in 874 def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), 875 (ins DstRC:$src1, x86memop:$src), 876 !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>, 877 Sched<[sched.Folded, ReadAfterLd]>; 878} // hasSideEffects = 0 879} 880 881let Predicates = [UseAVX] in { 882defm VCVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32, 883 "cvttss2si\t{$src, $dst|$dst, $src}", 884 WriteCvtSS2I>, 885 XS, VEX, VEX_LIG; 886defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32, 887 "cvttss2si\t{$src, $dst|$dst, $src}", 888 WriteCvtSS2I>, 889 XS, VEX, VEX_W, VEX_LIG; 890defm VCVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64, 891 "cvttsd2si\t{$src, $dst|$dst, $src}", 892 WriteCvtSD2I>, 893 XD, VEX, VEX_LIG; 894defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64, 895 "cvttsd2si\t{$src, $dst|$dst, $src}", 896 WriteCvtSD2I>, 897 XD, VEX, VEX_W, VEX_LIG; 898 899def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}", 900 (VCVTTSS2SIrr GR32:$dst, FR32:$src), 0, "att">; 901def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}", 902 (VCVTTSS2SIrm GR32:$dst, f32mem:$src), 0, "att">; 903def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}", 904 (VCVTTSD2SIrr GR32:$dst, FR64:$src), 0, "att">; 905def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}", 906 (VCVTTSD2SIrm GR32:$dst, f64mem:$src), 0, "att">; 907def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}", 908 (VCVTTSS2SI64rr GR64:$dst, FR32:$src), 0, "att">; 909def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}", 910 (VCVTTSS2SI64rm GR64:$dst, f32mem:$src), 0, "att">; 911def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}", 912 (VCVTTSD2SI64rr GR64:$dst, FR64:$src), 0, "att">; 913def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}", 914 (VCVTTSD2SI64rm GR64:$dst, f64mem:$src), 0, "att">; 915} 916// The assembler can recognize rr 64-bit instructions by seeing a rxx 917// register, but the same isn't true when only using memory operands, 918// provide other assembly "l" and "q" forms to address this explicitly 919// where appropriate to do so. 920defm VCVTSI2SS : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss{l}", 921 WriteCvtI2SS>, XS, VEX_4V, VEX_LIG; 922defm VCVTSI642SS : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss{q}", 923 WriteCvtI2SS>, XS, VEX_4V, VEX_W, VEX_LIG; 924defm VCVTSI2SD : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd{l}", 925 WriteCvtI2SD>, XD, VEX_4V, VEX_LIG; 926defm VCVTSI642SD : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd{q}", 927 WriteCvtI2SD>, XD, VEX_4V, VEX_W, VEX_LIG; 928 929let Predicates = [UseAVX] in { 930 def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}", 931 (VCVTSI2SSrm FR64:$dst, FR64:$src1, i32mem:$src), 0, "att">; 932 def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}", 933 (VCVTSI2SDrm FR64:$dst, FR64:$src1, i32mem:$src), 0, "att">; 934 935 def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))), 936 (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>; 937 def : Pat<(f32 (sint_to_fp (loadi64 addr:$src))), 938 (VCVTSI642SSrm (f32 (IMPLICIT_DEF)), addr:$src)>; 939 def : Pat<(f64 (sint_to_fp (loadi32 addr:$src))), 940 (VCVTSI2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>; 941 def : Pat<(f64 (sint_to_fp (loadi64 addr:$src))), 942 (VCVTSI642SDrm (f64 (IMPLICIT_DEF)), addr:$src)>; 943 944 def : Pat<(f32 (sint_to_fp GR32:$src)), 945 (VCVTSI2SSrr (f32 (IMPLICIT_DEF)), GR32:$src)>; 946 def : Pat<(f32 (sint_to_fp GR64:$src)), 947 (VCVTSI642SSrr (f32 (IMPLICIT_DEF)), GR64:$src)>; 948 def : Pat<(f64 (sint_to_fp GR32:$src)), 949 (VCVTSI2SDrr (f64 (IMPLICIT_DEF)), GR32:$src)>; 950 def : Pat<(f64 (sint_to_fp GR64:$src)), 951 (VCVTSI642SDrr (f64 (IMPLICIT_DEF)), GR64:$src)>; 952} 953 954defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32, 955 "cvttss2si\t{$src, $dst|$dst, $src}", 956 WriteCvtSS2I>, XS; 957defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32, 958 "cvttss2si\t{$src, $dst|$dst, $src}", 959 WriteCvtSS2I>, XS, REX_W; 960defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64, 961 "cvttsd2si\t{$src, $dst|$dst, $src}", 962 WriteCvtSD2I>, XD; 963defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64, 964 "cvttsd2si\t{$src, $dst|$dst, $src}", 965 WriteCvtSD2I>, XD, REX_W; 966defm CVTSI2SS : sse12_cvt_s<0x2A, GR32, FR32, sint_to_fp, i32mem, loadi32, 967 "cvtsi2ss{l}\t{$src, $dst|$dst, $src}", 968 WriteCvtI2SS>, XS; 969defm CVTSI642SS : sse12_cvt_s<0x2A, GR64, FR32, sint_to_fp, i64mem, loadi64, 970 "cvtsi2ss{q}\t{$src, $dst|$dst, $src}", 971 WriteCvtI2SS>, XS, REX_W; 972defm CVTSI2SD : sse12_cvt_s<0x2A, GR32, FR64, sint_to_fp, i32mem, loadi32, 973 "cvtsi2sd{l}\t{$src, $dst|$dst, $src}", 974 WriteCvtI2SD>, XD; 975defm CVTSI642SD : sse12_cvt_s<0x2A, GR64, FR64, sint_to_fp, i64mem, loadi64, 976 "cvtsi2sd{q}\t{$src, $dst|$dst, $src}", 977 WriteCvtI2SD>, XD, REX_W; 978 979def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}", 980 (CVTTSS2SIrr GR32:$dst, FR32:$src), 0, "att">; 981def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}", 982 (CVTTSS2SIrm GR32:$dst, f32mem:$src), 0, "att">; 983def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}", 984 (CVTTSD2SIrr GR32:$dst, FR64:$src), 0, "att">; 985def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}", 986 (CVTTSD2SIrm GR32:$dst, f64mem:$src), 0, "att">; 987def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}", 988 (CVTTSS2SI64rr GR64:$dst, FR32:$src), 0, "att">; 989def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}", 990 (CVTTSS2SI64rm GR64:$dst, f32mem:$src), 0, "att">; 991def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}", 992 (CVTTSD2SI64rr GR64:$dst, FR64:$src), 0, "att">; 993def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}", 994 (CVTTSD2SI64rm GR64:$dst, f64mem:$src), 0, "att">; 995 996def : InstAlias<"cvtsi2ss\t{$src, $dst|$dst, $src}", 997 (CVTSI2SSrm FR64:$dst, i32mem:$src), 0, "att">; 998def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}", 999 (CVTSI2SDrm FR64:$dst, i32mem:$src), 0, "att">; 1000 1001// Conversion Instructions Intrinsics - Match intrinsics which expect MM 1002// and/or XMM operand(s). 1003 1004// FIXME: We probably want to match the rm form only when optimizing for 1005// size, to avoid false depenendecies (see sse_fp_unop_s for details) 1006multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, 1007 Intrinsic Int, Operand memop, ComplexPattern mem_cpat, 1008 string asm, X86FoldableSchedWrite sched> { 1009 def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), 1010 !strconcat(asm, "\t{$src, $dst|$dst, $src}"), 1011 [(set DstRC:$dst, (Int SrcRC:$src))]>, 1012 Sched<[sched]>; 1013 def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src), 1014 !strconcat(asm, "\t{$src, $dst|$dst, $src}"), 1015 [(set DstRC:$dst, (Int mem_cpat:$src))]>, 1016 Sched<[sched.Folded]>; 1017} 1018 1019multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC, 1020 RegisterClass DstRC, X86MemOperand x86memop, 1021 string asm, X86FoldableSchedWrite sched, 1022 bit Is2Addr = 1> { 1023let hasSideEffects = 0 in { 1024 def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src2), 1025 !if(Is2Addr, 1026 !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), 1027 !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 1028 []>, Sched<[sched]>; 1029 let mayLoad = 1 in 1030 def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst), 1031 (ins DstRC:$src1, x86memop:$src2), 1032 !if(Is2Addr, 1033 !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), 1034 !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 1035 []>, Sched<[sched.Folded, ReadAfterLd]>; 1036} 1037} 1038 1039let Predicates = [UseAVX] in { 1040defm VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, 1041 int_x86_sse2_cvtsd2si, sdmem, sse_load_f64, "cvtsd2si", 1042 WriteCvtSD2I>, XD, VEX, VEX_LIG; 1043defm VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, 1044 int_x86_sse2_cvtsd2si64, sdmem, sse_load_f64, "cvtsd2si", 1045 WriteCvtSD2I>, XD, VEX, VEX_W, VEX_LIG; 1046} 1047defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse2_cvtsd2si, 1048 sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I>, XD; 1049defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse2_cvtsd2si64, 1050 sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I>, XD, REX_W; 1051 1052 1053let isCodeGenOnly = 1 in { 1054 let Predicates = [UseAVX] in { 1055 defm VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128, 1056 i32mem, "cvtsi2ss{l}", WriteCvtI2SS, 0>, XS, VEX_4V; 1057 defm VCVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128, 1058 i64mem, "cvtsi2ss{q}", WriteCvtI2SS, 0>, XS, VEX_4V, VEX_W; 1059 defm VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128, 1060 i32mem, "cvtsi2sd{l}", WriteCvtI2SD, 0>, XD, VEX_4V; 1061 defm VCVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128, 1062 i64mem, "cvtsi2sd{q}", WriteCvtI2SD, 0>, XD, VEX_4V, VEX_W; 1063 } 1064 let Constraints = "$src1 = $dst" in { 1065 defm CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128, 1066 i32mem, "cvtsi2ss{l}", WriteCvtI2SS>, XS; 1067 defm CVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128, 1068 i64mem, "cvtsi2ss{q}", WriteCvtI2SS>, XS, REX_W; 1069 defm CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128, 1070 i32mem, "cvtsi2sd{l}", WriteCvtI2SD>, XD; 1071 defm CVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128, 1072 i64mem, "cvtsi2sd{q}", WriteCvtI2SD>, XD, REX_W; 1073 } 1074} // isCodeGenOnly = 1 1075 1076/// SSE 1 Only 1077 1078// Aliases for intrinsics 1079let isCodeGenOnly = 1 in { 1080let Predicates = [UseAVX] in { 1081defm VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si, 1082 ssmem, sse_load_f32, "cvttss2si", 1083 WriteCvtSS2I>, XS, VEX; 1084defm VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, 1085 int_x86_sse_cvttss2si64, ssmem, sse_load_f32, 1086 "cvttss2si", WriteCvtSS2I>, 1087 XS, VEX, VEX_W; 1088defm VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si, 1089 sdmem, sse_load_f64, "cvttsd2si", 1090 WriteCvtSS2I>, XD, VEX; 1091defm VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, 1092 int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64, 1093 "cvttsd2si", WriteCvtSS2I>, 1094 XD, VEX, VEX_W; 1095} 1096defm CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si, 1097 ssmem, sse_load_f32, "cvttss2si", 1098 WriteCvtSS2I>, XS; 1099defm CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, 1100 int_x86_sse_cvttss2si64, ssmem, sse_load_f32, 1101 "cvttss2si", WriteCvtSS2I>, XS, REX_W; 1102defm CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si, 1103 sdmem, sse_load_f64, "cvttsd2si", 1104 WriteCvtSD2I>, XD; 1105defm CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, 1106 int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64, 1107 "cvttsd2si", WriteCvtSD2I>, XD, REX_W; 1108} // isCodeGenOnly = 1 1109 1110let Predicates = [UseAVX] in { 1111defm VCVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si, 1112 ssmem, sse_load_f32, "cvtss2si", 1113 WriteCvtSS2I>, XS, VEX, VEX_LIG; 1114defm VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64, 1115 ssmem, sse_load_f32, "cvtss2si", 1116 WriteCvtSS2I>, XS, VEX, VEX_W, VEX_LIG; 1117} 1118defm CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si, 1119 ssmem, sse_load_f32, "cvtss2si", 1120 WriteCvtSS2I>, XS; 1121defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64, 1122 ssmem, sse_load_f32, "cvtss2si", 1123 WriteCvtSS2I>, XS, REX_W; 1124 1125defm VCVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, loadv2i64, 1126 "vcvtdq2ps\t{$src, $dst|$dst, $src}", 1127 SSEPackedSingle, WriteCvtI2PS>, 1128 PS, VEX, Requires<[HasAVX, NoVLX]>, VEX_WIG; 1129defm VCVTDQ2PSY : sse12_cvt_p<0x5B, VR256, i256mem, v8f32, v8i32, loadv4i64, 1130 "vcvtdq2ps\t{$src, $dst|$dst, $src}", 1131 SSEPackedSingle, WriteCvtI2PSY>, 1132 PS, VEX, VEX_L, Requires<[HasAVX, NoVLX]>, VEX_WIG; 1133 1134defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, memopv2i64, 1135 "cvtdq2ps\t{$src, $dst|$dst, $src}", 1136 SSEPackedSingle, WriteCvtI2PS>, 1137 PS, Requires<[UseSSE2]>; 1138 1139let Predicates = [UseAVX] in { 1140def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}", 1141 (VCVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; 1142def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}", 1143 (VCVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0, "att">; 1144def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}", 1145 (VCVTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; 1146def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}", 1147 (VCVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0, "att">; 1148def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}", 1149 (VCVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; 1150def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}", 1151 (VCVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0, "att">; 1152def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}", 1153 (VCVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; 1154def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}", 1155 (VCVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0, "att">; 1156} 1157 1158def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}", 1159 (CVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; 1160def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}", 1161 (CVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0, "att">; 1162def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}", 1163 (CVTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; 1164def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}", 1165 (CVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0, "att">; 1166def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}", 1167 (CVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; 1168def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}", 1169 (CVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0, "att">; 1170def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}", 1171 (CVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; 1172def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}", 1173 (CVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0, "att">; 1174 1175/// SSE 2 Only 1176 1177// Convert scalar double to scalar single 1178let hasSideEffects = 0, Predicates = [UseAVX] in { 1179def VCVTSD2SSrr : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst), 1180 (ins FR32:$src1, FR64:$src2), 1181 "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 1182 VEX_4V, VEX_LIG, VEX_WIG, 1183 Sched<[WriteCvtSD2SS]>; 1184let mayLoad = 1 in 1185def VCVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), 1186 (ins FR32:$src1, f64mem:$src2), 1187 "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 1188 XD, VEX_4V, VEX_LIG, VEX_WIG, 1189 Sched<[WriteCvtSD2SS.Folded, ReadAfterLd]>; 1190} 1191 1192def : Pat<(f32 (fpround FR64:$src)), 1193 (VCVTSD2SSrr (f32 (IMPLICIT_DEF)), FR64:$src)>, 1194 Requires<[UseAVX]>; 1195 1196def CVTSD2SSrr : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src), 1197 "cvtsd2ss\t{$src, $dst|$dst, $src}", 1198 [(set FR32:$dst, (fpround FR64:$src))]>, 1199 Sched<[WriteCvtSD2SS]>; 1200def CVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src), 1201 "cvtsd2ss\t{$src, $dst|$dst, $src}", 1202 [(set FR32:$dst, (fpround (loadf64 addr:$src)))]>, 1203 XD, Requires<[UseSSE2, OptForSize]>, 1204 Sched<[WriteCvtSD2SS.Folded]>; 1205 1206let isCodeGenOnly = 1 in { 1207def VCVTSD2SSrr_Int: I<0x5A, MRMSrcReg, 1208 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), 1209 "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1210 [(set VR128:$dst, 1211 (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))]>, 1212 XD, VEX_4V, VEX_WIG, Requires<[HasAVX]>, 1213 Sched<[WriteCvtSD2SS]>; 1214def VCVTSD2SSrm_Int: I<0x5A, MRMSrcMem, 1215 (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2), 1216 "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1217 [(set VR128:$dst, (int_x86_sse2_cvtsd2ss 1218 VR128:$src1, sse_load_f64:$src2))]>, 1219 XD, VEX_4V, VEX_WIG, Requires<[HasAVX]>, 1220 Sched<[WriteCvtSD2SS.Folded, ReadAfterLd]>; 1221let Constraints = "$src1 = $dst" in { 1222def CVTSD2SSrr_Int: I<0x5A, MRMSrcReg, 1223 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), 1224 "cvtsd2ss\t{$src2, $dst|$dst, $src2}", 1225 [(set VR128:$dst, 1226 (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))]>, 1227 XD, Requires<[UseSSE2]>, Sched<[WriteCvtSD2SS]>; 1228def CVTSD2SSrm_Int: I<0x5A, MRMSrcMem, 1229 (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2), 1230 "cvtsd2ss\t{$src2, $dst|$dst, $src2}", 1231 [(set VR128:$dst, (int_x86_sse2_cvtsd2ss 1232 VR128:$src1, sse_load_f64:$src2))]>, 1233 XD, Requires<[UseSSE2]>, 1234 Sched<[WriteCvtSD2SS.Folded, ReadAfterLd]>; 1235} 1236} // isCodeGenOnly = 1 1237 1238// Convert scalar single to scalar double 1239// SSE2 instructions with XS prefix 1240let hasSideEffects = 0 in { 1241def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), 1242 (ins FR64:$src1, FR32:$src2), 1243 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 1244 XS, VEX_4V, VEX_LIG, VEX_WIG, 1245 Sched<[WriteCvtSS2SD]>, Requires<[UseAVX]>; 1246let mayLoad = 1 in 1247def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), 1248 (ins FR64:$src1, f32mem:$src2), 1249 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 1250 XS, VEX_4V, VEX_LIG, VEX_WIG, 1251 Sched<[WriteCvtSS2SD.Folded, ReadAfterLd]>, 1252 Requires<[UseAVX, OptForSize]>; 1253} 1254 1255def : Pat<(f64 (fpextend FR32:$src)), 1256 (VCVTSS2SDrr (f64 (IMPLICIT_DEF)), FR32:$src)>, Requires<[UseAVX]>; 1257def : Pat<(fpextend (loadf32 addr:$src)), 1258 (VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX, OptForSize]>; 1259 1260def : Pat<(extloadf32 addr:$src), 1261 (VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>, 1262 Requires<[UseAVX, OptForSize]>; 1263def : Pat<(extloadf32 addr:$src), 1264 (VCVTSS2SDrr (f64 (IMPLICIT_DEF)), (VMOVSSrm addr:$src))>, 1265 Requires<[UseAVX, OptForSpeed]>; 1266 1267def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src), 1268 "cvtss2sd\t{$src, $dst|$dst, $src}", 1269 [(set FR64:$dst, (fpextend FR32:$src))]>, 1270 XS, Requires<[UseSSE2]>, Sched<[WriteCvtSS2SD]>; 1271def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src), 1272 "cvtss2sd\t{$src, $dst|$dst, $src}", 1273 [(set FR64:$dst, (extloadf32 addr:$src))]>, 1274 XS, Requires<[UseSSE2, OptForSize]>, 1275 Sched<[WriteCvtSS2SD.Folded]>; 1276 1277// extload f32 -> f64. This matches load+fpextend because we have a hack in 1278// the isel (PreprocessForFPConvert) that can introduce loads after dag 1279// combine. 1280// Since these loads aren't folded into the fpextend, we have to match it 1281// explicitly here. 1282def : Pat<(fpextend (loadf32 addr:$src)), 1283 (CVTSS2SDrm addr:$src)>, Requires<[UseSSE2, OptForSize]>; 1284def : Pat<(extloadf32 addr:$src), 1285 (CVTSS2SDrr (MOVSSrm addr:$src))>, Requires<[UseSSE2, OptForSpeed]>; 1286 1287let isCodeGenOnly = 1, hasSideEffects = 0 in { 1288def VCVTSS2SDrr_Int: I<0x5A, MRMSrcReg, 1289 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), 1290 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1291 []>, XS, VEX_4V, VEX_WIG, 1292 Requires<[HasAVX]>, Sched<[WriteCvtSS2SD]>; 1293let mayLoad = 1 in 1294def VCVTSS2SDrm_Int: I<0x5A, MRMSrcMem, 1295 (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2), 1296 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1297 []>, XS, VEX_4V, VEX_WIG, Requires<[HasAVX]>, 1298 Sched<[WriteCvtSS2SD.Folded, ReadAfterLd]>; 1299let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix 1300def CVTSS2SDrr_Int: I<0x5A, MRMSrcReg, 1301 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), 1302 "cvtss2sd\t{$src2, $dst|$dst, $src2}", 1303 []>, XS, Requires<[UseSSE2]>, 1304 Sched<[WriteCvtSS2SD]>; 1305let mayLoad = 1 in 1306def CVTSS2SDrm_Int: I<0x5A, MRMSrcMem, 1307 (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2), 1308 "cvtss2sd\t{$src2, $dst|$dst, $src2}", 1309 []>, XS, Requires<[UseSSE2]>, 1310 Sched<[WriteCvtSS2SD.Folded, ReadAfterLd]>; 1311} 1312} // isCodeGenOnly = 1 1313 1314// Patterns used for matching (v)cvtsi2ss, (v)cvtsi2sd, (v)cvtsd2ss and 1315// (v)cvtss2sd intrinsic sequences from clang which produce unnecessary 1316// vmovs{s,d} instructions 1317let Predicates = [UseAVX] in { 1318def : Pat<(v4f32 (X86Movss 1319 (v4f32 VR128:$dst), 1320 (v4f32 (scalar_to_vector 1321 (f32 (fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))), 1322 (VCVTSD2SSrr_Int VR128:$dst, VR128:$src)>; 1323 1324def : Pat<(v2f64 (X86Movsd 1325 (v2f64 VR128:$dst), 1326 (v2f64 (scalar_to_vector 1327 (f64 (fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))), 1328 (VCVTSS2SDrr_Int VR128:$dst, VR128:$src)>; 1329 1330def : Pat<(v4f32 (X86Movss 1331 (v4f32 VR128:$dst), 1332 (v4f32 (scalar_to_vector (f32 (sint_to_fp GR64:$src)))))), 1333 (VCVTSI642SSrr_Int VR128:$dst, GR64:$src)>; 1334 1335def : Pat<(v4f32 (X86Movss 1336 (v4f32 VR128:$dst), 1337 (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi64 addr:$src))))))), 1338 (VCVTSI642SSrm_Int VR128:$dst, addr:$src)>; 1339 1340def : Pat<(v4f32 (X86Movss 1341 (v4f32 VR128:$dst), 1342 (v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))), 1343 (VCVTSI2SSrr_Int VR128:$dst, GR32:$src)>; 1344 1345def : Pat<(v4f32 (X86Movss 1346 (v4f32 VR128:$dst), 1347 (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi32 addr:$src))))))), 1348 (VCVTSI2SSrm_Int VR128:$dst, addr:$src)>; 1349 1350def : Pat<(v2f64 (X86Movsd 1351 (v2f64 VR128:$dst), 1352 (v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))), 1353 (VCVTSI642SDrr_Int VR128:$dst, GR64:$src)>; 1354 1355def : Pat<(v2f64 (X86Movsd 1356 (v2f64 VR128:$dst), 1357 (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi64 addr:$src))))))), 1358 (VCVTSI642SDrm_Int VR128:$dst, addr:$src)>; 1359 1360def : Pat<(v2f64 (X86Movsd 1361 (v2f64 VR128:$dst), 1362 (v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))), 1363 (VCVTSI2SDrr_Int VR128:$dst, GR32:$src)>; 1364 1365def : Pat<(v2f64 (X86Movsd 1366 (v2f64 VR128:$dst), 1367 (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi32 addr:$src))))))), 1368 (VCVTSI2SDrm_Int VR128:$dst, addr:$src)>; 1369} // Predicates = [UseAVX] 1370 1371let Predicates = [UseSSE2] in { 1372def : Pat<(v4f32 (X86Movss 1373 (v4f32 VR128:$dst), 1374 (v4f32 (scalar_to_vector 1375 (f32 (fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))), 1376 (CVTSD2SSrr_Int VR128:$dst, VR128:$src)>; 1377 1378def : Pat<(v2f64 (X86Movsd 1379 (v2f64 VR128:$dst), 1380 (v2f64 (scalar_to_vector 1381 (f64 (fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))), 1382 (CVTSS2SDrr_Int VR128:$dst, VR128:$src)>; 1383 1384def : Pat<(v2f64 (X86Movsd 1385 (v2f64 VR128:$dst), 1386 (v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))), 1387 (CVTSI642SDrr_Int VR128:$dst, GR64:$src)>; 1388 1389def : Pat<(v2f64 (X86Movsd 1390 (v2f64 VR128:$dst), 1391 (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi64 addr:$src))))))), 1392 (CVTSI642SDrm_Int VR128:$dst, addr:$src)>; 1393 1394def : Pat<(v2f64 (X86Movsd 1395 (v2f64 VR128:$dst), 1396 (v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))), 1397 (CVTSI2SDrr_Int VR128:$dst, GR32:$src)>; 1398 1399def : Pat<(v2f64 (X86Movsd 1400 (v2f64 VR128:$dst), 1401 (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi32 addr:$src))))))), 1402 (CVTSI2SDrm_Int VR128:$dst, addr:$src)>; 1403} // Predicates = [UseSSE2] 1404 1405let Predicates = [UseSSE1] in { 1406def : Pat<(v4f32 (X86Movss 1407 (v4f32 VR128:$dst), 1408 (v4f32 (scalar_to_vector (f32 (sint_to_fp GR64:$src)))))), 1409 (CVTSI642SSrr_Int VR128:$dst, GR64:$src)>; 1410 1411def : Pat<(v4f32 (X86Movss 1412 (v4f32 VR128:$dst), 1413 (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi64 addr:$src))))))), 1414 (CVTSI642SSrm_Int VR128:$dst, addr:$src)>; 1415 1416def : Pat<(v4f32 (X86Movss 1417 (v4f32 VR128:$dst), 1418 (v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))), 1419 (CVTSI2SSrr_Int VR128:$dst, GR32:$src)>; 1420 1421def : Pat<(v4f32 (X86Movss 1422 (v4f32 VR128:$dst), 1423 (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi32 addr:$src))))))), 1424 (CVTSI2SSrm_Int VR128:$dst, addr:$src)>; 1425} // Predicates = [UseSSE1] 1426 1427let Predicates = [HasAVX, NoVLX] in { 1428// Convert packed single/double fp to doubleword 1429def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1430 "cvtps2dq\t{$src, $dst|$dst, $src}", 1431 [(set VR128:$dst, (v4i32 (X86cvtp2Int (v4f32 VR128:$src))))]>, 1432 VEX, Sched<[WriteCvtPS2I]>, VEX_WIG; 1433def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1434 "cvtps2dq\t{$src, $dst|$dst, $src}", 1435 [(set VR128:$dst, 1436 (v4i32 (X86cvtp2Int (loadv4f32 addr:$src))))]>, 1437 VEX, Sched<[WriteCvtPS2ILd]>, VEX_WIG; 1438def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 1439 "cvtps2dq\t{$src, $dst|$dst, $src}", 1440 [(set VR256:$dst, 1441 (v8i32 (X86cvtp2Int (v8f32 VR256:$src))))]>, 1442 VEX, VEX_L, Sched<[WriteCvtPS2IY]>, VEX_WIG; 1443def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), 1444 "cvtps2dq\t{$src, $dst|$dst, $src}", 1445 [(set VR256:$dst, 1446 (v8i32 (X86cvtp2Int (loadv8f32 addr:$src))))]>, 1447 VEX, VEX_L, Sched<[WriteCvtPS2IYLd]>, VEX_WIG; 1448} 1449def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1450 "cvtps2dq\t{$src, $dst|$dst, $src}", 1451 [(set VR128:$dst, (v4i32 (X86cvtp2Int (v4f32 VR128:$src))))]>, 1452 Sched<[WriteCvtPS2I]>; 1453def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1454 "cvtps2dq\t{$src, $dst|$dst, $src}", 1455 [(set VR128:$dst, 1456 (v4i32 (X86cvtp2Int (memopv4f32 addr:$src))))]>, 1457 Sched<[WriteCvtPS2ILd]>; 1458 1459 1460// Convert Packed Double FP to Packed DW Integers 1461let Predicates = [HasAVX, NoVLX] in { 1462// The assembler can recognize rr 256-bit instructions by seeing a ymm 1463// register, but the same isn't true when using memory operands instead. 1464// Provide other assembly rr and rm forms to address this explicitly. 1465def VCVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1466 "vcvtpd2dq\t{$src, $dst|$dst, $src}", 1467 [(set VR128:$dst, 1468 (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>, 1469 VEX, Sched<[WriteCvtPD2I]>, VEX_WIG; 1470 1471// XMM only 1472def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}", 1473 (VCVTPD2DQrr VR128:$dst, VR128:$src), 0>; 1474def VCVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1475 "vcvtpd2dq{x}\t{$src, $dst|$dst, $src}", 1476 [(set VR128:$dst, 1477 (v4i32 (X86cvtp2Int (loadv2f64 addr:$src))))]>, VEX, 1478 Sched<[WriteCvtPD2ILd]>, VEX_WIG; 1479def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}", 1480 (VCVTPD2DQrm VR128:$dst, f128mem:$src), 0, "intel">; 1481 1482// YMM only 1483def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), 1484 "vcvtpd2dq\t{$src, $dst|$dst, $src}", 1485 [(set VR128:$dst, 1486 (v4i32 (X86cvtp2Int (v4f64 VR256:$src))))]>, 1487 VEX, VEX_L, Sched<[WriteCvtPD2IY]>, VEX_WIG; 1488def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), 1489 "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}", 1490 [(set VR128:$dst, 1491 (v4i32 (X86cvtp2Int (loadv4f64 addr:$src))))]>, 1492 VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, VEX_WIG; 1493def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}", 1494 (VCVTPD2DQYrr VR128:$dst, VR256:$src), 0>; 1495def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}", 1496 (VCVTPD2DQYrm VR128:$dst, f256mem:$src), 0, "intel">; 1497} 1498 1499def CVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1500 "cvtpd2dq\t{$src, $dst|$dst, $src}", 1501 [(set VR128:$dst, 1502 (v4i32 (X86cvtp2Int (memopv2f64 addr:$src))))]>, 1503 Sched<[WriteCvtPD2ILd]>; 1504def CVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1505 "cvtpd2dq\t{$src, $dst|$dst, $src}", 1506 [(set VR128:$dst, 1507 (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>, 1508 Sched<[WriteCvtPD2I]>; 1509 1510// Convert with truncation packed single/double fp to doubleword 1511// SSE2 packed instructions with XS prefix 1512let Predicates = [HasAVX, NoVLX] in { 1513def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1514 "cvttps2dq\t{$src, $dst|$dst, $src}", 1515 [(set VR128:$dst, 1516 (v4i32 (X86cvttp2si (v4f32 VR128:$src))))]>, 1517 VEX, Sched<[WriteCvtPS2I]>, VEX_WIG; 1518def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1519 "cvttps2dq\t{$src, $dst|$dst, $src}", 1520 [(set VR128:$dst, 1521 (v4i32 (X86cvttp2si (loadv4f32 addr:$src))))]>, 1522 VEX, Sched<[WriteCvtPS2ILd]>, VEX_WIG; 1523def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 1524 "cvttps2dq\t{$src, $dst|$dst, $src}", 1525 [(set VR256:$dst, 1526 (v8i32 (X86cvttp2si (v8f32 VR256:$src))))]>, 1527 VEX, VEX_L, Sched<[WriteCvtPS2IY]>, VEX_WIG; 1528def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), 1529 "cvttps2dq\t{$src, $dst|$dst, $src}", 1530 [(set VR256:$dst, 1531 (v8i32 (X86cvttp2si (loadv8f32 addr:$src))))]>, 1532 VEX, VEX_L, 1533 Sched<[WriteCvtPS2IYLd]>, VEX_WIG; 1534} 1535 1536let Predicates = [HasAVX, NoVLX] in { 1537 def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))), 1538 (VCVTTPS2DQrr VR128:$src)>; 1539 def : Pat<(v4i32 (fp_to_sint (loadv4f32 addr:$src))), 1540 (VCVTTPS2DQrm addr:$src)>; 1541 def : Pat<(v8i32 (fp_to_sint (v8f32 VR256:$src))), 1542 (VCVTTPS2DQYrr VR256:$src)>; 1543 def : Pat<(v8i32 (fp_to_sint (loadv8f32 addr:$src))), 1544 (VCVTTPS2DQYrm addr:$src)>; 1545} 1546 1547def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1548 "cvttps2dq\t{$src, $dst|$dst, $src}", 1549 [(set VR128:$dst, 1550 (v4i32 (X86cvttp2si (v4f32 VR128:$src))))]>, 1551 Sched<[WriteCvtPS2I]>; 1552def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1553 "cvttps2dq\t{$src, $dst|$dst, $src}", 1554 [(set VR128:$dst, 1555 (v4i32 (X86cvttp2si (memopv4f32 addr:$src))))]>, 1556 Sched<[WriteCvtPS2ILd]>; 1557 1558let Predicates = [UseSSE2] in { 1559 def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))), 1560 (CVTTPS2DQrr VR128:$src)>; 1561 def : Pat<(v4i32 (fp_to_sint (memopv4f32 addr:$src))), 1562 (CVTTPS2DQrm addr:$src)>; 1563} 1564 1565let Predicates = [HasAVX, NoVLX] in 1566def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1567 "cvttpd2dq\t{$src, $dst|$dst, $src}", 1568 [(set VR128:$dst, 1569 (v4i32 (X86cvttp2si (v2f64 VR128:$src))))]>, 1570 VEX, Sched<[WriteCvtPD2I]>, VEX_WIG; 1571 1572// The assembler can recognize rr 256-bit instructions by seeing a ymm 1573// register, but the same isn't true when using memory operands instead. 1574// Provide other assembly rr and rm forms to address this explicitly. 1575 1576// XMM only 1577def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}", 1578 (VCVTTPD2DQrr VR128:$dst, VR128:$src), 0>; 1579 1580let Predicates = [HasAVX, NoVLX] in 1581def VCVTTPD2DQrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1582 "cvttpd2dq{x}\t{$src, $dst|$dst, $src}", 1583 [(set VR128:$dst, 1584 (v4i32 (X86cvttp2si (loadv2f64 addr:$src))))]>, 1585 VEX, Sched<[WriteCvtPD2ILd]>, VEX_WIG; 1586def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}", 1587 (VCVTTPD2DQrm VR128:$dst, f128mem:$src), 0, "intel">; 1588 1589// YMM only 1590let Predicates = [HasAVX, NoVLX] in { 1591def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), 1592 "cvttpd2dq\t{$src, $dst|$dst, $src}", 1593 [(set VR128:$dst, 1594 (v4i32 (X86cvttp2si (v4f64 VR256:$src))))]>, 1595 VEX, VEX_L, Sched<[WriteCvtPD2IY]>, VEX_WIG; 1596def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), 1597 "cvttpd2dq{y}\t{$src, $dst|$dst, $src}", 1598 [(set VR128:$dst, 1599 (v4i32 (X86cvttp2si (loadv4f64 addr:$src))))]>, 1600 VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, VEX_WIG; 1601} 1602def : InstAlias<"vcvttpd2dqy\t{$src, $dst|$dst, $src}", 1603 (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0>; 1604def : InstAlias<"vcvttpd2dqy\t{$src, $dst|$dst, $src}", 1605 (VCVTTPD2DQYrm VR128:$dst, f256mem:$src), 0, "intel">; 1606 1607let Predicates = [HasAVX, NoVLX] in { 1608 def : Pat<(v4i32 (fp_to_sint (v4f64 VR256:$src))), 1609 (VCVTTPD2DQYrr VR256:$src)>; 1610 def : Pat<(v4i32 (fp_to_sint (loadv4f64 addr:$src))), 1611 (VCVTTPD2DQYrm addr:$src)>; 1612} 1613 1614let Predicates = [HasAVX, NoVLX] in { 1615 def : Pat<(X86vzmovl (v2i64 (bitconvert 1616 (v4i32 (X86cvtp2Int (v2f64 VR128:$src)))))), 1617 (VCVTPD2DQrr VR128:$src)>; 1618 def : Pat<(X86vzmovl (v2i64 (bitconvert 1619 (v4i32 (X86cvtp2Int (loadv2f64 addr:$src)))))), 1620 (VCVTPD2DQrm addr:$src)>; 1621 def : Pat<(X86vzmovl (v2i64 (bitconvert 1622 (v4i32 (X86cvttp2si (v2f64 VR128:$src)))))), 1623 (VCVTTPD2DQrr VR128:$src)>; 1624 def : Pat<(X86vzmovl (v2i64 (bitconvert 1625 (v4i32 (X86cvttp2si (loadv2f64 addr:$src)))))), 1626 (VCVTTPD2DQrm addr:$src)>; 1627} // Predicates = [HasAVX, NoVLX] 1628 1629def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1630 "cvttpd2dq\t{$src, $dst|$dst, $src}", 1631 [(set VR128:$dst, 1632 (v4i32 (X86cvttp2si (v2f64 VR128:$src))))]>, 1633 Sched<[WriteCvtPD2I]>; 1634def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src), 1635 "cvttpd2dq\t{$src, $dst|$dst, $src}", 1636 [(set VR128:$dst, 1637 (v4i32 (X86cvttp2si (memopv2f64 addr:$src))))]>, 1638 Sched<[WriteCvtPD2ILd]>; 1639 1640let Predicates = [UseSSE2] in { 1641 def : Pat<(X86vzmovl (v2i64 (bitconvert 1642 (v4i32 (X86cvtp2Int (v2f64 VR128:$src)))))), 1643 (CVTPD2DQrr VR128:$src)>; 1644 def : Pat<(X86vzmovl (v2i64 (bitconvert 1645 (v4i32 (X86cvtp2Int (memopv2f64 addr:$src)))))), 1646 (CVTPD2DQrm addr:$src)>; 1647 def : Pat<(X86vzmovl (v2i64 (bitconvert 1648 (v4i32 (X86cvttp2si (v2f64 VR128:$src)))))), 1649 (CVTTPD2DQrr VR128:$src)>; 1650 def : Pat<(X86vzmovl (v2i64 (bitconvert 1651 (v4i32 (X86cvttp2si (memopv2f64 addr:$src)))))), 1652 (CVTTPD2DQrm addr:$src)>; 1653} // Predicates = [UseSSE2] 1654 1655// Convert packed single to packed double 1656let Predicates = [HasAVX, NoVLX] in { 1657 // SSE2 instructions without OpSize prefix 1658def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1659 "vcvtps2pd\t{$src, $dst|$dst, $src}", 1660 [(set VR128:$dst, (v2f64 (X86vfpext (v4f32 VR128:$src))))]>, 1661 PS, VEX, Sched<[WriteCvtPS2PD]>, VEX_WIG; 1662def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), 1663 "vcvtps2pd\t{$src, $dst|$dst, $src}", 1664 [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))]>, 1665 PS, VEX, Sched<[WriteCvtPS2PD.Folded]>, VEX_WIG; 1666def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), 1667 "vcvtps2pd\t{$src, $dst|$dst, $src}", 1668 [(set VR256:$dst, (v4f64 (fpextend (v4f32 VR128:$src))))]>, 1669 PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY]>, VEX_WIG; 1670def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src), 1671 "vcvtps2pd\t{$src, $dst|$dst, $src}", 1672 [(set VR256:$dst, (v4f64 (extloadv4f32 addr:$src)))]>, 1673 PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY.Folded]>, VEX_WIG; 1674} 1675 1676let Predicates = [UseSSE2] in { 1677def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1678 "cvtps2pd\t{$src, $dst|$dst, $src}", 1679 [(set VR128:$dst, (v2f64 (X86vfpext (v4f32 VR128:$src))))]>, 1680 PS, Sched<[WriteCvtPS2PD]>; 1681def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), 1682 "cvtps2pd\t{$src, $dst|$dst, $src}", 1683 [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))]>, 1684 PS, Sched<[WriteCvtPS2PD.Folded]>; 1685} 1686 1687// Convert Packed DW Integers to Packed Double FP 1688let Predicates = [HasAVX, NoVLX] in { 1689let hasSideEffects = 0, mayLoad = 1 in 1690def VCVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 1691 "vcvtdq2pd\t{$src, $dst|$dst, $src}", 1692 [(set VR128:$dst, 1693 (v2f64 (X86VSintToFP (bc_v4i32 (loadv2i64 addr:$src)))))]>, 1694 VEX, Sched<[WriteCvtI2PDLd]>, VEX_WIG; 1695def VCVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1696 "vcvtdq2pd\t{$src, $dst|$dst, $src}", 1697 [(set VR128:$dst, 1698 (v2f64 (X86VSintToFP (v4i32 VR128:$src))))]>, 1699 VEX, Sched<[WriteCvtI2PD]>, VEX_WIG; 1700def VCVTDQ2PDYrm : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src), 1701 "vcvtdq2pd\t{$src, $dst|$dst, $src}", 1702 [(set VR256:$dst, 1703 (v4f64 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))))]>, 1704 VEX, VEX_L, Sched<[WriteCvtI2PDYLd]>, 1705 VEX_WIG; 1706def VCVTDQ2PDYrr : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), 1707 "vcvtdq2pd\t{$src, $dst|$dst, $src}", 1708 [(set VR256:$dst, 1709 (v4f64 (sint_to_fp (v4i32 VR128:$src))))]>, 1710 VEX, VEX_L, Sched<[WriteCvtI2PDY]>, VEX_WIG; 1711} 1712 1713let hasSideEffects = 0, mayLoad = 1 in 1714def CVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 1715 "cvtdq2pd\t{$src, $dst|$dst, $src}", 1716 [(set VR128:$dst, 1717 (v2f64 (X86VSintToFP (bc_v4i32 (loadv2i64 addr:$src)))))]>, 1718 Sched<[WriteCvtI2PDLd]>; 1719def CVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1720 "cvtdq2pd\t{$src, $dst|$dst, $src}", 1721 [(set VR128:$dst, 1722 (v2f64 (X86VSintToFP (v4i32 VR128:$src))))]>, 1723 Sched<[WriteCvtI2PD]>; 1724 1725// AVX register conversion intrinsics 1726let Predicates = [HasAVX, NoVLX] in { 1727 def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), 1728 (VCVTDQ2PDrm addr:$src)>; 1729 def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src))))), 1730 (VCVTDQ2PDrm addr:$src)>; 1731} // Predicates = [HasAVX, NoVLX] 1732 1733// SSE2 register conversion intrinsics 1734let Predicates = [UseSSE2] in { 1735 def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), 1736 (CVTDQ2PDrm addr:$src)>; 1737 def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src))))), 1738 (CVTDQ2PDrm addr:$src)>; 1739} // Predicates = [UseSSE2] 1740 1741// Convert packed double to packed single 1742// The assembler can recognize rr 256-bit instructions by seeing a ymm 1743// register, but the same isn't true when using memory operands instead. 1744// Provide other assembly rr and rm forms to address this explicitly. 1745let Predicates = [HasAVX, NoVLX] in 1746def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1747 "cvtpd2ps\t{$src, $dst|$dst, $src}", 1748 [(set VR128:$dst, (X86vfpround (v2f64 VR128:$src)))]>, 1749 VEX, Sched<[WriteCvtPD2PS]>, VEX_WIG; 1750 1751// XMM only 1752def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}", 1753 (VCVTPD2PSrr VR128:$dst, VR128:$src), 0>; 1754let Predicates = [HasAVX, NoVLX] in 1755def VCVTPD2PSrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1756 "cvtpd2ps{x}\t{$src, $dst|$dst, $src}", 1757 [(set VR128:$dst, (X86vfpround (loadv2f64 addr:$src)))]>, 1758 VEX, Sched<[WriteCvtPD2PS.Folded]>, VEX_WIG; 1759def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}", 1760 (VCVTPD2PSrm VR128:$dst, f128mem:$src), 0, "intel">; 1761 1762// YMM only 1763let Predicates = [HasAVX, NoVLX] in { 1764def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), 1765 "cvtpd2ps\t{$src, $dst|$dst, $src}", 1766 [(set VR128:$dst, (fpround VR256:$src))]>, 1767 VEX, VEX_L, Sched<[WriteCvtPD2PSY]>, VEX_WIG; 1768def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), 1769 "cvtpd2ps{y}\t{$src, $dst|$dst, $src}", 1770 [(set VR128:$dst, (fpround (loadv4f64 addr:$src)))]>, 1771 VEX, VEX_L, Sched<[WriteCvtPD2PSY.Folded]>, VEX_WIG; 1772} 1773def : InstAlias<"vcvtpd2psy\t{$src, $dst|$dst, $src}", 1774 (VCVTPD2PSYrr VR128:$dst, VR256:$src), 0>; 1775def : InstAlias<"vcvtpd2psy\t{$src, $dst|$dst, $src}", 1776 (VCVTPD2PSYrm VR128:$dst, f256mem:$src), 0, "intel">; 1777 1778def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1779 "cvtpd2ps\t{$src, $dst|$dst, $src}", 1780 [(set VR128:$dst, (X86vfpround (v2f64 VR128:$src)))]>, 1781 Sched<[WriteCvtPD2PS]>; 1782def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1783 "cvtpd2ps\t{$src, $dst|$dst, $src}", 1784 [(set VR128:$dst, (X86vfpround (memopv2f64 addr:$src)))]>, 1785 Sched<[WriteCvtPD2PS.Folded]>; 1786 1787// AVX 256-bit register conversion intrinsics 1788// FIXME: Migrate SSE conversion intrinsics matching to use patterns as below 1789// whenever possible to avoid declaring two versions of each one. 1790 1791let Predicates = [HasAVX, NoVLX] in { 1792 // Match fpround and fpextend for 128/256-bit conversions 1793 def : Pat<(X86vzmovl (v2f64 (bitconvert 1794 (v4f32 (X86vfpround (v2f64 VR128:$src)))))), 1795 (VCVTPD2PSrr VR128:$src)>; 1796 def : Pat<(X86vzmovl (v2f64 (bitconvert 1797 (v4f32 (X86vfpround (loadv2f64 addr:$src)))))), 1798 (VCVTPD2PSrm addr:$src)>; 1799} 1800 1801let Predicates = [UseSSE2] in { 1802 // Match fpround and fpextend for 128 conversions 1803 def : Pat<(X86vzmovl (v2f64 (bitconvert 1804 (v4f32 (X86vfpround (v2f64 VR128:$src)))))), 1805 (CVTPD2PSrr VR128:$src)>; 1806 def : Pat<(X86vzmovl (v2f64 (bitconvert 1807 (v4f32 (X86vfpround (memopv2f64 addr:$src)))))), 1808 (CVTPD2PSrm addr:$src)>; 1809} 1810 1811//===----------------------------------------------------------------------===// 1812// SSE 1 & 2 - Compare Instructions 1813//===----------------------------------------------------------------------===// 1814 1815// sse12_cmp_scalar - sse 1 & 2 compare scalar instructions 1816multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop, 1817 Operand CC, SDNode OpNode, ValueType VT, 1818 PatFrag ld_frag, string asm, string asm_alt, 1819 X86FoldableSchedWrite sched> { 1820 let isCommutable = 1 in 1821 def rr : SIi8<0xC2, MRMSrcReg, 1822 (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm, 1823 [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, imm:$cc))]>, 1824 Sched<[sched]>; 1825 def rm : SIi8<0xC2, MRMSrcMem, 1826 (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm, 1827 [(set RC:$dst, (OpNode (VT RC:$src1), 1828 (ld_frag addr:$src2), imm:$cc))]>, 1829 Sched<[sched.Folded, ReadAfterLd]>; 1830 1831 // Accept explicit immediate argument form instead of comparison code. 1832 let isAsmParserOnly = 1, hasSideEffects = 0 in { 1833 def rr_alt : SIi8<0xC2, MRMSrcReg, (outs RC:$dst), 1834 (ins RC:$src1, RC:$src2, u8imm:$cc), asm_alt, []>, 1835 Sched<[sched]>, NotMemoryFoldable; 1836 let mayLoad = 1 in 1837 def rm_alt : SIi8<0xC2, MRMSrcMem, (outs RC:$dst), 1838 (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm_alt, []>, 1839 Sched<[sched.Folded, ReadAfterLd]>, NotMemoryFoldable; 1840 } 1841} 1842 1843let ExeDomain = SSEPackedSingle in 1844defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, AVXCC, X86cmps, f32, loadf32, 1845 "cmp${cc}ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1846 "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", 1847 SchedWriteFCmpSizes.PS.Scl>, XS, VEX_4V, VEX_LIG, VEX_WIG; 1848let ExeDomain = SSEPackedDouble in 1849defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, AVXCC, X86cmps, f64, loadf64, 1850 "cmp${cc}sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1851 "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", 1852 SchedWriteFCmpSizes.PD.Scl>, 1853 XD, VEX_4V, VEX_LIG, VEX_WIG; 1854 1855let Constraints = "$src1 = $dst" in { 1856 let ExeDomain = SSEPackedSingle in 1857 defm CMPSS : sse12_cmp_scalar<FR32, f32mem, SSECC, X86cmps, f32, loadf32, 1858 "cmp${cc}ss\t{$src2, $dst|$dst, $src2}", 1859 "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}", 1860 SchedWriteFCmpSizes.PS.Scl>, XS; 1861 let ExeDomain = SSEPackedDouble in 1862 defm CMPSD : sse12_cmp_scalar<FR64, f64mem, SSECC, X86cmps, f64, loadf64, 1863 "cmp${cc}sd\t{$src2, $dst|$dst, $src2}", 1864 "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}", 1865 SchedWriteFCmpSizes.PD.Scl>, XD; 1866} 1867 1868multiclass sse12_cmp_scalar_int<Operand memop, Operand CC, 1869 Intrinsic Int, string asm, X86FoldableSchedWrite sched, 1870 ComplexPattern mem_cpat> { 1871 def rr_Int : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst), 1872 (ins VR128:$src1, VR128:$src, CC:$cc), asm, 1873 [(set VR128:$dst, (Int VR128:$src1, 1874 VR128:$src, imm:$cc))]>, 1875 Sched<[sched]>; 1876let mayLoad = 1 in 1877 def rm_Int : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst), 1878 (ins VR128:$src1, memop:$src, CC:$cc), asm, 1879 [(set VR128:$dst, (Int VR128:$src1, 1880 mem_cpat:$src, imm:$cc))]>, 1881 Sched<[sched.Folded, ReadAfterLd]>; 1882} 1883 1884let isCodeGenOnly = 1 in { 1885 // Aliases to match intrinsics which expect XMM operand(s). 1886 let ExeDomain = SSEPackedSingle in 1887 defm VCMPSS : sse12_cmp_scalar_int<ssmem, AVXCC, int_x86_sse_cmp_ss, 1888 "cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}", 1889 SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, XS, VEX_4V; 1890 let ExeDomain = SSEPackedDouble in 1891 defm VCMPSD : sse12_cmp_scalar_int<sdmem, AVXCC, int_x86_sse2_cmp_sd, 1892 "cmp${cc}sd\t{$src, $src1, $dst|$dst, $src1, $src}", 1893 SchedWriteFCmpSizes.PD.Scl, sse_load_f64>, 1894 XD, VEX_4V; 1895 let Constraints = "$src1 = $dst" in { 1896 let ExeDomain = SSEPackedSingle in 1897 defm CMPSS : sse12_cmp_scalar_int<ssmem, SSECC, int_x86_sse_cmp_ss, 1898 "cmp${cc}ss\t{$src, $dst|$dst, $src}", 1899 SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, XS; 1900 let ExeDomain = SSEPackedDouble in 1901 defm CMPSD : sse12_cmp_scalar_int<sdmem, SSECC, int_x86_sse2_cmp_sd, 1902 "cmp${cc}sd\t{$src, $dst|$dst, $src}", 1903 SchedWriteFCmpSizes.PD.Scl, sse_load_f64>, XD; 1904} 1905} 1906 1907 1908// sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS 1909multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode, 1910 ValueType vt, X86MemOperand x86memop, 1911 PatFrag ld_frag, string OpcodeStr, 1912 X86FoldableSchedWrite sched> { 1913let hasSideEffects = 0 in { 1914 def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2), 1915 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), 1916 [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>, 1917 Sched<[sched]>; 1918let mayLoad = 1 in 1919 def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2), 1920 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), 1921 [(set EFLAGS, (OpNode (vt RC:$src1), 1922 (ld_frag addr:$src2)))]>, 1923 Sched<[sched.Folded, ReadAfterLd]>; 1924} 1925} 1926 1927// sse12_ord_cmp_int - Intrinsic version of sse12_ord_cmp 1928multiclass sse12_ord_cmp_int<bits<8> opc, RegisterClass RC, SDNode OpNode, 1929 ValueType vt, Operand memop, 1930 ComplexPattern mem_cpat, string OpcodeStr, 1931 X86FoldableSchedWrite sched> { 1932 def rr_Int: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2), 1933 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), 1934 [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>, 1935 Sched<[sched]>; 1936let mayLoad = 1 in 1937 def rm_Int: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, memop:$src2), 1938 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), 1939 [(set EFLAGS, (OpNode (vt RC:$src1), 1940 mem_cpat:$src2))]>, 1941 Sched<[sched.Folded, ReadAfterLd]>; 1942} 1943 1944let Defs = [EFLAGS] in { 1945 defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32, 1946 "ucomiss", WriteFCom>, PS, VEX, VEX_LIG, VEX_WIG; 1947 defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64, 1948 "ucomisd", WriteFCom>, PD, VEX, VEX_LIG, VEX_WIG; 1949 let Pattern = []<dag> in { 1950 defm VCOMISS : sse12_ord_cmp<0x2F, FR32, undef, f32, f32mem, loadf32, 1951 "comiss", WriteFCom>, PS, VEX, VEX_LIG, VEX_WIG; 1952 defm VCOMISD : sse12_ord_cmp<0x2F, FR64, undef, f64, f64mem, loadf64, 1953 "comisd", WriteFCom>, PD, VEX, VEX_LIG, VEX_WIG; 1954 } 1955 1956 let isCodeGenOnly = 1 in { 1957 defm VUCOMISS : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem, 1958 sse_load_f32, "ucomiss", WriteFCom>, PS, VEX, VEX_WIG; 1959 defm VUCOMISD : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem, 1960 sse_load_f64, "ucomisd", WriteFCom>, PD, VEX, VEX_WIG; 1961 1962 defm VCOMISS : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem, 1963 sse_load_f32, "comiss", WriteFCom>, PS, VEX, VEX_WIG; 1964 defm VCOMISD : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem, 1965 sse_load_f64, "comisd", WriteFCom>, PD, VEX, VEX_WIG; 1966 } 1967 defm UCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32, 1968 "ucomiss", WriteFCom>, PS; 1969 defm UCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64, 1970 "ucomisd", WriteFCom>, PD; 1971 1972 let Pattern = []<dag> in { 1973 defm COMISS : sse12_ord_cmp<0x2F, FR32, undef, f32, f32mem, loadf32, 1974 "comiss", WriteFCom>, PS; 1975 defm COMISD : sse12_ord_cmp<0x2F, FR64, undef, f64, f64mem, loadf64, 1976 "comisd", WriteFCom>, PD; 1977 } 1978 1979 let isCodeGenOnly = 1 in { 1980 defm UCOMISS : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem, 1981 sse_load_f32, "ucomiss", WriteFCom>, PS; 1982 defm UCOMISD : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem, 1983 sse_load_f64, "ucomisd", WriteFCom>, PD; 1984 1985 defm COMISS : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem, 1986 sse_load_f32, "comiss", WriteFCom>, PS; 1987 defm COMISD : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem, 1988 sse_load_f64, "comisd", WriteFCom>, PD; 1989 } 1990} // Defs = [EFLAGS] 1991 1992// sse12_cmp_packed - sse 1 & 2 compare packed instructions 1993multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop, 1994 Operand CC, ValueType VT, string asm, 1995 string asm_alt, X86FoldableSchedWrite sched, 1996 Domain d, PatFrag ld_frag> { 1997 let isCommutable = 1 in 1998 def rri : PIi8<0xC2, MRMSrcReg, 1999 (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm, 2000 [(set RC:$dst, (VT (X86cmpp RC:$src1, RC:$src2, imm:$cc)))], d>, 2001 Sched<[sched]>; 2002 def rmi : PIi8<0xC2, MRMSrcMem, 2003 (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm, 2004 [(set RC:$dst, 2005 (VT (X86cmpp RC:$src1, (ld_frag addr:$src2), imm:$cc)))], d>, 2006 Sched<[sched.Folded, ReadAfterLd]>; 2007 2008 // Accept explicit immediate argument form instead of comparison code. 2009 let isAsmParserOnly = 1, hasSideEffects = 0 in { 2010 def rri_alt : PIi8<0xC2, MRMSrcReg, 2011 (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), 2012 asm_alt, [], d>, Sched<[sched]>, NotMemoryFoldable; 2013 let mayLoad = 1 in 2014 def rmi_alt : PIi8<0xC2, MRMSrcMem, 2015 (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), 2016 asm_alt, [], d>, Sched<[sched.Folded, ReadAfterLd]>, 2017 NotMemoryFoldable; 2018 } 2019} 2020 2021defm VCMPPS : sse12_cmp_packed<VR128, f128mem, AVXCC, v4f32, 2022 "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2023 "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", 2024 SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, loadv4f32>, PS, VEX_4V, VEX_WIG; 2025defm VCMPPD : sse12_cmp_packed<VR128, f128mem, AVXCC, v2f64, 2026 "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2027 "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", 2028 SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, loadv2f64>, PD, VEX_4V, VEX_WIG; 2029defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, AVXCC, v8f32, 2030 "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2031 "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", 2032 SchedWriteFCmpSizes.PS.YMM, SSEPackedSingle, loadv8f32>, PS, VEX_4V, VEX_L, VEX_WIG; 2033defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, AVXCC, v4f64, 2034 "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2035 "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", 2036 SchedWriteFCmpSizes.PD.YMM, SSEPackedDouble, loadv4f64>, PD, VEX_4V, VEX_L, VEX_WIG; 2037let Constraints = "$src1 = $dst" in { 2038 defm CMPPS : sse12_cmp_packed<VR128, f128mem, SSECC, v4f32, 2039 "cmp${cc}ps\t{$src2, $dst|$dst, $src2}", 2040 "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}", 2041 SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, memopv4f32>, PS; 2042 defm CMPPD : sse12_cmp_packed<VR128, f128mem, SSECC, v2f64, 2043 "cmp${cc}pd\t{$src2, $dst|$dst, $src2}", 2044 "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}", 2045 SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, memopv2f64>, PD; 2046} 2047 2048def CommutableCMPCC : PatLeaf<(imm), [{ 2049 uint64_t Imm = N->getZExtValue() & 0x7; 2050 return (Imm == 0x00 || Imm == 0x03 || Imm == 0x04 || Imm == 0x07); 2051}]>; 2052 2053// Patterns to select compares with loads in first operand. 2054let Predicates = [HasAVX] in { 2055 def : Pat<(v4f64 (X86cmpp (loadv4f64 addr:$src2), VR256:$src1, 2056 CommutableCMPCC:$cc)), 2057 (VCMPPDYrmi VR256:$src1, addr:$src2, imm:$cc)>; 2058 2059 def : Pat<(v8f32 (X86cmpp (loadv8f32 addr:$src2), VR256:$src1, 2060 CommutableCMPCC:$cc)), 2061 (VCMPPSYrmi VR256:$src1, addr:$src2, imm:$cc)>; 2062 2063 def : Pat<(v2f64 (X86cmpp (loadv2f64 addr:$src2), VR128:$src1, 2064 CommutableCMPCC:$cc)), 2065 (VCMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>; 2066 2067 def : Pat<(v4f32 (X86cmpp (loadv4f32 addr:$src2), VR128:$src1, 2068 CommutableCMPCC:$cc)), 2069 (VCMPPSrmi VR128:$src1, addr:$src2, imm:$cc)>; 2070 2071 def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1, 2072 CommutableCMPCC:$cc)), 2073 (VCMPSDrm FR64:$src1, addr:$src2, imm:$cc)>; 2074 2075 def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1, 2076 CommutableCMPCC:$cc)), 2077 (VCMPSSrm FR32:$src1, addr:$src2, imm:$cc)>; 2078} 2079 2080let Predicates = [UseSSE2] in { 2081 def : Pat<(v2f64 (X86cmpp (memopv2f64 addr:$src2), VR128:$src1, 2082 CommutableCMPCC:$cc)), 2083 (CMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>; 2084 2085 def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1, 2086 CommutableCMPCC:$cc)), 2087 (CMPSDrm FR64:$src1, addr:$src2, imm:$cc)>; 2088} 2089 2090let Predicates = [UseSSE1] in { 2091 def : Pat<(v4f32 (X86cmpp (memopv4f32 addr:$src2), VR128:$src1, 2092 CommutableCMPCC:$cc)), 2093 (CMPPSrmi VR128:$src1, addr:$src2, imm:$cc)>; 2094 2095 def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1, 2096 CommutableCMPCC:$cc)), 2097 (CMPSSrm FR32:$src1, addr:$src2, imm:$cc)>; 2098} 2099 2100//===----------------------------------------------------------------------===// 2101// SSE 1 & 2 - Shuffle Instructions 2102//===----------------------------------------------------------------------===// 2103 2104/// sse12_shuffle - sse 1 & 2 fp shuffle instructions 2105multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop, 2106 ValueType vt, string asm, PatFrag mem_frag, 2107 X86FoldableSchedWrite sched, Domain d> { 2108 def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst), 2109 (ins RC:$src1, x86memop:$src2, u8imm:$src3), asm, 2110 [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2), 2111 (i8 imm:$src3))))], d>, 2112 Sched<[sched.Folded, ReadAfterLd]>; 2113 def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst), 2114 (ins RC:$src1, RC:$src2, u8imm:$src3), asm, 2115 [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2, 2116 (i8 imm:$src3))))], d>, 2117 Sched<[sched]>; 2118} 2119 2120let Predicates = [HasAVX, NoVLX] in { 2121 defm VSHUFPS : sse12_shuffle<VR128, f128mem, v4f32, 2122 "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 2123 loadv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>, 2124 PS, VEX_4V, VEX_WIG; 2125 defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32, 2126 "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 2127 loadv8f32, SchedWriteFShuffle.YMM, SSEPackedSingle>, 2128 PS, VEX_4V, VEX_L, VEX_WIG; 2129 defm VSHUFPD : sse12_shuffle<VR128, f128mem, v2f64, 2130 "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 2131 loadv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble>, 2132 PD, VEX_4V, VEX_WIG; 2133 defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64, 2134 "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 2135 loadv4f64, SchedWriteFShuffle.YMM, SSEPackedDouble>, 2136 PD, VEX_4V, VEX_L, VEX_WIG; 2137} 2138let Constraints = "$src1 = $dst" in { 2139 defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32, 2140 "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}", 2141 memopv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>, PS; 2142 defm SHUFPD : sse12_shuffle<VR128, f128mem, v2f64, 2143 "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}", 2144 memopv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble>, PD; 2145} 2146 2147//===----------------------------------------------------------------------===// 2148// SSE 1 & 2 - Unpack FP Instructions 2149//===----------------------------------------------------------------------===// 2150 2151/// sse12_unpack_interleave - sse 1 & 2 fp unpack and interleave 2152multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt, 2153 PatFrag mem_frag, RegisterClass RC, 2154 X86MemOperand x86memop, string asm, 2155 X86FoldableSchedWrite sched, Domain d, 2156 bit IsCommutable = 0> { 2157 let isCommutable = IsCommutable in 2158 def rr : PI<opc, MRMSrcReg, 2159 (outs RC:$dst), (ins RC:$src1, RC:$src2), 2160 asm, [(set RC:$dst, 2161 (vt (OpNode RC:$src1, RC:$src2)))], d>, 2162 Sched<[sched]>; 2163 def rm : PI<opc, MRMSrcMem, 2164 (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 2165 asm, [(set RC:$dst, 2166 (vt (OpNode RC:$src1, 2167 (mem_frag addr:$src2))))], d>, 2168 Sched<[sched.Folded, ReadAfterLd]>; 2169} 2170 2171let Predicates = [HasAVX, NoVLX] in { 2172defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, loadv4f32, 2173 VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2174 SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG; 2175defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, loadv2f64, 2176 VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2177 SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD, VEX_4V, VEX_WIG; 2178defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, loadv4f32, 2179 VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2180 SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG; 2181defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, loadv2f64, 2182 VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2183 SchedWriteFShuffle.XMM, SSEPackedDouble>, PD, VEX_4V, VEX_WIG; 2184 2185defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, loadv8f32, 2186 VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2187 SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG; 2188defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, loadv4f64, 2189 VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2190 SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG; 2191defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, loadv8f32, 2192 VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2193 SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG; 2194defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, loadv4f64, 2195 VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2196 SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG; 2197}// Predicates = [HasAVX, NoVLX] 2198 2199let Constraints = "$src1 = $dst" in { 2200 defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memopv4f32, 2201 VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}", 2202 SchedWriteFShuffle.XMM, SSEPackedSingle>, PS; 2203 defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memopv2f64, 2204 VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}", 2205 SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD; 2206 defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memopv4f32, 2207 VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}", 2208 SchedWriteFShuffle.XMM, SSEPackedSingle>, PS; 2209 defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memopv2f64, 2210 VR128, f128mem, "unpcklpd\t{$src2, $dst|$dst, $src2}", 2211 SchedWriteFShuffle.XMM, SSEPackedDouble>, PD; 2212} // Constraints = "$src1 = $dst" 2213 2214let Predicates = [HasAVX1Only] in { 2215 def : Pat<(v8i32 (X86Unpckl VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))), 2216 (VUNPCKLPSYrm VR256:$src1, addr:$src2)>; 2217 def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)), 2218 (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>; 2219 def : Pat<(v8i32 (X86Unpckh VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))), 2220 (VUNPCKHPSYrm VR256:$src1, addr:$src2)>; 2221 def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)), 2222 (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>; 2223 2224 def : Pat<(v4i64 (X86Unpckl VR256:$src1, (loadv4i64 addr:$src2))), 2225 (VUNPCKLPDYrm VR256:$src1, addr:$src2)>; 2226 def : Pat<(v4i64 (X86Unpckl VR256:$src1, VR256:$src2)), 2227 (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>; 2228 def : Pat<(v4i64 (X86Unpckh VR256:$src1, (loadv4i64 addr:$src2))), 2229 (VUNPCKHPDYrm VR256:$src1, addr:$src2)>; 2230 def : Pat<(v4i64 (X86Unpckh VR256:$src1, VR256:$src2)), 2231 (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>; 2232} 2233 2234//===----------------------------------------------------------------------===// 2235// SSE 1 & 2 - Extract Floating-Point Sign mask 2236//===----------------------------------------------------------------------===// 2237 2238/// sse12_extr_sign_mask - sse 1 & 2 unpack and interleave 2239multiclass sse12_extr_sign_mask<RegisterClass RC, ValueType vt, 2240 string asm, Domain d> { 2241 def rr : PI<0x50, MRMSrcReg, (outs GR32orGR64:$dst), (ins RC:$src), 2242 !strconcat(asm, "\t{$src, $dst|$dst, $src}"), 2243 [(set GR32orGR64:$dst, (X86movmsk (vt RC:$src)))], d>, 2244 Sched<[WriteFMOVMSK]>; 2245} 2246 2247let Predicates = [HasAVX] in { 2248 defm VMOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps", 2249 SSEPackedSingle>, PS, VEX, VEX_WIG; 2250 defm VMOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd", 2251 SSEPackedDouble>, PD, VEX, VEX_WIG; 2252 defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, v8f32, "movmskps", 2253 SSEPackedSingle>, PS, VEX, VEX_L, VEX_WIG; 2254 defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, v4f64, "movmskpd", 2255 SSEPackedDouble>, PD, VEX, VEX_L, VEX_WIG; 2256} 2257 2258defm MOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps", 2259 SSEPackedSingle>, PS; 2260defm MOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd", 2261 SSEPackedDouble>, PD; 2262 2263//===---------------------------------------------------------------------===// 2264// SSE2 - Packed Integer Logical Instructions 2265//===---------------------------------------------------------------------===// 2266 2267let ExeDomain = SSEPackedInt in { // SSE integer instructions 2268 2269/// PDI_binop_rm - Simple SSE2 binary operator. 2270multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, 2271 ValueType OpVT, RegisterClass RC, PatFrag memop_frag, 2272 X86MemOperand x86memop, X86FoldableSchedWrite sched, 2273 bit IsCommutable, bit Is2Addr> { 2274 let isCommutable = IsCommutable in 2275 def rr : PDI<opc, MRMSrcReg, (outs RC:$dst), 2276 (ins RC:$src1, RC:$src2), 2277 !if(Is2Addr, 2278 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 2279 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 2280 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>, 2281 Sched<[sched]>; 2282 def rm : PDI<opc, MRMSrcMem, (outs RC:$dst), 2283 (ins RC:$src1, x86memop:$src2), 2284 !if(Is2Addr, 2285 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 2286 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 2287 [(set RC:$dst, (OpVT (OpNode RC:$src1, 2288 (bitconvert (memop_frag addr:$src2)))))]>, 2289 Sched<[sched.Folded, ReadAfterLd]>; 2290} 2291} // ExeDomain = SSEPackedInt 2292 2293multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode, 2294 ValueType OpVT128, ValueType OpVT256, 2295 X86SchedWriteWidths sched, bit IsCommutable, 2296 Predicate prd> { 2297let Predicates = [HasAVX, prd] in 2298 defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128, 2299 VR128, loadv2i64, i128mem, sched.XMM, 2300 IsCommutable, 0>, VEX_4V, VEX_WIG; 2301 2302let Constraints = "$src1 = $dst" in 2303 defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128, 2304 memopv2i64, i128mem, sched.XMM, IsCommutable, 1>; 2305 2306let Predicates = [HasAVX2, prd] in 2307 defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, 2308 OpVT256, VR256, loadv4i64, i256mem, sched.YMM, 2309 IsCommutable, 0>, VEX_4V, VEX_L, VEX_WIG; 2310} 2311 2312// These are ordered here for pattern ordering requirements with the fp versions 2313 2314defm PAND : PDI_binop_all<0xDB, "pand", and, v2i64, v4i64, 2315 SchedWriteVecLogic, 1, NoVLX>; 2316defm POR : PDI_binop_all<0xEB, "por", or, v2i64, v4i64, 2317 SchedWriteVecLogic, 1, NoVLX>; 2318defm PXOR : PDI_binop_all<0xEF, "pxor", xor, v2i64, v4i64, 2319 SchedWriteVecLogic, 1, NoVLX>; 2320defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64, 2321 SchedWriteVecLogic, 0, NoVLX>; 2322 2323//===----------------------------------------------------------------------===// 2324// SSE 1 & 2 - Logical Instructions 2325//===----------------------------------------------------------------------===// 2326 2327/// sse12_fp_packed_logical - SSE 1 & 2 packed FP logical ops 2328/// 2329/// There are no patterns here because isel prefers integer versions for SSE2 2330/// and later. There are SSE1 v4f32 patterns later. 2331multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr, 2332 SDNode OpNode, X86SchedWriteWidths sched> { 2333 let Predicates = [HasAVX, NoVLX] in { 2334 defm V#NAME#PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle, 2335 !strconcat(OpcodeStr, "ps"), f256mem, sched.YMM, 2336 [], [], 0>, PS, VEX_4V, VEX_L, VEX_WIG; 2337 2338 defm V#NAME#PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble, 2339 !strconcat(OpcodeStr, "pd"), f256mem, sched.YMM, 2340 [], [], 0>, PD, VEX_4V, VEX_L, VEX_WIG; 2341 2342 defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle, 2343 !strconcat(OpcodeStr, "ps"), f128mem, sched.XMM, 2344 [], [], 0>, PS, VEX_4V, VEX_WIG; 2345 2346 defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble, 2347 !strconcat(OpcodeStr, "pd"), f128mem, sched.XMM, 2348 [], [], 0>, PD, VEX_4V, VEX_WIG; 2349 } 2350 2351 let Constraints = "$src1 = $dst" in { 2352 defm PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle, 2353 !strconcat(OpcodeStr, "ps"), f128mem, sched.XMM, 2354 [], []>, PS; 2355 2356 defm PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble, 2357 !strconcat(OpcodeStr, "pd"), f128mem, sched.XMM, 2358 [], []>, PD; 2359 } 2360} 2361 2362defm AND : sse12_fp_packed_logical<0x54, "and", and, SchedWriteFLogic>; 2363defm OR : sse12_fp_packed_logical<0x56, "or", or, SchedWriteFLogic>; 2364defm XOR : sse12_fp_packed_logical<0x57, "xor", xor, SchedWriteFLogic>; 2365let isCommutable = 0 in 2366 defm ANDN : sse12_fp_packed_logical<0x55, "andn", X86andnp, SchedWriteFLogic>; 2367 2368// If only AVX1 is supported, we need to handle integer operations with 2369// floating point instructions since the integer versions aren't available. 2370let Predicates = [HasAVX1Only] in { 2371 def : Pat<(v4i64 (and VR256:$src1, VR256:$src2)), 2372 (VANDPSYrr VR256:$src1, VR256:$src2)>; 2373 def : Pat<(v4i64 (or VR256:$src1, VR256:$src2)), 2374 (VORPSYrr VR256:$src1, VR256:$src2)>; 2375 def : Pat<(v4i64 (xor VR256:$src1, VR256:$src2)), 2376 (VXORPSYrr VR256:$src1, VR256:$src2)>; 2377 def : Pat<(v4i64 (X86andnp VR256:$src1, VR256:$src2)), 2378 (VANDNPSYrr VR256:$src1, VR256:$src2)>; 2379 2380 def : Pat<(and VR256:$src1, (loadv4i64 addr:$src2)), 2381 (VANDPSYrm VR256:$src1, addr:$src2)>; 2382 def : Pat<(or VR256:$src1, (loadv4i64 addr:$src2)), 2383 (VORPSYrm VR256:$src1, addr:$src2)>; 2384 def : Pat<(xor VR256:$src1, (loadv4i64 addr:$src2)), 2385 (VXORPSYrm VR256:$src1, addr:$src2)>; 2386 def : Pat<(X86andnp VR256:$src1, (loadv4i64 addr:$src2)), 2387 (VANDNPSYrm VR256:$src1, addr:$src2)>; 2388} 2389 2390let Predicates = [HasAVX, NoVLX_Or_NoDQI] in { 2391 // Use packed logical operations for scalar ops. 2392 def : Pat<(f64 (X86fand FR64:$src1, FR64:$src2)), 2393 (COPY_TO_REGCLASS 2394 (v2f64 (VANDPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)), 2395 (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))), 2396 FR64)>; 2397 def : Pat<(f64 (X86for FR64:$src1, FR64:$src2)), 2398 (COPY_TO_REGCLASS 2399 (v2f64 (VORPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)), 2400 (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))), 2401 FR64)>; 2402 def : Pat<(f64 (X86fxor FR64:$src1, FR64:$src2)), 2403 (COPY_TO_REGCLASS 2404 (v2f64 (VXORPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)), 2405 (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))), 2406 FR64)>; 2407 def : Pat<(f64 (X86fandn FR64:$src1, FR64:$src2)), 2408 (COPY_TO_REGCLASS 2409 (v2f64 (VANDNPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)), 2410 (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))), 2411 FR64)>; 2412 2413 def : Pat<(f32 (X86fand FR32:$src1, FR32:$src2)), 2414 (COPY_TO_REGCLASS 2415 (v4f32 (VANDPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)), 2416 (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))), 2417 FR32)>; 2418 def : Pat<(f32 (X86for FR32:$src1, FR32:$src2)), 2419 (COPY_TO_REGCLASS 2420 (v4f32 (VORPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)), 2421 (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))), 2422 FR32)>; 2423 def : Pat<(f32 (X86fxor FR32:$src1, FR32:$src2)), 2424 (COPY_TO_REGCLASS 2425 (v4f32 (VXORPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)), 2426 (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))), 2427 FR32)>; 2428 def : Pat<(f32 (X86fandn FR32:$src1, FR32:$src2)), 2429 (COPY_TO_REGCLASS 2430 (v4f32 (VANDNPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)), 2431 (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))), 2432 FR32)>; 2433} 2434 2435let Predicates = [UseSSE1] in { 2436 // Use packed logical operations for scalar ops. 2437 def : Pat<(f32 (X86fand FR32:$src1, FR32:$src2)), 2438 (COPY_TO_REGCLASS 2439 (v4f32 (ANDPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)), 2440 (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))), 2441 FR32)>; 2442 def : Pat<(f32 (X86for FR32:$src1, FR32:$src2)), 2443 (COPY_TO_REGCLASS 2444 (v4f32 (ORPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)), 2445 (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))), 2446 FR32)>; 2447 def : Pat<(f32 (X86fxor FR32:$src1, FR32:$src2)), 2448 (COPY_TO_REGCLASS 2449 (v4f32 (XORPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)), 2450 (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))), 2451 FR32)>; 2452 def : Pat<(f32 (X86fandn FR32:$src1, FR32:$src2)), 2453 (COPY_TO_REGCLASS 2454 (v4f32 (ANDNPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)), 2455 (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))), 2456 FR32)>; 2457} 2458 2459let Predicates = [UseSSE2] in { 2460 // Use packed logical operations for scalar ops. 2461 def : Pat<(f64 (X86fand FR64:$src1, FR64:$src2)), 2462 (COPY_TO_REGCLASS 2463 (v2f64 (ANDPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)), 2464 (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))), 2465 FR64)>; 2466 def : Pat<(f64 (X86for FR64:$src1, FR64:$src2)), 2467 (COPY_TO_REGCLASS 2468 (v2f64 (ORPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)), 2469 (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))), 2470 FR64)>; 2471 def : Pat<(f64 (X86fxor FR64:$src1, FR64:$src2)), 2472 (COPY_TO_REGCLASS 2473 (v2f64 (XORPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)), 2474 (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))), 2475 FR64)>; 2476 def : Pat<(f64 (X86fandn FR64:$src1, FR64:$src2)), 2477 (COPY_TO_REGCLASS 2478 (v2f64 (ANDNPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)), 2479 (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))), 2480 FR64)>; 2481} 2482 2483// Patterns for packed operations when we don't have integer type available. 2484def : Pat<(v4f32 (X86fand VR128:$src1, VR128:$src2)), 2485 (ANDPSrr VR128:$src1, VR128:$src2)>; 2486def : Pat<(v4f32 (X86for VR128:$src1, VR128:$src2)), 2487 (ORPSrr VR128:$src1, VR128:$src2)>; 2488def : Pat<(v4f32 (X86fxor VR128:$src1, VR128:$src2)), 2489 (XORPSrr VR128:$src1, VR128:$src2)>; 2490def : Pat<(v4f32 (X86fandn VR128:$src1, VR128:$src2)), 2491 (ANDNPSrr VR128:$src1, VR128:$src2)>; 2492 2493def : Pat<(X86fand VR128:$src1, (memopv4f32 addr:$src2)), 2494 (ANDPSrm VR128:$src1, addr:$src2)>; 2495def : Pat<(X86for VR128:$src1, (memopv4f32 addr:$src2)), 2496 (ORPSrm VR128:$src1, addr:$src2)>; 2497def : Pat<(X86fxor VR128:$src1, (memopv4f32 addr:$src2)), 2498 (XORPSrm VR128:$src1, addr:$src2)>; 2499def : Pat<(X86fandn VR128:$src1, (memopv4f32 addr:$src2)), 2500 (ANDNPSrm VR128:$src1, addr:$src2)>; 2501 2502//===----------------------------------------------------------------------===// 2503// SSE 1 & 2 - Arithmetic Instructions 2504//===----------------------------------------------------------------------===// 2505 2506/// basic_sse12_fp_binop_xxx - SSE 1 & 2 binops come in both scalar and 2507/// vector forms. 2508/// 2509/// In addition, we also have a special variant of the scalar form here to 2510/// represent the associated intrinsic operation. This form is unlike the 2511/// plain scalar form, in that it takes an entire vector (instead of a scalar) 2512/// and leaves the top elements unmodified (therefore these cannot be commuted). 2513/// 2514/// These three forms can each be reg+reg or reg+mem. 2515/// 2516 2517/// FIXME: once all 256-bit intrinsics are matched, cleanup and refactor those 2518/// classes below 2519multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr, 2520 SDNode OpNode, X86SchedWriteSizes sched> { 2521 let Predicates = [HasAVX, NoVLX] in { 2522 defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, 2523 VR128, v4f32, f128mem, loadv4f32, 2524 SSEPackedSingle, sched.PS.XMM, 0>, PS, VEX_4V, VEX_WIG; 2525 defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, 2526 VR128, v2f64, f128mem, loadv2f64, 2527 SSEPackedDouble, sched.PD.XMM, 0>, PD, VEX_4V, VEX_WIG; 2528 2529 defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), 2530 OpNode, VR256, v8f32, f256mem, loadv8f32, 2531 SSEPackedSingle, sched.PS.YMM, 0>, PS, VEX_4V, VEX_L, VEX_WIG; 2532 defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), 2533 OpNode, VR256, v4f64, f256mem, loadv4f64, 2534 SSEPackedDouble, sched.PD.YMM, 0>, PD, VEX_4V, VEX_L, VEX_WIG; 2535 } 2536 2537 let Constraints = "$src1 = $dst" in { 2538 defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128, 2539 v4f32, f128mem, memopv4f32, SSEPackedSingle, 2540 sched.PS.XMM>, PS; 2541 defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128, 2542 v2f64, f128mem, memopv2f64, SSEPackedDouble, 2543 sched.PD.XMM>, PD; 2544 } 2545} 2546 2547multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, 2548 X86SchedWriteSizes sched> { 2549 defm V#NAME#SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"), 2550 OpNode, FR32, f32mem, SSEPackedSingle, sched.PS.Scl, 0>, 2551 XS, VEX_4V, VEX_LIG, VEX_WIG; 2552 defm V#NAME#SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"), 2553 OpNode, FR64, f64mem, SSEPackedDouble, sched.PD.Scl, 0>, 2554 XD, VEX_4V, VEX_LIG, VEX_WIG; 2555 2556 let Constraints = "$src1 = $dst" in { 2557 defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"), 2558 OpNode, FR32, f32mem, SSEPackedSingle, 2559 sched.PS.Scl>, XS; 2560 defm SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"), 2561 OpNode, FR64, f64mem, SSEPackedDouble, 2562 sched.PD.Scl>, XD; 2563 } 2564} 2565 2566multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr, 2567 SDPatternOperator OpNode, 2568 X86SchedWriteSizes sched> { 2569 defm V#NAME#SS : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v4f32, 2570 !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32, 2571 SSEPackedSingle, sched.PS.Scl, 0>, XS, VEX_4V, VEX_LIG, VEX_WIG; 2572 defm V#NAME#SD : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v2f64, 2573 !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64, 2574 SSEPackedDouble, sched.PD.Scl, 0>, XD, VEX_4V, VEX_LIG, VEX_WIG; 2575 2576 let Constraints = "$src1 = $dst" in { 2577 defm SS : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v4f32, 2578 !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32, 2579 SSEPackedSingle, sched.PS.Scl>, XS; 2580 defm SD : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v2f64, 2581 !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64, 2582 SSEPackedDouble, sched.PD.Scl>, XD; 2583 } 2584} 2585 2586// Binary Arithmetic instructions 2587defm ADD : basic_sse12_fp_binop_p<0x58, "add", fadd, SchedWriteFAddSizes>, 2588 basic_sse12_fp_binop_s<0x58, "add", fadd, SchedWriteFAddSizes>, 2589 basic_sse12_fp_binop_s_int<0x58, "add", null_frag, SchedWriteFAddSizes>; 2590defm MUL : basic_sse12_fp_binop_p<0x59, "mul", fmul, SchedWriteFMulSizes>, 2591 basic_sse12_fp_binop_s<0x59, "mul", fmul, SchedWriteFMulSizes>, 2592 basic_sse12_fp_binop_s_int<0x59, "mul", null_frag, SchedWriteFMulSizes>; 2593let isCommutable = 0 in { 2594 defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", fsub, SchedWriteFAddSizes>, 2595 basic_sse12_fp_binop_s<0x5C, "sub", fsub, SchedWriteFAddSizes>, 2596 basic_sse12_fp_binop_s_int<0x5C, "sub", null_frag, SchedWriteFAddSizes>; 2597 defm DIV : basic_sse12_fp_binop_p<0x5E, "div", fdiv, SchedWriteFDivSizes>, 2598 basic_sse12_fp_binop_s<0x5E, "div", fdiv, SchedWriteFDivSizes>, 2599 basic_sse12_fp_binop_s_int<0x5E, "div", null_frag, SchedWriteFDivSizes>; 2600 defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SchedWriteFCmpSizes>, 2601 basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SchedWriteFCmpSizes>, 2602 basic_sse12_fp_binop_s_int<0x5F, "max", X86fmaxs, SchedWriteFCmpSizes>; 2603 defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SchedWriteFCmpSizes>, 2604 basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SchedWriteFCmpSizes>, 2605 basic_sse12_fp_binop_s_int<0x5D, "min", X86fmins, SchedWriteFCmpSizes>; 2606} 2607 2608let isCodeGenOnly = 1 in { 2609 defm MAXC: basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SchedWriteFCmpSizes>, 2610 basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SchedWriteFCmpSizes>; 2611 defm MINC: basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SchedWriteFCmpSizes>, 2612 basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SchedWriteFCmpSizes>; 2613} 2614 2615// Patterns used to select SSE scalar fp arithmetic instructions from 2616// either: 2617// 2618// (1) a scalar fp operation followed by a blend 2619// 2620// The effect is that the backend no longer emits unnecessary vector 2621// insert instructions immediately after SSE scalar fp instructions 2622// like addss or mulss. 2623// 2624// For example, given the following code: 2625// __m128 foo(__m128 A, __m128 B) { 2626// A[0] += B[0]; 2627// return A; 2628// } 2629// 2630// Previously we generated: 2631// addss %xmm0, %xmm1 2632// movss %xmm1, %xmm0 2633// 2634// We now generate: 2635// addss %xmm1, %xmm0 2636// 2637// (2) a vector packed single/double fp operation followed by a vector insert 2638// 2639// The effect is that the backend converts the packed fp instruction 2640// followed by a vector insert into a single SSE scalar fp instruction. 2641// 2642// For example, given the following code: 2643// __m128 foo(__m128 A, __m128 B) { 2644// __m128 C = A + B; 2645// return (__m128) {c[0], a[1], a[2], a[3]}; 2646// } 2647// 2648// Previously we generated: 2649// addps %xmm0, %xmm1 2650// movss %xmm1, %xmm0 2651// 2652// We now generate: 2653// addss %xmm1, %xmm0 2654 2655// TODO: Some canonicalization in lowering would simplify the number of 2656// patterns we have to try to match. 2657multiclass scalar_math_patterns<SDNode Op, string OpcPrefix, SDNode Move, 2658 ValueType VT, ValueType EltTy, 2659 RegisterClass RC, Predicate BasePredicate> { 2660 let Predicates = [BasePredicate] in { 2661 // extracted scalar math op with insert via movss/movsd 2662 def : Pat<(VT (Move (VT VR128:$dst), 2663 (VT (scalar_to_vector 2664 (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))), 2665 RC:$src))))), 2666 (!cast<Instruction>(OpcPrefix#rr_Int) VT:$dst, 2667 (VT (COPY_TO_REGCLASS RC:$src, VR128)))>; 2668 } 2669 2670 // Repeat for AVX versions of the instructions. 2671 let Predicates = [UseAVX] in { 2672 // extracted scalar math op with insert via movss/movsd 2673 def : Pat<(VT (Move (VT VR128:$dst), 2674 (VT (scalar_to_vector 2675 (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))), 2676 RC:$src))))), 2677 (!cast<Instruction>("V"#OpcPrefix#rr_Int) VT:$dst, 2678 (VT (COPY_TO_REGCLASS RC:$src, VR128)))>; 2679 } 2680} 2681 2682defm : scalar_math_patterns<fadd, "ADDSS", X86Movss, v4f32, f32, FR32, UseSSE1>; 2683defm : scalar_math_patterns<fsub, "SUBSS", X86Movss, v4f32, f32, FR32, UseSSE1>; 2684defm : scalar_math_patterns<fmul, "MULSS", X86Movss, v4f32, f32, FR32, UseSSE1>; 2685defm : scalar_math_patterns<fdiv, "DIVSS", X86Movss, v4f32, f32, FR32, UseSSE1>; 2686 2687defm : scalar_math_patterns<fadd, "ADDSD", X86Movsd, v2f64, f64, FR64, UseSSE2>; 2688defm : scalar_math_patterns<fsub, "SUBSD", X86Movsd, v2f64, f64, FR64, UseSSE2>; 2689defm : scalar_math_patterns<fmul, "MULSD", X86Movsd, v2f64, f64, FR64, UseSSE2>; 2690defm : scalar_math_patterns<fdiv, "DIVSD", X86Movsd, v2f64, f64, FR64, UseSSE2>; 2691 2692/// Unop Arithmetic 2693/// In addition, we also have a special variant of the scalar form here to 2694/// represent the associated intrinsic operation. This form is unlike the 2695/// plain scalar form, in that it takes an entire vector (instead of a 2696/// scalar) and leaves the top elements undefined. 2697/// 2698/// And, we have a special variant form for a full-vector intrinsic form. 2699 2700/// sse_fp_unop_s - SSE1 unops in scalar form 2701/// For the non-AVX defs, we need $src1 to be tied to $dst because 2702/// the HW instructions are 2 operand / destructive. 2703multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, 2704 ValueType ScalarVT, X86MemOperand x86memop, 2705 Operand intmemop, SDNode OpNode, Domain d, 2706 X86FoldableSchedWrite sched, Predicate target> { 2707 let hasSideEffects = 0 in { 2708 def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1), 2709 !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"), 2710 [(set RC:$dst, (OpNode RC:$src1))], d>, Sched<[sched]>, 2711 Requires<[target]>; 2712 let mayLoad = 1 in 2713 def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1), 2714 !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"), 2715 [(set RC:$dst, (OpNode (load addr:$src1)))], d>, 2716 Sched<[sched.Folded, ReadAfterLd]>, 2717 Requires<[target, OptForSize]>; 2718 2719 let isCodeGenOnly = 1, Constraints = "$src1 = $dst", ExeDomain = d in { 2720 def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), 2721 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>, 2722 Sched<[sched]>; 2723 let mayLoad = 1 in 2724 def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, intmemop:$src2), 2725 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>, 2726 Sched<[sched.Folded, ReadAfterLd]>; 2727 } 2728 } 2729 2730} 2731 2732multiclass sse_fp_unop_s_intr<RegisterClass RC, ValueType vt, 2733 ComplexPattern int_cpat, Intrinsic Intr, 2734 Predicate target, string Suffix> { 2735 let Predicates = [target] in { 2736 // These are unary operations, but they are modeled as having 2 source operands 2737 // because the high elements of the destination are unchanged in SSE. 2738 def : Pat<(Intr VR128:$src), 2739 (!cast<Instruction>(NAME#r_Int) VR128:$src, VR128:$src)>; 2740 } 2741 // We don't want to fold scalar loads into these instructions unless 2742 // optimizing for size. This is because the folded instruction will have a 2743 // partial register update, while the unfolded sequence will not, e.g. 2744 // movss mem, %xmm0 2745 // rcpss %xmm0, %xmm0 2746 // which has a clobber before the rcp, vs. 2747 // rcpss mem, %xmm0 2748 let Predicates = [target, OptForSize] in { 2749 def : Pat<(Intr int_cpat:$src2), 2750 (!cast<Instruction>(NAME#m_Int) 2751 (vt (IMPLICIT_DEF)), addr:$src2)>; 2752 } 2753} 2754 2755multiclass avx_fp_unop_s_intr<RegisterClass RC, ValueType vt, ComplexPattern int_cpat, 2756 Intrinsic Intr, Predicate target> { 2757 let Predicates = [target] in { 2758 def : Pat<(Intr VR128:$src), 2759 (!cast<Instruction>(NAME#r_Int) VR128:$src, 2760 VR128:$src)>; 2761 } 2762 let Predicates = [target, OptForSize] in { 2763 def : Pat<(Intr int_cpat:$src2), 2764 (!cast<Instruction>(NAME#m_Int) 2765 (vt (IMPLICIT_DEF)), addr:$src2)>; 2766 } 2767} 2768 2769multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, 2770 ValueType ScalarVT, X86MemOperand x86memop, 2771 Operand intmemop, SDNode OpNode, Domain d, 2772 X86FoldableSchedWrite sched, Predicate target> { 2773 let hasSideEffects = 0 in { 2774 def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 2775 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 2776 [], d>, Sched<[sched]>; 2777 let mayLoad = 1 in 2778 def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 2779 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 2780 [], d>, Sched<[sched.Folded, ReadAfterLd]>; 2781 let isCodeGenOnly = 1, ExeDomain = d in { 2782 def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst), 2783 (ins VR128:$src1, VR128:$src2), 2784 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 2785 []>, Sched<[sched]>; 2786 let mayLoad = 1 in 2787 def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), 2788 (ins VR128:$src1, intmemop:$src2), 2789 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 2790 []>, Sched<[sched.Folded, ReadAfterLd]>; 2791 } 2792 } 2793 2794 // We don't want to fold scalar loads into these instructions unless 2795 // optimizing for size. This is because the folded instruction will have a 2796 // partial register update, while the unfolded sequence will not, e.g. 2797 // vmovss mem, %xmm0 2798 // vrcpss %xmm0, %xmm0, %xmm0 2799 // which has a clobber before the rcp, vs. 2800 // vrcpss mem, %xmm0, %xmm0 2801 // TODO: In theory, we could fold the load, and avoid the stall caused by 2802 // the partial register store, either in BreakFalseDeps or with smarter RA. 2803 let Predicates = [target] in { 2804 def : Pat<(OpNode RC:$src), (!cast<Instruction>(NAME#r) 2805 (ScalarVT (IMPLICIT_DEF)), RC:$src)>; 2806 } 2807 let Predicates = [target, OptForSize] in { 2808 def : Pat<(ScalarVT (OpNode (load addr:$src))), 2809 (!cast<Instruction>(NAME#m) (ScalarVT (IMPLICIT_DEF)), 2810 addr:$src)>; 2811 } 2812} 2813 2814/// sse1_fp_unop_p - SSE1 unops in packed form. 2815multiclass sse1_fp_unop_p<bits<8> opc, string OpcodeStr, SDNode OpNode, 2816 X86SchedWriteWidths sched, list<Predicate> prds> { 2817let Predicates = prds in { 2818 def V#NAME#PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 2819 !strconcat("v", OpcodeStr, 2820 "ps\t{$src, $dst|$dst, $src}"), 2821 [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]>, 2822 VEX, Sched<[sched.XMM]>, VEX_WIG; 2823 def V#NAME#PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 2824 !strconcat("v", OpcodeStr, 2825 "ps\t{$src, $dst|$dst, $src}"), 2826 [(set VR128:$dst, (OpNode (loadv4f32 addr:$src)))]>, 2827 VEX, Sched<[sched.XMM.Folded]>, VEX_WIG; 2828 def V#NAME#PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 2829 !strconcat("v", OpcodeStr, 2830 "ps\t{$src, $dst|$dst, $src}"), 2831 [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))]>, 2832 VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG; 2833 def V#NAME#PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), 2834 !strconcat("v", OpcodeStr, 2835 "ps\t{$src, $dst|$dst, $src}"), 2836 [(set VR256:$dst, (OpNode (loadv8f32 addr:$src)))]>, 2837 VEX, VEX_L, Sched<[sched.YMM.Folded]>, VEX_WIG; 2838} 2839 2840 def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 2841 !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), 2842 [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]>, 2843 Sched<[sched.XMM]>; 2844 def PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 2845 !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), 2846 [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))]>, 2847 Sched<[sched.XMM.Folded]>; 2848} 2849 2850/// sse2_fp_unop_p - SSE2 unops in vector forms. 2851multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr, 2852 SDNode OpNode, X86SchedWriteWidths sched> { 2853let Predicates = [HasAVX, NoVLX] in { 2854 def V#NAME#PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 2855 !strconcat("v", OpcodeStr, 2856 "pd\t{$src, $dst|$dst, $src}"), 2857 [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]>, 2858 VEX, Sched<[sched.XMM]>, VEX_WIG; 2859 def V#NAME#PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 2860 !strconcat("v", OpcodeStr, 2861 "pd\t{$src, $dst|$dst, $src}"), 2862 [(set VR128:$dst, (OpNode (loadv2f64 addr:$src)))]>, 2863 VEX, Sched<[sched.XMM.Folded]>, VEX_WIG; 2864 def V#NAME#PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 2865 !strconcat("v", OpcodeStr, 2866 "pd\t{$src, $dst|$dst, $src}"), 2867 [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))]>, 2868 VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG; 2869 def V#NAME#PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), 2870 !strconcat("v", OpcodeStr, 2871 "pd\t{$src, $dst|$dst, $src}"), 2872 [(set VR256:$dst, (OpNode (loadv4f64 addr:$src)))]>, 2873 VEX, VEX_L, Sched<[sched.YMM.Folded]>, VEX_WIG; 2874} 2875 2876 def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 2877 !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"), 2878 [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]>, 2879 Sched<[sched.XMM]>; 2880 def PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 2881 !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"), 2882 [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))]>, 2883 Sched<[sched.XMM.Folded]>; 2884} 2885 2886multiclass sse1_fp_unop_s_intr<bits<8> opc, string OpcodeStr, SDNode OpNode, 2887 X86SchedWriteWidths sched, Predicate AVXTarget> { 2888 defm SS : sse_fp_unop_s_intr<FR32, v4f32, sse_load_f32, 2889 !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), 2890 UseSSE1, "SS">, XS; 2891 defm V#NAME#SS : avx_fp_unop_s_intr<FR32, v4f32, sse_load_f32, 2892 !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), 2893 AVXTarget>, 2894 XS, VEX_4V, VEX_LIG, VEX_WIG, NotMemoryFoldable; 2895} 2896 2897multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, 2898 X86SchedWriteWidths sched, Predicate AVXTarget> { 2899 defm SS : sse_fp_unop_s<opc, OpcodeStr##ss, FR32, f32, f32mem, 2900 ssmem, OpNode, SSEPackedSingle, sched.Scl, UseSSE1>, XS; 2901 defm V#NAME#SS : avx_fp_unop_s<opc, "v"#OpcodeStr##ss, FR32, f32, 2902 f32mem, ssmem, OpNode, SSEPackedSingle, sched.Scl, AVXTarget>, 2903 XS, VEX_4V, VEX_LIG, VEX_WIG; 2904} 2905 2906multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, 2907 X86SchedWriteWidths sched, Predicate AVXTarget> { 2908 defm SD : sse_fp_unop_s<opc, OpcodeStr##sd, FR64, f64, f64mem, 2909 sdmem, OpNode, SSEPackedDouble, sched.Scl, UseSSE2>, XD; 2910 defm V#NAME#SD : avx_fp_unop_s<opc, "v"#OpcodeStr##sd, FR64, f64, 2911 f64mem, sdmem, OpNode, SSEPackedDouble, sched.Scl, AVXTarget>, 2912 XD, VEX_4V, VEX_LIG, VEX_WIG; 2913} 2914 2915// Square root. 2916defm SQRT : sse1_fp_unop_s<0x51, "sqrt", fsqrt, SchedWriteFSqrt, UseAVX>, 2917 sse1_fp_unop_p<0x51, "sqrt", fsqrt, SchedWriteFSqrt, [HasAVX, NoVLX]>, 2918 sse2_fp_unop_s<0x51, "sqrt", fsqrt, SchedWriteFSqrt64, UseAVX>, 2919 sse2_fp_unop_p<0x51, "sqrt", fsqrt, SchedWriteFSqrt64>; 2920 2921// Reciprocal approximations. Note that these typically require refinement 2922// in order to obtain suitable precision. 2923defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, HasAVX>, 2924 sse1_fp_unop_s_intr<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, HasAVX>, 2925 sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, [HasAVX]>; 2926defm RCP : sse1_fp_unop_s<0x53, "rcp", X86frcp, SchedWriteFRcp, HasAVX>, 2927 sse1_fp_unop_s_intr<0x53, "rcp", X86frcp, SchedWriteFRcp, HasAVX>, 2928 sse1_fp_unop_p<0x53, "rcp", X86frcp, SchedWriteFRcp, [HasAVX]>; 2929 2930// There is no f64 version of the reciprocal approximation instructions. 2931 2932multiclass scalar_unary_math_patterns<SDNode OpNode, string OpcPrefix, SDNode Move, 2933 ValueType VT, Predicate BasePredicate> { 2934 let Predicates = [BasePredicate] in { 2935 def : Pat<(VT (Move VT:$dst, (scalar_to_vector 2936 (OpNode (extractelt VT:$src, 0))))), 2937 (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src)>; 2938 } 2939 2940 // Repeat for AVX versions of the instructions. 2941 let Predicates = [UseAVX] in { 2942 def : Pat<(VT (Move VT:$dst, (scalar_to_vector 2943 (OpNode (extractelt VT:$src, 0))))), 2944 (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>; 2945 } 2946} 2947 2948multiclass scalar_unary_math_imm_patterns<SDNode OpNode, string OpcPrefix, SDNode Move, 2949 ValueType VT, bits<8> ImmV, 2950 Predicate BasePredicate> { 2951 let Predicates = [BasePredicate] in { 2952 def : Pat<(VT (Move VT:$dst, (scalar_to_vector 2953 (OpNode (extractelt VT:$src, 0))))), 2954 (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src, (i32 ImmV))>; 2955 } 2956 2957 // Repeat for AVX versions of the instructions. 2958 let Predicates = [UseAVX] in { 2959 def : Pat<(VT (Move VT:$dst, (scalar_to_vector 2960 (OpNode (extractelt VT:$src, 0))))), 2961 (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src, (i32 ImmV))>; 2962 } 2963} 2964 2965defm : scalar_unary_math_patterns<fsqrt, "SQRTSS", X86Movss, v4f32, UseSSE1>; 2966defm : scalar_unary_math_patterns<fsqrt, "SQRTSD", X86Movsd, v2f64, UseSSE2>; 2967 2968multiclass scalar_unary_math_intr_patterns<Intrinsic Intr, string OpcPrefix, 2969 SDNode Move, ValueType VT, 2970 Predicate BasePredicate> { 2971 let Predicates = [BasePredicate] in { 2972 def : Pat<(VT (Move VT:$dst, (Intr VT:$src))), 2973 (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src)>; 2974 } 2975 2976 // Repeat for AVX versions of the instructions. 2977 let Predicates = [HasAVX] in { 2978 def : Pat<(VT (Move VT:$dst, (Intr VT:$src))), 2979 (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>; 2980 } 2981} 2982 2983defm : scalar_unary_math_intr_patterns<int_x86_sse_rcp_ss, "RCPSS", X86Movss, 2984 v4f32, UseSSE1>; 2985defm : scalar_unary_math_intr_patterns<int_x86_sse_rsqrt_ss, "RSQRTSS", X86Movss, 2986 v4f32, UseSSE1>; 2987 2988 2989//===----------------------------------------------------------------------===// 2990// SSE 1 & 2 - Non-temporal stores 2991//===----------------------------------------------------------------------===// 2992 2993let AddedComplexity = 400 in { // Prefer non-temporal versions 2994let Predicates = [HasAVX, NoVLX] in { 2995let SchedRW = [SchedWriteFMoveLSNT.XMM.MR] in { 2996def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs), 2997 (ins f128mem:$dst, VR128:$src), 2998 "movntps\t{$src, $dst|$dst, $src}", 2999 [(alignednontemporalstore (v4f32 VR128:$src), 3000 addr:$dst)]>, VEX, VEX_WIG; 3001def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs), 3002 (ins f128mem:$dst, VR128:$src), 3003 "movntpd\t{$src, $dst|$dst, $src}", 3004 [(alignednontemporalstore (v2f64 VR128:$src), 3005 addr:$dst)]>, VEX, VEX_WIG; 3006} // SchedRW 3007 3008let SchedRW = [SchedWriteFMoveLSNT.YMM.MR] in { 3009def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs), 3010 (ins f256mem:$dst, VR256:$src), 3011 "movntps\t{$src, $dst|$dst, $src}", 3012 [(alignednontemporalstore (v8f32 VR256:$src), 3013 addr:$dst)]>, VEX, VEX_L, VEX_WIG; 3014def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs), 3015 (ins f256mem:$dst, VR256:$src), 3016 "movntpd\t{$src, $dst|$dst, $src}", 3017 [(alignednontemporalstore (v4f64 VR256:$src), 3018 addr:$dst)]>, VEX, VEX_L, VEX_WIG; 3019} // SchedRW 3020 3021let ExeDomain = SSEPackedInt in { 3022def VMOVNTDQmr : VPDI<0xE7, MRMDestMem, (outs), 3023 (ins i128mem:$dst, VR128:$src), 3024 "movntdq\t{$src, $dst|$dst, $src}", 3025 [(alignednontemporalstore (v2i64 VR128:$src), 3026 addr:$dst)]>, VEX, VEX_WIG, 3027 Sched<[SchedWriteVecMoveLSNT.XMM.MR]>; 3028def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs), 3029 (ins i256mem:$dst, VR256:$src), 3030 "movntdq\t{$src, $dst|$dst, $src}", 3031 [(alignednontemporalstore (v4i64 VR256:$src), 3032 addr:$dst)]>, VEX, VEX_L, VEX_WIG, 3033 Sched<[SchedWriteVecMoveLSNT.YMM.MR]>; 3034} // ExeDomain 3035} // Predicates 3036 3037let SchedRW = [SchedWriteFMoveLSNT.XMM.MR] in { 3038def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 3039 "movntps\t{$src, $dst|$dst, $src}", 3040 [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)]>; 3041def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 3042 "movntpd\t{$src, $dst|$dst, $src}", 3043 [(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)]>; 3044} // SchedRW 3045 3046let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecMoveLSNT.XMM.MR] in 3047def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 3048 "movntdq\t{$src, $dst|$dst, $src}", 3049 [(alignednontemporalstore (v2i64 VR128:$src), addr:$dst)]>; 3050 3051let SchedRW = [WriteStoreNT] in { 3052// There is no AVX form for instructions below this point 3053def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src), 3054 "movnti{l}\t{$src, $dst|$dst, $src}", 3055 [(nontemporalstore (i32 GR32:$src), addr:$dst)]>, 3056 PS, Requires<[HasSSE2]>; 3057def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), 3058 "movnti{q}\t{$src, $dst|$dst, $src}", 3059 [(nontemporalstore (i64 GR64:$src), addr:$dst)]>, 3060 PS, Requires<[HasSSE2]>; 3061} // SchedRW = [WriteStoreNT] 3062 3063let Predicates = [HasAVX, NoVLX] in { 3064 def : Pat<(alignednontemporalstore (v8i32 VR256:$src), addr:$dst), 3065 (VMOVNTDQYmr addr:$dst, VR256:$src)>; 3066 def : Pat<(alignednontemporalstore (v16i16 VR256:$src), addr:$dst), 3067 (VMOVNTDQYmr addr:$dst, VR256:$src)>; 3068 def : Pat<(alignednontemporalstore (v32i8 VR256:$src), addr:$dst), 3069 (VMOVNTDQYmr addr:$dst, VR256:$src)>; 3070 3071 def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst), 3072 (VMOVNTDQmr addr:$dst, VR128:$src)>; 3073 def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst), 3074 (VMOVNTDQmr addr:$dst, VR128:$src)>; 3075 def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst), 3076 (VMOVNTDQmr addr:$dst, VR128:$src)>; 3077} 3078 3079let Predicates = [UseSSE2] in { 3080 def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst), 3081 (MOVNTDQmr addr:$dst, VR128:$src)>; 3082 def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst), 3083 (MOVNTDQmr addr:$dst, VR128:$src)>; 3084 def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst), 3085 (MOVNTDQmr addr:$dst, VR128:$src)>; 3086} 3087 3088} // AddedComplexity 3089 3090//===----------------------------------------------------------------------===// 3091// SSE 1 & 2 - Prefetch and memory fence 3092//===----------------------------------------------------------------------===// 3093 3094// Prefetch intrinsic. 3095let Predicates = [HasSSEPrefetch], SchedRW = [WriteLoad] in { 3096def PREFETCHT0 : I<0x18, MRM1m, (outs), (ins i8mem:$src), 3097 "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3), (i32 1))]>, TB; 3098def PREFETCHT1 : I<0x18, MRM2m, (outs), (ins i8mem:$src), 3099 "prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2), (i32 1))]>, TB; 3100def PREFETCHT2 : I<0x18, MRM3m, (outs), (ins i8mem:$src), 3101 "prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1), (i32 1))]>, TB; 3102def PREFETCHNTA : I<0x18, MRM0m, (outs), (ins i8mem:$src), 3103 "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0), (i32 1))]>, TB; 3104} 3105 3106// FIXME: How should flush instruction be modeled? 3107let SchedRW = [WriteLoad] in { 3108// Flush cache 3109def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src), 3110 "clflush\t$src", [(int_x86_sse2_clflush addr:$src)]>, 3111 PS, Requires<[HasSSE2]>; 3112} 3113 3114let SchedRW = [WriteNop] in { 3115// Pause. This "instruction" is encoded as "rep; nop", so even though it 3116// was introduced with SSE2, it's backward compatible. 3117def PAUSE : I<0x90, RawFrm, (outs), (ins), 3118 "pause", [(int_x86_sse2_pause)]>, OBXS; 3119} 3120 3121let SchedRW = [WriteFence] in { 3122// Load, store, and memory fence 3123// TODO: As with mfence, we may want to ease the availablity of sfence/lfence 3124// to include any 64-bit target. 3125def SFENCE : I<0xAE, MRM_F8, (outs), (ins), "sfence", [(int_x86_sse_sfence)]>, 3126 PS, Requires<[HasSSE1]>; 3127def LFENCE : I<0xAE, MRM_E8, (outs), (ins), "lfence", [(int_x86_sse2_lfence)]>, 3128 PS, Requires<[HasSSE2]>; 3129def MFENCE : I<0xAE, MRM_F0, (outs), (ins), "mfence", [(int_x86_sse2_mfence)]>, 3130 PS, Requires<[HasMFence]>; 3131} // SchedRW 3132 3133def : Pat<(X86MFence), (MFENCE)>; 3134 3135//===----------------------------------------------------------------------===// 3136// SSE 1 & 2 - Load/Store XCSR register 3137//===----------------------------------------------------------------------===// 3138 3139def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src), 3140 "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>, 3141 VEX, Sched<[WriteLDMXCSR]>, VEX_WIG; 3142def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst), 3143 "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>, 3144 VEX, Sched<[WriteSTMXCSR]>, VEX_WIG; 3145 3146def LDMXCSR : I<0xAE, MRM2m, (outs), (ins i32mem:$src), 3147 "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>, 3148 TB, Sched<[WriteLDMXCSR]>; 3149def STMXCSR : I<0xAE, MRM3m, (outs), (ins i32mem:$dst), 3150 "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>, 3151 TB, Sched<[WriteSTMXCSR]>; 3152 3153//===---------------------------------------------------------------------===// 3154// SSE2 - Move Aligned/Unaligned Packed Integer Instructions 3155//===---------------------------------------------------------------------===// 3156 3157let ExeDomain = SSEPackedInt in { // SSE integer instructions 3158 3159let hasSideEffects = 0 in { 3160def VMOVDQArr : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 3161 "movdqa\t{$src, $dst|$dst, $src}", []>, 3162 Sched<[SchedWriteVecMoveLS.XMM.RR]>, VEX, VEX_WIG; 3163def VMOVDQUrr : VSSI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 3164 "movdqu\t{$src, $dst|$dst, $src}", []>, 3165 Sched<[SchedWriteVecMoveLS.XMM.RR]>, VEX, VEX_WIG; 3166def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 3167 "movdqa\t{$src, $dst|$dst, $src}", []>, 3168 Sched<[SchedWriteVecMoveLS.YMM.RR]>, VEX, VEX_L, VEX_WIG; 3169def VMOVDQUYrr : VSSI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 3170 "movdqu\t{$src, $dst|$dst, $src}", []>, 3171 Sched<[SchedWriteVecMoveLS.YMM.RR]>, VEX, VEX_L, VEX_WIG; 3172} 3173 3174// For Disassembler 3175let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in { 3176def VMOVDQArr_REV : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 3177 "movdqa\t{$src, $dst|$dst, $src}", []>, 3178 Sched<[SchedWriteVecMoveLS.XMM.RR]>, 3179 VEX, VEX_WIG, FoldGenData<"VMOVDQArr">; 3180def VMOVDQAYrr_REV : VPDI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src), 3181 "movdqa\t{$src, $dst|$dst, $src}", []>, 3182 Sched<[SchedWriteVecMoveLS.YMM.RR]>, 3183 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVDQAYrr">; 3184def VMOVDQUrr_REV : VSSI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 3185 "movdqu\t{$src, $dst|$dst, $src}", []>, 3186 Sched<[SchedWriteVecMoveLS.XMM.RR]>, 3187 VEX, VEX_WIG, FoldGenData<"VMOVDQUrr">; 3188def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src), 3189 "movdqu\t{$src, $dst|$dst, $src}", []>, 3190 Sched<[SchedWriteVecMoveLS.YMM.RR]>, 3191 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVDQUYrr">; 3192} 3193 3194let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1, 3195 hasSideEffects = 0, Predicates = [HasAVX,NoVLX] in { 3196def VMOVDQArm : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 3197 "movdqa\t{$src, $dst|$dst, $src}", 3198 [(set VR128:$dst, (alignedloadv2i64 addr:$src))]>, 3199 Sched<[SchedWriteVecMoveLS.XMM.RM]>, VEX, VEX_WIG; 3200def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), 3201 "movdqa\t{$src, $dst|$dst, $src}", []>, 3202 Sched<[SchedWriteVecMoveLS.YMM.RM]>, 3203 VEX, VEX_L, VEX_WIG; 3204def VMOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 3205 "vmovdqu\t{$src, $dst|$dst, $src}", 3206 [(set VR128:$dst, (loadv2i64 addr:$src))]>, 3207 Sched<[SchedWriteVecMoveLS.XMM.RM]>, 3208 XS, VEX, VEX_WIG; 3209def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), 3210 "vmovdqu\t{$src, $dst|$dst, $src}", []>, 3211 Sched<[SchedWriteVecMoveLS.YMM.RM]>, 3212 XS, VEX, VEX_L, VEX_WIG; 3213} 3214 3215let mayStore = 1, hasSideEffects = 0, Predicates = [HasAVX,NoVLX] in { 3216def VMOVDQAmr : VPDI<0x7F, MRMDestMem, (outs), 3217 (ins i128mem:$dst, VR128:$src), 3218 "movdqa\t{$src, $dst|$dst, $src}", 3219 [(alignedstore (v2i64 VR128:$src), addr:$dst)]>, 3220 Sched<[SchedWriteVecMoveLS.XMM.MR]>, VEX, VEX_WIG; 3221def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs), 3222 (ins i256mem:$dst, VR256:$src), 3223 "movdqa\t{$src, $dst|$dst, $src}", []>, 3224 Sched<[SchedWriteVecMoveLS.YMM.MR]>, VEX, VEX_L, VEX_WIG; 3225def VMOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), 3226 "vmovdqu\t{$src, $dst|$dst, $src}", 3227 [(store (v2i64 VR128:$src), addr:$dst)]>, 3228 Sched<[SchedWriteVecMoveLS.XMM.MR]>, XS, VEX, VEX_WIG; 3229def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src), 3230 "vmovdqu\t{$src, $dst|$dst, $src}",[]>, 3231 Sched<[SchedWriteVecMoveLS.YMM.MR]>, XS, VEX, VEX_L, VEX_WIG; 3232} 3233 3234let SchedRW = [SchedWriteVecMoveLS.XMM.RR] in { 3235let hasSideEffects = 0 in { 3236def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 3237 "movdqa\t{$src, $dst|$dst, $src}", []>; 3238 3239def MOVDQUrr : I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 3240 "movdqu\t{$src, $dst|$dst, $src}", []>, 3241 XS, Requires<[UseSSE2]>; 3242} 3243 3244// For Disassembler 3245let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in { 3246def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 3247 "movdqa\t{$src, $dst|$dst, $src}", []>, 3248 FoldGenData<"MOVDQArr">; 3249 3250def MOVDQUrr_REV : I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 3251 "movdqu\t{$src, $dst|$dst, $src}", []>, 3252 XS, Requires<[UseSSE2]>, FoldGenData<"MOVDQUrr">; 3253} 3254} // SchedRW 3255 3256let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1, 3257 hasSideEffects = 0, SchedRW = [SchedWriteVecMoveLS.XMM.RM] in { 3258def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 3259 "movdqa\t{$src, $dst|$dst, $src}", 3260 [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/]>; 3261def MOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 3262 "movdqu\t{$src, $dst|$dst, $src}", 3263 [/*(set VR128:$dst, (loadv2i64 addr:$src))*/]>, 3264 XS, Requires<[UseSSE2]>; 3265} 3266 3267let mayStore = 1, hasSideEffects = 0, 3268 SchedRW = [SchedWriteVecMoveLS.XMM.MR] in { 3269def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), 3270 "movdqa\t{$src, $dst|$dst, $src}", 3271 [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/]>; 3272def MOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), 3273 "movdqu\t{$src, $dst|$dst, $src}", 3274 [/*(store (v2i64 VR128:$src), addr:$dst)*/]>, 3275 XS, Requires<[UseSSE2]>; 3276} 3277 3278} // ExeDomain = SSEPackedInt 3279 3280// Aliases to help the assembler pick two byte VEX encodings by swapping the 3281// operands relative to the normal instructions to use VEX.R instead of VEX.B. 3282def : InstAlias<"vmovdqa\t{$src, $dst|$dst, $src}", 3283 (VMOVDQArr_REV VR128L:$dst, VR128H:$src), 0>; 3284def : InstAlias<"vmovdqa\t{$src, $dst|$dst, $src}", 3285 (VMOVDQAYrr_REV VR256L:$dst, VR256H:$src), 0>; 3286def : InstAlias<"vmovdqu\t{$src, $dst|$dst, $src}", 3287 (VMOVDQUrr_REV VR128L:$dst, VR128H:$src), 0>; 3288def : InstAlias<"vmovdqu\t{$src, $dst|$dst, $src}", 3289 (VMOVDQUYrr_REV VR256L:$dst, VR256H:$src), 0>; 3290 3291// Reversed version with ".s" suffix for GAS compatibility. 3292def : InstAlias<"vmovdqa.s\t{$src, $dst|$dst, $src}", 3293 (VMOVDQArr_REV VR128:$dst, VR128:$src), 0>; 3294def : InstAlias<"vmovdqa.s\t{$src, $dst|$dst, $src}", 3295 (VMOVDQAYrr_REV VR256:$dst, VR256:$src), 0>; 3296def : InstAlias<"vmovdqu.s\t{$src, $dst|$dst, $src}", 3297 (VMOVDQUrr_REV VR128:$dst, VR128:$src), 0>; 3298def : InstAlias<"vmovdqu.s\t{$src, $dst|$dst, $src}", 3299 (VMOVDQUYrr_REV VR256:$dst, VR256:$src), 0>; 3300 3301// Reversed version with ".s" suffix for GAS compatibility. 3302def : InstAlias<"movdqa.s\t{$src, $dst|$dst, $src}", 3303 (MOVDQArr_REV VR128:$dst, VR128:$src), 0>; 3304def : InstAlias<"movdqu.s\t{$src, $dst|$dst, $src}", 3305 (MOVDQUrr_REV VR128:$dst, VR128:$src), 0>; 3306 3307let Predicates = [HasAVX, NoVLX] in { 3308 // Additional patterns for other integer sizes. 3309 def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst), 3310 (VMOVDQAmr addr:$dst, VR128:$src)>; 3311 def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst), 3312 (VMOVDQAmr addr:$dst, VR128:$src)>; 3313 def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst), 3314 (VMOVDQAmr addr:$dst, VR128:$src)>; 3315 def : Pat<(store (v4i32 VR128:$src), addr:$dst), 3316 (VMOVDQUmr addr:$dst, VR128:$src)>; 3317 def : Pat<(store (v8i16 VR128:$src), addr:$dst), 3318 (VMOVDQUmr addr:$dst, VR128:$src)>; 3319 def : Pat<(store (v16i8 VR128:$src), addr:$dst), 3320 (VMOVDQUmr addr:$dst, VR128:$src)>; 3321} 3322 3323//===---------------------------------------------------------------------===// 3324// SSE2 - Packed Integer Arithmetic Instructions 3325//===---------------------------------------------------------------------===// 3326 3327let ExeDomain = SSEPackedInt in { // SSE integer instructions 3328 3329/// PDI_binop_rm2 - Simple SSE2 binary operator with different src and dst types 3330multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode, 3331 ValueType DstVT, ValueType SrcVT, RegisterClass RC, 3332 PatFrag memop_frag, X86MemOperand x86memop, 3333 X86FoldableSchedWrite sched, bit Is2Addr = 1> { 3334 let isCommutable = 1 in 3335 def rr : PDI<opc, MRMSrcReg, (outs RC:$dst), 3336 (ins RC:$src1, RC:$src2), 3337 !if(Is2Addr, 3338 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3339 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3340 [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>, 3341 Sched<[sched]>; 3342 def rm : PDI<opc, MRMSrcMem, (outs RC:$dst), 3343 (ins RC:$src1, x86memop:$src2), 3344 !if(Is2Addr, 3345 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3346 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3347 [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), 3348 (bitconvert (memop_frag addr:$src2)))))]>, 3349 Sched<[sched.Folded, ReadAfterLd]>; 3350} 3351} // ExeDomain = SSEPackedInt 3352 3353defm PADDB : PDI_binop_all<0xFC, "paddb", add, v16i8, v32i8, 3354 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3355defm PADDW : PDI_binop_all<0xFD, "paddw", add, v8i16, v16i16, 3356 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3357defm PADDD : PDI_binop_all<0xFE, "paddd", add, v4i32, v8i32, 3358 SchedWriteVecALU, 1, NoVLX>; 3359defm PADDQ : PDI_binop_all<0xD4, "paddq", add, v2i64, v4i64, 3360 SchedWriteVecALU, 1, NoVLX>; 3361defm PADDSB : PDI_binop_all<0xEC, "paddsb", X86adds, v16i8, v32i8, 3362 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3363defm PADDSW : PDI_binop_all<0xED, "paddsw", X86adds, v8i16, v16i16, 3364 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3365defm PADDUSB : PDI_binop_all<0xDC, "paddusb", X86addus, v16i8, v32i8, 3366 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3367defm PADDUSW : PDI_binop_all<0xDD, "paddusw", X86addus, v8i16, v16i16, 3368 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3369defm PMULLW : PDI_binop_all<0xD5, "pmullw", mul, v8i16, v16i16, 3370 SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>; 3371defm PMULHUW : PDI_binop_all<0xE4, "pmulhuw", mulhu, v8i16, v16i16, 3372 SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>; 3373defm PMULHW : PDI_binop_all<0xE5, "pmulhw", mulhs, v8i16, v16i16, 3374 SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>; 3375defm PSUBB : PDI_binop_all<0xF8, "psubb", sub, v16i8, v32i8, 3376 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; 3377defm PSUBW : PDI_binop_all<0xF9, "psubw", sub, v8i16, v16i16, 3378 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; 3379defm PSUBD : PDI_binop_all<0xFA, "psubd", sub, v4i32, v8i32, 3380 SchedWriteVecALU, 0, NoVLX>; 3381defm PSUBQ : PDI_binop_all<0xFB, "psubq", sub, v2i64, v4i64, 3382 SchedWriteVecALU, 0, NoVLX>; 3383defm PSUBSB : PDI_binop_all<0xE8, "psubsb", X86subs, v16i8, v32i8, 3384 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; 3385defm PSUBSW : PDI_binop_all<0xE9, "psubsw", X86subs, v8i16, v16i16, 3386 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; 3387defm PSUBUSB : PDI_binop_all<0xD8, "psubusb", X86subus, v16i8, v32i8, 3388 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; 3389defm PSUBUSW : PDI_binop_all<0xD9, "psubusw", X86subus, v8i16, v16i16, 3390 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; 3391defm PMINUB : PDI_binop_all<0xDA, "pminub", umin, v16i8, v32i8, 3392 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3393defm PMINSW : PDI_binop_all<0xEA, "pminsw", smin, v8i16, v16i16, 3394 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3395defm PMAXUB : PDI_binop_all<0xDE, "pmaxub", umax, v16i8, v32i8, 3396 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3397defm PMAXSW : PDI_binop_all<0xEE, "pmaxsw", smax, v8i16, v16i16, 3398 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3399defm PAVGB : PDI_binop_all<0xE0, "pavgb", X86avg, v16i8, v32i8, 3400 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3401defm PAVGW : PDI_binop_all<0xE3, "pavgw", X86avg, v8i16, v16i16, 3402 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3403defm PMULUDQ : PDI_binop_all<0xF4, "pmuludq", X86pmuludq, v2i64, v4i64, 3404 SchedWriteVecIMul, 1, NoVLX>; 3405 3406let Predicates = [HasAVX, NoVLX_Or_NoBWI] in 3407defm VPMADDWD : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v4i32, v8i16, VR128, 3408 loadv2i64, i128mem, SchedWriteVecIMul.XMM, 0>, 3409 VEX_4V, VEX_WIG; 3410 3411let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in 3412defm VPMADDWDY : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v8i32, v16i16, 3413 VR256, loadv4i64, i256mem, SchedWriteVecIMul.YMM, 3414 0>, VEX_4V, VEX_L, VEX_WIG; 3415let Constraints = "$src1 = $dst" in 3416defm PMADDWD : PDI_binop_rm2<0xF5, "pmaddwd", X86vpmaddwd, v4i32, v8i16, VR128, 3417 memopv2i64, i128mem, SchedWriteVecIMul.XMM>; 3418 3419let Predicates = [HasAVX, NoVLX_Or_NoBWI] in 3420defm VPSADBW : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v2i64, v16i8, VR128, 3421 loadv2i64, i128mem, SchedWritePSADBW.XMM, 0>, 3422 VEX_4V, VEX_WIG; 3423let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in 3424defm VPSADBWY : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v4i64, v32i8, VR256, 3425 loadv4i64, i256mem, SchedWritePSADBW.YMM, 0>, 3426 VEX_4V, VEX_L, VEX_WIG; 3427let Constraints = "$src1 = $dst" in 3428defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128, 3429 memopv2i64, i128mem, SchedWritePSADBW.XMM>; 3430 3431//===---------------------------------------------------------------------===// 3432// SSE2 - Packed Integer Logical Instructions 3433//===---------------------------------------------------------------------===// 3434 3435multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm, 3436 string OpcodeStr, SDNode OpNode, 3437 SDNode OpNode2, RegisterClass RC, 3438 X86FoldableSchedWrite sched, 3439 X86FoldableSchedWrite schedImm, 3440 ValueType DstVT, ValueType SrcVT, 3441 PatFrag ld_frag, bit Is2Addr = 1> { 3442 // src2 is always 128-bit 3443 def rr : PDI<opc, MRMSrcReg, (outs RC:$dst), 3444 (ins RC:$src1, VR128:$src2), 3445 !if(Is2Addr, 3446 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3447 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3448 [(set RC:$dst, (DstVT (OpNode RC:$src1, (SrcVT VR128:$src2))))]>, 3449 Sched<[sched]>; 3450 def rm : PDI<opc, MRMSrcMem, (outs RC:$dst), 3451 (ins RC:$src1, i128mem:$src2), 3452 !if(Is2Addr, 3453 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3454 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3455 [(set RC:$dst, (DstVT (OpNode RC:$src1, 3456 (SrcVT (bitconvert (ld_frag addr:$src2))))))]>, 3457 Sched<[sched.Folded, ReadAfterLd]>; 3458 def ri : PDIi8<opc2, ImmForm, (outs RC:$dst), 3459 (ins RC:$src1, u8imm:$src2), 3460 !if(Is2Addr, 3461 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3462 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3463 [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i8 imm:$src2))))]>, 3464 Sched<[schedImm]>; 3465} 3466 3467multiclass PDI_binop_rmi_all<bits<8> opc, bits<8> opc2, Format ImmForm, 3468 string OpcodeStr, SDNode OpNode, 3469 SDNode OpNode2, ValueType DstVT128, 3470 ValueType DstVT256, ValueType SrcVT, 3471 X86SchedWriteWidths sched, 3472 X86SchedWriteWidths schedImm, Predicate prd> { 3473let Predicates = [HasAVX, prd] in 3474 defm V#NAME : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr), 3475 OpNode, OpNode2, VR128, sched.XMM, schedImm.XMM, 3476 DstVT128, SrcVT, loadv2i64, 0>, VEX_4V, VEX_WIG; 3477let Predicates = [HasAVX2, prd] in 3478 defm V#NAME#Y : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr), 3479 OpNode, OpNode2, VR256, sched.YMM, schedImm.YMM, 3480 DstVT256, SrcVT, loadv2i64, 0>, VEX_4V, VEX_L, 3481 VEX_WIG; 3482let Constraints = "$src1 = $dst" in 3483 defm NAME : PDI_binop_rmi<opc, opc2, ImmForm, OpcodeStr, OpNode, OpNode2, 3484 VR128, sched.XMM, schedImm.XMM, DstVT128, SrcVT, 3485 memopv2i64>; 3486} 3487 3488multiclass PDI_binop_ri<bits<8> opc, Format ImmForm, string OpcodeStr, 3489 SDNode OpNode, RegisterClass RC, ValueType VT, 3490 X86FoldableSchedWrite sched, bit Is2Addr = 1> { 3491 def ri : PDIi8<opc, ImmForm, (outs RC:$dst), (ins RC:$src1, u8imm:$src2), 3492 !if(Is2Addr, 3493 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3494 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3495 [(set RC:$dst, (VT (OpNode RC:$src1, (i8 imm:$src2))))]>, 3496 Sched<[sched]>; 3497} 3498 3499multiclass PDI_binop_ri_all<bits<8> opc, Format ImmForm, string OpcodeStr, 3500 SDNode OpNode, X86SchedWriteWidths sched> { 3501let Predicates = [HasAVX, NoVLX_Or_NoBWI] in 3502 defm V#NAME : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode, 3503 VR128, v16i8, sched.XMM, 0>, VEX_4V, VEX_WIG; 3504let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in 3505 defm V#NAME#Y : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode, 3506 VR256, v32i8, sched.YMM, 0>, 3507 VEX_4V, VEX_L, VEX_WIG; 3508let Constraints = "$src1 = $dst" in 3509 defm NAME : PDI_binop_ri<opc, ImmForm, OpcodeStr, OpNode, VR128, v16i8, 3510 sched.XMM>; 3511} 3512 3513let ExeDomain = SSEPackedInt in { 3514 defm PSLLW : PDI_binop_rmi_all<0xF1, 0x71, MRM6r, "psllw", X86vshl, X86vshli, 3515 v8i16, v16i16, v8i16, SchedWriteVecShift, 3516 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>; 3517 defm PSLLD : PDI_binop_rmi_all<0xF2, 0x72, MRM6r, "pslld", X86vshl, X86vshli, 3518 v4i32, v8i32, v4i32, SchedWriteVecShift, 3519 SchedWriteVecShiftImm, NoVLX>; 3520 defm PSLLQ : PDI_binop_rmi_all<0xF3, 0x73, MRM6r, "psllq", X86vshl, X86vshli, 3521 v2i64, v4i64, v2i64, SchedWriteVecShift, 3522 SchedWriteVecShiftImm, NoVLX>; 3523 3524 defm PSRLW : PDI_binop_rmi_all<0xD1, 0x71, MRM2r, "psrlw", X86vsrl, X86vsrli, 3525 v8i16, v16i16, v8i16, SchedWriteVecShift, 3526 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>; 3527 defm PSRLD : PDI_binop_rmi_all<0xD2, 0x72, MRM2r, "psrld", X86vsrl, X86vsrli, 3528 v4i32, v8i32, v4i32, SchedWriteVecShift, 3529 SchedWriteVecShiftImm, NoVLX>; 3530 defm PSRLQ : PDI_binop_rmi_all<0xD3, 0x73, MRM2r, "psrlq", X86vsrl, X86vsrli, 3531 v2i64, v4i64, v2i64, SchedWriteVecShift, 3532 SchedWriteVecShiftImm, NoVLX>; 3533 3534 defm PSRAW : PDI_binop_rmi_all<0xE1, 0x71, MRM4r, "psraw", X86vsra, X86vsrai, 3535 v8i16, v16i16, v8i16, SchedWriteVecShift, 3536 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>; 3537 defm PSRAD : PDI_binop_rmi_all<0xE2, 0x72, MRM4r, "psrad", X86vsra, X86vsrai, 3538 v4i32, v8i32, v4i32, SchedWriteVecShift, 3539 SchedWriteVecShiftImm, NoVLX>; 3540 3541 defm PSLLDQ : PDI_binop_ri_all<0x73, MRM7r, "pslldq", X86vshldq, 3542 SchedWriteShuffle>; 3543 defm PSRLDQ : PDI_binop_ri_all<0x73, MRM3r, "psrldq", X86vshrdq, 3544 SchedWriteShuffle>; 3545} // ExeDomain = SSEPackedInt 3546 3547//===---------------------------------------------------------------------===// 3548// SSE2 - Packed Integer Comparison Instructions 3549//===---------------------------------------------------------------------===// 3550 3551defm PCMPEQB : PDI_binop_all<0x74, "pcmpeqb", X86pcmpeq, v16i8, v32i8, 3552 SchedWriteVecALU, 1, TruePredicate>; 3553defm PCMPEQW : PDI_binop_all<0x75, "pcmpeqw", X86pcmpeq, v8i16, v16i16, 3554 SchedWriteVecALU, 1, TruePredicate>; 3555defm PCMPEQD : PDI_binop_all<0x76, "pcmpeqd", X86pcmpeq, v4i32, v8i32, 3556 SchedWriteVecALU, 1, TruePredicate>; 3557defm PCMPGTB : PDI_binop_all<0x64, "pcmpgtb", X86pcmpgt, v16i8, v32i8, 3558 SchedWriteVecALU, 0, TruePredicate>; 3559defm PCMPGTW : PDI_binop_all<0x65, "pcmpgtw", X86pcmpgt, v8i16, v16i16, 3560 SchedWriteVecALU, 0, TruePredicate>; 3561defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32, 3562 SchedWriteVecALU, 0, TruePredicate>; 3563 3564//===---------------------------------------------------------------------===// 3565// SSE2 - Packed Integer Shuffle Instructions 3566//===---------------------------------------------------------------------===// 3567 3568let ExeDomain = SSEPackedInt in { 3569multiclass sse2_pshuffle<string OpcodeStr, ValueType vt128, ValueType vt256, 3570 SDNode OpNode, X86SchedWriteWidths sched, 3571 Predicate prd> { 3572let Predicates = [HasAVX, prd] in { 3573 def V#NAME#ri : Ii8<0x70, MRMSrcReg, (outs VR128:$dst), 3574 (ins VR128:$src1, u8imm:$src2), 3575 !strconcat("v", OpcodeStr, 3576 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 3577 [(set VR128:$dst, 3578 (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))]>, 3579 VEX, Sched<[sched.XMM]>, VEX_WIG; 3580 def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst), 3581 (ins i128mem:$src1, u8imm:$src2), 3582 !strconcat("v", OpcodeStr, 3583 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 3584 [(set VR128:$dst, 3585 (vt128 (OpNode (bitconvert (loadv2i64 addr:$src1)), 3586 (i8 imm:$src2))))]>, VEX, 3587 Sched<[sched.XMM.Folded]>, VEX_WIG; 3588} 3589 3590let Predicates = [HasAVX2, prd] in { 3591 def V#NAME#Yri : Ii8<0x70, MRMSrcReg, (outs VR256:$dst), 3592 (ins VR256:$src1, u8imm:$src2), 3593 !strconcat("v", OpcodeStr, 3594 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 3595 [(set VR256:$dst, 3596 (vt256 (OpNode VR256:$src1, (i8 imm:$src2))))]>, 3597 VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG; 3598 def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst), 3599 (ins i256mem:$src1, u8imm:$src2), 3600 !strconcat("v", OpcodeStr, 3601 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 3602 [(set VR256:$dst, 3603 (vt256 (OpNode (bitconvert (loadv4i64 addr:$src1)), 3604 (i8 imm:$src2))))]>, VEX, VEX_L, 3605 Sched<[sched.YMM.Folded]>, VEX_WIG; 3606} 3607 3608let Predicates = [UseSSE2] in { 3609 def ri : Ii8<0x70, MRMSrcReg, 3610 (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2), 3611 !strconcat(OpcodeStr, 3612 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 3613 [(set VR128:$dst, 3614 (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))]>, 3615 Sched<[sched.XMM]>; 3616 def mi : Ii8<0x70, MRMSrcMem, 3617 (outs VR128:$dst), (ins i128mem:$src1, u8imm:$src2), 3618 !strconcat(OpcodeStr, 3619 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 3620 [(set VR128:$dst, 3621 (vt128 (OpNode (bitconvert (memopv2i64 addr:$src1)), 3622 (i8 imm:$src2))))]>, 3623 Sched<[sched.XMM.Folded]>; 3624} 3625} 3626} // ExeDomain = SSEPackedInt 3627 3628defm PSHUFD : sse2_pshuffle<"pshufd", v4i32, v8i32, X86PShufd, 3629 SchedWriteShuffle, NoVLX>, PD; 3630defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw, 3631 SchedWriteShuffle, NoVLX_Or_NoBWI>, XS; 3632defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw, 3633 SchedWriteShuffle, NoVLX_Or_NoBWI>, XD; 3634 3635//===---------------------------------------------------------------------===// 3636// Packed Integer Pack Instructions (SSE & AVX) 3637//===---------------------------------------------------------------------===// 3638 3639let ExeDomain = SSEPackedInt in { 3640multiclass sse2_pack<bits<8> opc, string OpcodeStr, ValueType OutVT, 3641 ValueType ArgVT, SDNode OpNode, RegisterClass RC, 3642 X86MemOperand x86memop, X86FoldableSchedWrite sched, 3643 PatFrag ld_frag, bit Is2Addr = 1> { 3644 def rr : PDI<opc, MRMSrcReg, 3645 (outs RC:$dst), (ins RC:$src1, RC:$src2), 3646 !if(Is2Addr, 3647 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3648 !strconcat(OpcodeStr, 3649 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3650 [(set RC:$dst, 3651 (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))]>, 3652 Sched<[sched]>; 3653 def rm : PDI<opc, MRMSrcMem, 3654 (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 3655 !if(Is2Addr, 3656 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3657 !strconcat(OpcodeStr, 3658 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3659 [(set RC:$dst, 3660 (OutVT (OpNode (ArgVT RC:$src1), 3661 (bitconvert (ld_frag addr:$src2)))))]>, 3662 Sched<[sched.Folded, ReadAfterLd]>; 3663} 3664 3665multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT, 3666 ValueType ArgVT, SDNode OpNode, RegisterClass RC, 3667 X86MemOperand x86memop, X86FoldableSchedWrite sched, 3668 PatFrag ld_frag, bit Is2Addr = 1> { 3669 def rr : SS48I<opc, MRMSrcReg, 3670 (outs RC:$dst), (ins RC:$src1, RC:$src2), 3671 !if(Is2Addr, 3672 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3673 !strconcat(OpcodeStr, 3674 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3675 [(set RC:$dst, 3676 (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))]>, 3677 Sched<[sched]>; 3678 def rm : SS48I<opc, MRMSrcMem, 3679 (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 3680 !if(Is2Addr, 3681 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3682 !strconcat(OpcodeStr, 3683 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3684 [(set RC:$dst, 3685 (OutVT (OpNode (ArgVT RC:$src1), 3686 (bitconvert (ld_frag addr:$src2)))))]>, 3687 Sched<[sched.Folded, ReadAfterLd]>; 3688} 3689 3690let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 3691 defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss, VR128, 3692 i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>, 3693 VEX_4V, VEX_WIG; 3694 defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss, VR128, 3695 i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>, 3696 VEX_4V, VEX_WIG; 3697 3698 defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus, VR128, 3699 i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>, 3700 VEX_4V, VEX_WIG; 3701 defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus, VR128, 3702 i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>, 3703 VEX_4V; 3704} 3705 3706let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 3707 defm VPACKSSWBY : sse2_pack<0x63, "vpacksswb", v32i8, v16i16, X86Packss, VR256, 3708 i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>, 3709 VEX_4V, VEX_L, VEX_WIG; 3710 defm VPACKSSDWY : sse2_pack<0x6B, "vpackssdw", v16i16, v8i32, X86Packss, VR256, 3711 i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>, 3712 VEX_4V, VEX_L, VEX_WIG; 3713 3714 defm VPACKUSWBY : sse2_pack<0x67, "vpackuswb", v32i8, v16i16, X86Packus, VR256, 3715 i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>, 3716 VEX_4V, VEX_L, VEX_WIG; 3717 defm VPACKUSDWY : sse4_pack<0x2B, "vpackusdw", v16i16, v8i32, X86Packus, VR256, 3718 i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>, 3719 VEX_4V, VEX_L; 3720} 3721 3722let Constraints = "$src1 = $dst" in { 3723 defm PACKSSWB : sse2_pack<0x63, "packsswb", v16i8, v8i16, X86Packss, VR128, 3724 i128mem, SchedWriteShuffle.XMM, memopv2i64>; 3725 defm PACKSSDW : sse2_pack<0x6B, "packssdw", v8i16, v4i32, X86Packss, VR128, 3726 i128mem, SchedWriteShuffle.XMM, memopv2i64>; 3727 3728 defm PACKUSWB : sse2_pack<0x67, "packuswb", v16i8, v8i16, X86Packus, VR128, 3729 i128mem, SchedWriteShuffle.XMM, memopv2i64>; 3730 3731 defm PACKUSDW : sse4_pack<0x2B, "packusdw", v8i16, v4i32, X86Packus, VR128, 3732 i128mem, SchedWriteShuffle.XMM, memopv2i64>; 3733} 3734} // ExeDomain = SSEPackedInt 3735 3736//===---------------------------------------------------------------------===// 3737// SSE2 - Packed Integer Unpack Instructions 3738//===---------------------------------------------------------------------===// 3739 3740let ExeDomain = SSEPackedInt in { 3741multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt, 3742 SDNode OpNode, RegisterClass RC, X86MemOperand x86memop, 3743 X86FoldableSchedWrite sched, PatFrag ld_frag, 3744 bit Is2Addr = 1> { 3745 def rr : PDI<opc, MRMSrcReg, 3746 (outs RC:$dst), (ins RC:$src1, RC:$src2), 3747 !if(Is2Addr, 3748 !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"), 3749 !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3750 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>, 3751 Sched<[sched]>; 3752 def rm : PDI<opc, MRMSrcMem, 3753 (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 3754 !if(Is2Addr, 3755 !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"), 3756 !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3757 [(set RC:$dst, (vt (OpNode RC:$src1, 3758 (bitconvert (ld_frag addr:$src2)))))]>, 3759 Sched<[sched.Folded, ReadAfterLd]>; 3760} 3761 3762let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 3763 defm VPUNPCKLBW : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl, VR128, 3764 i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>, 3765 VEX_4V, VEX_WIG; 3766 defm VPUNPCKLWD : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl, VR128, 3767 i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>, 3768 VEX_4V, VEX_WIG; 3769 defm VPUNPCKHBW : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh, VR128, 3770 i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>, 3771 VEX_4V, VEX_WIG; 3772 defm VPUNPCKHWD : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh, VR128, 3773 i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>, 3774 VEX_4V, VEX_WIG; 3775} 3776 3777let Predicates = [HasAVX, NoVLX] in { 3778 defm VPUNPCKLDQ : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl, VR128, 3779 i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>, 3780 VEX_4V, VEX_WIG; 3781 defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl, VR128, 3782 i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>, 3783 VEX_4V, VEX_WIG; 3784 defm VPUNPCKHDQ : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh, VR128, 3785 i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>, 3786 VEX_4V, VEX_WIG; 3787 defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh, VR128, 3788 i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>, 3789 VEX_4V, VEX_WIG; 3790} 3791 3792let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 3793 defm VPUNPCKLBWY : sse2_unpack<0x60, "vpunpcklbw", v32i8, X86Unpckl, VR256, 3794 i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>, 3795 VEX_4V, VEX_L, VEX_WIG; 3796 defm VPUNPCKLWDY : sse2_unpack<0x61, "vpunpcklwd", v16i16, X86Unpckl, VR256, 3797 i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>, 3798 VEX_4V, VEX_L, VEX_WIG; 3799 defm VPUNPCKHBWY : sse2_unpack<0x68, "vpunpckhbw", v32i8, X86Unpckh, VR256, 3800 i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>, 3801 VEX_4V, VEX_L, VEX_WIG; 3802 defm VPUNPCKHWDY : sse2_unpack<0x69, "vpunpckhwd", v16i16, X86Unpckh, VR256, 3803 i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>, 3804 VEX_4V, VEX_L, VEX_WIG; 3805} 3806 3807let Predicates = [HasAVX2, NoVLX] in { 3808 defm VPUNPCKLDQY : sse2_unpack<0x62, "vpunpckldq", v8i32, X86Unpckl, VR256, 3809 i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>, 3810 VEX_4V, VEX_L, VEX_WIG; 3811 defm VPUNPCKLQDQY : sse2_unpack<0x6C, "vpunpcklqdq", v4i64, X86Unpckl, VR256, 3812 i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>, 3813 VEX_4V, VEX_L, VEX_WIG; 3814 defm VPUNPCKHDQY : sse2_unpack<0x6A, "vpunpckhdq", v8i32, X86Unpckh, VR256, 3815 i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>, 3816 VEX_4V, VEX_L, VEX_WIG; 3817 defm VPUNPCKHQDQY : sse2_unpack<0x6D, "vpunpckhqdq", v4i64, X86Unpckh, VR256, 3818 i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>, 3819 VEX_4V, VEX_L, VEX_WIG; 3820} 3821 3822let Constraints = "$src1 = $dst" in { 3823 defm PUNPCKLBW : sse2_unpack<0x60, "punpcklbw", v16i8, X86Unpckl, VR128, 3824 i128mem, SchedWriteShuffle.XMM, memopv2i64>; 3825 defm PUNPCKLWD : sse2_unpack<0x61, "punpcklwd", v8i16, X86Unpckl, VR128, 3826 i128mem, SchedWriteShuffle.XMM, memopv2i64>; 3827 defm PUNPCKLDQ : sse2_unpack<0x62, "punpckldq", v4i32, X86Unpckl, VR128, 3828 i128mem, SchedWriteShuffle.XMM, memopv2i64>; 3829 defm PUNPCKLQDQ : sse2_unpack<0x6C, "punpcklqdq", v2i64, X86Unpckl, VR128, 3830 i128mem, SchedWriteShuffle.XMM, memopv2i64>; 3831 3832 defm PUNPCKHBW : sse2_unpack<0x68, "punpckhbw", v16i8, X86Unpckh, VR128, 3833 i128mem, SchedWriteShuffle.XMM, memopv2i64>; 3834 defm PUNPCKHWD : sse2_unpack<0x69, "punpckhwd", v8i16, X86Unpckh, VR128, 3835 i128mem, SchedWriteShuffle.XMM, memopv2i64>; 3836 defm PUNPCKHDQ : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Unpckh, VR128, 3837 i128mem, SchedWriteShuffle.XMM, memopv2i64>; 3838 defm PUNPCKHQDQ : sse2_unpack<0x6D, "punpckhqdq", v2i64, X86Unpckh, VR128, 3839 i128mem, SchedWriteShuffle.XMM, memopv2i64>; 3840} 3841} // ExeDomain = SSEPackedInt 3842 3843//===---------------------------------------------------------------------===// 3844// SSE2 - Packed Integer Extract and Insert 3845//===---------------------------------------------------------------------===// 3846 3847let ExeDomain = SSEPackedInt in { 3848multiclass sse2_pinsrw<bit Is2Addr = 1> { 3849 def rr : Ii8<0xC4, MRMSrcReg, 3850 (outs VR128:$dst), (ins VR128:$src1, 3851 GR32orGR64:$src2, u8imm:$src3), 3852 !if(Is2Addr, 3853 "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}", 3854 "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 3855 [(set VR128:$dst, 3856 (X86pinsrw VR128:$src1, GR32orGR64:$src2, imm:$src3))]>, 3857 Sched<[WriteVecInsert]>; 3858 def rm : Ii8<0xC4, MRMSrcMem, 3859 (outs VR128:$dst), (ins VR128:$src1, 3860 i16mem:$src2, u8imm:$src3), 3861 !if(Is2Addr, 3862 "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}", 3863 "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 3864 [(set VR128:$dst, 3865 (X86pinsrw VR128:$src1, (extloadi16 addr:$src2), 3866 imm:$src3))]>, 3867 Sched<[WriteVecInsertLd, ReadAfterLd]>; 3868} 3869 3870// Extract 3871let Predicates = [HasAVX, NoBWI] in 3872def VPEXTRWrr : Ii8<0xC5, MRMSrcReg, 3873 (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2), 3874 "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", 3875 [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1), 3876 imm:$src2))]>, 3877 PD, VEX, Sched<[WriteVecExtract]>; 3878def PEXTRWrr : PDIi8<0xC5, MRMSrcReg, 3879 (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2), 3880 "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", 3881 [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1), 3882 imm:$src2))]>, 3883 Sched<[WriteVecExtract]>; 3884 3885// Insert 3886let Predicates = [HasAVX, NoBWI] in 3887defm VPINSRW : sse2_pinsrw<0>, PD, VEX_4V; 3888 3889let Predicates = [UseSSE2], Constraints = "$src1 = $dst" in 3890defm PINSRW : sse2_pinsrw, PD; 3891 3892} // ExeDomain = SSEPackedInt 3893 3894//===---------------------------------------------------------------------===// 3895// SSE2 - Packed Mask Creation 3896//===---------------------------------------------------------------------===// 3897 3898let ExeDomain = SSEPackedInt in { 3899 3900def VPMOVMSKBrr : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), 3901 (ins VR128:$src), 3902 "pmovmskb\t{$src, $dst|$dst, $src}", 3903 [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))]>, 3904 Sched<[WriteVecMOVMSK]>, VEX, VEX_WIG; 3905 3906let Predicates = [HasAVX2] in { 3907def VPMOVMSKBYrr : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), 3908 (ins VR256:$src), 3909 "pmovmskb\t{$src, $dst|$dst, $src}", 3910 [(set GR32orGR64:$dst, (X86movmsk (v32i8 VR256:$src)))]>, 3911 Sched<[WriteVecMOVMSKY]>, VEX, VEX_L, VEX_WIG; 3912} 3913 3914def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src), 3915 "pmovmskb\t{$src, $dst|$dst, $src}", 3916 [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))]>, 3917 Sched<[WriteVecMOVMSK]>; 3918 3919} // ExeDomain = SSEPackedInt 3920 3921//===---------------------------------------------------------------------===// 3922// SSE2 - Conditional Store 3923//===---------------------------------------------------------------------===// 3924 3925let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecMoveLS.XMM.MR] in { 3926let Uses = [EDI], Predicates = [HasAVX,Not64BitMode] in 3927def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs), 3928 (ins VR128:$src, VR128:$mask), 3929 "maskmovdqu\t{$mask, $src|$src, $mask}", 3930 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>, 3931 VEX, VEX_WIG; 3932let Uses = [RDI], Predicates = [HasAVX,In64BitMode] in 3933def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs), 3934 (ins VR128:$src, VR128:$mask), 3935 "maskmovdqu\t{$mask, $src|$src, $mask}", 3936 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>, 3937 VEX, VEX_WIG; 3938 3939let Uses = [EDI], Predicates = [UseSSE2,Not64BitMode] in 3940def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask), 3941 "maskmovdqu\t{$mask, $src|$src, $mask}", 3942 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>; 3943let Uses = [RDI], Predicates = [UseSSE2,In64BitMode] in 3944def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask), 3945 "maskmovdqu\t{$mask, $src|$src, $mask}", 3946 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>; 3947 3948} // ExeDomain = SSEPackedInt 3949 3950//===---------------------------------------------------------------------===// 3951// SSE2 - Move Doubleword/Quadword 3952//===---------------------------------------------------------------------===// 3953 3954//===---------------------------------------------------------------------===// 3955// Move Int Doubleword to Packed Double Int 3956// 3957let ExeDomain = SSEPackedInt in { 3958def VMOVDI2PDIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src), 3959 "movd\t{$src, $dst|$dst, $src}", 3960 [(set VR128:$dst, 3961 (v4i32 (scalar_to_vector GR32:$src)))]>, 3962 VEX, Sched<[WriteVecMoveFromGpr]>; 3963def VMOVDI2PDIrm : VS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src), 3964 "movd\t{$src, $dst|$dst, $src}", 3965 [(set VR128:$dst, 3966 (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>, 3967 VEX, Sched<[WriteVecLoad]>; 3968def VMOV64toPQIrr : VRS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), 3969 "movq\t{$src, $dst|$dst, $src}", 3970 [(set VR128:$dst, 3971 (v2i64 (scalar_to_vector GR64:$src)))]>, 3972 VEX, Sched<[WriteVecMoveFromGpr]>; 3973let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in 3974def VMOV64toPQIrm : VRS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 3975 "movq\t{$src, $dst|$dst, $src}", []>, 3976 VEX, Sched<[WriteVecLoad]>; 3977let isCodeGenOnly = 1 in 3978def VMOV64toSDrr : VRS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src), 3979 "movq\t{$src, $dst|$dst, $src}", 3980 [(set FR64:$dst, (bitconvert GR64:$src))]>, 3981 VEX, Sched<[WriteVecMoveFromGpr]>; 3982 3983def MOVDI2PDIrr : S2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src), 3984 "movd\t{$src, $dst|$dst, $src}", 3985 [(set VR128:$dst, 3986 (v4i32 (scalar_to_vector GR32:$src)))]>, 3987 Sched<[WriteVecMoveFromGpr]>; 3988def MOVDI2PDIrm : S2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src), 3989 "movd\t{$src, $dst|$dst, $src}", 3990 [(set VR128:$dst, 3991 (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>, 3992 Sched<[WriteVecLoad]>; 3993def MOV64toPQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), 3994 "movq\t{$src, $dst|$dst, $src}", 3995 [(set VR128:$dst, 3996 (v2i64 (scalar_to_vector GR64:$src)))]>, 3997 Sched<[WriteVecMoveFromGpr]>; 3998let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in 3999def MOV64toPQIrm : RS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 4000 "movq\t{$src, $dst|$dst, $src}", []>, 4001 Sched<[WriteVecLoad]>; 4002let isCodeGenOnly = 1 in 4003def MOV64toSDrr : RS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src), 4004 "movq\t{$src, $dst|$dst, $src}", 4005 [(set FR64:$dst, (bitconvert GR64:$src))]>, 4006 Sched<[WriteVecMoveFromGpr]>; 4007} // ExeDomain = SSEPackedInt 4008 4009//===---------------------------------------------------------------------===// 4010// Move Int Doubleword to Single Scalar 4011// 4012let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in { 4013 def VMOVDI2SSrr : VS2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src), 4014 "movd\t{$src, $dst|$dst, $src}", 4015 [(set FR32:$dst, (bitconvert GR32:$src))]>, 4016 VEX, Sched<[WriteVecMoveFromGpr]>; 4017 4018 def VMOVDI2SSrm : VS2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src), 4019 "movd\t{$src, $dst|$dst, $src}", 4020 [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))]>, 4021 VEX, Sched<[WriteVecLoad]>; 4022 def MOVDI2SSrr : S2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src), 4023 "movd\t{$src, $dst|$dst, $src}", 4024 [(set FR32:$dst, (bitconvert GR32:$src))]>, 4025 Sched<[WriteVecMoveFromGpr]>; 4026 4027 def MOVDI2SSrm : S2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src), 4028 "movd\t{$src, $dst|$dst, $src}", 4029 [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))]>, 4030 Sched<[WriteVecLoad]>; 4031} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1 4032 4033//===---------------------------------------------------------------------===// 4034// Move Packed Doubleword Int to Packed Double Int 4035// 4036let ExeDomain = SSEPackedInt in { 4037def VMOVPDI2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src), 4038 "movd\t{$src, $dst|$dst, $src}", 4039 [(set GR32:$dst, (extractelt (v4i32 VR128:$src), 4040 (iPTR 0)))]>, VEX, 4041 Sched<[WriteVecMoveToGpr]>; 4042def VMOVPDI2DImr : VS2I<0x7E, MRMDestMem, (outs), 4043 (ins i32mem:$dst, VR128:$src), 4044 "movd\t{$src, $dst|$dst, $src}", 4045 [(store (i32 (extractelt (v4i32 VR128:$src), 4046 (iPTR 0))), addr:$dst)]>, 4047 VEX, Sched<[WriteVecStore]>; 4048def MOVPDI2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src), 4049 "movd\t{$src, $dst|$dst, $src}", 4050 [(set GR32:$dst, (extractelt (v4i32 VR128:$src), 4051 (iPTR 0)))]>, 4052 Sched<[WriteVecMoveToGpr]>; 4053def MOVPDI2DImr : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src), 4054 "movd\t{$src, $dst|$dst, $src}", 4055 [(store (i32 (extractelt (v4i32 VR128:$src), 4056 (iPTR 0))), addr:$dst)]>, 4057 Sched<[WriteVecStore]>; 4058} // ExeDomain = SSEPackedInt 4059 4060//===---------------------------------------------------------------------===// 4061// Move Packed Doubleword Int first element to Doubleword Int 4062// 4063let ExeDomain = SSEPackedInt in { 4064let SchedRW = [WriteVecMoveToGpr] in { 4065def VMOVPQIto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), 4066 "movq\t{$src, $dst|$dst, $src}", 4067 [(set GR64:$dst, (extractelt (v2i64 VR128:$src), 4068 (iPTR 0)))]>, 4069 VEX; 4070 4071def MOVPQIto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), 4072 "movq\t{$src, $dst|$dst, $src}", 4073 [(set GR64:$dst, (extractelt (v2i64 VR128:$src), 4074 (iPTR 0)))]>; 4075} //SchedRW 4076 4077let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in 4078def VMOVPQIto64mr : VRS2I<0x7E, MRMDestMem, (outs), 4079 (ins i64mem:$dst, VR128:$src), 4080 "movq\t{$src, $dst|$dst, $src}", []>, 4081 VEX, Sched<[WriteVecStore]>; 4082let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in 4083def MOVPQIto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), 4084 "movq\t{$src, $dst|$dst, $src}", []>, 4085 Sched<[WriteVecStore]>; 4086} // ExeDomain = SSEPackedInt 4087 4088//===---------------------------------------------------------------------===// 4089// Bitcast FR64 <-> GR64 4090// 4091let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in { 4092 let Predicates = [UseAVX] in 4093 def VMOV64toSDrm : VS2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src), 4094 "movq\t{$src, $dst|$dst, $src}", 4095 [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>, 4096 VEX, Sched<[WriteVecLoad]>; 4097 def VMOVSDto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src), 4098 "movq\t{$src, $dst|$dst, $src}", 4099 [(set GR64:$dst, (bitconvert FR64:$src))]>, 4100 VEX, Sched<[WriteVecMoveToGpr]>; 4101 def VMOVSDto64mr : VRS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src), 4102 "movq\t{$src, $dst|$dst, $src}", 4103 [(store (i64 (bitconvert FR64:$src)), addr:$dst)]>, 4104 VEX, Sched<[WriteVecStore]>; 4105 4106 def MOV64toSDrm : S2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src), 4107 "movq\t{$src, $dst|$dst, $src}", 4108 [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>, 4109 Sched<[WriteVecLoad]>; 4110 def MOVSDto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src), 4111 "movq\t{$src, $dst|$dst, $src}", 4112 [(set GR64:$dst, (bitconvert FR64:$src))]>, 4113 Sched<[WriteVecMoveToGpr]>; 4114 def MOVSDto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src), 4115 "movq\t{$src, $dst|$dst, $src}", 4116 [(store (i64 (bitconvert FR64:$src)), addr:$dst)]>, 4117 Sched<[WriteVecStore]>; 4118} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1 4119 4120//===---------------------------------------------------------------------===// 4121// Move Scalar Single to Double Int 4122// 4123let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in { 4124 def VMOVSS2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src), 4125 "movd\t{$src, $dst|$dst, $src}", 4126 [(set GR32:$dst, (bitconvert FR32:$src))]>, 4127 VEX, Sched<[WriteVecMoveToGpr]>; 4128 def VMOVSS2DImr : VS2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src), 4129 "movd\t{$src, $dst|$dst, $src}", 4130 [(store (i32 (bitconvert FR32:$src)), addr:$dst)]>, 4131 VEX, Sched<[WriteVecStore]>; 4132 def MOVSS2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src), 4133 "movd\t{$src, $dst|$dst, $src}", 4134 [(set GR32:$dst, (bitconvert FR32:$src))]>, 4135 Sched<[WriteVecMoveToGpr]>; 4136 def MOVSS2DImr : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src), 4137 "movd\t{$src, $dst|$dst, $src}", 4138 [(store (i32 (bitconvert FR32:$src)), addr:$dst)]>, 4139 Sched<[WriteVecStore]>; 4140} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1 4141 4142let Predicates = [UseAVX] in { 4143 def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))), 4144 (VMOVDI2PDIrr GR32:$src)>; 4145 4146 def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))), 4147 (VMOV64toPQIrr GR64:$src)>; 4148 4149 def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, 4150 (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))), 4151 (SUBREG_TO_REG (i64 0), (v2i64 (VMOV64toPQIrr GR64:$src)), sub_xmm)>; 4152 // AVX 128-bit movd/movq instructions write zeros in the high 128-bit part. 4153 // These instructions also write zeros in the high part of a 256-bit register. 4154 def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))), 4155 (VMOVDI2PDIrm addr:$src)>; 4156 def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))), 4157 (VMOVDI2PDIrm addr:$src)>; 4158 def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))), 4159 (VMOVDI2PDIrm addr:$src)>; 4160 def : Pat<(v4i32 (X86vzload addr:$src)), 4161 (VMOVDI2PDIrm addr:$src)>; 4162 def : Pat<(v8i32 (X86vzmovl (insert_subvector undef, 4163 (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))), 4164 (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIrm addr:$src)), sub_xmm)>; 4165 def : Pat<(v8i32 (X86vzload addr:$src)), 4166 (SUBREG_TO_REG (i64 0), (v4i32 (VMOVDI2PDIrm addr:$src)), sub_xmm)>; 4167 // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext. 4168 def : Pat<(v8i32 (X86vzmovl (insert_subvector undef, 4169 (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))), 4170 (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIrr GR32:$src)), sub_xmm)>; 4171} 4172 4173let Predicates = [UseSSE2] in { 4174 def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))), 4175 (MOVDI2PDIrr GR32:$src)>; 4176 4177 def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))), 4178 (MOV64toPQIrr GR64:$src)>; 4179 def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))), 4180 (MOVDI2PDIrm addr:$src)>; 4181 def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))), 4182 (MOVDI2PDIrm addr:$src)>; 4183 def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))), 4184 (MOVDI2PDIrm addr:$src)>; 4185 def : Pat<(v4i32 (X86vzload addr:$src)), 4186 (MOVDI2PDIrm addr:$src)>; 4187} 4188 4189// Before the MC layer of LLVM existed, clang emitted "movd" assembly instead of 4190// "movq" due to MacOS parsing limitation. In order to parse old assembly, we add 4191// these aliases. 4192def : InstAlias<"movd\t{$src, $dst|$dst, $src}", 4193 (MOV64toPQIrr VR128:$dst, GR64:$src), 0>; 4194def : InstAlias<"movd\t{$src, $dst|$dst, $src}", 4195 (MOVPQIto64rr GR64:$dst, VR128:$src), 0>; 4196// Allow "vmovd" but print "vmovq" since we don't need compatibility for AVX. 4197def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}", 4198 (VMOV64toPQIrr VR128:$dst, GR64:$src), 0>; 4199def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}", 4200 (VMOVPQIto64rr GR64:$dst, VR128:$src), 0>; 4201 4202//===---------------------------------------------------------------------===// 4203// SSE2 - Move Quadword 4204//===---------------------------------------------------------------------===// 4205 4206//===---------------------------------------------------------------------===// 4207// Move Quadword Int to Packed Quadword Int 4208// 4209 4210let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLoad] in { 4211def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 4212 "vmovq\t{$src, $dst|$dst, $src}", 4213 [(set VR128:$dst, 4214 (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS, 4215 VEX, Requires<[UseAVX]>, VEX_WIG; 4216def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 4217 "movq\t{$src, $dst|$dst, $src}", 4218 [(set VR128:$dst, 4219 (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, 4220 XS, Requires<[UseSSE2]>; // SSE2 instruction with XS Prefix 4221} // ExeDomain, SchedRW 4222 4223//===---------------------------------------------------------------------===// 4224// Move Packed Quadword Int to Quadword Int 4225// 4226let ExeDomain = SSEPackedInt, SchedRW = [WriteVecStore] in { 4227def VMOVPQI2QImr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), 4228 "movq\t{$src, $dst|$dst, $src}", 4229 [(store (i64 (extractelt (v2i64 VR128:$src), 4230 (iPTR 0))), addr:$dst)]>, 4231 VEX, VEX_WIG; 4232def MOVPQI2QImr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), 4233 "movq\t{$src, $dst|$dst, $src}", 4234 [(store (i64 (extractelt (v2i64 VR128:$src), 4235 (iPTR 0))), addr:$dst)]>; 4236} // ExeDomain, SchedRW 4237 4238// For disassembler only 4239let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, 4240 SchedRW = [SchedWriteVecLogic.XMM] in { 4241def VMOVPQI2QIrr : VS2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 4242 "movq\t{$src, $dst|$dst, $src}", []>, VEX, VEX_WIG; 4243def MOVPQI2QIrr : S2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 4244 "movq\t{$src, $dst|$dst, $src}", []>; 4245} 4246 4247// Aliases to help the assembler pick two byte VEX encodings by swapping the 4248// operands relative to the normal instructions to use VEX.R instead of VEX.B. 4249def : InstAlias<"vmovq\t{$src, $dst|$dst, $src}", 4250 (VMOVPQI2QIrr VR128L:$dst, VR128H:$src), 0>; 4251 4252def : InstAlias<"vmovq.s\t{$src, $dst|$dst, $src}", 4253 (VMOVPQI2QIrr VR128:$dst, VR128:$src), 0>; 4254def : InstAlias<"movq.s\t{$src, $dst|$dst, $src}", 4255 (MOVPQI2QIrr VR128:$dst, VR128:$src), 0>; 4256 4257let Predicates = [UseAVX] in { 4258 def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))), 4259 (VMOVQI2PQIrm addr:$src)>; 4260 def : Pat<(v2i64 (X86vzload addr:$src)), 4261 (VMOVQI2PQIrm addr:$src)>; 4262 def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, 4263 (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))), 4264 (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIrm addr:$src)), sub_xmm)>; 4265 def : Pat<(v4i64 (X86vzload addr:$src)), 4266 (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIrm addr:$src)), sub_xmm)>; 4267} 4268 4269let Predicates = [UseSSE2] in { 4270 def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))), 4271 (MOVQI2PQIrm addr:$src)>; 4272 def : Pat<(v2i64 (X86vzload addr:$src)), (MOVQI2PQIrm addr:$src)>; 4273} 4274 4275//===---------------------------------------------------------------------===// 4276// Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in 4277// IA32 document. movq xmm1, xmm2 does clear the high bits. 4278// 4279let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecLogic.XMM] in { 4280def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 4281 "vmovq\t{$src, $dst|$dst, $src}", 4282 [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>, 4283 XS, VEX, Requires<[UseAVX]>, VEX_WIG; 4284def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 4285 "movq\t{$src, $dst|$dst, $src}", 4286 [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>, 4287 XS, Requires<[UseSSE2]>; 4288} // ExeDomain, SchedRW 4289 4290let Predicates = [UseAVX] in { 4291 def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))), 4292 (VMOVZPQILo2PQIrr VR128:$src)>; 4293} 4294let Predicates = [UseSSE2] in { 4295 def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))), 4296 (MOVZPQILo2PQIrr VR128:$src)>; 4297} 4298 4299//===---------------------------------------------------------------------===// 4300// SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP 4301//===---------------------------------------------------------------------===// 4302 4303multiclass sse3_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr, 4304 ValueType vt, RegisterClass RC, PatFrag mem_frag, 4305 X86MemOperand x86memop, X86FoldableSchedWrite sched> { 4306def rr : S3SI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src), 4307 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4308 [(set RC:$dst, (vt (OpNode RC:$src)))]>, 4309 Sched<[sched]>; 4310def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), 4311 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4312 [(set RC:$dst, (OpNode (mem_frag addr:$src)))]>, 4313 Sched<[sched.Folded]>; 4314} 4315 4316let Predicates = [HasAVX, NoVLX] in { 4317 defm VMOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup", 4318 v4f32, VR128, loadv4f32, f128mem, 4319 SchedWriteFShuffle.XMM>, VEX, VEX_WIG; 4320 defm VMOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup", 4321 v4f32, VR128, loadv4f32, f128mem, 4322 SchedWriteFShuffle.XMM>, VEX, VEX_WIG; 4323 defm VMOVSHDUPY : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup", 4324 v8f32, VR256, loadv8f32, f256mem, 4325 SchedWriteFShuffle.YMM>, VEX, VEX_L, VEX_WIG; 4326 defm VMOVSLDUPY : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup", 4327 v8f32, VR256, loadv8f32, f256mem, 4328 SchedWriteFShuffle.YMM>, VEX, VEX_L, VEX_WIG; 4329} 4330defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128, 4331 memopv4f32, f128mem, SchedWriteFShuffle.XMM>; 4332defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128, 4333 memopv4f32, f128mem, SchedWriteFShuffle.XMM>; 4334 4335let Predicates = [HasAVX, NoVLX] in { 4336 def : Pat<(v4i32 (X86Movshdup VR128:$src)), 4337 (VMOVSHDUPrr VR128:$src)>; 4338 def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (loadv2i64 addr:$src)))), 4339 (VMOVSHDUPrm addr:$src)>; 4340 def : Pat<(v4i32 (X86Movsldup VR128:$src)), 4341 (VMOVSLDUPrr VR128:$src)>; 4342 def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (loadv2i64 addr:$src)))), 4343 (VMOVSLDUPrm addr:$src)>; 4344 def : Pat<(v8i32 (X86Movshdup VR256:$src)), 4345 (VMOVSHDUPYrr VR256:$src)>; 4346 def : Pat<(v8i32 (X86Movshdup (bc_v8i32 (loadv4i64 addr:$src)))), 4347 (VMOVSHDUPYrm addr:$src)>; 4348 def : Pat<(v8i32 (X86Movsldup VR256:$src)), 4349 (VMOVSLDUPYrr VR256:$src)>; 4350 def : Pat<(v8i32 (X86Movsldup (bc_v8i32 (loadv4i64 addr:$src)))), 4351 (VMOVSLDUPYrm addr:$src)>; 4352} 4353 4354let Predicates = [UseSSE3] in { 4355 def : Pat<(v4i32 (X86Movshdup VR128:$src)), 4356 (MOVSHDUPrr VR128:$src)>; 4357 def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (memopv2i64 addr:$src)))), 4358 (MOVSHDUPrm addr:$src)>; 4359 def : Pat<(v4i32 (X86Movsldup VR128:$src)), 4360 (MOVSLDUPrr VR128:$src)>; 4361 def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (memopv2i64 addr:$src)))), 4362 (MOVSLDUPrm addr:$src)>; 4363} 4364 4365//===---------------------------------------------------------------------===// 4366// SSE3 - Replicate Double FP - MOVDDUP 4367//===---------------------------------------------------------------------===// 4368 4369multiclass sse3_replicate_dfp<string OpcodeStr, X86SchedWriteWidths sched> { 4370def rr : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 4371 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4372 [(set VR128:$dst, (v2f64 (X86Movddup VR128:$src)))]>, 4373 Sched<[sched.XMM]>; 4374def rm : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), 4375 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4376 [(set VR128:$dst, 4377 (v2f64 (X86Movddup 4378 (scalar_to_vector (loadf64 addr:$src)))))]>, 4379 Sched<[sched.XMM.Folded]>; 4380} 4381 4382// FIXME: Merge with above classes when there are patterns for the ymm version 4383multiclass sse3_replicate_dfp_y<string OpcodeStr, X86SchedWriteWidths sched> { 4384def rr : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 4385 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4386 [(set VR256:$dst, (v4f64 (X86Movddup VR256:$src)))]>, 4387 Sched<[sched.YMM]>; 4388def rm : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), 4389 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4390 [(set VR256:$dst, 4391 (v4f64 (X86Movddup (loadv4f64 addr:$src))))]>, 4392 Sched<[sched.YMM.Folded]>; 4393} 4394 4395let Predicates = [HasAVX, NoVLX] in { 4396 defm VMOVDDUP : sse3_replicate_dfp<"vmovddup", SchedWriteFShuffle>, 4397 VEX, VEX_WIG; 4398 defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup", SchedWriteFShuffle>, 4399 VEX, VEX_L, VEX_WIG; 4400} 4401 4402defm MOVDDUP : sse3_replicate_dfp<"movddup", SchedWriteFShuffle>; 4403 4404 4405let Predicates = [HasAVX, NoVLX] in { 4406 def : Pat<(X86Movddup (loadv2f64 addr:$src)), 4407 (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; 4408} 4409 4410let Predicates = [UseSSE3] in { 4411 // No need for aligned memory as this only loads 64-bits. 4412 def : Pat<(X86Movddup (loadv2f64 addr:$src)), 4413 (MOVDDUPrm addr:$src)>; 4414} 4415 4416//===---------------------------------------------------------------------===// 4417// SSE3 - Move Unaligned Integer 4418//===---------------------------------------------------------------------===// 4419 4420let Predicates = [HasAVX] in { 4421 def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 4422 "vlddqu\t{$src, $dst|$dst, $src}", 4423 [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>, 4424 Sched<[SchedWriteVecMoveLS.XMM.RM]>, VEX, VEX_WIG; 4425 def VLDDQUYrm : S3DI<0xF0, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), 4426 "vlddqu\t{$src, $dst|$dst, $src}", 4427 [(set VR256:$dst, (int_x86_avx_ldu_dq_256 addr:$src))]>, 4428 Sched<[SchedWriteVecMoveLS.YMM.RM]>, VEX, VEX_L, VEX_WIG; 4429} // Predicates 4430 4431def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 4432 "lddqu\t{$src, $dst|$dst, $src}", 4433 [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>, 4434 Sched<[SchedWriteVecMoveLS.XMM.RM]>; 4435 4436//===---------------------------------------------------------------------===// 4437// SSE3 - Arithmetic 4438//===---------------------------------------------------------------------===// 4439 4440multiclass sse3_addsub<string OpcodeStr, ValueType vt, RegisterClass RC, 4441 X86MemOperand x86memop, X86FoldableSchedWrite sched, 4442 PatFrag ld_frag, bit Is2Addr = 1> { 4443 def rr : I<0xD0, MRMSrcReg, 4444 (outs RC:$dst), (ins RC:$src1, RC:$src2), 4445 !if(Is2Addr, 4446 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4447 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4448 [(set RC:$dst, (vt (X86Addsub RC:$src1, RC:$src2)))]>, 4449 Sched<[sched]>; 4450 def rm : I<0xD0, MRMSrcMem, 4451 (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 4452 !if(Is2Addr, 4453 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4454 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4455 [(set RC:$dst, (vt (X86Addsub RC:$src1, (ld_frag addr:$src2))))]>, 4456 Sched<[sched.Folded, ReadAfterLd]>; 4457} 4458 4459let Predicates = [HasAVX] in { 4460 let ExeDomain = SSEPackedSingle in { 4461 defm VADDSUBPS : sse3_addsub<"vaddsubps", v4f32, VR128, f128mem, 4462 SchedWriteFAddSizes.PS.XMM, loadv4f32, 0>, 4463 XD, VEX_4V, VEX_WIG; 4464 defm VADDSUBPSY : sse3_addsub<"vaddsubps", v8f32, VR256, f256mem, 4465 SchedWriteFAddSizes.PS.YMM, loadv8f32, 0>, 4466 XD, VEX_4V, VEX_L, VEX_WIG; 4467 } 4468 let ExeDomain = SSEPackedDouble in { 4469 defm VADDSUBPD : sse3_addsub<"vaddsubpd", v2f64, VR128, f128mem, 4470 SchedWriteFAddSizes.PD.XMM, loadv2f64, 0>, 4471 PD, VEX_4V, VEX_WIG; 4472 defm VADDSUBPDY : sse3_addsub<"vaddsubpd", v4f64, VR256, f256mem, 4473 SchedWriteFAddSizes.PD.YMM, loadv4f64, 0>, 4474 PD, VEX_4V, VEX_L, VEX_WIG; 4475 } 4476} 4477let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in { 4478 let ExeDomain = SSEPackedSingle in 4479 defm ADDSUBPS : sse3_addsub<"addsubps", v4f32, VR128, f128mem, 4480 SchedWriteFAddSizes.PS.XMM, memopv4f32>, XD; 4481 let ExeDomain = SSEPackedDouble in 4482 defm ADDSUBPD : sse3_addsub<"addsubpd", v2f64, VR128, f128mem, 4483 SchedWriteFAddSizes.PD.XMM, memopv2f64>, PD; 4484} 4485 4486//===---------------------------------------------------------------------===// 4487// SSE3 Instructions 4488//===---------------------------------------------------------------------===// 4489 4490// Horizontal ops 4491multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC, 4492 X86MemOperand x86memop, SDNode OpNode, 4493 X86FoldableSchedWrite sched, PatFrag ld_frag, 4494 bit Is2Addr = 1> { 4495 def rr : S3DI<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 4496 !if(Is2Addr, 4497 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4498 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4499 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>, 4500 Sched<[sched]>; 4501 4502 def rm : S3DI<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 4503 !if(Is2Addr, 4504 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4505 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4506 [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>, 4507 Sched<[sched.Folded, ReadAfterLd]>; 4508} 4509multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC, 4510 X86MemOperand x86memop, SDNode OpNode, 4511 X86FoldableSchedWrite sched, PatFrag ld_frag, 4512 bit Is2Addr = 1> { 4513 def rr : S3I<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 4514 !if(Is2Addr, 4515 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4516 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4517 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>, 4518 Sched<[sched]>; 4519 4520 def rm : S3I<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 4521 !if(Is2Addr, 4522 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4523 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4524 [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>, 4525 Sched<[sched.Folded, ReadAfterLd]>; 4526} 4527 4528let Predicates = [HasAVX] in { 4529 let ExeDomain = SSEPackedSingle in { 4530 defm VHADDPS : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem, 4531 X86fhadd, WriteFHAdd, loadv4f32, 0>, VEX_4V, VEX_WIG; 4532 defm VHSUBPS : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem, 4533 X86fhsub, WriteFHAdd, loadv4f32, 0>, VEX_4V, VEX_WIG; 4534 defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem, 4535 X86fhadd, WriteFHAddY, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG; 4536 defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem, 4537 X86fhsub, WriteFHAddY, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG; 4538 } 4539 let ExeDomain = SSEPackedDouble in { 4540 defm VHADDPD : S3_Int<0x7C, "vhaddpd", v2f64, VR128, f128mem, 4541 X86fhadd, WriteFHAdd, loadv2f64, 0>, VEX_4V, VEX_WIG; 4542 defm VHSUBPD : S3_Int<0x7D, "vhsubpd", v2f64, VR128, f128mem, 4543 X86fhsub, WriteFHAdd, loadv2f64, 0>, VEX_4V, VEX_WIG; 4544 defm VHADDPDY : S3_Int<0x7C, "vhaddpd", v4f64, VR256, f256mem, 4545 X86fhadd, WriteFHAddY, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG; 4546 defm VHSUBPDY : S3_Int<0x7D, "vhsubpd", v4f64, VR256, f256mem, 4547 X86fhsub, WriteFHAddY, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG; 4548 } 4549} 4550 4551let Constraints = "$src1 = $dst" in { 4552 let ExeDomain = SSEPackedSingle in { 4553 defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, X86fhadd, 4554 WriteFHAdd, memopv4f32>; 4555 defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, X86fhsub, 4556 WriteFHAdd, memopv4f32>; 4557 } 4558 let ExeDomain = SSEPackedDouble in { 4559 defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, X86fhadd, 4560 WriteFHAdd, memopv2f64>; 4561 defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, X86fhsub, 4562 WriteFHAdd, memopv2f64>; 4563 } 4564} 4565 4566//===---------------------------------------------------------------------===// 4567// SSSE3 - Packed Absolute Instructions 4568//===---------------------------------------------------------------------===// 4569 4570/// SS3I_unop_rm_int - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}. 4571multiclass SS3I_unop_rm<bits<8> opc, string OpcodeStr, ValueType vt, 4572 SDNode OpNode, X86SchedWriteWidths sched, PatFrag ld_frag> { 4573 def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst), 4574 (ins VR128:$src), 4575 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4576 [(set VR128:$dst, (vt (OpNode VR128:$src)))]>, 4577 Sched<[sched.XMM]>; 4578 4579 def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst), 4580 (ins i128mem:$src), 4581 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4582 [(set VR128:$dst, 4583 (vt (OpNode (bitconvert (ld_frag addr:$src)))))]>, 4584 Sched<[sched.XMM.Folded]>; 4585} 4586 4587/// SS3I_unop_rm_int_y - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}. 4588multiclass SS3I_unop_rm_y<bits<8> opc, string OpcodeStr, ValueType vt, 4589 SDNode OpNode, X86SchedWriteWidths sched> { 4590 def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst), 4591 (ins VR256:$src), 4592 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4593 [(set VR256:$dst, (vt (OpNode VR256:$src)))]>, 4594 Sched<[sched.YMM]>; 4595 4596 def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst), 4597 (ins i256mem:$src), 4598 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4599 [(set VR256:$dst, 4600 (vt (OpNode (bitconvert (loadv4i64 addr:$src)))))]>, 4601 Sched<[sched.YMM.Folded]>; 4602} 4603 4604let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 4605 defm VPABSB : SS3I_unop_rm<0x1C, "vpabsb", v16i8, abs, SchedWriteVecALU, 4606 loadv2i64>, VEX, VEX_WIG; 4607 defm VPABSW : SS3I_unop_rm<0x1D, "vpabsw", v8i16, abs, SchedWriteVecALU, 4608 loadv2i64>, VEX, VEX_WIG; 4609} 4610let Predicates = [HasAVX, NoVLX] in { 4611 defm VPABSD : SS3I_unop_rm<0x1E, "vpabsd", v4i32, abs, SchedWriteVecALU, 4612 loadv2i64>, VEX, VEX_WIG; 4613} 4614let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 4615 defm VPABSB : SS3I_unop_rm_y<0x1C, "vpabsb", v32i8, abs, SchedWriteVecALU>, 4616 VEX, VEX_L, VEX_WIG; 4617 defm VPABSW : SS3I_unop_rm_y<0x1D, "vpabsw", v16i16, abs, SchedWriteVecALU>, 4618 VEX, VEX_L, VEX_WIG; 4619} 4620let Predicates = [HasAVX2, NoVLX] in { 4621 defm VPABSD : SS3I_unop_rm_y<0x1E, "vpabsd", v8i32, abs, SchedWriteVecALU>, 4622 VEX, VEX_L, VEX_WIG; 4623} 4624 4625defm PABSB : SS3I_unop_rm<0x1C, "pabsb", v16i8, abs, SchedWriteVecALU, 4626 memopv2i64>; 4627defm PABSW : SS3I_unop_rm<0x1D, "pabsw", v8i16, abs, SchedWriteVecALU, 4628 memopv2i64>; 4629defm PABSD : SS3I_unop_rm<0x1E, "pabsd", v4i32, abs, SchedWriteVecALU, 4630 memopv2i64>; 4631 4632//===---------------------------------------------------------------------===// 4633// SSSE3 - Packed Binary Operator Instructions 4634//===---------------------------------------------------------------------===// 4635 4636/// SS3I_binop_rm - Simple SSSE3 bin op 4637multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, 4638 ValueType DstVT, ValueType OpVT, RegisterClass RC, 4639 PatFrag memop_frag, X86MemOperand x86memop, 4640 X86FoldableSchedWrite sched, bit Is2Addr = 1> { 4641 let isCommutable = 1 in 4642 def rr : SS38I<opc, MRMSrcReg, (outs RC:$dst), 4643 (ins RC:$src1, RC:$src2), 4644 !if(Is2Addr, 4645 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4646 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4647 [(set RC:$dst, (DstVT (OpNode (OpVT RC:$src1), RC:$src2)))]>, 4648 Sched<[sched]>; 4649 def rm : SS38I<opc, MRMSrcMem, (outs RC:$dst), 4650 (ins RC:$src1, x86memop:$src2), 4651 !if(Is2Addr, 4652 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4653 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4654 [(set RC:$dst, 4655 (DstVT (OpNode (OpVT RC:$src1), 4656 (bitconvert (memop_frag addr:$src2)))))]>, 4657 Sched<[sched.Folded, ReadAfterLd]>; 4658} 4659 4660/// SS3I_binop_rm_int - Simple SSSE3 bin op whose type can be v*{i8,i16,i32}. 4661multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr, 4662 Intrinsic IntId128, X86FoldableSchedWrite sched, 4663 PatFrag ld_frag, bit Is2Addr = 1> { 4664 let isCommutable = 1 in 4665 def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst), 4666 (ins VR128:$src1, VR128:$src2), 4667 !if(Is2Addr, 4668 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4669 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4670 [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>, 4671 Sched<[sched]>; 4672 def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst), 4673 (ins VR128:$src1, i128mem:$src2), 4674 !if(Is2Addr, 4675 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4676 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4677 [(set VR128:$dst, 4678 (IntId128 VR128:$src1, 4679 (bitconvert (ld_frag addr:$src2))))]>, 4680 Sched<[sched.Folded, ReadAfterLd]>; 4681} 4682 4683multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr, 4684 Intrinsic IntId256, 4685 X86FoldableSchedWrite sched> { 4686 let isCommutable = 1 in 4687 def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst), 4688 (ins VR256:$src1, VR256:$src2), 4689 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 4690 [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>, 4691 Sched<[sched]>; 4692 def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst), 4693 (ins VR256:$src1, i256mem:$src2), 4694 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 4695 [(set VR256:$dst, 4696 (IntId256 VR256:$src1, (bitconvert (loadv4i64 addr:$src2))))]>, 4697 Sched<[sched.Folded, ReadAfterLd]>; 4698} 4699 4700let ImmT = NoImm, Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 4701let isCommutable = 0 in { 4702 defm VPSHUFB : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, v16i8, 4703 VR128, loadv2i64, i128mem, 4704 SchedWriteVarShuffle.XMM, 0>, VEX_4V, VEX_WIG; 4705 defm VPMADDUBSW : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v8i16, 4706 v16i8, VR128, loadv2i64, i128mem, 4707 SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG; 4708} 4709defm VPMULHRSW : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v8i16, v8i16, 4710 VR128, loadv2i64, i128mem, 4711 SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG; 4712} 4713 4714let ImmT = NoImm, Predicates = [HasAVX] in { 4715let isCommutable = 0 in { 4716 defm VPHADDW : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, v8i16, VR128, 4717 loadv2i64, i128mem, 4718 SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG; 4719 defm VPHADDD : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, v4i32, VR128, 4720 loadv2i64, i128mem, 4721 SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG; 4722 defm VPHSUBW : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, v8i16, VR128, 4723 loadv2i64, i128mem, 4724 SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG; 4725 defm VPHSUBD : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, v4i32, VR128, 4726 loadv2i64, i128mem, 4727 SchedWritePHAdd.XMM, 0>, VEX_4V; 4728 defm VPSIGNB : SS3I_binop_rm_int<0x08, "vpsignb", 4729 int_x86_ssse3_psign_b_128, 4730 SchedWriteVecALU.XMM, loadv2i64, 0>, VEX_4V, VEX_WIG; 4731 defm VPSIGNW : SS3I_binop_rm_int<0x09, "vpsignw", 4732 int_x86_ssse3_psign_w_128, 4733 SchedWriteVecALU.XMM, loadv2i64, 0>, VEX_4V, VEX_WIG; 4734 defm VPSIGND : SS3I_binop_rm_int<0x0A, "vpsignd", 4735 int_x86_ssse3_psign_d_128, 4736 SchedWriteVecALU.XMM, loadv2i64, 0>, VEX_4V, VEX_WIG; 4737 defm VPHADDSW : SS3I_binop_rm_int<0x03, "vphaddsw", 4738 int_x86_ssse3_phadd_sw_128, 4739 SchedWritePHAdd.XMM, loadv2i64, 0>, VEX_4V, VEX_WIG; 4740 defm VPHSUBSW : SS3I_binop_rm_int<0x07, "vphsubsw", 4741 int_x86_ssse3_phsub_sw_128, 4742 SchedWritePHAdd.XMM, loadv2i64, 0>, VEX_4V, VEX_WIG; 4743} 4744} 4745 4746let ImmT = NoImm, Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 4747let isCommutable = 0 in { 4748 defm VPSHUFBY : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, v32i8, 4749 VR256, loadv4i64, i256mem, 4750 SchedWriteVarShuffle.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; 4751 defm VPMADDUBSWY : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v16i16, 4752 v32i8, VR256, loadv4i64, i256mem, 4753 SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; 4754} 4755defm VPMULHRSWY : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v16i16, v16i16, 4756 VR256, loadv4i64, i256mem, 4757 SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; 4758} 4759 4760let ImmT = NoImm, Predicates = [HasAVX2] in { 4761let isCommutable = 0 in { 4762 defm VPHADDWY : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, v16i16, 4763 VR256, loadv4i64, i256mem, 4764 SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; 4765 defm VPHADDDY : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, v8i32, VR256, 4766 loadv4i64, i256mem, 4767 SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; 4768 defm VPHSUBWY : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, v16i16, 4769 VR256, loadv4i64, i256mem, 4770 SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; 4771 defm VPHSUBDY : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, v8i32, VR256, 4772 loadv4i64, i256mem, 4773 SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L; 4774 defm VPSIGNB : SS3I_binop_rm_int_y<0x08, "vpsignb", int_x86_avx2_psign_b, 4775 SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG; 4776 defm VPSIGNW : SS3I_binop_rm_int_y<0x09, "vpsignw", int_x86_avx2_psign_w, 4777 SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG; 4778 defm VPSIGND : SS3I_binop_rm_int_y<0x0A, "vpsignd", int_x86_avx2_psign_d, 4779 SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG; 4780 defm VPHADDSW : SS3I_binop_rm_int_y<0x03, "vphaddsw", 4781 int_x86_avx2_phadd_sw, 4782 SchedWritePHAdd.YMM>, VEX_4V, VEX_L, VEX_WIG; 4783 defm VPHSUBSW : SS3I_binop_rm_int_y<0x07, "vphsubsw", 4784 int_x86_avx2_phsub_sw, 4785 SchedWritePHAdd.YMM>, VEX_4V, VEX_L, VEX_WIG; 4786} 4787} 4788 4789// None of these have i8 immediate fields. 4790let ImmT = NoImm, Constraints = "$src1 = $dst" in { 4791let isCommutable = 0 in { 4792 defm PHADDW : SS3I_binop_rm<0x01, "phaddw", X86hadd, v8i16, v8i16, VR128, 4793 memopv2i64, i128mem, SchedWritePHAdd.XMM>; 4794 defm PHADDD : SS3I_binop_rm<0x02, "phaddd", X86hadd, v4i32, v4i32, VR128, 4795 memopv2i64, i128mem, SchedWritePHAdd.XMM>; 4796 defm PHSUBW : SS3I_binop_rm<0x05, "phsubw", X86hsub, v8i16, v8i16, VR128, 4797 memopv2i64, i128mem, SchedWritePHAdd.XMM>; 4798 defm PHSUBD : SS3I_binop_rm<0x06, "phsubd", X86hsub, v4i32, v4i32, VR128, 4799 memopv2i64, i128mem, SchedWritePHAdd.XMM>; 4800 defm PSIGNB : SS3I_binop_rm_int<0x08, "psignb", int_x86_ssse3_psign_b_128, 4801 SchedWriteVecALU.XMM, memopv2i64>; 4802 defm PSIGNW : SS3I_binop_rm_int<0x09, "psignw", int_x86_ssse3_psign_w_128, 4803 SchedWriteVecALU.XMM, memopv2i64>; 4804 defm PSIGND : SS3I_binop_rm_int<0x0A, "psignd", int_x86_ssse3_psign_d_128, 4805 SchedWriteVecALU.XMM, memopv2i64>; 4806 defm PSHUFB : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, v16i8, VR128, 4807 memopv2i64, i128mem, SchedWriteVarShuffle.XMM>; 4808 defm PHADDSW : SS3I_binop_rm_int<0x03, "phaddsw", 4809 int_x86_ssse3_phadd_sw_128, 4810 SchedWritePHAdd.XMM, memopv2i64>; 4811 defm PHSUBSW : SS3I_binop_rm_int<0x07, "phsubsw", 4812 int_x86_ssse3_phsub_sw_128, 4813 SchedWritePHAdd.XMM, memopv2i64>; 4814 defm PMADDUBSW : SS3I_binop_rm<0x04, "pmaddubsw", X86vpmaddubsw, v8i16, 4815 v16i8, VR128, memopv2i64, i128mem, 4816 SchedWriteVecIMul.XMM>; 4817} 4818defm PMULHRSW : SS3I_binop_rm<0x0B, "pmulhrsw", X86mulhrs, v8i16, v8i16, 4819 VR128, memopv2i64, i128mem, SchedWriteVecIMul.XMM>; 4820} 4821 4822//===---------------------------------------------------------------------===// 4823// SSSE3 - Packed Align Instruction Patterns 4824//===---------------------------------------------------------------------===// 4825 4826multiclass ssse3_palignr<string asm, ValueType VT, RegisterClass RC, 4827 PatFrag memop_frag, X86MemOperand x86memop, 4828 X86FoldableSchedWrite sched, bit Is2Addr = 1> { 4829 let hasSideEffects = 0 in { 4830 def rri : SS3AI<0x0F, MRMSrcReg, (outs RC:$dst), 4831 (ins RC:$src1, RC:$src2, u8imm:$src3), 4832 !if(Is2Addr, 4833 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 4834 !strconcat(asm, 4835 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 4836 [(set RC:$dst, (VT (X86PAlignr RC:$src1, RC:$src2, (i8 imm:$src3))))]>, 4837 Sched<[sched]>; 4838 let mayLoad = 1 in 4839 def rmi : SS3AI<0x0F, MRMSrcMem, (outs RC:$dst), 4840 (ins RC:$src1, x86memop:$src2, u8imm:$src3), 4841 !if(Is2Addr, 4842 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 4843 !strconcat(asm, 4844 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 4845 [(set RC:$dst, (VT (X86PAlignr RC:$src1, 4846 (bitconvert (memop_frag addr:$src2)), 4847 (i8 imm:$src3))))]>, 4848 Sched<[sched.Folded, ReadAfterLd]>; 4849 } 4850} 4851 4852let Predicates = [HasAVX, NoVLX_Or_NoBWI] in 4853 defm VPALIGNR : ssse3_palignr<"vpalignr", v16i8, VR128, loadv2i64, i128mem, 4854 SchedWriteShuffle.XMM, 0>, VEX_4V, VEX_WIG; 4855let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in 4856 defm VPALIGNRY : ssse3_palignr<"vpalignr", v32i8, VR256, loadv4i64, i256mem, 4857 SchedWriteShuffle.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; 4858let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in 4859 defm PALIGNR : ssse3_palignr<"palignr", v16i8, VR128, memopv2i64, i128mem, 4860 SchedWriteShuffle.XMM>; 4861 4862//===---------------------------------------------------------------------===// 4863// SSSE3 - Thread synchronization 4864//===---------------------------------------------------------------------===// 4865 4866let SchedRW = [WriteSystem] in { 4867let usesCustomInserter = 1 in { 4868def MONITOR : PseudoI<(outs), (ins i32mem:$src1, GR32:$src2, GR32:$src3), 4869 [(int_x86_sse3_monitor addr:$src1, GR32:$src2, GR32:$src3)]>, 4870 Requires<[HasSSE3]>; 4871} 4872 4873let Uses = [EAX, ECX, EDX] in 4874def MONITORrrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>, 4875 TB, Requires<[HasSSE3]>; 4876 4877let Uses = [ECX, EAX] in 4878def MWAITrr : I<0x01, MRM_C9, (outs), (ins), "mwait", 4879 [(int_x86_sse3_mwait ECX, EAX)]>, TB, Requires<[HasSSE3]>; 4880} // SchedRW 4881 4882def : InstAlias<"mwait\t{%eax, %ecx|ecx, eax}", (MWAITrr)>, Requires<[Not64BitMode]>; 4883def : InstAlias<"mwait\t{%rax, %rcx|rcx, rax}", (MWAITrr)>, Requires<[In64BitMode]>; 4884 4885def : InstAlias<"monitor\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITORrrr)>, 4886 Requires<[Not64BitMode]>; 4887def : InstAlias<"monitor\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITORrrr)>, 4888 Requires<[In64BitMode]>; 4889 4890//===----------------------------------------------------------------------===// 4891// SSE4.1 - Packed Move with Sign/Zero Extend 4892//===----------------------------------------------------------------------===// 4893 4894multiclass SS41I_pmovx_rrrm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp, 4895 RegisterClass OutRC, RegisterClass InRC, 4896 X86FoldableSchedWrite sched> { 4897 def rr : SS48I<opc, MRMSrcReg, (outs OutRC:$dst), (ins InRC:$src), 4898 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>, 4899 Sched<[sched]>; 4900 4901 def rm : SS48I<opc, MRMSrcMem, (outs OutRC:$dst), (ins MemOp:$src), 4902 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>, 4903 Sched<[sched.Folded]>; 4904} 4905 4906multiclass SS41I_pmovx_rm_all<bits<8> opc, string OpcodeStr, 4907 X86MemOperand MemOp, X86MemOperand MemYOp, 4908 Predicate prd> { 4909 defm NAME : SS41I_pmovx_rrrm<opc, OpcodeStr, MemOp, VR128, VR128, 4910 SchedWriteShuffle.XMM>; 4911 let Predicates = [HasAVX, prd] in 4912 defm V#NAME : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemOp, 4913 VR128, VR128, SchedWriteShuffle.XMM>, 4914 VEX, VEX_WIG; 4915 let Predicates = [HasAVX2, prd] in 4916 defm V#NAME#Y : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemYOp, 4917 VR256, VR128, WriteShuffle256>, 4918 VEX, VEX_L, VEX_WIG; 4919} 4920 4921multiclass SS41I_pmovx_rm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp, 4922 X86MemOperand MemYOp, Predicate prd> { 4923 defm PMOVSX#NAME : SS41I_pmovx_rm_all<opc, !strconcat("pmovsx", OpcodeStr), 4924 MemOp, MemYOp, prd>; 4925 defm PMOVZX#NAME : SS41I_pmovx_rm_all<!add(opc, 0x10), 4926 !strconcat("pmovzx", OpcodeStr), 4927 MemOp, MemYOp, prd>; 4928} 4929 4930defm BW : SS41I_pmovx_rm<0x20, "bw", i64mem, i128mem, NoVLX_Or_NoBWI>; 4931defm WD : SS41I_pmovx_rm<0x23, "wd", i64mem, i128mem, NoVLX>; 4932defm DQ : SS41I_pmovx_rm<0x25, "dq", i64mem, i128mem, NoVLX>; 4933 4934defm BD : SS41I_pmovx_rm<0x21, "bd", i32mem, i64mem, NoVLX>; 4935defm WQ : SS41I_pmovx_rm<0x24, "wq", i32mem, i64mem, NoVLX>; 4936 4937defm BQ : SS41I_pmovx_rm<0x22, "bq", i16mem, i32mem, NoVLX>; 4938 4939// AVX2 Patterns 4940multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy, SDNode ExtOp> { 4941 // Register-Register patterns 4942 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 4943 def : Pat<(v16i16 (ExtOp (v16i8 VR128:$src))), 4944 (!cast<I>(OpcPrefix#BWYrr) VR128:$src)>; 4945 } 4946 let Predicates = [HasAVX, NoVLX] in { 4947 def : Pat<(v8i32 (ExtOp (v16i8 VR128:$src))), 4948 (!cast<I>(OpcPrefix#BDYrr) VR128:$src)>; 4949 def : Pat<(v4i64 (ExtOp (v16i8 VR128:$src))), 4950 (!cast<I>(OpcPrefix#BQYrr) VR128:$src)>; 4951 4952 def : Pat<(v8i32 (ExtOp (v8i16 VR128:$src))), 4953 (!cast<I>(OpcPrefix#WDYrr) VR128:$src)>; 4954 def : Pat<(v4i64 (ExtOp (v8i16 VR128:$src))), 4955 (!cast<I>(OpcPrefix#WQYrr) VR128:$src)>; 4956 4957 def : Pat<(v4i64 (ExtOp (v4i32 VR128:$src))), 4958 (!cast<I>(OpcPrefix#DQYrr) VR128:$src)>; 4959 } 4960 4961 // Simple Register-Memory patterns 4962 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 4963 def : Pat<(v16i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), 4964 (!cast<I>(OpcPrefix#BWYrm) addr:$src)>; 4965 } 4966 let Predicates = [HasAVX, NoVLX] in { 4967 def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), 4968 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; 4969 def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), 4970 (!cast<I>(OpcPrefix#BQYrm) addr:$src)>; 4971 4972 def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)), 4973 (!cast<I>(OpcPrefix#WDYrm) addr:$src)>; 4974 def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)), 4975 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; 4976 4977 def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)), 4978 (!cast<I>(OpcPrefix#DQYrm) addr:$src)>; 4979 } 4980 4981 // AVX2 Register-Memory patterns 4982 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 4983 def : Pat<(v16i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), 4984 (!cast<I>(OpcPrefix#BWYrm) addr:$src)>; 4985 def : Pat<(v16i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))), 4986 (!cast<I>(OpcPrefix#BWYrm) addr:$src)>; 4987 def : Pat<(v16i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), 4988 (!cast<I>(OpcPrefix#BWYrm) addr:$src)>; 4989 } 4990 let Predicates = [HasAVX, NoVLX] in { 4991 def : Pat<(v8i32 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), 4992 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; 4993 def : Pat<(v8i32 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))), 4994 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; 4995 def : Pat<(v8i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), 4996 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; 4997 def : Pat<(v8i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), 4998 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; 4999 5000 def : Pat<(v4i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), 5001 (!cast<I>(OpcPrefix#BQYrm) addr:$src)>; 5002 def : Pat<(v4i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))), 5003 (!cast<I>(OpcPrefix#BQYrm) addr:$src)>; 5004 def : Pat<(v4i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), 5005 (!cast<I>(OpcPrefix#BQYrm) addr:$src)>; 5006 def : Pat<(v4i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), 5007 (!cast<I>(OpcPrefix#BQYrm) addr:$src)>; 5008 5009 def : Pat<(v8i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))), 5010 (!cast<I>(OpcPrefix#WDYrm) addr:$src)>; 5011 def : Pat<(v8i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))), 5012 (!cast<I>(OpcPrefix#WDYrm) addr:$src)>; 5013 def : Pat<(v8i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), 5014 (!cast<I>(OpcPrefix#WDYrm) addr:$src)>; 5015 5016 def : Pat<(v4i64 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), 5017 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; 5018 def : Pat<(v4i64 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))), 5019 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; 5020 def : Pat<(v4i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), 5021 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; 5022 def : Pat<(v4i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))), 5023 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; 5024 5025 def : Pat<(v4i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))), 5026 (!cast<I>(OpcPrefix#DQYrm) addr:$src)>; 5027 def : Pat<(v4i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))), 5028 (!cast<I>(OpcPrefix#DQYrm) addr:$src)>; 5029 def : Pat<(v4i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))), 5030 (!cast<I>(OpcPrefix#DQYrm) addr:$src)>; 5031 } 5032} 5033 5034defm : SS41I_pmovx_avx2_patterns<"VPMOVSX", "s", X86vsext>; 5035defm : SS41I_pmovx_avx2_patterns<"VPMOVZX", "z", X86vzext>; 5036 5037// SSE4.1/AVX patterns. 5038multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy, 5039 SDNode ExtOp> { 5040 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 5041 def : Pat<(v8i16 (ExtOp (v16i8 VR128:$src))), 5042 (!cast<I>(OpcPrefix#BWrr) VR128:$src)>; 5043 } 5044 let Predicates = [HasAVX, NoVLX] in { 5045 def : Pat<(v4i32 (ExtOp (v16i8 VR128:$src))), 5046 (!cast<I>(OpcPrefix#BDrr) VR128:$src)>; 5047 def : Pat<(v2i64 (ExtOp (v16i8 VR128:$src))), 5048 (!cast<I>(OpcPrefix#BQrr) VR128:$src)>; 5049 5050 def : Pat<(v4i32 (ExtOp (v8i16 VR128:$src))), 5051 (!cast<I>(OpcPrefix#WDrr) VR128:$src)>; 5052 def : Pat<(v2i64 (ExtOp (v8i16 VR128:$src))), 5053 (!cast<I>(OpcPrefix#WQrr) VR128:$src)>; 5054 5055 def : Pat<(v2i64 (ExtOp (v4i32 VR128:$src))), 5056 (!cast<I>(OpcPrefix#DQrr) VR128:$src)>; 5057 } 5058 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 5059 def : Pat<(v8i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), 5060 (!cast<I>(OpcPrefix#BWrm) addr:$src)>; 5061 } 5062 let Predicates = [HasAVX, NoVLX] in { 5063 def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), 5064 (!cast<I>(OpcPrefix#BDrm) addr:$src)>; 5065 def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), 5066 (!cast<I>(OpcPrefix#BQrm) addr:$src)>; 5067 5068 def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)), 5069 (!cast<I>(OpcPrefix#WDrm) addr:$src)>; 5070 def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)), 5071 (!cast<I>(OpcPrefix#WQrm) addr:$src)>; 5072 5073 def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)), 5074 (!cast<I>(OpcPrefix#DQrm) addr:$src)>; 5075 } 5076 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 5077 def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), 5078 (!cast<I>(OpcPrefix#BWrm) addr:$src)>; 5079 def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), 5080 (!cast<I>(OpcPrefix#BWrm) addr:$src)>; 5081 def : Pat<(v8i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))), 5082 (!cast<I>(OpcPrefix#BWrm) addr:$src)>; 5083 def : Pat<(v8i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), 5084 (!cast<I>(OpcPrefix#BWrm) addr:$src)>; 5085 def : Pat<(v8i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), 5086 (!cast<I>(OpcPrefix#BWrm) addr:$src)>; 5087 } 5088 let Predicates = [HasAVX, NoVLX] in { 5089 def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), 5090 (!cast<I>(OpcPrefix#BDrm) addr:$src)>; 5091 def : Pat<(v4i32 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))), 5092 (!cast<I>(OpcPrefix#BDrm) addr:$src)>; 5093 def : Pat<(v4i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), 5094 (!cast<I>(OpcPrefix#BDrm) addr:$src)>; 5095 def : Pat<(v4i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), 5096 (!cast<I>(OpcPrefix#BDrm) addr:$src)>; 5097 5098 def : Pat<(v2i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (extloadi32i16 addr:$src)))))), 5099 (!cast<I>(OpcPrefix#BQrm) addr:$src)>; 5100 def : Pat<(v2i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))), 5101 (!cast<I>(OpcPrefix#BQrm) addr:$src)>; 5102 def : Pat<(v2i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), 5103 (!cast<I>(OpcPrefix#BQrm) addr:$src)>; 5104 def : Pat<(v2i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), 5105 (!cast<I>(OpcPrefix#BQrm) addr:$src)>; 5106 5107 def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), 5108 (!cast<I>(OpcPrefix#WDrm) addr:$src)>; 5109 def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), 5110 (!cast<I>(OpcPrefix#WDrm) addr:$src)>; 5111 def : Pat<(v4i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))), 5112 (!cast<I>(OpcPrefix#WDrm) addr:$src)>; 5113 def : Pat<(v4i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), 5114 (!cast<I>(OpcPrefix#WDrm) addr:$src)>; 5115 def : Pat<(v4i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))), 5116 (!cast<I>(OpcPrefix#WDrm) addr:$src)>; 5117 5118 def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), 5119 (!cast<I>(OpcPrefix#WQrm) addr:$src)>; 5120 def : Pat<(v2i64 (ExtOp (v8i16 (vzmovl_v4i32 addr:$src)))), 5121 (!cast<I>(OpcPrefix#WQrm) addr:$src)>; 5122 def : Pat<(v2i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), 5123 (!cast<I>(OpcPrefix#WQrm) addr:$src)>; 5124 def : Pat<(v2i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))), 5125 (!cast<I>(OpcPrefix#WQrm) addr:$src)>; 5126 5127 def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), 5128 (!cast<I>(OpcPrefix#DQrm) addr:$src)>; 5129 def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), 5130 (!cast<I>(OpcPrefix#DQrm) addr:$src)>; 5131 def : Pat<(v2i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))), 5132 (!cast<I>(OpcPrefix#DQrm) addr:$src)>; 5133 def : Pat<(v2i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))), 5134 (!cast<I>(OpcPrefix#DQrm) addr:$src)>; 5135 def : Pat<(v2i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))), 5136 (!cast<I>(OpcPrefix#DQrm) addr:$src)>; 5137 } 5138} 5139 5140defm : SS41I_pmovx_patterns<"VPMOVSX", "s", sext_invec>; 5141defm : SS41I_pmovx_patterns<"VPMOVZX", "z", zext_invec>; 5142 5143let Predicates = [UseSSE41] in { 5144 defm : SS41I_pmovx_patterns<"PMOVSX", "s", sext_invec>; 5145 defm : SS41I_pmovx_patterns<"PMOVZX", "z", zext_invec>; 5146} 5147 5148//===----------------------------------------------------------------------===// 5149// SSE4.1 - Extract Instructions 5150//===----------------------------------------------------------------------===// 5151 5152/// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem 5153multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> { 5154 def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst), 5155 (ins VR128:$src1, u8imm:$src2), 5156 !strconcat(OpcodeStr, 5157 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5158 [(set GR32orGR64:$dst, (X86pextrb (v16i8 VR128:$src1), 5159 imm:$src2))]>, 5160 Sched<[WriteVecExtract]>; 5161 let hasSideEffects = 0, mayStore = 1 in 5162 def mr : SS4AIi8<opc, MRMDestMem, (outs), 5163 (ins i8mem:$dst, VR128:$src1, u8imm:$src2), 5164 !strconcat(OpcodeStr, 5165 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5166 [(store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), imm:$src2))), 5167 addr:$dst)]>, Sched<[WriteVecExtractSt]>; 5168} 5169 5170let Predicates = [HasAVX, NoBWI] in 5171 defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX; 5172 5173defm PEXTRB : SS41I_extract8<0x14, "pextrb">; 5174 5175 5176/// SS41I_extract16 - SSE 4.1 extract 16 bits to memory destination 5177multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> { 5178 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in 5179 def rr_REV : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst), 5180 (ins VR128:$src1, u8imm:$src2), 5181 !strconcat(OpcodeStr, 5182 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, 5183 Sched<[WriteVecExtract]>, FoldGenData<NAME#rr>; 5184 5185 let hasSideEffects = 0, mayStore = 1 in 5186 def mr : SS4AIi8<opc, MRMDestMem, (outs), 5187 (ins i16mem:$dst, VR128:$src1, u8imm:$src2), 5188 !strconcat(OpcodeStr, 5189 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5190 [(store (i16 (trunc (X86pextrw (v8i16 VR128:$src1), imm:$src2))), 5191 addr:$dst)]>, Sched<[WriteVecExtractSt]>; 5192} 5193 5194let Predicates = [HasAVX, NoBWI] in 5195 defm VPEXTRW : SS41I_extract16<0x15, "vpextrw">, VEX; 5196 5197defm PEXTRW : SS41I_extract16<0x15, "pextrw">; 5198 5199 5200/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination 5201multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> { 5202 def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst), 5203 (ins VR128:$src1, u8imm:$src2), 5204 !strconcat(OpcodeStr, 5205 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5206 [(set GR32:$dst, 5207 (extractelt (v4i32 VR128:$src1), imm:$src2))]>, 5208 Sched<[WriteVecExtract]>; 5209 def mr : SS4AIi8<opc, MRMDestMem, (outs), 5210 (ins i32mem:$dst, VR128:$src1, u8imm:$src2), 5211 !strconcat(OpcodeStr, 5212 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5213 [(store (extractelt (v4i32 VR128:$src1), imm:$src2), 5214 addr:$dst)]>, Sched<[WriteVecExtractSt]>; 5215} 5216 5217let Predicates = [HasAVX, NoDQI] in 5218 defm VPEXTRD : SS41I_extract32<0x16, "vpextrd">, VEX; 5219 5220defm PEXTRD : SS41I_extract32<0x16, "pextrd">; 5221 5222/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination 5223multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> { 5224 def rr : SS4AIi8<opc, MRMDestReg, (outs GR64:$dst), 5225 (ins VR128:$src1, u8imm:$src2), 5226 !strconcat(OpcodeStr, 5227 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5228 [(set GR64:$dst, 5229 (extractelt (v2i64 VR128:$src1), imm:$src2))]>, 5230 Sched<[WriteVecExtract]>; 5231 def mr : SS4AIi8<opc, MRMDestMem, (outs), 5232 (ins i64mem:$dst, VR128:$src1, u8imm:$src2), 5233 !strconcat(OpcodeStr, 5234 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5235 [(store (extractelt (v2i64 VR128:$src1), imm:$src2), 5236 addr:$dst)]>, Sched<[WriteVecExtractSt]>; 5237} 5238 5239let Predicates = [HasAVX, NoDQI] in 5240 defm VPEXTRQ : SS41I_extract64<0x16, "vpextrq">, VEX, VEX_W; 5241 5242defm PEXTRQ : SS41I_extract64<0x16, "pextrq">, REX_W; 5243 5244/// SS41I_extractf32 - SSE 4.1 extract 32 bits fp value to int reg or memory 5245/// destination 5246multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr> { 5247 def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst), 5248 (ins VR128:$src1, u8imm:$src2), 5249 !strconcat(OpcodeStr, 5250 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5251 [(set GR32orGR64:$dst, 5252 (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))]>, 5253 Sched<[WriteVecExtract]>; 5254 def mr : SS4AIi8<opc, MRMDestMem, (outs), 5255 (ins f32mem:$dst, VR128:$src1, u8imm:$src2), 5256 !strconcat(OpcodeStr, 5257 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5258 [(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2), 5259 addr:$dst)]>, Sched<[WriteVecExtractSt]>; 5260} 5261 5262let ExeDomain = SSEPackedSingle in { 5263 let Predicates = [UseAVX] in 5264 defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX, VEX_WIG; 5265 defm EXTRACTPS : SS41I_extractf32<0x17, "extractps">; 5266} 5267 5268// Also match an EXTRACTPS store when the store is done as f32 instead of i32. 5269def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)), 5270 imm:$src2))), 5271 addr:$dst), 5272 (VEXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>, 5273 Requires<[HasAVX]>; 5274def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)), 5275 imm:$src2))), 5276 addr:$dst), 5277 (EXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>, 5278 Requires<[UseSSE41]>; 5279 5280//===----------------------------------------------------------------------===// 5281// SSE4.1 - Insert Instructions 5282//===----------------------------------------------------------------------===// 5283 5284multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> { 5285 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), 5286 (ins VR128:$src1, GR32orGR64:$src2, u8imm:$src3), 5287 !if(Is2Addr, 5288 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5289 !strconcat(asm, 5290 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5291 [(set VR128:$dst, 5292 (X86pinsrb VR128:$src1, GR32orGR64:$src2, imm:$src3))]>, 5293 Sched<[WriteVecInsert]>; 5294 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), 5295 (ins VR128:$src1, i8mem:$src2, u8imm:$src3), 5296 !if(Is2Addr, 5297 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5298 !strconcat(asm, 5299 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5300 [(set VR128:$dst, 5301 (X86pinsrb VR128:$src1, (extloadi8 addr:$src2), 5302 imm:$src3))]>, Sched<[WriteVecInsertLd, ReadAfterLd]>; 5303} 5304 5305let Predicates = [HasAVX, NoBWI] in 5306 defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V; 5307let Constraints = "$src1 = $dst" in 5308 defm PINSRB : SS41I_insert8<0x20, "pinsrb">; 5309 5310multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> { 5311 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), 5312 (ins VR128:$src1, GR32:$src2, u8imm:$src3), 5313 !if(Is2Addr, 5314 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5315 !strconcat(asm, 5316 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5317 [(set VR128:$dst, 5318 (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>, 5319 Sched<[WriteVecInsert]>; 5320 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), 5321 (ins VR128:$src1, i32mem:$src2, u8imm:$src3), 5322 !if(Is2Addr, 5323 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5324 !strconcat(asm, 5325 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5326 [(set VR128:$dst, 5327 (v4i32 (insertelt VR128:$src1, (loadi32 addr:$src2), 5328 imm:$src3)))]>, Sched<[WriteVecInsertLd, ReadAfterLd]>; 5329} 5330 5331let Predicates = [HasAVX, NoDQI] in 5332 defm VPINSRD : SS41I_insert32<0x22, "vpinsrd", 0>, VEX_4V; 5333let Constraints = "$src1 = $dst" in 5334 defm PINSRD : SS41I_insert32<0x22, "pinsrd">; 5335 5336multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> { 5337 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), 5338 (ins VR128:$src1, GR64:$src2, u8imm:$src3), 5339 !if(Is2Addr, 5340 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5341 !strconcat(asm, 5342 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5343 [(set VR128:$dst, 5344 (v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>, 5345 Sched<[WriteVecInsert]>; 5346 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), 5347 (ins VR128:$src1, i64mem:$src2, u8imm:$src3), 5348 !if(Is2Addr, 5349 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5350 !strconcat(asm, 5351 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5352 [(set VR128:$dst, 5353 (v2i64 (insertelt VR128:$src1, (loadi64 addr:$src2), 5354 imm:$src3)))]>, Sched<[WriteVecInsertLd, ReadAfterLd]>; 5355} 5356 5357let Predicates = [HasAVX, NoDQI] in 5358 defm VPINSRQ : SS41I_insert64<0x22, "vpinsrq", 0>, VEX_4V, VEX_W; 5359let Constraints = "$src1 = $dst" in 5360 defm PINSRQ : SS41I_insert64<0x22, "pinsrq">, REX_W; 5361 5362// insertps has a few different modes, there's the first two here below which 5363// are optimized inserts that won't zero arbitrary elements in the destination 5364// vector. The next one matches the intrinsic and could zero arbitrary elements 5365// in the target vector. 5366multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> { 5367 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), 5368 (ins VR128:$src1, VR128:$src2, u8imm:$src3), 5369 !if(Is2Addr, 5370 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5371 !strconcat(asm, 5372 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5373 [(set VR128:$dst, 5374 (X86insertps VR128:$src1, VR128:$src2, imm:$src3))]>, 5375 Sched<[SchedWriteFShuffle.XMM]>; 5376 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), 5377 (ins VR128:$src1, f32mem:$src2, u8imm:$src3), 5378 !if(Is2Addr, 5379 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5380 !strconcat(asm, 5381 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5382 [(set VR128:$dst, 5383 (X86insertps VR128:$src1, 5384 (v4f32 (scalar_to_vector (loadf32 addr:$src2))), 5385 imm:$src3))]>, 5386 Sched<[SchedWriteFShuffle.XMM.Folded, ReadAfterLd]>; 5387} 5388 5389let ExeDomain = SSEPackedSingle in { 5390 let Predicates = [UseAVX] in 5391 defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>, 5392 VEX_4V, VEX_WIG; 5393 let Constraints = "$src1 = $dst" in 5394 defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1>; 5395} 5396 5397let Predicates = [UseAVX] in { 5398 // If we're inserting an element from a vbroadcast of a load, fold the 5399 // load into the X86insertps instruction. 5400 def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), 5401 (X86VBroadcast (loadf32 addr:$src2)), imm:$src3)), 5402 (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>; 5403 def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), 5404 (X86VBroadcast (loadv4f32 addr:$src2)), imm:$src3)), 5405 (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>; 5406} 5407 5408//===----------------------------------------------------------------------===// 5409// SSE4.1 - Round Instructions 5410//===----------------------------------------------------------------------===// 5411 5412multiclass sse41_fp_unop_p<bits<8> opc, string OpcodeStr, 5413 X86MemOperand x86memop, RegisterClass RC, 5414 ValueType VT, PatFrag mem_frag, SDNode OpNode, 5415 X86FoldableSchedWrite sched> { 5416 // Intrinsic operation, reg. 5417 // Vector intrinsic operation, reg 5418 def r : SS4AIi8<opc, MRMSrcReg, 5419 (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2), 5420 !strconcat(OpcodeStr, 5421 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5422 [(set RC:$dst, (VT (OpNode RC:$src1, imm:$src2)))]>, 5423 Sched<[sched]>; 5424 5425 // Vector intrinsic operation, mem 5426 def m : SS4AIi8<opc, MRMSrcMem, 5427 (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2), 5428 !strconcat(OpcodeStr, 5429 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5430 [(set RC:$dst, 5431 (VT (OpNode (mem_frag addr:$src1),imm:$src2)))]>, 5432 Sched<[sched.Folded]>; 5433} 5434 5435multiclass avx_fp_unop_rm<bits<8> opcss, bits<8> opcsd, 5436 string OpcodeStr, X86FoldableSchedWrite sched> { 5437let ExeDomain = SSEPackedSingle, hasSideEffects = 0 in { 5438 def SSr : SS4AIi8<opcss, MRMSrcReg, 5439 (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32u8imm:$src3), 5440 !strconcat(OpcodeStr, 5441 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 5442 []>, Sched<[sched]>; 5443 5444 let mayLoad = 1 in 5445 def SSm : SS4AIi8<opcss, MRMSrcMem, 5446 (outs FR32:$dst), (ins FR32:$src1, f32mem:$src2, i32u8imm:$src3), 5447 !strconcat(OpcodeStr, 5448 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 5449 []>, Sched<[sched.Folded, ReadAfterLd]>; 5450} // ExeDomain = SSEPackedSingle, hasSideEffects = 0 5451 5452let ExeDomain = SSEPackedDouble, hasSideEffects = 0 in { 5453 def SDr : SS4AIi8<opcsd, MRMSrcReg, 5454 (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32u8imm:$src3), 5455 !strconcat(OpcodeStr, 5456 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 5457 []>, Sched<[sched]>; 5458 5459 let mayLoad = 1 in 5460 def SDm : SS4AIi8<opcsd, MRMSrcMem, 5461 (outs FR64:$dst), (ins FR64:$src1, f64mem:$src2, i32u8imm:$src3), 5462 !strconcat(OpcodeStr, 5463 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 5464 []>, Sched<[sched.Folded, ReadAfterLd]>; 5465} // ExeDomain = SSEPackedDouble, hasSideEffects = 0 5466} 5467 5468multiclass sse41_fp_unop_s<bits<8> opcss, bits<8> opcsd, 5469 string OpcodeStr, X86FoldableSchedWrite sched> { 5470let ExeDomain = SSEPackedSingle, hasSideEffects = 0 in { 5471 def SSr : SS4AIi8<opcss, MRMSrcReg, 5472 (outs FR32:$dst), (ins FR32:$src1, i32u8imm:$src2), 5473 !strconcat(OpcodeStr, 5474 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5475 []>, Sched<[sched]>; 5476 5477 let mayLoad = 1 in 5478 def SSm : SS4AIi8<opcss, MRMSrcMem, 5479 (outs FR32:$dst), (ins f32mem:$src1, i32u8imm:$src2), 5480 !strconcat(OpcodeStr, 5481 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5482 []>, Sched<[sched.Folded, ReadAfterLd]>; 5483} // ExeDomain = SSEPackedSingle, hasSideEffects = 0 5484 5485let ExeDomain = SSEPackedDouble, hasSideEffects = 0 in { 5486 def SDr : SS4AIi8<opcsd, MRMSrcReg, 5487 (outs FR64:$dst), (ins FR64:$src1, i32u8imm:$src2), 5488 !strconcat(OpcodeStr, 5489 "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5490 []>, Sched<[sched]>; 5491 5492 let mayLoad = 1 in 5493 def SDm : SS4AIi8<opcsd, MRMSrcMem, 5494 (outs FR64:$dst), (ins f64mem:$src1, i32u8imm:$src2), 5495 !strconcat(OpcodeStr, 5496 "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5497 []>, Sched<[sched.Folded, ReadAfterLd]>; 5498} // ExeDomain = SSEPackedDouble, hasSideEffects = 0 5499} 5500 5501multiclass sse41_fp_binop_s<bits<8> opcss, bits<8> opcsd, 5502 string OpcodeStr, X86FoldableSchedWrite sched, 5503 ValueType VT32, ValueType VT64, 5504 SDNode OpNode, bit Is2Addr = 1> { 5505let ExeDomain = SSEPackedSingle, isCodeGenOnly = 1 in { 5506 def SSr_Int : SS4AIi8<opcss, MRMSrcReg, 5507 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3), 5508 !if(Is2Addr, 5509 !strconcat(OpcodeStr, 5510 "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5511 !strconcat(OpcodeStr, 5512 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5513 [(set VR128:$dst, (VT32 (OpNode VR128:$src1, VR128:$src2, imm:$src3)))]>, 5514 Sched<[sched]>; 5515 5516 def SSm_Int : SS4AIi8<opcss, MRMSrcMem, 5517 (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32u8imm:$src3), 5518 !if(Is2Addr, 5519 !strconcat(OpcodeStr, 5520 "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5521 !strconcat(OpcodeStr, 5522 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5523 [(set VR128:$dst, 5524 (OpNode VR128:$src1, sse_load_f32:$src2, imm:$src3))]>, 5525 Sched<[sched.Folded, ReadAfterLd]>; 5526} // ExeDomain = SSEPackedSingle, isCodeGenOnly = 1 5527 5528let ExeDomain = SSEPackedDouble, isCodeGenOnly = 1 in { 5529 def SDr_Int : SS4AIi8<opcsd, MRMSrcReg, 5530 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3), 5531 !if(Is2Addr, 5532 !strconcat(OpcodeStr, 5533 "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5534 !strconcat(OpcodeStr, 5535 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5536 [(set VR128:$dst, (VT64 (OpNode VR128:$src1, VR128:$src2, imm:$src3)))]>, 5537 Sched<[sched]>; 5538 5539 def SDm_Int : SS4AIi8<opcsd, MRMSrcMem, 5540 (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32u8imm:$src3), 5541 !if(Is2Addr, 5542 !strconcat(OpcodeStr, 5543 "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5544 !strconcat(OpcodeStr, 5545 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5546 [(set VR128:$dst, 5547 (OpNode VR128:$src1, sse_load_f64:$src2, imm:$src3))]>, 5548 Sched<[sched.Folded, ReadAfterLd]>; 5549} // ExeDomain = SSEPackedDouble, isCodeGenOnly = 1 5550} 5551 5552// FP round - roundss, roundps, roundsd, roundpd 5553let Predicates = [HasAVX, NoVLX] in { 5554 let ExeDomain = SSEPackedSingle in { 5555 // Intrinsic form 5556 defm VROUNDPS : sse41_fp_unop_p<0x08, "vroundps", f128mem, VR128, v4f32, 5557 loadv4f32, X86VRndScale, SchedWriteFRnd.XMM>, 5558 VEX, VEX_WIG; 5559 defm VROUNDPSY : sse41_fp_unop_p<0x08, "vroundps", f256mem, VR256, v8f32, 5560 loadv8f32, X86VRndScale, SchedWriteFRnd.YMM>, 5561 VEX, VEX_L, VEX_WIG; 5562 } 5563 5564 let ExeDomain = SSEPackedDouble in { 5565 defm VROUNDPD : sse41_fp_unop_p<0x09, "vroundpd", f128mem, VR128, v2f64, 5566 loadv2f64, X86VRndScale, SchedWriteFRnd.XMM>, 5567 VEX, VEX_WIG; 5568 defm VROUNDPDY : sse41_fp_unop_p<0x09, "vroundpd", f256mem, VR256, v4f64, 5569 loadv4f64, X86VRndScale, SchedWriteFRnd.YMM>, 5570 VEX, VEX_L, VEX_WIG; 5571 } 5572} 5573let Predicates = [HasAVX, NoAVX512] in { 5574 defm VROUND : sse41_fp_binop_s<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl, 5575 v4f32, v2f64, X86RndScales, 0>, 5576 VEX_4V, VEX_LIG, VEX_WIG; 5577 defm VROUND : avx_fp_unop_rm<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl>, 5578 VEX_4V, VEX_LIG, VEX_WIG; 5579} 5580 5581let Predicates = [UseAVX] in { 5582 def : Pat<(ffloor FR32:$src), 5583 (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x9))>; 5584 def : Pat<(f32 (fnearbyint FR32:$src)), 5585 (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>; 5586 def : Pat<(f32 (fceil FR32:$src)), 5587 (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xA))>; 5588 def : Pat<(f32 (frint FR32:$src)), 5589 (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>; 5590 def : Pat<(f32 (ftrunc FR32:$src)), 5591 (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xB))>; 5592 5593 def : Pat<(f64 (ffloor FR64:$src)), 5594 (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x9))>; 5595 def : Pat<(f64 (fnearbyint FR64:$src)), 5596 (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>; 5597 def : Pat<(f64 (fceil FR64:$src)), 5598 (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xA))>; 5599 def : Pat<(f64 (frint FR64:$src)), 5600 (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>; 5601 def : Pat<(f64 (ftrunc FR64:$src)), 5602 (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xB))>; 5603} 5604 5605let Predicates = [UseAVX, OptForSize] in { 5606 def : Pat<(ffloor (loadf32 addr:$src)), 5607 (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src, (i32 0x9))>; 5608 def : Pat<(f32 (fnearbyint (loadf32 addr:$src))), 5609 (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src, (i32 0xC))>; 5610 def : Pat<(f32 (fceil (loadf32 addr:$src))), 5611 (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src, (i32 0xA))>; 5612 def : Pat<(f32 (frint (loadf32 addr:$src))), 5613 (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src, (i32 0x4))>; 5614 def : Pat<(f32 (ftrunc (loadf32 addr:$src))), 5615 (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src, (i32 0xB))>; 5616 5617 def : Pat<(f64 (ffloor (loadf64 addr:$src))), 5618 (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src, (i32 0x9))>; 5619 def : Pat<(f64 (fnearbyint (loadf64 addr:$src))), 5620 (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src, (i32 0xC))>; 5621 def : Pat<(f64 (fceil (loadf64 addr:$src))), 5622 (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src, (i32 0xA))>; 5623 def : Pat<(f64 (frint (loadf64 addr:$src))), 5624 (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src, (i32 0x4))>; 5625 def : Pat<(f64 (ftrunc (loadf64 addr:$src))), 5626 (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src, (i32 0xB))>; 5627} 5628 5629let Predicates = [HasAVX, NoVLX] in { 5630 def : Pat<(v4f32 (ffloor VR128:$src)), 5631 (VROUNDPSr VR128:$src, (i32 0x9))>; 5632 def : Pat<(v4f32 (fnearbyint VR128:$src)), 5633 (VROUNDPSr VR128:$src, (i32 0xC))>; 5634 def : Pat<(v4f32 (fceil VR128:$src)), 5635 (VROUNDPSr VR128:$src, (i32 0xA))>; 5636 def : Pat<(v4f32 (frint VR128:$src)), 5637 (VROUNDPSr VR128:$src, (i32 0x4))>; 5638 def : Pat<(v4f32 (ftrunc VR128:$src)), 5639 (VROUNDPSr VR128:$src, (i32 0xB))>; 5640 5641 def : Pat<(v4f32 (ffloor (loadv4f32 addr:$src))), 5642 (VROUNDPSm addr:$src, (i32 0x9))>; 5643 def : Pat<(v4f32 (fnearbyint (loadv4f32 addr:$src))), 5644 (VROUNDPSm addr:$src, (i32 0xC))>; 5645 def : Pat<(v4f32 (fceil (loadv4f32 addr:$src))), 5646 (VROUNDPSm addr:$src, (i32 0xA))>; 5647 def : Pat<(v4f32 (frint (loadv4f32 addr:$src))), 5648 (VROUNDPSm addr:$src, (i32 0x4))>; 5649 def : Pat<(v4f32 (ftrunc (loadv4f32 addr:$src))), 5650 (VROUNDPSm addr:$src, (i32 0xB))>; 5651 5652 def : Pat<(v2f64 (ffloor VR128:$src)), 5653 (VROUNDPDr VR128:$src, (i32 0x9))>; 5654 def : Pat<(v2f64 (fnearbyint VR128:$src)), 5655 (VROUNDPDr VR128:$src, (i32 0xC))>; 5656 def : Pat<(v2f64 (fceil VR128:$src)), 5657 (VROUNDPDr VR128:$src, (i32 0xA))>; 5658 def : Pat<(v2f64 (frint VR128:$src)), 5659 (VROUNDPDr VR128:$src, (i32 0x4))>; 5660 def : Pat<(v2f64 (ftrunc VR128:$src)), 5661 (VROUNDPDr VR128:$src, (i32 0xB))>; 5662 5663 def : Pat<(v2f64 (ffloor (loadv2f64 addr:$src))), 5664 (VROUNDPDm addr:$src, (i32 0x9))>; 5665 def : Pat<(v2f64 (fnearbyint (loadv2f64 addr:$src))), 5666 (VROUNDPDm addr:$src, (i32 0xC))>; 5667 def : Pat<(v2f64 (fceil (loadv2f64 addr:$src))), 5668 (VROUNDPDm addr:$src, (i32 0xA))>; 5669 def : Pat<(v2f64 (frint (loadv2f64 addr:$src))), 5670 (VROUNDPDm addr:$src, (i32 0x4))>; 5671 def : Pat<(v2f64 (ftrunc (loadv2f64 addr:$src))), 5672 (VROUNDPDm addr:$src, (i32 0xB))>; 5673 5674 def : Pat<(v8f32 (ffloor VR256:$src)), 5675 (VROUNDPSYr VR256:$src, (i32 0x9))>; 5676 def : Pat<(v8f32 (fnearbyint VR256:$src)), 5677 (VROUNDPSYr VR256:$src, (i32 0xC))>; 5678 def : Pat<(v8f32 (fceil VR256:$src)), 5679 (VROUNDPSYr VR256:$src, (i32 0xA))>; 5680 def : Pat<(v8f32 (frint VR256:$src)), 5681 (VROUNDPSYr VR256:$src, (i32 0x4))>; 5682 def : Pat<(v8f32 (ftrunc VR256:$src)), 5683 (VROUNDPSYr VR256:$src, (i32 0xB))>; 5684 5685 def : Pat<(v8f32 (ffloor (loadv8f32 addr:$src))), 5686 (VROUNDPSYm addr:$src, (i32 0x9))>; 5687 def : Pat<(v8f32 (fnearbyint (loadv8f32 addr:$src))), 5688 (VROUNDPSYm addr:$src, (i32 0xC))>; 5689 def : Pat<(v8f32 (fceil (loadv8f32 addr:$src))), 5690 (VROUNDPSYm addr:$src, (i32 0xA))>; 5691 def : Pat<(v8f32 (frint (loadv8f32 addr:$src))), 5692 (VROUNDPSYm addr:$src, (i32 0x4))>; 5693 def : Pat<(v8f32 (ftrunc (loadv8f32 addr:$src))), 5694 (VROUNDPSYm addr:$src, (i32 0xB))>; 5695 5696 def : Pat<(v4f64 (ffloor VR256:$src)), 5697 (VROUNDPDYr VR256:$src, (i32 0x9))>; 5698 def : Pat<(v4f64 (fnearbyint VR256:$src)), 5699 (VROUNDPDYr VR256:$src, (i32 0xC))>; 5700 def : Pat<(v4f64 (fceil VR256:$src)), 5701 (VROUNDPDYr VR256:$src, (i32 0xA))>; 5702 def : Pat<(v4f64 (frint VR256:$src)), 5703 (VROUNDPDYr VR256:$src, (i32 0x4))>; 5704 def : Pat<(v4f64 (ftrunc VR256:$src)), 5705 (VROUNDPDYr VR256:$src, (i32 0xB))>; 5706 5707 def : Pat<(v4f64 (ffloor (loadv4f64 addr:$src))), 5708 (VROUNDPDYm addr:$src, (i32 0x9))>; 5709 def : Pat<(v4f64 (fnearbyint (loadv4f64 addr:$src))), 5710 (VROUNDPDYm addr:$src, (i32 0xC))>; 5711 def : Pat<(v4f64 (fceil (loadv4f64 addr:$src))), 5712 (VROUNDPDYm addr:$src, (i32 0xA))>; 5713 def : Pat<(v4f64 (frint (loadv4f64 addr:$src))), 5714 (VROUNDPDYm addr:$src, (i32 0x4))>; 5715 def : Pat<(v4f64 (ftrunc (loadv4f64 addr:$src))), 5716 (VROUNDPDYm addr:$src, (i32 0xB))>; 5717} 5718 5719let ExeDomain = SSEPackedSingle in 5720defm ROUNDPS : sse41_fp_unop_p<0x08, "roundps", f128mem, VR128, v4f32, 5721 memopv4f32, X86VRndScale, SchedWriteFRnd.XMM>; 5722let ExeDomain = SSEPackedDouble in 5723defm ROUNDPD : sse41_fp_unop_p<0x09, "roundpd", f128mem, VR128, v2f64, 5724 memopv2f64, X86VRndScale, SchedWriteFRnd.XMM>; 5725 5726defm ROUND : sse41_fp_unop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl>; 5727 5728let Constraints = "$src1 = $dst" in 5729defm ROUND : sse41_fp_binop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl, 5730 v4f32, v2f64, X86RndScales>; 5731 5732let Predicates = [UseSSE41] in { 5733 def : Pat<(ffloor FR32:$src), 5734 (ROUNDSSr FR32:$src, (i32 0x9))>; 5735 def : Pat<(f32 (fnearbyint FR32:$src)), 5736 (ROUNDSSr FR32:$src, (i32 0xC))>; 5737 def : Pat<(f32 (fceil FR32:$src)), 5738 (ROUNDSSr FR32:$src, (i32 0xA))>; 5739 def : Pat<(f32 (frint FR32:$src)), 5740 (ROUNDSSr FR32:$src, (i32 0x4))>; 5741 def : Pat<(f32 (ftrunc FR32:$src)), 5742 (ROUNDSSr FR32:$src, (i32 0xB))>; 5743 5744 def : Pat<(f64 (ffloor FR64:$src)), 5745 (ROUNDSDr FR64:$src, (i32 0x9))>; 5746 def : Pat<(f64 (fnearbyint FR64:$src)), 5747 (ROUNDSDr FR64:$src, (i32 0xC))>; 5748 def : Pat<(f64 (fceil FR64:$src)), 5749 (ROUNDSDr FR64:$src, (i32 0xA))>; 5750 def : Pat<(f64 (frint FR64:$src)), 5751 (ROUNDSDr FR64:$src, (i32 0x4))>; 5752 def : Pat<(f64 (ftrunc FR64:$src)), 5753 (ROUNDSDr FR64:$src, (i32 0xB))>; 5754} 5755 5756let Predicates = [UseSSE41, OptForSize] in { 5757 def : Pat<(ffloor (loadf32 addr:$src)), 5758 (ROUNDSSm addr:$src, (i32 0x9))>; 5759 def : Pat<(f32 (fnearbyint (loadf32 addr:$src))), 5760 (ROUNDSSm addr:$src, (i32 0xC))>; 5761 def : Pat<(f32 (fceil (loadf32 addr:$src))), 5762 (ROUNDSSm addr:$src, (i32 0xA))>; 5763 def : Pat<(f32 (frint (loadf32 addr:$src))), 5764 (ROUNDSSm addr:$src, (i32 0x4))>; 5765 def : Pat<(f32 (ftrunc (loadf32 addr:$src))), 5766 (ROUNDSSm addr:$src, (i32 0xB))>; 5767 5768 def : Pat<(f64 (ffloor (loadf64 addr:$src))), 5769 (ROUNDSDm addr:$src, (i32 0x9))>; 5770 def : Pat<(f64 (fnearbyint (loadf64 addr:$src))), 5771 (ROUNDSDm addr:$src, (i32 0xC))>; 5772 def : Pat<(f64 (fceil (loadf64 addr:$src))), 5773 (ROUNDSDm addr:$src, (i32 0xA))>; 5774 def : Pat<(f64 (frint (loadf64 addr:$src))), 5775 (ROUNDSDm addr:$src, (i32 0x4))>; 5776 def : Pat<(f64 (ftrunc (loadf64 addr:$src))), 5777 (ROUNDSDm addr:$src, (i32 0xB))>; 5778} 5779 5780let Predicates = [UseSSE41] in { 5781 def : Pat<(v4f32 (ffloor VR128:$src)), 5782 (ROUNDPSr VR128:$src, (i32 0x9))>; 5783 def : Pat<(v4f32 (fnearbyint VR128:$src)), 5784 (ROUNDPSr VR128:$src, (i32 0xC))>; 5785 def : Pat<(v4f32 (fceil VR128:$src)), 5786 (ROUNDPSr VR128:$src, (i32 0xA))>; 5787 def : Pat<(v4f32 (frint VR128:$src)), 5788 (ROUNDPSr VR128:$src, (i32 0x4))>; 5789 def : Pat<(v4f32 (ftrunc VR128:$src)), 5790 (ROUNDPSr VR128:$src, (i32 0xB))>; 5791 5792 def : Pat<(v4f32 (ffloor (memopv4f32 addr:$src))), 5793 (ROUNDPSm addr:$src, (i32 0x9))>; 5794 def : Pat<(v4f32 (fnearbyint (memopv4f32 addr:$src))), 5795 (ROUNDPSm addr:$src, (i32 0xC))>; 5796 def : Pat<(v4f32 (fceil (memopv4f32 addr:$src))), 5797 (ROUNDPSm addr:$src, (i32 0xA))>; 5798 def : Pat<(v4f32 (frint (memopv4f32 addr:$src))), 5799 (ROUNDPSm addr:$src, (i32 0x4))>; 5800 def : Pat<(v4f32 (ftrunc (memopv4f32 addr:$src))), 5801 (ROUNDPSm addr:$src, (i32 0xB))>; 5802 5803 def : Pat<(v2f64 (ffloor VR128:$src)), 5804 (ROUNDPDr VR128:$src, (i32 0x9))>; 5805 def : Pat<(v2f64 (fnearbyint VR128:$src)), 5806 (ROUNDPDr VR128:$src, (i32 0xC))>; 5807 def : Pat<(v2f64 (fceil VR128:$src)), 5808 (ROUNDPDr VR128:$src, (i32 0xA))>; 5809 def : Pat<(v2f64 (frint VR128:$src)), 5810 (ROUNDPDr VR128:$src, (i32 0x4))>; 5811 def : Pat<(v2f64 (ftrunc VR128:$src)), 5812 (ROUNDPDr VR128:$src, (i32 0xB))>; 5813 5814 def : Pat<(v2f64 (ffloor (memopv2f64 addr:$src))), 5815 (ROUNDPDm addr:$src, (i32 0x9))>; 5816 def : Pat<(v2f64 (fnearbyint (memopv2f64 addr:$src))), 5817 (ROUNDPDm addr:$src, (i32 0xC))>; 5818 def : Pat<(v2f64 (fceil (memopv2f64 addr:$src))), 5819 (ROUNDPDm addr:$src, (i32 0xA))>; 5820 def : Pat<(v2f64 (frint (memopv2f64 addr:$src))), 5821 (ROUNDPDm addr:$src, (i32 0x4))>; 5822 def : Pat<(v2f64 (ftrunc (memopv2f64 addr:$src))), 5823 (ROUNDPDm addr:$src, (i32 0xB))>; 5824} 5825 5826defm : scalar_unary_math_imm_patterns<ffloor, "ROUNDSS", X86Movss, 5827 v4f32, 0x01, UseSSE41>; 5828defm : scalar_unary_math_imm_patterns<fceil, "ROUNDSS", X86Movss, 5829 v4f32, 0x02, UseSSE41>; 5830defm : scalar_unary_math_imm_patterns<ffloor, "ROUNDSD", X86Movsd, 5831 v2f64, 0x01, UseSSE41>; 5832defm : scalar_unary_math_imm_patterns<fceil, "ROUNDSD", X86Movsd, 5833 v2f64, 0x02, UseSSE41>; 5834 5835//===----------------------------------------------------------------------===// 5836// SSE4.1 - Packed Bit Test 5837//===----------------------------------------------------------------------===// 5838 5839// ptest instruction we'll lower to this in X86ISelLowering primarily from 5840// the intel intrinsic that corresponds to this. 5841let Defs = [EFLAGS], Predicates = [HasAVX] in { 5842def VPTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2), 5843 "vptest\t{$src2, $src1|$src1, $src2}", 5844 [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>, 5845 Sched<[SchedWriteVecTest.XMM]>, VEX, VEX_WIG; 5846def VPTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2), 5847 "vptest\t{$src2, $src1|$src1, $src2}", 5848 [(set EFLAGS,(X86ptest VR128:$src1, (loadv2i64 addr:$src2)))]>, 5849 Sched<[SchedWriteVecTest.XMM.Folded, ReadAfterLd]>, 5850 VEX, VEX_WIG; 5851 5852def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2), 5853 "vptest\t{$src2, $src1|$src1, $src2}", 5854 [(set EFLAGS, (X86ptest VR256:$src1, (v4i64 VR256:$src2)))]>, 5855 Sched<[SchedWriteVecTest.YMM]>, VEX, VEX_L, VEX_WIG; 5856def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2), 5857 "vptest\t{$src2, $src1|$src1, $src2}", 5858 [(set EFLAGS,(X86ptest VR256:$src1, (loadv4i64 addr:$src2)))]>, 5859 Sched<[SchedWriteVecTest.YMM.Folded, ReadAfterLd]>, 5860 VEX, VEX_L, VEX_WIG; 5861} 5862 5863let Defs = [EFLAGS] in { 5864def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2), 5865 "ptest\t{$src2, $src1|$src1, $src2}", 5866 [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>, 5867 Sched<[SchedWriteVecTest.XMM]>; 5868def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2), 5869 "ptest\t{$src2, $src1|$src1, $src2}", 5870 [(set EFLAGS, (X86ptest VR128:$src1, (memopv2i64 addr:$src2)))]>, 5871 Sched<[SchedWriteVecTest.XMM.Folded, ReadAfterLd]>; 5872} 5873 5874// The bit test instructions below are AVX only 5875multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC, 5876 X86MemOperand x86memop, PatFrag mem_frag, ValueType vt, 5877 X86FoldableSchedWrite sched> { 5878 def rr : SS48I<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2), 5879 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), 5880 [(set EFLAGS, (X86testp RC:$src1, (vt RC:$src2)))]>, 5881 Sched<[sched]>, VEX; 5882 def rm : SS48I<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2), 5883 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), 5884 [(set EFLAGS, (X86testp RC:$src1, (mem_frag addr:$src2)))]>, 5885 Sched<[sched.Folded, ReadAfterLd]>, VEX; 5886} 5887 5888let Defs = [EFLAGS], Predicates = [HasAVX] in { 5889let ExeDomain = SSEPackedSingle in { 5890defm VTESTPS : avx_bittest<0x0E, "vtestps", VR128, f128mem, loadv4f32, v4f32, 5891 SchedWriteFTest.XMM>; 5892defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, loadv8f32, v8f32, 5893 SchedWriteFTest.YMM>, VEX_L; 5894} 5895let ExeDomain = SSEPackedDouble in { 5896defm VTESTPD : avx_bittest<0x0F, "vtestpd", VR128, f128mem, loadv2f64, v2f64, 5897 SchedWriteFTest.XMM>; 5898defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, loadv4f64, v4f64, 5899 SchedWriteFTest.YMM>, VEX_L; 5900} 5901} 5902 5903//===----------------------------------------------------------------------===// 5904// SSE4.1 - Misc Instructions 5905//===----------------------------------------------------------------------===// 5906 5907let Defs = [EFLAGS], Predicates = [HasPOPCNT] in { 5908 def POPCNT16rr : I<0xB8, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), 5909 "popcnt{w}\t{$src, $dst|$dst, $src}", 5910 [(set GR16:$dst, (ctpop GR16:$src)), (implicit EFLAGS)]>, 5911 Sched<[WritePOPCNT]>, OpSize16, XS; 5912 def POPCNT16rm : I<0xB8, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), 5913 "popcnt{w}\t{$src, $dst|$dst, $src}", 5914 [(set GR16:$dst, (ctpop (loadi16 addr:$src))), 5915 (implicit EFLAGS)]>, 5916 Sched<[WritePOPCNT.Folded]>, OpSize16, XS; 5917 5918 def POPCNT32rr : I<0xB8, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), 5919 "popcnt{l}\t{$src, $dst|$dst, $src}", 5920 [(set GR32:$dst, (ctpop GR32:$src)), (implicit EFLAGS)]>, 5921 Sched<[WritePOPCNT]>, OpSize32, XS; 5922 5923 def POPCNT32rm : I<0xB8, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), 5924 "popcnt{l}\t{$src, $dst|$dst, $src}", 5925 [(set GR32:$dst, (ctpop (loadi32 addr:$src))), 5926 (implicit EFLAGS)]>, 5927 Sched<[WritePOPCNT.Folded]>, OpSize32, XS; 5928 5929 def POPCNT64rr : RI<0xB8, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), 5930 "popcnt{q}\t{$src, $dst|$dst, $src}", 5931 [(set GR64:$dst, (ctpop GR64:$src)), (implicit EFLAGS)]>, 5932 Sched<[WritePOPCNT]>, XS; 5933 def POPCNT64rm : RI<0xB8, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), 5934 "popcnt{q}\t{$src, $dst|$dst, $src}", 5935 [(set GR64:$dst, (ctpop (loadi64 addr:$src))), 5936 (implicit EFLAGS)]>, 5937 Sched<[WritePOPCNT.Folded]>, XS; 5938} 5939 5940// SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16. 5941multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr, 5942 SDNode OpNode, PatFrag ld_frag, 5943 X86FoldableSchedWrite Sched> { 5944 def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), 5945 (ins VR128:$src), 5946 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 5947 [(set VR128:$dst, (v8i16 (OpNode (v8i16 VR128:$src))))]>, 5948 Sched<[Sched]>; 5949 def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), 5950 (ins i128mem:$src), 5951 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 5952 [(set VR128:$dst, 5953 (v8i16 (OpNode (v8i16 (bitconvert (ld_frag addr:$src))))))]>, 5954 Sched<[Sched.Folded]>; 5955} 5956 5957// PHMIN has the same profile as PSAD, thus we use the same scheduling 5958// model, although the naming is misleading. 5959let Predicates = [HasAVX] in 5960defm VPHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "vphminposuw", 5961 X86phminpos, loadv2i64, 5962 WritePHMINPOS>, VEX, VEX_WIG; 5963defm PHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "phminposuw", 5964 X86phminpos, memopv2i64, 5965 WritePHMINPOS>; 5966 5967/// SS48I_binop_rm - Simple SSE41 binary operator. 5968multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, 5969 ValueType OpVT, RegisterClass RC, PatFrag memop_frag, 5970 X86MemOperand x86memop, X86FoldableSchedWrite sched, 5971 bit Is2Addr = 1> { 5972 let isCommutable = 1 in 5973 def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst), 5974 (ins RC:$src1, RC:$src2), 5975 !if(Is2Addr, 5976 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 5977 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 5978 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>, 5979 Sched<[sched]>; 5980 def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst), 5981 (ins RC:$src1, x86memop:$src2), 5982 !if(Is2Addr, 5983 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 5984 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 5985 [(set RC:$dst, 5986 (OpVT (OpNode RC:$src1, (bitconvert (memop_frag addr:$src2)))))]>, 5987 Sched<[sched.Folded, ReadAfterLd]>; 5988} 5989 5990let Predicates = [HasAVX, NoVLX] in { 5991 defm VPMINSD : SS48I_binop_rm<0x39, "vpminsd", smin, v4i32, VR128, 5992 loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>, 5993 VEX_4V, VEX_WIG; 5994 defm VPMINUD : SS48I_binop_rm<0x3B, "vpminud", umin, v4i32, VR128, 5995 loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>, 5996 VEX_4V, VEX_WIG; 5997 defm VPMAXSD : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v4i32, VR128, 5998 loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>, 5999 VEX_4V, VEX_WIG; 6000 defm VPMAXUD : SS48I_binop_rm<0x3F, "vpmaxud", umax, v4i32, VR128, 6001 loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>, 6002 VEX_4V, VEX_WIG; 6003 defm VPMULDQ : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v2i64, VR128, 6004 loadv2i64, i128mem, SchedWriteVecIMul.XMM, 0>, 6005 VEX_4V, VEX_WIG; 6006} 6007let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 6008 defm VPMINSB : SS48I_binop_rm<0x38, "vpminsb", smin, v16i8, VR128, 6009 loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>, 6010 VEX_4V, VEX_WIG; 6011 defm VPMINUW : SS48I_binop_rm<0x3A, "vpminuw", umin, v8i16, VR128, 6012 loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>, 6013 VEX_4V, VEX_WIG; 6014 defm VPMAXSB : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v16i8, VR128, 6015 loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>, 6016 VEX_4V, VEX_WIG; 6017 defm VPMAXUW : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v8i16, VR128, 6018 loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>, 6019 VEX_4V, VEX_WIG; 6020} 6021 6022let Predicates = [HasAVX2, NoVLX] in { 6023 defm VPMINSDY : SS48I_binop_rm<0x39, "vpminsd", smin, v8i32, VR256, 6024 loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>, 6025 VEX_4V, VEX_L, VEX_WIG; 6026 defm VPMINUDY : SS48I_binop_rm<0x3B, "vpminud", umin, v8i32, VR256, 6027 loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>, 6028 VEX_4V, VEX_L, VEX_WIG; 6029 defm VPMAXSDY : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v8i32, VR256, 6030 loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>, 6031 VEX_4V, VEX_L, VEX_WIG; 6032 defm VPMAXUDY : SS48I_binop_rm<0x3F, "vpmaxud", umax, v8i32, VR256, 6033 loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>, 6034 VEX_4V, VEX_L, VEX_WIG; 6035 defm VPMULDQY : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v4i64, VR256, 6036 loadv4i64, i256mem, SchedWriteVecIMul.YMM, 0>, 6037 VEX_4V, VEX_L, VEX_WIG; 6038} 6039let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 6040 defm VPMINSBY : SS48I_binop_rm<0x38, "vpminsb", smin, v32i8, VR256, 6041 loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>, 6042 VEX_4V, VEX_L, VEX_WIG; 6043 defm VPMINUWY : SS48I_binop_rm<0x3A, "vpminuw", umin, v16i16, VR256, 6044 loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>, 6045 VEX_4V, VEX_L, VEX_WIG; 6046 defm VPMAXSBY : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v32i8, VR256, 6047 loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>, 6048 VEX_4V, VEX_L, VEX_WIG; 6049 defm VPMAXUWY : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v16i16, VR256, 6050 loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>, 6051 VEX_4V, VEX_L, VEX_WIG; 6052} 6053 6054let Constraints = "$src1 = $dst" in { 6055 defm PMINSB : SS48I_binop_rm<0x38, "pminsb", smin, v16i8, VR128, 6056 memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>; 6057 defm PMINSD : SS48I_binop_rm<0x39, "pminsd", smin, v4i32, VR128, 6058 memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>; 6059 defm PMINUD : SS48I_binop_rm<0x3B, "pminud", umin, v4i32, VR128, 6060 memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>; 6061 defm PMINUW : SS48I_binop_rm<0x3A, "pminuw", umin, v8i16, VR128, 6062 memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>; 6063 defm PMAXSB : SS48I_binop_rm<0x3C, "pmaxsb", smax, v16i8, VR128, 6064 memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>; 6065 defm PMAXSD : SS48I_binop_rm<0x3D, "pmaxsd", smax, v4i32, VR128, 6066 memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>; 6067 defm PMAXUD : SS48I_binop_rm<0x3F, "pmaxud", umax, v4i32, VR128, 6068 memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>; 6069 defm PMAXUW : SS48I_binop_rm<0x3E, "pmaxuw", umax, v8i16, VR128, 6070 memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>; 6071 defm PMULDQ : SS48I_binop_rm<0x28, "pmuldq", X86pmuldq, v2i64, VR128, 6072 memopv2i64, i128mem, SchedWriteVecIMul.XMM, 1>; 6073} 6074 6075let Predicates = [HasAVX, NoVLX] in 6076 defm VPMULLD : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128, 6077 loadv2i64, i128mem, SchedWritePMULLD.XMM, 0>, 6078 VEX_4V, VEX_WIG; 6079let Predicates = [HasAVX] in 6080 defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128, 6081 loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>, 6082 VEX_4V, VEX_WIG; 6083 6084let Predicates = [HasAVX2, NoVLX] in 6085 defm VPMULLDY : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256, 6086 loadv4i64, i256mem, SchedWritePMULLD.YMM, 0>, 6087 VEX_4V, VEX_L, VEX_WIG; 6088let Predicates = [HasAVX2] in 6089 defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256, 6090 loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>, 6091 VEX_4V, VEX_L, VEX_WIG; 6092 6093let Constraints = "$src1 = $dst" in { 6094 defm PMULLD : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, VR128, 6095 memopv2i64, i128mem, SchedWritePMULLD.XMM, 1>; 6096 defm PCMPEQQ : SS48I_binop_rm<0x29, "pcmpeqq", X86pcmpeq, v2i64, VR128, 6097 memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>; 6098} 6099 6100/// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate 6101multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr, 6102 Intrinsic IntId, RegisterClass RC, PatFrag memop_frag, 6103 X86MemOperand x86memop, bit Is2Addr, 6104 X86FoldableSchedWrite sched> { 6105 let isCommutable = 1 in 6106 def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst), 6107 (ins RC:$src1, RC:$src2, u8imm:$src3), 6108 !if(Is2Addr, 6109 !strconcat(OpcodeStr, 6110 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 6111 !strconcat(OpcodeStr, 6112 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 6113 [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))]>, 6114 Sched<[sched]>; 6115 def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst), 6116 (ins RC:$src1, x86memop:$src2, u8imm:$src3), 6117 !if(Is2Addr, 6118 !strconcat(OpcodeStr, 6119 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 6120 !strconcat(OpcodeStr, 6121 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 6122 [(set RC:$dst, 6123 (IntId RC:$src1, 6124 (bitconvert (memop_frag addr:$src2)), imm:$src3))]>, 6125 Sched<[sched.Folded, ReadAfterLd]>; 6126} 6127 6128/// SS41I_binop_rmi - SSE 4.1 binary operator with 8-bit immediate 6129multiclass SS41I_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode, 6130 ValueType OpVT, RegisterClass RC, PatFrag memop_frag, 6131 X86MemOperand x86memop, bit Is2Addr, 6132 X86FoldableSchedWrite sched> { 6133 let isCommutable = 1 in 6134 def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst), 6135 (ins RC:$src1, RC:$src2, u8imm:$src3), 6136 !if(Is2Addr, 6137 !strconcat(OpcodeStr, 6138 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 6139 !strconcat(OpcodeStr, 6140 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 6141 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))]>, 6142 Sched<[sched]>; 6143 def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst), 6144 (ins RC:$src1, x86memop:$src2, u8imm:$src3), 6145 !if(Is2Addr, 6146 !strconcat(OpcodeStr, 6147 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 6148 !strconcat(OpcodeStr, 6149 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 6150 [(set RC:$dst, 6151 (OpVT (OpNode RC:$src1, 6152 (bitconvert (memop_frag addr:$src2)), imm:$src3)))]>, 6153 Sched<[sched.Folded, ReadAfterLd]>; 6154} 6155 6156def BlendCommuteImm2 : SDNodeXForm<imm, [{ 6157 uint8_t Imm = N->getZExtValue() & 0x03; 6158 return getI8Imm(Imm ^ 0x03, SDLoc(N)); 6159}]>; 6160 6161def BlendCommuteImm4 : SDNodeXForm<imm, [{ 6162 uint8_t Imm = N->getZExtValue() & 0x0f; 6163 return getI8Imm(Imm ^ 0x0f, SDLoc(N)); 6164}]>; 6165 6166def BlendCommuteImm8 : SDNodeXForm<imm, [{ 6167 uint8_t Imm = N->getZExtValue() & 0xff; 6168 return getI8Imm(Imm ^ 0xff, SDLoc(N)); 6169}]>; 6170 6171let Predicates = [HasAVX] in { 6172 let isCommutable = 0 in { 6173 defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw, 6174 VR128, loadv2i64, i128mem, 0, 6175 SchedWriteMPSAD.XMM>, VEX_4V, VEX_WIG; 6176 } 6177 6178 let ExeDomain = SSEPackedSingle in 6179 defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps, 6180 VR128, loadv4f32, f128mem, 0, 6181 SchedWriteDPPS.XMM>, VEX_4V, VEX_WIG; 6182 let ExeDomain = SSEPackedDouble in 6183 defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd, 6184 VR128, loadv2f64, f128mem, 0, 6185 SchedWriteDPPD.XMM>, VEX_4V, VEX_WIG; 6186 let ExeDomain = SSEPackedSingle in 6187 defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256, 6188 VR256, loadv8f32, i256mem, 0, 6189 SchedWriteDPPS.YMM>, VEX_4V, VEX_L, VEX_WIG; 6190} 6191 6192let Predicates = [HasAVX2] in { 6193 let isCommutable = 0 in { 6194 defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw, 6195 VR256, loadv4i64, i256mem, 0, 6196 SchedWriteMPSAD.YMM>, VEX_4V, VEX_L, VEX_WIG; 6197 } 6198} 6199 6200let Constraints = "$src1 = $dst" in { 6201 let isCommutable = 0 in { 6202 defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw, 6203 VR128, memopv2i64, i128mem, 1, 6204 SchedWriteMPSAD.XMM>; 6205 } 6206 6207 let ExeDomain = SSEPackedSingle in 6208 defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps, 6209 VR128, memopv4f32, f128mem, 1, 6210 SchedWriteDPPS.XMM>; 6211 let ExeDomain = SSEPackedDouble in 6212 defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd, 6213 VR128, memopv2f64, f128mem, 1, 6214 SchedWriteDPPD.XMM>; 6215} 6216 6217/// SS41I_blend_rmi - SSE 4.1 blend with 8-bit immediate 6218multiclass SS41I_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode, 6219 ValueType OpVT, RegisterClass RC, PatFrag memop_frag, 6220 X86MemOperand x86memop, bit Is2Addr, Domain d, 6221 X86FoldableSchedWrite sched, SDNodeXForm commuteXForm> { 6222let ExeDomain = d, Constraints = !if(Is2Addr, "$src1 = $dst", "") in { 6223 let isCommutable = 1 in 6224 def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst), 6225 (ins RC:$src1, RC:$src2, u8imm:$src3), 6226 !if(Is2Addr, 6227 !strconcat(OpcodeStr, 6228 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 6229 !strconcat(OpcodeStr, 6230 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 6231 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))]>, 6232 Sched<[sched]>; 6233 def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst), 6234 (ins RC:$src1, x86memop:$src2, u8imm:$src3), 6235 !if(Is2Addr, 6236 !strconcat(OpcodeStr, 6237 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 6238 !strconcat(OpcodeStr, 6239 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 6240 [(set RC:$dst, 6241 (OpVT (OpNode RC:$src1, 6242 (bitconvert (memop_frag addr:$src2)), imm:$src3)))]>, 6243 Sched<[sched.Folded, ReadAfterLd]>; 6244} 6245 6246 // Pattern to commute if load is in first source. 6247 def : Pat<(OpVT (OpNode (bitconvert (memop_frag addr:$src2)), 6248 RC:$src1, imm:$src3)), 6249 (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2, 6250 (commuteXForm imm:$src3))>; 6251} 6252 6253let Predicates = [HasAVX] in { 6254 defm VBLENDPS : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v4f32, 6255 VR128, loadv4f32, f128mem, 0, SSEPackedSingle, 6256 SchedWriteFBlend.XMM, BlendCommuteImm4>, 6257 VEX_4V, VEX_WIG; 6258 defm VBLENDPSY : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v8f32, 6259 VR256, loadv8f32, f256mem, 0, SSEPackedSingle, 6260 SchedWriteFBlend.YMM, BlendCommuteImm8>, 6261 VEX_4V, VEX_L, VEX_WIG; 6262 defm VBLENDPD : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v2f64, 6263 VR128, loadv2f64, f128mem, 0, SSEPackedDouble, 6264 SchedWriteFBlend.XMM, BlendCommuteImm2>, 6265 VEX_4V, VEX_WIG; 6266 defm VBLENDPDY : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v4f64, 6267 VR256, loadv4f64, f256mem, 0, SSEPackedDouble, 6268 SchedWriteFBlend.YMM, BlendCommuteImm4>, 6269 VEX_4V, VEX_L, VEX_WIG; 6270 defm VPBLENDW : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v8i16, 6271 VR128, loadv2i64, i128mem, 0, SSEPackedInt, 6272 SchedWriteBlend.XMM, BlendCommuteImm8>, 6273 VEX_4V, VEX_WIG; 6274} 6275 6276let Predicates = [HasAVX2] in { 6277 defm VPBLENDWY : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v16i16, 6278 VR256, loadv4i64, i256mem, 0, SSEPackedInt, 6279 SchedWriteBlend.YMM, BlendCommuteImm8>, 6280 VEX_4V, VEX_L, VEX_WIG; 6281} 6282 6283defm BLENDPS : SS41I_blend_rmi<0x0C, "blendps", X86Blendi, v4f32, 6284 VR128, memopv4f32, f128mem, 1, SSEPackedSingle, 6285 SchedWriteFBlend.XMM, BlendCommuteImm4>; 6286defm BLENDPD : SS41I_blend_rmi<0x0D, "blendpd", X86Blendi, v2f64, 6287 VR128, memopv2f64, f128mem, 1, SSEPackedDouble, 6288 SchedWriteFBlend.XMM, BlendCommuteImm2>; 6289defm PBLENDW : SS41I_blend_rmi<0x0E, "pblendw", X86Blendi, v8i16, 6290 VR128, memopv2i64, i128mem, 1, SSEPackedInt, 6291 SchedWriteBlend.XMM, BlendCommuteImm8>; 6292 6293// For insertion into the zero index (low half) of a 256-bit vector, it is 6294// more efficient to generate a blend with immediate instead of an insert*128. 6295let Predicates = [HasAVX] in { 6296def : Pat<(insert_subvector (v4f64 VR256:$src1), (v2f64 VR128:$src2), (iPTR 0)), 6297 (VBLENDPDYrri VR256:$src1, 6298 (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), 6299 VR128:$src2, sub_xmm), 0x3)>; 6300def : Pat<(insert_subvector (v8f32 VR256:$src1), (v4f32 VR128:$src2), (iPTR 0)), 6301 (VBLENDPSYrri VR256:$src1, 6302 (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), 6303 VR128:$src2, sub_xmm), 0xf)>; 6304} 6305 6306/// SS41I_quaternary_int_avx - AVX SSE 4.1 with 4 operators 6307multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr, 6308 RegisterClass RC, X86MemOperand x86memop, 6309 PatFrag mem_frag, Intrinsic IntId, 6310 X86FoldableSchedWrite sched> { 6311 def rr : Ii8Reg<opc, MRMSrcReg, (outs RC:$dst), 6312 (ins RC:$src1, RC:$src2, RC:$src3), 6313 !strconcat(OpcodeStr, 6314 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 6315 [(set RC:$dst, (IntId RC:$src1, RC:$src2, RC:$src3))], 6316 SSEPackedInt>, TAPD, VEX_4V, 6317 Sched<[sched]>; 6318 6319 def rm : Ii8Reg<opc, MRMSrcMem, (outs RC:$dst), 6320 (ins RC:$src1, x86memop:$src2, RC:$src3), 6321 !strconcat(OpcodeStr, 6322 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 6323 [(set RC:$dst, 6324 (IntId RC:$src1, (bitconvert (mem_frag addr:$src2)), 6325 RC:$src3))], SSEPackedInt>, TAPD, VEX_4V, 6326 Sched<[sched.Folded, ReadAfterLd, 6327 // x86memop:$src2 6328 ReadDefault, ReadDefault, ReadDefault, ReadDefault, 6329 ReadDefault, 6330 // RC::$src3 6331 ReadAfterLd]>; 6332} 6333 6334let Predicates = [HasAVX] in { 6335let ExeDomain = SSEPackedDouble in { 6336defm VBLENDVPD : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR128, f128mem, 6337 loadv2f64, int_x86_sse41_blendvpd, 6338 SchedWriteFVarBlend.XMM>; 6339defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, f256mem, 6340 loadv4f64, int_x86_avx_blendv_pd_256, 6341 SchedWriteFVarBlend.YMM>, VEX_L; 6342} // ExeDomain = SSEPackedDouble 6343let ExeDomain = SSEPackedSingle in { 6344defm VBLENDVPS : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR128, f128mem, 6345 loadv4f32, int_x86_sse41_blendvps, 6346 SchedWriteFVarBlend.XMM>; 6347defm VBLENDVPSY : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR256, f256mem, 6348 loadv8f32, int_x86_avx_blendv_ps_256, 6349 SchedWriteFVarBlend.YMM>, VEX_L; 6350} // ExeDomain = SSEPackedSingle 6351defm VPBLENDVB : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR128, i128mem, 6352 loadv2i64, int_x86_sse41_pblendvb, 6353 SchedWriteVarBlend.XMM>; 6354} 6355 6356let Predicates = [HasAVX2] in { 6357defm VPBLENDVBY : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR256, i256mem, 6358 loadv4i64, int_x86_avx2_pblendvb, 6359 SchedWriteVarBlend.YMM>, VEX_L; 6360} 6361 6362let Predicates = [HasAVX] in { 6363 def : Pat<(v16i8 (vselect (v16i8 VR128:$mask), (v16i8 VR128:$src1), 6364 (v16i8 VR128:$src2))), 6365 (VPBLENDVBrr VR128:$src2, VR128:$src1, VR128:$mask)>; 6366 def : Pat<(v4i32 (vselect (v4i32 VR128:$mask), (v4i32 VR128:$src1), 6367 (v4i32 VR128:$src2))), 6368 (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>; 6369 def : Pat<(v4f32 (vselect (v4i32 VR128:$mask), (v4f32 VR128:$src1), 6370 (v4f32 VR128:$src2))), 6371 (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>; 6372 def : Pat<(v2i64 (vselect (v2i64 VR128:$mask), (v2i64 VR128:$src1), 6373 (v2i64 VR128:$src2))), 6374 (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>; 6375 def : Pat<(v2f64 (vselect (v2i64 VR128:$mask), (v2f64 VR128:$src1), 6376 (v2f64 VR128:$src2))), 6377 (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>; 6378 def : Pat<(v8i32 (vselect (v8i32 VR256:$mask), (v8i32 VR256:$src1), 6379 (v8i32 VR256:$src2))), 6380 (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>; 6381 def : Pat<(v8f32 (vselect (v8i32 VR256:$mask), (v8f32 VR256:$src1), 6382 (v8f32 VR256:$src2))), 6383 (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>; 6384 def : Pat<(v4i64 (vselect (v4i64 VR256:$mask), (v4i64 VR256:$src1), 6385 (v4i64 VR256:$src2))), 6386 (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>; 6387 def : Pat<(v4f64 (vselect (v4i64 VR256:$mask), (v4f64 VR256:$src1), 6388 (v4f64 VR256:$src2))), 6389 (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>; 6390} 6391 6392let Predicates = [HasAVX2] in { 6393 def : Pat<(v32i8 (vselect (v32i8 VR256:$mask), (v32i8 VR256:$src1), 6394 (v32i8 VR256:$src2))), 6395 (VPBLENDVBYrr VR256:$src2, VR256:$src1, VR256:$mask)>; 6396} 6397 6398// Prefer a movss or movsd over a blendps when optimizing for size. these were 6399// changed to use blends because blends have better throughput on sandybridge 6400// and haswell, but movs[s/d] are 1-2 byte shorter instructions. 6401let Predicates = [HasAVX, OptForSpeed] in { 6402 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), 6403 (VBLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>; 6404 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), 6405 (VPBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>; 6406 6407 def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)), 6408 (VBLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>; 6409 def : Pat<(v4f32 (X86Movss VR128:$src1, (loadv4f32 addr:$src2))), 6410 (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>; 6411 def : Pat<(v4f32 (X86Movss (loadv4f32 addr:$src2), VR128:$src1)), 6412 (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>; 6413 6414 def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)), 6415 (VBLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>; 6416 def : Pat<(v2f64 (X86Movsd VR128:$src1, (loadv2f64 addr:$src2))), 6417 (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>; 6418 def : Pat<(v2f64 (X86Movsd (loadv2f64 addr:$src2), VR128:$src1)), 6419 (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>; 6420 6421 // Move low f32 and clear high bits. 6422 def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))), 6423 (SUBREG_TO_REG (i32 0), 6424 (v4f32 (VBLENDPSrri (v4f32 (V_SET0)), 6425 (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)), 6426 (i8 1))), sub_xmm)>; 6427 def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))), 6428 (SUBREG_TO_REG (i32 0), 6429 (v4i32 (VPBLENDWrri (v4i32 (V_SET0)), 6430 (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)), 6431 (i8 3))), sub_xmm)>; 6432 6433 def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))), 6434 (SUBREG_TO_REG (i32 0), 6435 (v2f64 (VBLENDPDrri (v2f64 (V_SET0)), 6436 (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)), 6437 (i8 1))), sub_xmm)>; 6438 def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))), 6439 (SUBREG_TO_REG (i32 0), 6440 (v2i64 (VPBLENDWrri (v2i64 (V_SET0)), 6441 (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)), 6442 (i8 0xf))), sub_xmm)>; 6443} 6444 6445// Prefer a movss or movsd over a blendps when optimizing for size. these were 6446// changed to use blends because blends have better throughput on sandybridge 6447// and haswell, but movs[s/d] are 1-2 byte shorter instructions. 6448let Predicates = [UseSSE41, OptForSpeed] in { 6449 // With SSE41 we can use blends for these patterns. 6450 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), 6451 (BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>; 6452 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), 6453 (PBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>; 6454 6455 def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)), 6456 (BLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>; 6457 def : Pat<(v4f32 (X86Movss VR128:$src1, (memopv4f32 addr:$src2))), 6458 (BLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>; 6459 def : Pat<(v4f32 (X86Movss (memopv4f32 addr:$src2), VR128:$src1)), 6460 (BLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>; 6461 6462 def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)), 6463 (BLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>; 6464 def : Pat<(v2f64 (X86Movsd VR128:$src1, (memopv2f64 addr:$src2))), 6465 (BLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>; 6466 def : Pat<(v2f64 (X86Movsd (memopv2f64 addr:$src2), VR128:$src1)), 6467 (BLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>; 6468} 6469 6470 6471/// SS41I_ternary_int - SSE 4.1 ternary operator 6472let Uses = [XMM0], Constraints = "$src1 = $dst" in { 6473 multiclass SS41I_ternary_int<bits<8> opc, string OpcodeStr, PatFrag mem_frag, 6474 X86MemOperand x86memop, Intrinsic IntId, 6475 X86FoldableSchedWrite sched> { 6476 def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst), 6477 (ins VR128:$src1, VR128:$src2), 6478 !strconcat(OpcodeStr, 6479 "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"), 6480 [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0))]>, 6481 Sched<[sched]>; 6482 6483 def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst), 6484 (ins VR128:$src1, x86memop:$src2), 6485 !strconcat(OpcodeStr, 6486 "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"), 6487 [(set VR128:$dst, 6488 (IntId VR128:$src1, 6489 (bitconvert (mem_frag addr:$src2)), XMM0))]>, 6490 Sched<[sched.Folded, ReadAfterLd]>; 6491 } 6492} 6493 6494let ExeDomain = SSEPackedDouble in 6495defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", memopv2f64, f128mem, 6496 int_x86_sse41_blendvpd, SchedWriteFVarBlend.XMM>; 6497let ExeDomain = SSEPackedSingle in 6498defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", memopv4f32, f128mem, 6499 int_x86_sse41_blendvps, SchedWriteFVarBlend.XMM>; 6500defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", memopv2i64, i128mem, 6501 int_x86_sse41_pblendvb, SchedWriteVarBlend.XMM>; 6502 6503// Aliases with the implicit xmm0 argument 6504def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}", 6505 (BLENDVPDrr0 VR128:$dst, VR128:$src2), 0>; 6506def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}", 6507 (BLENDVPDrm0 VR128:$dst, f128mem:$src2), 0>; 6508def : InstAlias<"blendvps\t{$src2, $dst|$dst, $src2}", 6509 (BLENDVPSrr0 VR128:$dst, VR128:$src2), 0>; 6510def : InstAlias<"blendvps\t{$src2, $dst|$dst, $src2}", 6511 (BLENDVPSrm0 VR128:$dst, f128mem:$src2), 0>; 6512def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}", 6513 (PBLENDVBrr0 VR128:$dst, VR128:$src2), 0>; 6514def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}", 6515 (PBLENDVBrm0 VR128:$dst, i128mem:$src2), 0>; 6516 6517let Predicates = [UseSSE41] in { 6518 def : Pat<(v16i8 (vselect (v16i8 XMM0), (v16i8 VR128:$src1), 6519 (v16i8 VR128:$src2))), 6520 (PBLENDVBrr0 VR128:$src2, VR128:$src1)>; 6521 def : Pat<(v4i32 (vselect (v4i32 XMM0), (v4i32 VR128:$src1), 6522 (v4i32 VR128:$src2))), 6523 (BLENDVPSrr0 VR128:$src2, VR128:$src1)>; 6524 def : Pat<(v4f32 (vselect (v4i32 XMM0), (v4f32 VR128:$src1), 6525 (v4f32 VR128:$src2))), 6526 (BLENDVPSrr0 VR128:$src2, VR128:$src1)>; 6527 def : Pat<(v2i64 (vselect (v2i64 XMM0), (v2i64 VR128:$src1), 6528 (v2i64 VR128:$src2))), 6529 (BLENDVPDrr0 VR128:$src2, VR128:$src1)>; 6530 def : Pat<(v2f64 (vselect (v2i64 XMM0), (v2f64 VR128:$src1), 6531 (v2f64 VR128:$src2))), 6532 (BLENDVPDrr0 VR128:$src2, VR128:$src1)>; 6533} 6534 6535let AddedComplexity = 400 in { // Prefer non-temporal versions 6536 6537let Predicates = [HasAVX, NoVLX] in 6538def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 6539 "vmovntdqa\t{$src, $dst|$dst, $src}", []>, 6540 Sched<[SchedWriteVecMoveLSNT.XMM.RM]>, VEX, VEX_WIG; 6541let Predicates = [HasAVX2, NoVLX] in 6542def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), 6543 "vmovntdqa\t{$src, $dst|$dst, $src}", []>, 6544 Sched<[SchedWriteVecMoveLSNT.YMM.RM]>, VEX, VEX_L, VEX_WIG; 6545def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 6546 "movntdqa\t{$src, $dst|$dst, $src}", []>, 6547 Sched<[SchedWriteVecMoveLSNT.XMM.RM]>; 6548 6549let Predicates = [HasAVX2, NoVLX] in { 6550 def : Pat<(v8f32 (alignednontemporalload addr:$src)), 6551 (VMOVNTDQAYrm addr:$src)>; 6552 def : Pat<(v4f64 (alignednontemporalload addr:$src)), 6553 (VMOVNTDQAYrm addr:$src)>; 6554 def : Pat<(v4i64 (alignednontemporalload addr:$src)), 6555 (VMOVNTDQAYrm addr:$src)>; 6556} 6557 6558let Predicates = [HasAVX, NoVLX] in { 6559 def : Pat<(v4f32 (alignednontemporalload addr:$src)), 6560 (VMOVNTDQArm addr:$src)>; 6561 def : Pat<(v2f64 (alignednontemporalload addr:$src)), 6562 (VMOVNTDQArm addr:$src)>; 6563 def : Pat<(v2i64 (alignednontemporalload addr:$src)), 6564 (VMOVNTDQArm addr:$src)>; 6565} 6566 6567let Predicates = [UseSSE41] in { 6568 def : Pat<(v4f32 (alignednontemporalload addr:$src)), 6569 (MOVNTDQArm addr:$src)>; 6570 def : Pat<(v2f64 (alignednontemporalload addr:$src)), 6571 (MOVNTDQArm addr:$src)>; 6572 def : Pat<(v2i64 (alignednontemporalload addr:$src)), 6573 (MOVNTDQArm addr:$src)>; 6574} 6575 6576} // AddedComplexity 6577 6578//===----------------------------------------------------------------------===// 6579// SSE4.2 - Compare Instructions 6580//===----------------------------------------------------------------------===// 6581 6582/// SS42I_binop_rm - Simple SSE 4.2 binary operator 6583multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, 6584 ValueType OpVT, RegisterClass RC, PatFrag memop_frag, 6585 X86MemOperand x86memop, X86FoldableSchedWrite sched, 6586 bit Is2Addr = 1> { 6587 def rr : SS428I<opc, MRMSrcReg, (outs RC:$dst), 6588 (ins RC:$src1, RC:$src2), 6589 !if(Is2Addr, 6590 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 6591 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 6592 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>, 6593 Sched<[sched]>; 6594 def rm : SS428I<opc, MRMSrcMem, (outs RC:$dst), 6595 (ins RC:$src1, x86memop:$src2), 6596 !if(Is2Addr, 6597 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 6598 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 6599 [(set RC:$dst, 6600 (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>, 6601 Sched<[sched.Folded, ReadAfterLd]>; 6602} 6603 6604let Predicates = [HasAVX] in 6605 defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128, 6606 loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>, 6607 VEX_4V, VEX_WIG; 6608 6609let Predicates = [HasAVX2] in 6610 defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256, 6611 loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>, 6612 VEX_4V, VEX_L, VEX_WIG; 6613 6614let Constraints = "$src1 = $dst" in 6615 defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128, 6616 memopv2i64, i128mem, SchedWriteVecALU.XMM>; 6617 6618//===----------------------------------------------------------------------===// 6619// SSE4.2 - String/text Processing Instructions 6620//===----------------------------------------------------------------------===// 6621 6622multiclass pcmpistrm_SS42AI<string asm> { 6623 def rr : SS42AI<0x62, MRMSrcReg, (outs), 6624 (ins VR128:$src1, VR128:$src2, u8imm:$src3), 6625 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), 6626 []>, Sched<[WritePCmpIStrM]>; 6627 let mayLoad = 1 in 6628 def rm :SS42AI<0x62, MRMSrcMem, (outs), 6629 (ins VR128:$src1, i128mem:$src2, u8imm:$src3), 6630 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), 6631 []>, Sched<[WritePCmpIStrM.Folded, ReadAfterLd]>; 6632} 6633 6634let Defs = [XMM0, EFLAGS], hasSideEffects = 0 in { 6635 let Predicates = [HasAVX] in 6636 defm VPCMPISTRM : pcmpistrm_SS42AI<"vpcmpistrm">, VEX; 6637 defm PCMPISTRM : pcmpistrm_SS42AI<"pcmpistrm"> ; 6638} 6639 6640multiclass SS42AI_pcmpestrm<string asm> { 6641 def rr : SS42AI<0x60, MRMSrcReg, (outs), 6642 (ins VR128:$src1, VR128:$src3, u8imm:$src5), 6643 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), 6644 []>, Sched<[WritePCmpEStrM]>; 6645 let mayLoad = 1 in 6646 def rm : SS42AI<0x60, MRMSrcMem, (outs), 6647 (ins VR128:$src1, i128mem:$src3, u8imm:$src5), 6648 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), 6649 []>, Sched<[WritePCmpEStrM.Folded, ReadAfterLd]>; 6650} 6651 6652let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in { 6653 let Predicates = [HasAVX] in 6654 defm VPCMPESTRM : SS42AI_pcmpestrm<"vpcmpestrm">, VEX; 6655 defm PCMPESTRM : SS42AI_pcmpestrm<"pcmpestrm">; 6656} 6657 6658multiclass SS42AI_pcmpistri<string asm> { 6659 def rr : SS42AI<0x63, MRMSrcReg, (outs), 6660 (ins VR128:$src1, VR128:$src2, u8imm:$src3), 6661 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), 6662 []>, Sched<[WritePCmpIStrI]>; 6663 let mayLoad = 1 in 6664 def rm : SS42AI<0x63, MRMSrcMem, (outs), 6665 (ins VR128:$src1, i128mem:$src2, u8imm:$src3), 6666 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), 6667 []>, Sched<[WritePCmpIStrI.Folded, ReadAfterLd]>; 6668} 6669 6670let Defs = [ECX, EFLAGS], hasSideEffects = 0 in { 6671 let Predicates = [HasAVX] in 6672 defm VPCMPISTRI : SS42AI_pcmpistri<"vpcmpistri">, VEX; 6673 defm PCMPISTRI : SS42AI_pcmpistri<"pcmpistri">; 6674} 6675 6676multiclass SS42AI_pcmpestri<string asm> { 6677 def rr : SS42AI<0x61, MRMSrcReg, (outs), 6678 (ins VR128:$src1, VR128:$src3, u8imm:$src5), 6679 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), 6680 []>, Sched<[WritePCmpEStrI]>; 6681 let mayLoad = 1 in 6682 def rm : SS42AI<0x61, MRMSrcMem, (outs), 6683 (ins VR128:$src1, i128mem:$src3, u8imm:$src5), 6684 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), 6685 []>, Sched<[WritePCmpEStrI.Folded, ReadAfterLd]>; 6686} 6687 6688let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in { 6689 let Predicates = [HasAVX] in 6690 defm VPCMPESTRI : SS42AI_pcmpestri<"vpcmpestri">, VEX; 6691 defm PCMPESTRI : SS42AI_pcmpestri<"pcmpestri">; 6692} 6693 6694//===----------------------------------------------------------------------===// 6695// SSE4.2 - CRC Instructions 6696//===----------------------------------------------------------------------===// 6697 6698// No CRC instructions have AVX equivalents 6699 6700// crc intrinsic instruction 6701// This set of instructions are only rm, the only difference is the size 6702// of r and m. 6703class SS42I_crc32r<bits<8> opc, string asm, RegisterClass RCOut, 6704 RegisterClass RCIn, SDPatternOperator Int> : 6705 SS42FI<opc, MRMSrcReg, (outs RCOut:$dst), (ins RCOut:$src1, RCIn:$src2), 6706 !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"), 6707 [(set RCOut:$dst, (Int RCOut:$src1, RCIn:$src2))]>, 6708 Sched<[WriteCRC32]>; 6709 6710class SS42I_crc32m<bits<8> opc, string asm, RegisterClass RCOut, 6711 X86MemOperand x86memop, SDPatternOperator Int> : 6712 SS42FI<opc, MRMSrcMem, (outs RCOut:$dst), (ins RCOut:$src1, x86memop:$src2), 6713 !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"), 6714 [(set RCOut:$dst, (Int RCOut:$src1, (load addr:$src2)))]>, 6715 Sched<[WriteCRC32.Folded, ReadAfterLd]>; 6716 6717let Constraints = "$src1 = $dst" in { 6718 def CRC32r32m8 : SS42I_crc32m<0xF0, "crc32{b}", GR32, i8mem, 6719 int_x86_sse42_crc32_32_8>; 6720 def CRC32r32r8 : SS42I_crc32r<0xF0, "crc32{b}", GR32, GR8, 6721 int_x86_sse42_crc32_32_8>; 6722 def CRC32r32m16 : SS42I_crc32m<0xF1, "crc32{w}", GR32, i16mem, 6723 int_x86_sse42_crc32_32_16>, OpSize16; 6724 def CRC32r32r16 : SS42I_crc32r<0xF1, "crc32{w}", GR32, GR16, 6725 int_x86_sse42_crc32_32_16>, OpSize16; 6726 def CRC32r32m32 : SS42I_crc32m<0xF1, "crc32{l}", GR32, i32mem, 6727 int_x86_sse42_crc32_32_32>, OpSize32; 6728 def CRC32r32r32 : SS42I_crc32r<0xF1, "crc32{l}", GR32, GR32, 6729 int_x86_sse42_crc32_32_32>, OpSize32; 6730 def CRC32r64m64 : SS42I_crc32m<0xF1, "crc32{q}", GR64, i64mem, 6731 int_x86_sse42_crc32_64_64>, REX_W; 6732 def CRC32r64r64 : SS42I_crc32r<0xF1, "crc32{q}", GR64, GR64, 6733 int_x86_sse42_crc32_64_64>, REX_W; 6734 let hasSideEffects = 0 in { 6735 let mayLoad = 1 in 6736 def CRC32r64m8 : SS42I_crc32m<0xF0, "crc32{b}", GR64, i8mem, 6737 null_frag>, REX_W; 6738 def CRC32r64r8 : SS42I_crc32r<0xF0, "crc32{b}", GR64, GR8, 6739 null_frag>, REX_W; 6740 } 6741} 6742 6743//===----------------------------------------------------------------------===// 6744// SHA-NI Instructions 6745//===----------------------------------------------------------------------===// 6746 6747// FIXME: Is there a better scheduler class for SHA than WriteVecIMul? 6748multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId, 6749 X86FoldableSchedWrite sched, bit UsesXMM0 = 0> { 6750 def rr : I<Opc, MRMSrcReg, (outs VR128:$dst), 6751 (ins VR128:$src1, VR128:$src2), 6752 !if(UsesXMM0, 6753 !strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"), 6754 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")), 6755 [!if(UsesXMM0, 6756 (set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0)), 6757 (set VR128:$dst, (IntId VR128:$src1, VR128:$src2)))]>, 6758 T8, Sched<[sched]>; 6759 6760 def rm : I<Opc, MRMSrcMem, (outs VR128:$dst), 6761 (ins VR128:$src1, i128mem:$src2), 6762 !if(UsesXMM0, 6763 !strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"), 6764 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")), 6765 [!if(UsesXMM0, 6766 (set VR128:$dst, (IntId VR128:$src1, 6767 (bc_v4i32 (memopv2i64 addr:$src2)), XMM0)), 6768 (set VR128:$dst, (IntId VR128:$src1, 6769 (bc_v4i32 (memopv2i64 addr:$src2)))))]>, T8, 6770 Sched<[sched.Folded, ReadAfterLd]>; 6771} 6772 6773let Constraints = "$src1 = $dst", Predicates = [HasSHA] in { 6774 def SHA1RNDS4rri : Ii8<0xCC, MRMSrcReg, (outs VR128:$dst), 6775 (ins VR128:$src1, VR128:$src2, u8imm:$src3), 6776 "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}", 6777 [(set VR128:$dst, 6778 (int_x86_sha1rnds4 VR128:$src1, VR128:$src2, 6779 (i8 imm:$src3)))]>, TA, 6780 Sched<[SchedWriteVecIMul.XMM]>; 6781 def SHA1RNDS4rmi : Ii8<0xCC, MRMSrcMem, (outs VR128:$dst), 6782 (ins VR128:$src1, i128mem:$src2, u8imm:$src3), 6783 "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}", 6784 [(set VR128:$dst, 6785 (int_x86_sha1rnds4 VR128:$src1, 6786 (bc_v4i32 (memopv2i64 addr:$src2)), 6787 (i8 imm:$src3)))]>, TA, 6788 Sched<[SchedWriteVecIMul.XMM.Folded, ReadAfterLd]>; 6789 6790 defm SHA1NEXTE : SHAI_binop<0xC8, "sha1nexte", int_x86_sha1nexte, 6791 SchedWriteVecIMul.XMM>; 6792 defm SHA1MSG1 : SHAI_binop<0xC9, "sha1msg1", int_x86_sha1msg1, 6793 SchedWriteVecIMul.XMM>; 6794 defm SHA1MSG2 : SHAI_binop<0xCA, "sha1msg2", int_x86_sha1msg2, 6795 SchedWriteVecIMul.XMM>; 6796 6797 let Uses=[XMM0] in 6798 defm SHA256RNDS2 : SHAI_binop<0xCB, "sha256rnds2", int_x86_sha256rnds2, 6799 SchedWriteVecIMul.XMM, 1>; 6800 6801 defm SHA256MSG1 : SHAI_binop<0xCC, "sha256msg1", int_x86_sha256msg1, 6802 SchedWriteVecIMul.XMM>; 6803 defm SHA256MSG2 : SHAI_binop<0xCD, "sha256msg2", int_x86_sha256msg2, 6804 SchedWriteVecIMul.XMM>; 6805} 6806 6807// Aliases with explicit %xmm0 6808def : InstAlias<"sha256rnds2\t{$src2, $dst|$dst, $src2}", 6809 (SHA256RNDS2rr VR128:$dst, VR128:$src2), 0>; 6810def : InstAlias<"sha256rnds2\t{$src2, $dst|$dst, $src2}", 6811 (SHA256RNDS2rm VR128:$dst, i128mem:$src2), 0>; 6812 6813//===----------------------------------------------------------------------===// 6814// AES-NI Instructions 6815//===----------------------------------------------------------------------===// 6816 6817multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr, 6818 Intrinsic IntId, PatFrag ld_frag, 6819 bit Is2Addr = 0, RegisterClass RC = VR128, 6820 X86MemOperand MemOp = i128mem> { 6821 let AsmString = OpcodeStr## 6822 !if(Is2Addr, "\t{$src2, $dst|$dst, $src2}", 6823 "\t{$src2, $src1, $dst|$dst, $src1, $src2}") in { 6824 def rr : AES8I<opc, MRMSrcReg, (outs RC:$dst), 6825 (ins RC:$src1, RC:$src2), "", 6826 [(set RC:$dst, (IntId RC:$src1, RC:$src2))]>, 6827 Sched<[WriteAESDecEnc]>; 6828 def rm : AES8I<opc, MRMSrcMem, (outs RC:$dst), 6829 (ins RC:$src1, MemOp:$src2), "", 6830 [(set RC:$dst, (IntId RC:$src1, (ld_frag addr:$src2)))]>, 6831 Sched<[WriteAESDecEnc.Folded, ReadAfterLd]>; 6832 } 6833} 6834 6835// Perform One Round of an AES Encryption/Decryption Flow 6836let Predicates = [HasAVX, NoVLX_Or_NoVAES, HasAES] in { 6837 defm VAESENC : AESI_binop_rm_int<0xDC, "vaesenc", 6838 int_x86_aesni_aesenc, loadv2i64>, VEX_4V, VEX_WIG; 6839 defm VAESENCLAST : AESI_binop_rm_int<0xDD, "vaesenclast", 6840 int_x86_aesni_aesenclast, loadv2i64>, VEX_4V, VEX_WIG; 6841 defm VAESDEC : AESI_binop_rm_int<0xDE, "vaesdec", 6842 int_x86_aesni_aesdec, loadv2i64>, VEX_4V, VEX_WIG; 6843 defm VAESDECLAST : AESI_binop_rm_int<0xDF, "vaesdeclast", 6844 int_x86_aesni_aesdeclast, loadv2i64>, VEX_4V, VEX_WIG; 6845} 6846 6847let Predicates = [NoVLX, HasVAES] in { 6848 defm VAESENCY : AESI_binop_rm_int<0xDC, "vaesenc", 6849 int_x86_aesni_aesenc_256, loadv4i64, 0, VR256, 6850 i256mem>, VEX_4V, VEX_L, VEX_WIG; 6851 defm VAESENCLASTY : AESI_binop_rm_int<0xDD, "vaesenclast", 6852 int_x86_aesni_aesenclast_256, loadv4i64, 0, VR256, 6853 i256mem>, VEX_4V, VEX_L, VEX_WIG; 6854 defm VAESDECY : AESI_binop_rm_int<0xDE, "vaesdec", 6855 int_x86_aesni_aesdec_256, loadv4i64, 0, VR256, 6856 i256mem>, VEX_4V, VEX_L, VEX_WIG; 6857 defm VAESDECLASTY : AESI_binop_rm_int<0xDF, "vaesdeclast", 6858 int_x86_aesni_aesdeclast_256, loadv4i64, 0, VR256, 6859 i256mem>, VEX_4V, VEX_L, VEX_WIG; 6860} 6861 6862let Constraints = "$src1 = $dst" in { 6863 defm AESENC : AESI_binop_rm_int<0xDC, "aesenc", 6864 int_x86_aesni_aesenc, memopv2i64, 1>; 6865 defm AESENCLAST : AESI_binop_rm_int<0xDD, "aesenclast", 6866 int_x86_aesni_aesenclast, memopv2i64, 1>; 6867 defm AESDEC : AESI_binop_rm_int<0xDE, "aesdec", 6868 int_x86_aesni_aesdec, memopv2i64, 1>; 6869 defm AESDECLAST : AESI_binop_rm_int<0xDF, "aesdeclast", 6870 int_x86_aesni_aesdeclast, memopv2i64, 1>; 6871} 6872 6873// Perform the AES InvMixColumn Transformation 6874let Predicates = [HasAVX, HasAES] in { 6875 def VAESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst), 6876 (ins VR128:$src1), 6877 "vaesimc\t{$src1, $dst|$dst, $src1}", 6878 [(set VR128:$dst, 6879 (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>, 6880 VEX, VEX_WIG; 6881 def VAESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst), 6882 (ins i128mem:$src1), 6883 "vaesimc\t{$src1, $dst|$dst, $src1}", 6884 [(set VR128:$dst, (int_x86_aesni_aesimc (loadv2i64 addr:$src1)))]>, 6885 Sched<[WriteAESIMC.Folded]>, VEX, VEX_WIG; 6886} 6887def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst), 6888 (ins VR128:$src1), 6889 "aesimc\t{$src1, $dst|$dst, $src1}", 6890 [(set VR128:$dst, 6891 (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>; 6892def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst), 6893 (ins i128mem:$src1), 6894 "aesimc\t{$src1, $dst|$dst, $src1}", 6895 [(set VR128:$dst, (int_x86_aesni_aesimc (memopv2i64 addr:$src1)))]>, 6896 Sched<[WriteAESIMC.Folded]>; 6897 6898// AES Round Key Generation Assist 6899let Predicates = [HasAVX, HasAES] in { 6900 def VAESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst), 6901 (ins VR128:$src1, u8imm:$src2), 6902 "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", 6903 [(set VR128:$dst, 6904 (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>, 6905 Sched<[WriteAESKeyGen]>, VEX, VEX_WIG; 6906 def VAESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst), 6907 (ins i128mem:$src1, u8imm:$src2), 6908 "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", 6909 [(set VR128:$dst, 6910 (int_x86_aesni_aeskeygenassist (loadv2i64 addr:$src1), imm:$src2))]>, 6911 Sched<[WriteAESKeyGen.Folded]>, VEX, VEX_WIG; 6912} 6913def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst), 6914 (ins VR128:$src1, u8imm:$src2), 6915 "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", 6916 [(set VR128:$dst, 6917 (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>, 6918 Sched<[WriteAESKeyGen]>; 6919def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst), 6920 (ins i128mem:$src1, u8imm:$src2), 6921 "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", 6922 [(set VR128:$dst, 6923 (int_x86_aesni_aeskeygenassist (memopv2i64 addr:$src1), imm:$src2))]>, 6924 Sched<[WriteAESKeyGen.Folded]>; 6925 6926//===----------------------------------------------------------------------===// 6927// PCLMUL Instructions 6928//===----------------------------------------------------------------------===// 6929 6930// Immediate transform to help with commuting. 6931def PCLMULCommuteImm : SDNodeXForm<imm, [{ 6932 uint8_t Imm = N->getZExtValue(); 6933 return getI8Imm((uint8_t)((Imm >> 4) | (Imm << 4)), SDLoc(N)); 6934}]>; 6935 6936// SSE carry-less Multiplication instructions 6937let Predicates = [NoAVX, HasPCLMUL] in { 6938 let Constraints = "$src1 = $dst" in { 6939 let isCommutable = 1 in 6940 def PCLMULQDQrr : PCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst), 6941 (ins VR128:$src1, VR128:$src2, u8imm:$src3), 6942 "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}", 6943 [(set VR128:$dst, 6944 (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))]>, 6945 Sched<[WriteCLMul]>; 6946 6947 def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst), 6948 (ins VR128:$src1, i128mem:$src2, u8imm:$src3), 6949 "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}", 6950 [(set VR128:$dst, 6951 (int_x86_pclmulqdq VR128:$src1, (memopv2i64 addr:$src2), 6952 imm:$src3))]>, 6953 Sched<[WriteCLMul.Folded, ReadAfterLd]>; 6954 } // Constraints = "$src1 = $dst" 6955 6956 def : Pat<(int_x86_pclmulqdq (memopv2i64 addr:$src2), VR128:$src1, 6957 (i8 imm:$src3)), 6958 (PCLMULQDQrm VR128:$src1, addr:$src2, 6959 (PCLMULCommuteImm imm:$src3))>; 6960} // Predicates = [NoAVX, HasPCLMUL] 6961 6962// SSE aliases 6963foreach HI = ["hq","lq"] in 6964foreach LO = ["hq","lq"] in { 6965 def : InstAlias<"pclmul" # HI # LO # "dq\t{$src, $dst|$dst, $src}", 6966 (PCLMULQDQrr VR128:$dst, VR128:$src, 6967 !add(!shl(!eq(LO,"hq"),4),!eq(HI,"hq"))), 0>; 6968 def : InstAlias<"pclmul" # HI # LO # "dq\t{$src, $dst|$dst, $src}", 6969 (PCLMULQDQrm VR128:$dst, i128mem:$src, 6970 !add(!shl(!eq(LO,"hq"),4),!eq(HI,"hq"))), 0>; 6971} 6972 6973// AVX carry-less Multiplication instructions 6974multiclass vpclmulqdq<RegisterClass RC, X86MemOperand MemOp, 6975 PatFrag LdFrag, Intrinsic IntId> { 6976 let isCommutable = 1 in 6977 def rr : PCLMULIi8<0x44, MRMSrcReg, (outs RC:$dst), 6978 (ins RC:$src1, RC:$src2, u8imm:$src3), 6979 "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 6980 [(set RC:$dst, 6981 (IntId RC:$src1, RC:$src2, imm:$src3))]>, 6982 Sched<[WriteCLMul]>; 6983 6984 def rm : PCLMULIi8<0x44, MRMSrcMem, (outs RC:$dst), 6985 (ins RC:$src1, MemOp:$src2, u8imm:$src3), 6986 "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 6987 [(set RC:$dst, 6988 (IntId RC:$src1, (LdFrag addr:$src2), imm:$src3))]>, 6989 Sched<[WriteCLMul.Folded, ReadAfterLd]>; 6990 6991 // We can commute a load in the first operand by swapping the sources and 6992 // rotating the immediate. 6993 def : Pat<(IntId (LdFrag addr:$src2), RC:$src1, (i8 imm:$src3)), 6994 (!cast<Instruction>(NAME#"rm") RC:$src1, addr:$src2, 6995 (PCLMULCommuteImm imm:$src3))>; 6996} 6997 6998let Predicates = [HasAVX, NoVLX_Or_NoVPCLMULQDQ, HasPCLMUL] in 6999defm VPCLMULQDQ : vpclmulqdq<VR128, i128mem, loadv2i64, 7000 int_x86_pclmulqdq>, VEX_4V, VEX_WIG; 7001 7002let Predicates = [NoVLX, HasVPCLMULQDQ] in 7003defm VPCLMULQDQY : vpclmulqdq<VR256, i256mem, loadv4i64, 7004 int_x86_pclmulqdq_256>, VEX_4V, VEX_L, VEX_WIG; 7005 7006multiclass vpclmulqdq_aliases_impl<string InstStr, RegisterClass RC, 7007 X86MemOperand MemOp, string Hi, string Lo> { 7008 def : InstAlias<"vpclmul"##Hi##Lo##"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}", 7009 (!cast<Instruction>(InstStr # "rr") RC:$dst, RC:$src1, RC:$src2, 7010 !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>; 7011 def : InstAlias<"vpclmul"##Hi##Lo##"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}", 7012 (!cast<Instruction>(InstStr # "rm") RC:$dst, RC:$src1, MemOp:$src2, 7013 !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>; 7014} 7015 7016multiclass vpclmulqdq_aliases<string InstStr, RegisterClass RC, 7017 X86MemOperand MemOp> { 7018 defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "hq", "hq">; 7019 defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "hq", "lq">; 7020 defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "lq", "hq">; 7021 defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "lq", "lq">; 7022} 7023 7024// AVX aliases 7025defm : vpclmulqdq_aliases<"VPCLMULQDQ", VR128, i128mem>; 7026defm : vpclmulqdq_aliases<"VPCLMULQDQY", VR256, i256mem>; 7027 7028//===----------------------------------------------------------------------===// 7029// SSE4A Instructions 7030//===----------------------------------------------------------------------===// 7031 7032let Predicates = [HasSSE4A] in { 7033 7034let ExeDomain = SSEPackedInt in { 7035let Constraints = "$src = $dst" in { 7036def EXTRQI : Ii8<0x78, MRMXr, (outs VR128:$dst), 7037 (ins VR128:$src, u8imm:$len, u8imm:$idx), 7038 "extrq\t{$idx, $len, $src|$src, $len, $idx}", 7039 [(set VR128:$dst, (X86extrqi VR128:$src, imm:$len, 7040 imm:$idx))]>, 7041 PD, Sched<[SchedWriteVecALU.XMM]>; 7042def EXTRQ : I<0x79, MRMSrcReg, (outs VR128:$dst), 7043 (ins VR128:$src, VR128:$mask), 7044 "extrq\t{$mask, $src|$src, $mask}", 7045 [(set VR128:$dst, (int_x86_sse4a_extrq VR128:$src, 7046 VR128:$mask))]>, 7047 PD, Sched<[SchedWriteVecALU.XMM]>; 7048 7049def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst), 7050 (ins VR128:$src, VR128:$src2, u8imm:$len, u8imm:$idx), 7051 "insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}", 7052 [(set VR128:$dst, (X86insertqi VR128:$src, VR128:$src2, 7053 imm:$len, imm:$idx))]>, 7054 XD, Sched<[SchedWriteVecALU.XMM]>; 7055def INSERTQ : I<0x79, MRMSrcReg, (outs VR128:$dst), 7056 (ins VR128:$src, VR128:$mask), 7057 "insertq\t{$mask, $src|$src, $mask}", 7058 [(set VR128:$dst, (int_x86_sse4a_insertq VR128:$src, 7059 VR128:$mask))]>, 7060 XD, Sched<[SchedWriteVecALU.XMM]>; 7061} 7062} // ExeDomain = SSEPackedInt 7063 7064// Non-temporal (unaligned) scalar stores. 7065let AddedComplexity = 400 in { // Prefer non-temporal versions 7066let hasSideEffects = 0, mayStore = 1, SchedRW = [SchedWriteFMoveLSNT.Scl.MR] in { 7067def MOVNTSS : I<0x2B, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src), 7068 "movntss\t{$src, $dst|$dst, $src}", []>, XS; 7069 7070def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 7071 "movntsd\t{$src, $dst|$dst, $src}", []>, XD; 7072} // SchedRW 7073 7074def : Pat<(nontemporalstore FR32:$src, addr:$dst), 7075 (MOVNTSS addr:$dst, (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>; 7076 7077def : Pat<(nontemporalstore FR64:$src, addr:$dst), 7078 (MOVNTSD addr:$dst, (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>; 7079 7080} // AddedComplexity 7081} // HasSSE4A 7082 7083//===----------------------------------------------------------------------===// 7084// AVX Instructions 7085//===----------------------------------------------------------------------===// 7086 7087//===----------------------------------------------------------------------===// 7088// VBROADCAST - Load from memory and broadcast to all elements of the 7089// destination operand 7090// 7091class avx_broadcast_rm<bits<8> opc, string OpcodeStr, RegisterClass RC, 7092 X86MemOperand x86memop, ValueType VT, 7093 PatFrag ld_frag, SchedWrite Sched> : 7094 AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), 7095 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 7096 [(set RC:$dst, (VT (X86VBroadcast (ld_frag addr:$src))))]>, 7097 Sched<[Sched]>, VEX; 7098 7099// AVX2 adds register forms 7100class avx2_broadcast_rr<bits<8> opc, string OpcodeStr, RegisterClass RC, 7101 ValueType ResVT, ValueType OpVT, SchedWrite Sched> : 7102 AVX28I<opc, MRMSrcReg, (outs RC:$dst), (ins VR128:$src), 7103 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 7104 [(set RC:$dst, (ResVT (X86VBroadcast (OpVT VR128:$src))))]>, 7105 Sched<[Sched]>, VEX; 7106 7107let ExeDomain = SSEPackedSingle, Predicates = [HasAVX, NoVLX] in { 7108 def VBROADCASTSSrm : avx_broadcast_rm<0x18, "vbroadcastss", VR128, 7109 f32mem, v4f32, loadf32, 7110 SchedWriteFShuffle.XMM.Folded>; 7111 def VBROADCASTSSYrm : avx_broadcast_rm<0x18, "vbroadcastss", VR256, 7112 f32mem, v8f32, loadf32, 7113 SchedWriteFShuffle.XMM.Folded>, VEX_L; 7114} 7115let ExeDomain = SSEPackedDouble, Predicates = [HasAVX, NoVLX] in 7116def VBROADCASTSDYrm : avx_broadcast_rm<0x19, "vbroadcastsd", VR256, f64mem, 7117 v4f64, loadf64, 7118 SchedWriteFShuffle.XMM.Folded>, VEX_L; 7119 7120let ExeDomain = SSEPackedSingle, Predicates = [HasAVX2, NoVLX] in { 7121 def VBROADCASTSSrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR128, 7122 v4f32, v4f32, SchedWriteFShuffle.XMM>; 7123 def VBROADCASTSSYrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR256, 7124 v8f32, v4f32, WriteFShuffle256>, VEX_L; 7125} 7126let ExeDomain = SSEPackedDouble, Predicates = [HasAVX2, NoVLX] in 7127def VBROADCASTSDYrr : avx2_broadcast_rr<0x19, "vbroadcastsd", VR256, 7128 v4f64, v2f64, WriteFShuffle256>, VEX_L; 7129 7130let Predicates = [HasAVX, NoVLX] in { 7131 def : Pat<(v4f32 (X86VBroadcast (v4f32 (scalar_to_vector (loadf32 addr:$src))))), 7132 (VBROADCASTSSrm addr:$src)>; 7133 def : Pat<(v8f32 (X86VBroadcast (v4f32 (scalar_to_vector (loadf32 addr:$src))))), 7134 (VBROADCASTSSYrm addr:$src)>; 7135 def : Pat<(v4f64 (X86VBroadcast (v2f64 (scalar_to_vector (loadf64 addr:$src))))), 7136 (VBROADCASTSDYrm addr:$src)>; 7137} 7138 7139//===----------------------------------------------------------------------===// 7140// VBROADCAST*128 - Load from memory and broadcast 128-bit vector to both 7141// halves of a 256-bit vector. 7142// 7143let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX2] in 7144def VBROADCASTI128 : AVX8I<0x5A, MRMSrcMem, (outs VR256:$dst), 7145 (ins i128mem:$src), 7146 "vbroadcasti128\t{$src, $dst|$dst, $src}", []>, 7147 Sched<[WriteShuffleLd]>, VEX, VEX_L; 7148 7149let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX], 7150 ExeDomain = SSEPackedSingle in 7151def VBROADCASTF128 : AVX8I<0x1A, MRMSrcMem, (outs VR256:$dst), 7152 (ins f128mem:$src), 7153 "vbroadcastf128\t{$src, $dst|$dst, $src}", []>, 7154 Sched<[SchedWriteFShuffle.XMM.Folded]>, VEX, VEX_L; 7155 7156let Predicates = [HasAVX2, NoVLX] in { 7157def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))), 7158 (VBROADCASTI128 addr:$src)>; 7159def : Pat<(v8i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src)))), 7160 (VBROADCASTI128 addr:$src)>; 7161def : Pat<(v16i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))), 7162 (VBROADCASTI128 addr:$src)>; 7163def : Pat<(v32i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))), 7164 (VBROADCASTI128 addr:$src)>; 7165} 7166 7167let Predicates = [HasAVX, NoVLX] in { 7168def : Pat<(v4f64 (X86SubVBroadcast (loadv2f64 addr:$src))), 7169 (VBROADCASTF128 addr:$src)>; 7170def : Pat<(v8f32 (X86SubVBroadcast (loadv4f32 addr:$src))), 7171 (VBROADCASTF128 addr:$src)>; 7172} 7173 7174let Predicates = [HasAVX1Only] in { 7175def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))), 7176 (VBROADCASTF128 addr:$src)>; 7177def : Pat<(v8i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src)))), 7178 (VBROADCASTF128 addr:$src)>; 7179def : Pat<(v16i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))), 7180 (VBROADCASTF128 addr:$src)>; 7181def : Pat<(v32i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))), 7182 (VBROADCASTF128 addr:$src)>; 7183} 7184 7185//===----------------------------------------------------------------------===// 7186// VINSERTF128 - Insert packed floating-point values 7187// 7188let hasSideEffects = 0, ExeDomain = SSEPackedSingle in { 7189def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst), 7190 (ins VR256:$src1, VR128:$src2, u8imm:$src3), 7191 "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 7192 []>, Sched<[WriteFShuffle256]>, VEX_4V, VEX_L; 7193let mayLoad = 1 in 7194def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst), 7195 (ins VR256:$src1, f128mem:$src2, u8imm:$src3), 7196 "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 7197 []>, Sched<[WriteFShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L; 7198} 7199 7200// To create a 256-bit all ones value, we should produce VCMPTRUEPS 7201// with YMM register containing zero. 7202// FIXME: Avoid producing vxorps to clear the fake inputs. 7203let Predicates = [HasAVX1Only] in { 7204def : Pat<(v8i32 immAllOnesV), (VCMPPSYrri (AVX_SET0), (AVX_SET0), 0xf)>; 7205} 7206 7207multiclass vinsert_lowering<string InstrStr, ValueType From, ValueType To, 7208 PatFrag memop_frag> { 7209 def : Pat<(vinsert128_insert:$ins (To VR256:$src1), (From VR128:$src2), 7210 (iPTR imm)), 7211 (!cast<Instruction>(InstrStr#rr) VR256:$src1, VR128:$src2, 7212 (INSERT_get_vinsert128_imm VR256:$ins))>; 7213 def : Pat<(vinsert128_insert:$ins (To VR256:$src1), 7214 (From (bitconvert (memop_frag addr:$src2))), 7215 (iPTR imm)), 7216 (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2, 7217 (INSERT_get_vinsert128_imm VR256:$ins))>; 7218} 7219 7220let Predicates = [HasAVX, NoVLX] in { 7221 defm : vinsert_lowering<"VINSERTF128", v4f32, v8f32, loadv4f32>; 7222 defm : vinsert_lowering<"VINSERTF128", v2f64, v4f64, loadv2f64>; 7223} 7224 7225let Predicates = [HasAVX1Only] in { 7226 defm : vinsert_lowering<"VINSERTF128", v2i64, v4i64, loadv2i64>; 7227 defm : vinsert_lowering<"VINSERTF128", v4i32, v8i32, loadv2i64>; 7228 defm : vinsert_lowering<"VINSERTF128", v8i16, v16i16, loadv2i64>; 7229 defm : vinsert_lowering<"VINSERTF128", v16i8, v32i8, loadv2i64>; 7230} 7231 7232//===----------------------------------------------------------------------===// 7233// VEXTRACTF128 - Extract packed floating-point values 7234// 7235let hasSideEffects = 0, ExeDomain = SSEPackedSingle in { 7236def VEXTRACTF128rr : AVXAIi8<0x19, MRMDestReg, (outs VR128:$dst), 7237 (ins VR256:$src1, u8imm:$src2), 7238 "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}", 7239 []>, Sched<[WriteFShuffle256]>, VEX, VEX_L; 7240let mayStore = 1 in 7241def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs), 7242 (ins f128mem:$dst, VR256:$src1, u8imm:$src2), 7243 "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}", 7244 []>, Sched<[WriteFStoreX]>, VEX, VEX_L; 7245} 7246 7247multiclass vextract_lowering<string InstrStr, ValueType From, ValueType To> { 7248 def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), 7249 (To (!cast<Instruction>(InstrStr#rr) 7250 (From VR256:$src1), 7251 (EXTRACT_get_vextract128_imm VR128:$ext)))>; 7252 def : Pat<(store (To (vextract128_extract:$ext (From VR256:$src1), 7253 (iPTR imm))), addr:$dst), 7254 (!cast<Instruction>(InstrStr#mr) addr:$dst, VR256:$src1, 7255 (EXTRACT_get_vextract128_imm VR128:$ext))>; 7256} 7257 7258// AVX1 patterns 7259let Predicates = [HasAVX, NoVLX] in { 7260 defm : vextract_lowering<"VEXTRACTF128", v8f32, v4f32>; 7261 defm : vextract_lowering<"VEXTRACTF128", v4f64, v2f64>; 7262} 7263 7264let Predicates = [HasAVX1Only] in { 7265 defm : vextract_lowering<"VEXTRACTF128", v4i64, v2i64>; 7266 defm : vextract_lowering<"VEXTRACTF128", v8i32, v4i32>; 7267 defm : vextract_lowering<"VEXTRACTF128", v16i16, v8i16>; 7268 defm : vextract_lowering<"VEXTRACTF128", v32i8, v16i8>; 7269} 7270 7271//===----------------------------------------------------------------------===// 7272// VMASKMOV - Conditional SIMD Packed Loads and Stores 7273// 7274multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr, 7275 Intrinsic IntLd, Intrinsic IntLd256, 7276 Intrinsic IntSt, Intrinsic IntSt256> { 7277 def rm : AVX8I<opc_rm, MRMSrcMem, (outs VR128:$dst), 7278 (ins VR128:$src1, f128mem:$src2), 7279 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7280 [(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))]>, 7281 VEX_4V, Sched<[WriteFMaskedLoad]>; 7282 def Yrm : AVX8I<opc_rm, MRMSrcMem, (outs VR256:$dst), 7283 (ins VR256:$src1, f256mem:$src2), 7284 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7285 [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>, 7286 VEX_4V, VEX_L, Sched<[WriteFMaskedLoadY]>; 7287 def mr : AVX8I<opc_mr, MRMDestMem, (outs), 7288 (ins f128mem:$dst, VR128:$src1, VR128:$src2), 7289 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7290 [(IntSt addr:$dst, VR128:$src1, VR128:$src2)]>, 7291 VEX_4V, Sched<[WriteFMaskedStore]>; 7292 def Ymr : AVX8I<opc_mr, MRMDestMem, (outs), 7293 (ins f256mem:$dst, VR256:$src1, VR256:$src2), 7294 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7295 [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, 7296 VEX_4V, VEX_L, Sched<[WriteFMaskedStoreY]>; 7297} 7298 7299let ExeDomain = SSEPackedSingle in 7300defm VMASKMOVPS : avx_movmask_rm<0x2C, 0x2E, "vmaskmovps", 7301 int_x86_avx_maskload_ps, 7302 int_x86_avx_maskload_ps_256, 7303 int_x86_avx_maskstore_ps, 7304 int_x86_avx_maskstore_ps_256>; 7305let ExeDomain = SSEPackedDouble in 7306defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd", 7307 int_x86_avx_maskload_pd, 7308 int_x86_avx_maskload_pd_256, 7309 int_x86_avx_maskstore_pd, 7310 int_x86_avx_maskstore_pd_256>; 7311 7312//===----------------------------------------------------------------------===// 7313// VPERMIL - Permute Single and Double Floating-Point Values 7314// 7315 7316multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr, 7317 RegisterClass RC, X86MemOperand x86memop_f, 7318 X86MemOperand x86memop_i, PatFrag i_frag, 7319 ValueType f_vt, ValueType i_vt, 7320 X86FoldableSchedWrite sched, 7321 X86FoldableSchedWrite varsched> { 7322 let Predicates = [HasAVX, NoVLX] in { 7323 def rr : AVX8I<opc_rm, MRMSrcReg, (outs RC:$dst), 7324 (ins RC:$src1, RC:$src2), 7325 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7326 [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1, (i_vt RC:$src2))))]>, VEX_4V, 7327 Sched<[varsched]>; 7328 def rm : AVX8I<opc_rm, MRMSrcMem, (outs RC:$dst), 7329 (ins RC:$src1, x86memop_i:$src2), 7330 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7331 [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1, 7332 (i_vt (bitconvert (i_frag addr:$src2))))))]>, VEX_4V, 7333 Sched<[varsched.Folded, ReadAfterLd]>; 7334 7335 def ri : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst), 7336 (ins RC:$src1, u8imm:$src2), 7337 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7338 [(set RC:$dst, (f_vt (X86VPermilpi RC:$src1, (i8 imm:$src2))))]>, VEX, 7339 Sched<[sched]>; 7340 def mi : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst), 7341 (ins x86memop_f:$src1, u8imm:$src2), 7342 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7343 [(set RC:$dst, 7344 (f_vt (X86VPermilpi (load addr:$src1), (i8 imm:$src2))))]>, VEX, 7345 Sched<[sched.Folded]>; 7346 }// Predicates = [HasAVX, NoVLX] 7347} 7348 7349let ExeDomain = SSEPackedSingle in { 7350 defm VPERMILPS : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem, 7351 loadv2i64, v4f32, v4i32, SchedWriteFShuffle.XMM, 7352 SchedWriteFVarShuffle.XMM>; 7353 defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem, 7354 loadv4i64, v8f32, v8i32, SchedWriteFShuffle.YMM, 7355 SchedWriteFVarShuffle.YMM>, VEX_L; 7356} 7357let ExeDomain = SSEPackedDouble in { 7358 defm VPERMILPD : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem, 7359 loadv2i64, v2f64, v2i64, SchedWriteFShuffle.XMM, 7360 SchedWriteFVarShuffle.XMM>; 7361 defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem, 7362 loadv4i64, v4f64, v4i64, SchedWriteFShuffle.YMM, 7363 SchedWriteFVarShuffle.YMM>, VEX_L; 7364} 7365 7366//===----------------------------------------------------------------------===// 7367// VPERM2F128 - Permute Floating-Point Values in 128-bit chunks 7368// 7369 7370let ExeDomain = SSEPackedSingle in { 7371let isCommutable = 1 in 7372def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst), 7373 (ins VR256:$src1, VR256:$src2, u8imm:$src3), 7374 "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 7375 [(set VR256:$dst, (v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2, 7376 (i8 imm:$src3))))]>, VEX_4V, VEX_L, 7377 Sched<[WriteFShuffle256]>; 7378def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst), 7379 (ins VR256:$src1, f256mem:$src2, u8imm:$src3), 7380 "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 7381 [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4f64 addr:$src2), 7382 (i8 imm:$src3)))]>, VEX_4V, VEX_L, 7383 Sched<[WriteFShuffle256Ld, ReadAfterLd]>; 7384} 7385 7386// Immediate transform to help with commuting. 7387def Perm2XCommuteImm : SDNodeXForm<imm, [{ 7388 return getI8Imm(N->getZExtValue() ^ 0x22, SDLoc(N)); 7389}]>; 7390 7391let Predicates = [HasAVX] in { 7392// Pattern with load in other operand. 7393def : Pat<(v4f64 (X86VPerm2x128 (loadv4f64 addr:$src2), 7394 VR256:$src1, (i8 imm:$imm))), 7395 (VPERM2F128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm imm:$imm))>; 7396} 7397 7398let Predicates = [HasAVX1Only] in { 7399def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), 7400 (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; 7401def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, 7402 (loadv4i64 addr:$src2), (i8 imm:$imm))), 7403 (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>; 7404// Pattern with load in other operand. 7405def : Pat<(v4i64 (X86VPerm2x128 (loadv4i64 addr:$src2), 7406 VR256:$src1, (i8 imm:$imm))), 7407 (VPERM2F128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm imm:$imm))>; 7408} 7409 7410//===----------------------------------------------------------------------===// 7411// VZERO - Zero YMM registers 7412// Note: These instruction do not affect the YMM16-YMM31. 7413// 7414 7415let SchedRW = [WriteSystem] in { 7416let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7, 7417 YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15] in { 7418 // Zero All YMM registers 7419 def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall", 7420 [(int_x86_avx_vzeroall)]>, PS, VEX, VEX_L, 7421 Requires<[HasAVX]>, VEX_WIG; 7422 7423 // Zero Upper bits of YMM registers 7424 def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper", 7425 [(int_x86_avx_vzeroupper)]>, PS, VEX, 7426 Requires<[HasAVX]>, VEX_WIG; 7427} // Defs 7428} // SchedRW 7429 7430//===----------------------------------------------------------------------===// 7431// Half precision conversion instructions 7432// 7433 7434multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop, 7435 X86FoldableSchedWrite sched> { 7436 def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src), 7437 "vcvtph2ps\t{$src, $dst|$dst, $src}", 7438 [(set RC:$dst, (X86cvtph2ps VR128:$src))]>, 7439 T8PD, VEX, Sched<[sched]>; 7440 let hasSideEffects = 0, mayLoad = 1 in 7441 def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), 7442 "vcvtph2ps\t{$src, $dst|$dst, $src}", 7443 [(set RC:$dst, (X86cvtph2ps (bc_v8i16 7444 (loadv2i64 addr:$src))))]>, 7445 T8PD, VEX, Sched<[sched.Folded]>; 7446} 7447 7448multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop, 7449 SchedWrite RR, SchedWrite MR> { 7450 def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst), 7451 (ins RC:$src1, i32u8imm:$src2), 7452 "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", 7453 [(set VR128:$dst, (X86cvtps2ph RC:$src1, imm:$src2))]>, 7454 TAPD, VEX, Sched<[RR]>; 7455 let hasSideEffects = 0, mayStore = 1 in 7456 def mr : Ii8<0x1D, MRMDestMem, (outs), 7457 (ins x86memop:$dst, RC:$src1, i32u8imm:$src2), 7458 "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 7459 TAPD, VEX, Sched<[MR]>; 7460} 7461 7462let Predicates = [HasF16C, NoVLX] in { 7463 defm VCVTPH2PS : f16c_ph2ps<VR128, f64mem, WriteCvtPH2PS>; 7464 defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, WriteCvtPH2PSY>, VEX_L; 7465 defm VCVTPS2PH : f16c_ps2ph<VR128, f64mem, WriteCvtPS2PH, 7466 WriteCvtPS2PHSt>; 7467 defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, WriteCvtPS2PHY, 7468 WriteCvtPS2PHYSt>, VEX_L; 7469 7470 // Pattern match vcvtph2ps of a scalar i64 load. 7471 def : Pat<(v4f32 (X86cvtph2ps (v8i16 (vzmovl_v2i64 addr:$src)))), 7472 (VCVTPH2PSrm addr:$src)>; 7473 def : Pat<(v4f32 (X86cvtph2ps (v8i16 (vzload_v2i64 addr:$src)))), 7474 (VCVTPH2PSrm addr:$src)>; 7475 def : Pat<(v4f32 (X86cvtph2ps (v8i16 (bitconvert 7476 (v2i64 (scalar_to_vector (loadi64 addr:$src))))))), 7477 (VCVTPH2PSrm addr:$src)>; 7478 7479 def : Pat<(store (f64 (extractelt 7480 (bc_v2f64 (v8i16 (X86cvtps2ph VR128:$src1, i32:$src2))), 7481 (iPTR 0))), addr:$dst), 7482 (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>; 7483 def : Pat<(store (i64 (extractelt 7484 (bc_v2i64 (v8i16 (X86cvtps2ph VR128:$src1, i32:$src2))), 7485 (iPTR 0))), addr:$dst), 7486 (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>; 7487 def : Pat<(store (v8i16 (X86cvtps2ph VR256:$src1, i32:$src2)), addr:$dst), 7488 (VCVTPS2PHYmr addr:$dst, VR256:$src1, imm:$src2)>; 7489} 7490 7491// Patterns for matching conversions from float to half-float and vice versa. 7492let Predicates = [HasF16C, NoVLX] in { 7493 // Use MXCSR.RC for rounding instead of explicitly specifying the default 7494 // rounding mode (Nearest-Even, encoded as 0). Both are equivalent in the 7495 // configurations we support (the default). However, falling back to MXCSR is 7496 // more consistent with other instructions, which are always controlled by it. 7497 // It's encoded as 0b100. 7498 def : Pat<(fp_to_f16 FR32:$src), 7499 (i16 (EXTRACT_SUBREG (VMOVPDI2DIrr (v8i16 (VCVTPS2PHrr 7500 (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 4))), sub_16bit))>; 7501 7502 def : Pat<(f16_to_fp GR16:$src), 7503 (f32 (COPY_TO_REGCLASS (v4f32 (VCVTPH2PSrr 7504 (v4i32 (COPY_TO_REGCLASS (MOVSX32rr16 GR16:$src), VR128)))), FR32)) >; 7505 7506 def : Pat<(f16_to_fp (i16 (fp_to_f16 FR32:$src))), 7507 (f32 (COPY_TO_REGCLASS (v4f32 (VCVTPH2PSrr 7508 (v8i16 (VCVTPS2PHrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 4)))), FR32)) >; 7509} 7510 7511//===----------------------------------------------------------------------===// 7512// AVX2 Instructions 7513//===----------------------------------------------------------------------===// 7514 7515/// AVX2_blend_rmi - AVX2 blend with 8-bit immediate 7516multiclass AVX2_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode, 7517 ValueType OpVT, X86FoldableSchedWrite sched, 7518 RegisterClass RC, PatFrag memop_frag, 7519 X86MemOperand x86memop, SDNodeXForm commuteXForm> { 7520 let isCommutable = 1 in 7521 def rri : AVX2AIi8<opc, MRMSrcReg, (outs RC:$dst), 7522 (ins RC:$src1, RC:$src2, u8imm:$src3), 7523 !strconcat(OpcodeStr, 7524 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 7525 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))]>, 7526 Sched<[sched]>, VEX_4V; 7527 def rmi : AVX2AIi8<opc, MRMSrcMem, (outs RC:$dst), 7528 (ins RC:$src1, x86memop:$src2, u8imm:$src3), 7529 !strconcat(OpcodeStr, 7530 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 7531 [(set RC:$dst, 7532 (OpVT (OpNode RC:$src1, 7533 (bitconvert (memop_frag addr:$src2)), imm:$src3)))]>, 7534 Sched<[sched.Folded, ReadAfterLd]>, VEX_4V; 7535 7536 // Pattern to commute if load is in first source. 7537 def : Pat<(OpVT (OpNode (bitconvert (memop_frag addr:$src2)), 7538 RC:$src1, imm:$src3)), 7539 (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2, 7540 (commuteXForm imm:$src3))>; 7541} 7542 7543defm VPBLENDD : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v4i32, 7544 SchedWriteBlend.XMM, VR128, loadv2i64, i128mem, 7545 BlendCommuteImm4>; 7546defm VPBLENDDY : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v8i32, 7547 SchedWriteBlend.YMM, VR256, loadv4i64, i256mem, 7548 BlendCommuteImm8>, VEX_L; 7549 7550// For insertion into the zero index (low half) of a 256-bit vector, it is 7551// more efficient to generate a blend with immediate instead of an insert*128. 7552let Predicates = [HasAVX2] in { 7553def : Pat<(insert_subvector (v8i32 VR256:$src1), (v4i32 VR128:$src2), (iPTR 0)), 7554 (VPBLENDDYrri VR256:$src1, 7555 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7556 VR128:$src2, sub_xmm), 0xf)>; 7557def : Pat<(insert_subvector (v4i64 VR256:$src1), (v2i64 VR128:$src2), (iPTR 0)), 7558 (VPBLENDDYrri VR256:$src1, 7559 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7560 VR128:$src2, sub_xmm), 0xf)>; 7561def : Pat<(insert_subvector (v16i16 VR256:$src1), (v8i16 VR128:$src2), (iPTR 0)), 7562 (VPBLENDDYrri VR256:$src1, 7563 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7564 VR128:$src2, sub_xmm), 0xf)>; 7565def : Pat<(insert_subvector (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR 0)), 7566 (VPBLENDDYrri VR256:$src1, 7567 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7568 VR128:$src2, sub_xmm), 0xf)>; 7569} 7570 7571let Predicates = [HasAVX1Only] in { 7572def : Pat<(insert_subvector (v8i32 VR256:$src1), (v4i32 VR128:$src2), (iPTR 0)), 7573 (VBLENDPSYrri VR256:$src1, 7574 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7575 VR128:$src2, sub_xmm), 0xf)>; 7576def : Pat<(insert_subvector (v4i64 VR256:$src1), (v2i64 VR128:$src2), (iPTR 0)), 7577 (VBLENDPSYrri VR256:$src1, 7578 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7579 VR128:$src2, sub_xmm), 0xf)>; 7580def : Pat<(insert_subvector (v16i16 VR256:$src1), (v8i16 VR128:$src2), (iPTR 0)), 7581 (VBLENDPSYrri VR256:$src1, 7582 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7583 VR128:$src2, sub_xmm), 0xf)>; 7584def : Pat<(insert_subvector (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR 0)), 7585 (VBLENDPSYrri VR256:$src1, 7586 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7587 VR128:$src2, sub_xmm), 0xf)>; 7588} 7589 7590//===----------------------------------------------------------------------===// 7591// VPBROADCAST - Load from memory and broadcast to all elements of the 7592// destination operand 7593// 7594multiclass avx2_broadcast<bits<8> opc, string OpcodeStr, 7595 X86MemOperand x86memop, PatFrag ld_frag, 7596 ValueType OpVT128, ValueType OpVT256, Predicate prd> { 7597 let Predicates = [HasAVX2, prd] in { 7598 def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 7599 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 7600 [(set VR128:$dst, 7601 (OpVT128 (X86VBroadcast (OpVT128 VR128:$src))))]>, 7602 Sched<[SchedWriteShuffle.XMM]>, VEX; 7603 def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src), 7604 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 7605 [(set VR128:$dst, 7606 (OpVT128 (X86VBroadcast (ld_frag addr:$src))))]>, 7607 Sched<[SchedWriteShuffle.XMM.Folded]>, VEX; 7608 def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), 7609 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 7610 [(set VR256:$dst, 7611 (OpVT256 (X86VBroadcast (OpVT128 VR128:$src))))]>, 7612 Sched<[WriteShuffle256]>, VEX, VEX_L; 7613 def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src), 7614 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 7615 [(set VR256:$dst, 7616 (OpVT256 (X86VBroadcast (ld_frag addr:$src))))]>, 7617 Sched<[SchedWriteShuffle.XMM.Folded]>, VEX, VEX_L; 7618 7619 // Provide aliases for broadcast from the same register class that 7620 // automatically does the extract. 7621 def : Pat<(OpVT256 (X86VBroadcast (OpVT256 VR256:$src))), 7622 (!cast<Instruction>(NAME#"Yrr") 7623 (OpVT128 (EXTRACT_SUBREG (OpVT256 VR256:$src),sub_xmm)))>; 7624 } 7625} 7626 7627defm VPBROADCASTB : avx2_broadcast<0x78, "vpbroadcastb", i8mem, loadi8, 7628 v16i8, v32i8, NoVLX_Or_NoBWI>; 7629defm VPBROADCASTW : avx2_broadcast<0x79, "vpbroadcastw", i16mem, loadi16, 7630 v8i16, v16i16, NoVLX_Or_NoBWI>; 7631defm VPBROADCASTD : avx2_broadcast<0x58, "vpbroadcastd", i32mem, loadi32, 7632 v4i32, v8i32, NoVLX>; 7633defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, loadi64, 7634 v2i64, v4i64, NoVLX>; 7635 7636let Predicates = [HasAVX2, NoVLX] in { 7637 // 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD. 7638 def : Pat<(v2i64 (X86VBroadcast (v2i64 (X86vzload addr:$src)))), 7639 (VPBROADCASTQrm addr:$src)>; 7640 def : Pat<(v4i64 (X86VBroadcast (v4i64 (X86vzload addr:$src)))), 7641 (VPBROADCASTQYrm addr:$src)>; 7642 7643 def : Pat<(v4i32 (X86VBroadcast (v4i32 (scalar_to_vector (loadi32 addr:$src))))), 7644 (VPBROADCASTDrm addr:$src)>; 7645 def : Pat<(v8i32 (X86VBroadcast (v4i32 (scalar_to_vector (loadi32 addr:$src))))), 7646 (VPBROADCASTDYrm addr:$src)>; 7647 def : Pat<(v2i64 (X86VBroadcast (v2i64 (scalar_to_vector (loadi64 addr:$src))))), 7648 (VPBROADCASTQrm addr:$src)>; 7649 def : Pat<(v4i64 (X86VBroadcast (v2i64 (scalar_to_vector (loadi64 addr:$src))))), 7650 (VPBROADCASTQYrm addr:$src)>; 7651} 7652let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 7653 // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably. 7654 // This means we'll encounter truncated i32 loads; match that here. 7655 def : Pat<(v8i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))), 7656 (VPBROADCASTWrm addr:$src)>; 7657 def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))), 7658 (VPBROADCASTWYrm addr:$src)>; 7659 def : Pat<(v8i16 (X86VBroadcast 7660 (i16 (trunc (i32 (zextloadi16 addr:$src)))))), 7661 (VPBROADCASTWrm addr:$src)>; 7662 def : Pat<(v16i16 (X86VBroadcast 7663 (i16 (trunc (i32 (zextloadi16 addr:$src)))))), 7664 (VPBROADCASTWYrm addr:$src)>; 7665} 7666 7667let Predicates = [HasAVX2, NoVLX] in { 7668 // Provide aliases for broadcast from the same register class that 7669 // automatically does the extract. 7670 def : Pat<(v8f32 (X86VBroadcast (v8f32 VR256:$src))), 7671 (VBROADCASTSSYrr (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), 7672 sub_xmm)))>; 7673 def : Pat<(v4f64 (X86VBroadcast (v4f64 VR256:$src))), 7674 (VBROADCASTSDYrr (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), 7675 sub_xmm)))>; 7676} 7677 7678let Predicates = [HasAVX2, NoVLX] in { 7679 // Provide fallback in case the load node that is used in the patterns above 7680 // is used by additional users, which prevents the pattern selection. 7681 def : Pat<(v4f32 (X86VBroadcast FR32:$src)), 7682 (VBROADCASTSSrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>; 7683 def : Pat<(v8f32 (X86VBroadcast FR32:$src)), 7684 (VBROADCASTSSYrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>; 7685 def : Pat<(v4f64 (X86VBroadcast FR64:$src)), 7686 (VBROADCASTSDYrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>; 7687} 7688 7689let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 7690 def : Pat<(v16i8 (X86VBroadcast GR8:$src)), 7691 (VPBROADCASTBrr (v16i8 (COPY_TO_REGCLASS 7692 (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), 7693 GR8:$src, sub_8bit)), 7694 VR128)))>; 7695 def : Pat<(v32i8 (X86VBroadcast GR8:$src)), 7696 (VPBROADCASTBYrr (v16i8 (COPY_TO_REGCLASS 7697 (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), 7698 GR8:$src, sub_8bit)), 7699 VR128)))>; 7700 7701 def : Pat<(v8i16 (X86VBroadcast GR16:$src)), 7702 (VPBROADCASTWrr (v8i16 (COPY_TO_REGCLASS 7703 (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), 7704 GR16:$src, sub_16bit)), 7705 VR128)))>; 7706 def : Pat<(v16i16 (X86VBroadcast GR16:$src)), 7707 (VPBROADCASTWYrr (v8i16 (COPY_TO_REGCLASS 7708 (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), 7709 GR16:$src, sub_16bit)), 7710 VR128)))>; 7711} 7712let Predicates = [HasAVX2, NoVLX] in { 7713 def : Pat<(v4i32 (X86VBroadcast GR32:$src)), 7714 (VPBROADCASTDrr (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)))>; 7715 def : Pat<(v8i32 (X86VBroadcast GR32:$src)), 7716 (VPBROADCASTDYrr (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)))>; 7717 def : Pat<(v2i64 (X86VBroadcast GR64:$src)), 7718 (VPBROADCASTQrr (v2i64 (COPY_TO_REGCLASS GR64:$src, VR128)))>; 7719 def : Pat<(v4i64 (X86VBroadcast GR64:$src)), 7720 (VPBROADCASTQYrr (v2i64 (COPY_TO_REGCLASS GR64:$src, VR128)))>; 7721} 7722 7723// AVX1 broadcast patterns 7724let Predicates = [HasAVX1Only] in { 7725def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))), 7726 (VBROADCASTSSYrm addr:$src)>; 7727def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))), 7728 (VBROADCASTSDYrm addr:$src)>; 7729def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))), 7730 (VBROADCASTSSrm addr:$src)>; 7731} 7732 7733 // Provide fallback in case the load node that is used in the patterns above 7734 // is used by additional users, which prevents the pattern selection. 7735let Predicates = [HasAVX, NoVLX] in { 7736 // 128bit broadcasts: 7737 def : Pat<(v2f64 (X86VBroadcast f64:$src)), 7738 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>; 7739 def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))), 7740 (VMOVDDUPrm addr:$src)>; 7741 7742 def : Pat<(v2f64 (X86VBroadcast v2f64:$src)), 7743 (VMOVDDUPrr VR128:$src)>; 7744 def : Pat<(v2f64 (X86VBroadcast (loadv2f64 addr:$src))), 7745 (VMOVDDUPrm addr:$src)>; 7746} 7747 7748let Predicates = [HasAVX1Only] in { 7749 def : Pat<(v4f32 (X86VBroadcast FR32:$src)), 7750 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)>; 7751 def : Pat<(v8f32 (X86VBroadcast FR32:$src)), 7752 (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), 7753 (v4f32 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), sub_xmm), 7754 (v4f32 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), 1)>; 7755 def : Pat<(v4f64 (X86VBroadcast FR64:$src)), 7756 (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), 7757 (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), sub_xmm), 7758 (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), 1)>; 7759 7760 def : Pat<(v4i32 (X86VBroadcast GR32:$src)), 7761 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)), 0)>; 7762 def : Pat<(v8i32 (X86VBroadcast GR32:$src)), 7763 (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7764 (v4i32 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)), 0)), sub_xmm), 7765 (v4i32 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)), 0)), 1)>; 7766 def : Pat<(v4i64 (X86VBroadcast GR64:$src)), 7767 (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), 7768 (v4i32 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR64:$src, VR128)), 0x44)), sub_xmm), 7769 (v4i32 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR64:$src, VR128)), 0x44)), 1)>; 7770 7771 def : Pat<(v2i64 (X86VBroadcast i64:$src)), 7772 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR64:$src, VR128)), 0x44)>; 7773 def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))), 7774 (VMOVDDUPrm addr:$src)>; 7775} 7776 7777//===----------------------------------------------------------------------===// 7778// VPERM - Permute instructions 7779// 7780 7781multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag, 7782 ValueType OpVT, X86FoldableSchedWrite Sched, 7783 X86MemOperand memOp> { 7784 let Predicates = [HasAVX2, NoVLX] in { 7785 def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), 7786 (ins VR256:$src1, VR256:$src2), 7787 !strconcat(OpcodeStr, 7788 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7789 [(set VR256:$dst, 7790 (OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>, 7791 Sched<[Sched]>, VEX_4V, VEX_L; 7792 def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), 7793 (ins VR256:$src1, memOp:$src2), 7794 !strconcat(OpcodeStr, 7795 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7796 [(set VR256:$dst, 7797 (OpVT (X86VPermv VR256:$src1, 7798 (bitconvert (mem_frag addr:$src2)))))]>, 7799 Sched<[Sched.Folded, ReadAfterLd]>, VEX_4V, VEX_L; 7800 } 7801} 7802 7803defm VPERMD : avx2_perm<0x36, "vpermd", loadv4i64, v8i32, WriteVarShuffle256, 7804 i256mem>; 7805let ExeDomain = SSEPackedSingle in 7806defm VPERMPS : avx2_perm<0x16, "vpermps", loadv8f32, v8f32, WriteFVarShuffle256, 7807 f256mem>; 7808 7809multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag, 7810 ValueType OpVT, X86FoldableSchedWrite Sched, 7811 X86MemOperand memOp> { 7812 let Predicates = [HasAVX2, NoVLX] in { 7813 def Yri : AVX2AIi8<opc, MRMSrcReg, (outs VR256:$dst), 7814 (ins VR256:$src1, u8imm:$src2), 7815 !strconcat(OpcodeStr, 7816 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7817 [(set VR256:$dst, 7818 (OpVT (X86VPermi VR256:$src1, (i8 imm:$src2))))]>, 7819 Sched<[Sched]>, VEX, VEX_L; 7820 def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst), 7821 (ins memOp:$src1, u8imm:$src2), 7822 !strconcat(OpcodeStr, 7823 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7824 [(set VR256:$dst, 7825 (OpVT (X86VPermi (mem_frag addr:$src1), 7826 (i8 imm:$src2))))]>, 7827 Sched<[Sched.Folded, ReadAfterLd]>, VEX, VEX_L; 7828 } 7829} 7830 7831defm VPERMQ : avx2_perm_imm<0x00, "vpermq", loadv4i64, v4i64, 7832 WriteShuffle256, i256mem>, VEX_W; 7833let ExeDomain = SSEPackedDouble in 7834defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64, 7835 WriteFShuffle256, f256mem>, VEX_W; 7836 7837//===----------------------------------------------------------------------===// 7838// VPERM2I128 - Permute Floating-Point Values in 128-bit chunks 7839// 7840let isCommutable = 1 in 7841def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst), 7842 (ins VR256:$src1, VR256:$src2, u8imm:$src3), 7843 "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 7844 [(set VR256:$dst, (v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, 7845 (i8 imm:$src3))))]>, Sched<[WriteShuffle256]>, 7846 VEX_4V, VEX_L; 7847def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst), 7848 (ins VR256:$src1, f256mem:$src2, u8imm:$src3), 7849 "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 7850 [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4i64 addr:$src2), 7851 (i8 imm:$src3)))]>, 7852 Sched<[WriteShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L; 7853 7854let Predicates = [HasAVX2] in 7855def : Pat<(v4i64 (X86VPerm2x128 (loadv4i64 addr:$src2), 7856 VR256:$src1, (i8 imm:$imm))), 7857 (VPERM2I128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm imm:$imm))>; 7858 7859 7860//===----------------------------------------------------------------------===// 7861// VINSERTI128 - Insert packed integer values 7862// 7863let hasSideEffects = 0 in { 7864def VINSERTI128rr : AVX2AIi8<0x38, MRMSrcReg, (outs VR256:$dst), 7865 (ins VR256:$src1, VR128:$src2, u8imm:$src3), 7866 "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 7867 []>, Sched<[WriteShuffle256]>, VEX_4V, VEX_L; 7868let mayLoad = 1 in 7869def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst), 7870 (ins VR256:$src1, i128mem:$src2, u8imm:$src3), 7871 "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 7872 []>, Sched<[WriteShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L; 7873} 7874 7875let Predicates = [HasAVX2, NoVLX] in { 7876 defm : vinsert_lowering<"VINSERTI128", v2i64, v4i64, loadv2i64>; 7877 defm : vinsert_lowering<"VINSERTI128", v4i32, v8i32, loadv2i64>; 7878 defm : vinsert_lowering<"VINSERTI128", v8i16, v16i16, loadv2i64>; 7879 defm : vinsert_lowering<"VINSERTI128", v16i8, v32i8, loadv2i64>; 7880} 7881 7882//===----------------------------------------------------------------------===// 7883// VEXTRACTI128 - Extract packed integer values 7884// 7885def VEXTRACTI128rr : AVX2AIi8<0x39, MRMDestReg, (outs VR128:$dst), 7886 (ins VR256:$src1, u8imm:$src2), 7887 "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 7888 Sched<[WriteShuffle256]>, VEX, VEX_L; 7889let hasSideEffects = 0, mayStore = 1 in 7890def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs), 7891 (ins i128mem:$dst, VR256:$src1, u8imm:$src2), 7892 "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 7893 Sched<[SchedWriteVecMoveLS.XMM.MR]>, VEX, VEX_L; 7894 7895let Predicates = [HasAVX2, NoVLX] in { 7896 defm : vextract_lowering<"VEXTRACTI128", v4i64, v2i64>; 7897 defm : vextract_lowering<"VEXTRACTI128", v8i32, v4i32>; 7898 defm : vextract_lowering<"VEXTRACTI128", v16i16, v8i16>; 7899 defm : vextract_lowering<"VEXTRACTI128", v32i8, v16i8>; 7900} 7901 7902//===----------------------------------------------------------------------===// 7903// VPMASKMOV - Conditional SIMD Integer Packed Loads and Stores 7904// 7905multiclass avx2_pmovmask<string OpcodeStr, 7906 Intrinsic IntLd128, Intrinsic IntLd256, 7907 Intrinsic IntSt128, Intrinsic IntSt256> { 7908 def rm : AVX28I<0x8c, MRMSrcMem, (outs VR128:$dst), 7909 (ins VR128:$src1, i128mem:$src2), 7910 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7911 [(set VR128:$dst, (IntLd128 addr:$src2, VR128:$src1))]>, 7912 VEX_4V, Sched<[WriteVecMaskedLoad]>; 7913 def Yrm : AVX28I<0x8c, MRMSrcMem, (outs VR256:$dst), 7914 (ins VR256:$src1, i256mem:$src2), 7915 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7916 [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>, 7917 VEX_4V, VEX_L, Sched<[WriteVecMaskedLoadY]>; 7918 def mr : AVX28I<0x8e, MRMDestMem, (outs), 7919 (ins i128mem:$dst, VR128:$src1, VR128:$src2), 7920 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7921 [(IntSt128 addr:$dst, VR128:$src1, VR128:$src2)]>, 7922 VEX_4V, Sched<[WriteVecMaskedStore]>; 7923 def Ymr : AVX28I<0x8e, MRMDestMem, (outs), 7924 (ins i256mem:$dst, VR256:$src1, VR256:$src2), 7925 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7926 [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, 7927 VEX_4V, VEX_L, Sched<[WriteVecMaskedStoreY]>; 7928} 7929 7930defm VPMASKMOVD : avx2_pmovmask<"vpmaskmovd", 7931 int_x86_avx2_maskload_d, 7932 int_x86_avx2_maskload_d_256, 7933 int_x86_avx2_maskstore_d, 7934 int_x86_avx2_maskstore_d_256>; 7935defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq", 7936 int_x86_avx2_maskload_q, 7937 int_x86_avx2_maskload_q_256, 7938 int_x86_avx2_maskstore_q, 7939 int_x86_avx2_maskstore_q_256>, VEX_W; 7940 7941multiclass maskmov_lowering<string InstrStr, RegisterClass RC, ValueType VT, 7942 ValueType MaskVT, string BlendStr, ValueType ZeroVT> { 7943 // masked store 7944 def: Pat<(X86mstore addr:$ptr, (MaskVT RC:$mask), (VT RC:$src)), 7945 (!cast<Instruction>(InstrStr#"mr") addr:$ptr, RC:$mask, RC:$src)>; 7946 // masked load 7947 def: Pat<(VT (X86mload addr:$ptr, (MaskVT RC:$mask), undef)), 7948 (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>; 7949 def: Pat<(VT (X86mload addr:$ptr, (MaskVT RC:$mask), 7950 (VT (bitconvert (ZeroVT immAllZerosV))))), 7951 (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>; 7952 def: Pat<(VT (X86mload addr:$ptr, (MaskVT RC:$mask), (VT RC:$src0))), 7953 (!cast<Instruction>(BlendStr#"rr") 7954 RC:$src0, 7955 (VT (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)), 7956 RC:$mask)>; 7957} 7958let Predicates = [HasAVX] in { 7959 defm : maskmov_lowering<"VMASKMOVPS", VR128, v4f32, v4i32, "VBLENDVPS", v4i32>; 7960 defm : maskmov_lowering<"VMASKMOVPD", VR128, v2f64, v2i64, "VBLENDVPD", v4i32>; 7961 defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8f32, v8i32, "VBLENDVPSY", v8i32>; 7962 defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4f64, v4i64, "VBLENDVPDY", v8i32>; 7963} 7964let Predicates = [HasAVX1Only] in { 7965 // load/store i32/i64 not supported use ps/pd version 7966 defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8i32, v8i32, "VBLENDVPSY", v8i32>; 7967 defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4i64, v4i64, "VBLENDVPDY", v8i32>; 7968 defm : maskmov_lowering<"VMASKMOVPS", VR128, v4i32, v4i32, "VBLENDVPS", v4i32>; 7969 defm : maskmov_lowering<"VMASKMOVPD", VR128, v2i64, v2i64, "VBLENDVPD", v4i32>; 7970} 7971let Predicates = [HasAVX2] in { 7972 defm : maskmov_lowering<"VPMASKMOVDY", VR256, v8i32, v8i32, "VBLENDVPSY", v8i32>; 7973 defm : maskmov_lowering<"VPMASKMOVQY", VR256, v4i64, v4i64, "VBLENDVPDY", v8i32>; 7974 defm : maskmov_lowering<"VPMASKMOVD", VR128, v4i32, v4i32, "VBLENDVPS", v4i32>; 7975 defm : maskmov_lowering<"VPMASKMOVQ", VR128, v2i64, v2i64, "VBLENDVPD", v4i32>; 7976} 7977 7978//===----------------------------------------------------------------------===// 7979// SubVector Broadcasts 7980// Provide fallback in case the load node that is used in the patterns above 7981// is used by additional users, which prevents the pattern selection. 7982 7983let Predicates = [HasAVX2, NoVLX] in { 7984def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128:$src))), 7985 (VINSERTI128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm), 7986 (v2i64 VR128:$src), 1)>; 7987def : Pat<(v8i32 (X86SubVBroadcast (v4i32 VR128:$src))), 7988 (VINSERTI128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm), 7989 (v4i32 VR128:$src), 1)>; 7990def : Pat<(v16i16 (X86SubVBroadcast (v8i16 VR128:$src))), 7991 (VINSERTI128rr (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm), 7992 (v8i16 VR128:$src), 1)>; 7993def : Pat<(v32i8 (X86SubVBroadcast (v16i8 VR128:$src))), 7994 (VINSERTI128rr (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm), 7995 (v16i8 VR128:$src), 1)>; 7996} 7997 7998let Predicates = [HasAVX, NoVLX] in { 7999def : Pat<(v4f64 (X86SubVBroadcast (v2f64 VR128:$src))), 8000 (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128:$src, sub_xmm), 8001 (v2f64 VR128:$src), 1)>; 8002def : Pat<(v8f32 (X86SubVBroadcast (v4f32 VR128:$src))), 8003 (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128:$src, sub_xmm), 8004 (v4f32 VR128:$src), 1)>; 8005} 8006 8007let Predicates = [HasAVX1Only] in { 8008def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128:$src))), 8009 (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm), 8010 (v2i64 VR128:$src), 1)>; 8011def : Pat<(v8i32 (X86SubVBroadcast (v4i32 VR128:$src))), 8012 (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm), 8013 (v4i32 VR128:$src), 1)>; 8014def : Pat<(v16i16 (X86SubVBroadcast (v8i16 VR128:$src))), 8015 (VINSERTF128rr (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm), 8016 (v8i16 VR128:$src), 1)>; 8017def : Pat<(v32i8 (X86SubVBroadcast (v16i8 VR128:$src))), 8018 (VINSERTF128rr (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm), 8019 (v16i8 VR128:$src), 1)>; 8020} 8021 8022//===----------------------------------------------------------------------===// 8023// Variable Bit Shifts 8024// 8025multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode, 8026 ValueType vt128, ValueType vt256> { 8027 def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), 8028 (ins VR128:$src1, VR128:$src2), 8029 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 8030 [(set VR128:$dst, 8031 (vt128 (OpNode VR128:$src1, (vt128 VR128:$src2))))]>, 8032 VEX_4V, Sched<[SchedWriteVarVecShift.XMM]>; 8033 def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), 8034 (ins VR128:$src1, i128mem:$src2), 8035 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 8036 [(set VR128:$dst, 8037 (vt128 (OpNode VR128:$src1, 8038 (vt128 (bitconvert (loadv2i64 addr:$src2))))))]>, 8039 VEX_4V, Sched<[SchedWriteVarVecShift.XMM.Folded, ReadAfterLd]>; 8040 def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), 8041 (ins VR256:$src1, VR256:$src2), 8042 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 8043 [(set VR256:$dst, 8044 (vt256 (OpNode VR256:$src1, (vt256 VR256:$src2))))]>, 8045 VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM]>; 8046 def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), 8047 (ins VR256:$src1, i256mem:$src2), 8048 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 8049 [(set VR256:$dst, 8050 (vt256 (OpNode VR256:$src1, 8051 (vt256 (bitconvert (loadv4i64 addr:$src2))))))]>, 8052 VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM.Folded, ReadAfterLd]>; 8053} 8054 8055let Predicates = [HasAVX2, NoVLX] in { 8056 defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", shl, v4i32, v8i32>; 8057 defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", shl, v2i64, v4i64>, VEX_W; 8058 defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", srl, v4i32, v8i32>; 8059 defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", srl, v2i64, v4i64>, VEX_W; 8060 defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", sra, v4i32, v8i32>; 8061 8062 def : Pat<(v4i32 (X86vsrav VR128:$src1, VR128:$src2)), 8063 (VPSRAVDrr VR128:$src1, VR128:$src2)>; 8064 def : Pat<(v4i32 (X86vsrav VR128:$src1, 8065 (bitconvert (loadv2i64 addr:$src2)))), 8066 (VPSRAVDrm VR128:$src1, addr:$src2)>; 8067 def : Pat<(v8i32 (X86vsrav VR256:$src1, VR256:$src2)), 8068 (VPSRAVDYrr VR256:$src1, VR256:$src2)>; 8069 def : Pat<(v8i32 (X86vsrav VR256:$src1, 8070 (bitconvert (loadv4i64 addr:$src2)))), 8071 (VPSRAVDYrm VR256:$src1, addr:$src2)>; 8072} 8073 8074//===----------------------------------------------------------------------===// 8075// VGATHER - GATHER Operations 8076 8077// FIXME: Improve scheduling of gather instructions. 8078multiclass avx2_gather<bits<8> opc, string OpcodeStr, ValueType VTx, 8079 ValueType VTy, PatFrag GatherNode128, 8080 PatFrag GatherNode256, RegisterClass RC256, 8081 X86MemOperand memop128, X86MemOperand memop256, 8082 ValueType MTx = VTx, ValueType MTy = VTy> { 8083 def rm : AVX28I<opc, MRMSrcMem4VOp3, (outs VR128:$dst, VR128:$mask_wb), 8084 (ins VR128:$src1, memop128:$src2, VR128:$mask), 8085 !strconcat(OpcodeStr, 8086 "\t{$mask, $src2, $dst|$dst, $src2, $mask}"), 8087 [(set (VTx VR128:$dst), (MTx VR128:$mask_wb), 8088 (GatherNode128 VR128:$src1, VR128:$mask, 8089 vectoraddr:$src2))]>, 8090 VEX, Sched<[WriteLoad]>; 8091 def Yrm : AVX28I<opc, MRMSrcMem4VOp3, (outs RC256:$dst, RC256:$mask_wb), 8092 (ins RC256:$src1, memop256:$src2, RC256:$mask), 8093 !strconcat(OpcodeStr, 8094 "\t{$mask, $src2, $dst|$dst, $src2, $mask}"), 8095 [(set (VTy RC256:$dst), (MTy RC256:$mask_wb), 8096 (GatherNode256 RC256:$src1, RC256:$mask, 8097 vectoraddr:$src2))]>, 8098 VEX, VEX_L, Sched<[WriteLoad]>; 8099} 8100 8101let Predicates = [UseAVX2] in { 8102 let mayLoad = 1, hasSideEffects = 0, Constraints 8103 = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb" 8104 in { 8105 defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", v2i64, v4i64, mgatherv4i32, 8106 mgatherv4i32, VR256, vx128mem, vx256mem>, VEX_W; 8107 defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", v2i64, v4i64, mgatherv2i64, 8108 mgatherv4i64, VR256, vx128mem, vy256mem>, VEX_W; 8109 defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", v4i32, v8i32, mgatherv4i32, 8110 mgatherv8i32, VR256, vx128mem, vy256mem>; 8111 defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", v4i32, v4i32, mgatherv2i64, 8112 mgatherv4i64, VR128, vx64mem, vy128mem>; 8113 8114 let ExeDomain = SSEPackedDouble in { 8115 defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", v2f64, v4f64, mgatherv4i32, 8116 mgatherv4i32, VR256, vx128mem, vx256mem, 8117 v2i64, v4i64>, VEX_W; 8118 defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", v2f64, v4f64, mgatherv2i64, 8119 mgatherv4i64, VR256, vx128mem, vy256mem, 8120 v2i64, v4i64>, VEX_W; 8121 } 8122 8123 let ExeDomain = SSEPackedSingle in { 8124 defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", v4f32, v8f32, mgatherv4i32, 8125 mgatherv8i32, VR256, vx128mem, vy256mem, 8126 v4i32, v8i32>; 8127 defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", v4f32, v4f32, mgatherv2i64, 8128 mgatherv4i64, VR128, vx64mem, vy128mem, 8129 v4i32, v4i32>; 8130 } 8131 } 8132} 8133 8134//===----------------------------------------------------------------------===// 8135// Extra selection patterns for f128, f128mem 8136 8137// movaps is shorter than movdqa. movaps is in SSE and movdqa is in SSE2. 8138def : Pat<(alignedstore (f128 VR128:$src), addr:$dst), 8139 (MOVAPSmr addr:$dst, (COPY_TO_REGCLASS (f128 VR128:$src), VR128))>; 8140def : Pat<(store (f128 VR128:$src), addr:$dst), 8141 (MOVUPSmr addr:$dst, (COPY_TO_REGCLASS (f128 VR128:$src), VR128))>; 8142 8143def : Pat<(alignedloadf128 addr:$src), 8144 (COPY_TO_REGCLASS (MOVAPSrm addr:$src), VR128)>; 8145def : Pat<(loadf128 addr:$src), 8146 (COPY_TO_REGCLASS (MOVUPSrm addr:$src), VR128)>; 8147 8148// andps is shorter than andpd or pand. andps is SSE and andpd/pand are in SSE2 8149def : Pat<(f128 (X86fand VR128:$src1, (memopf128 addr:$src2))), 8150 (COPY_TO_REGCLASS 8151 (ANDPSrm (COPY_TO_REGCLASS VR128:$src1, VR128), f128mem:$src2), 8152 VR128)>; 8153 8154def : Pat<(f128 (X86fand VR128:$src1, VR128:$src2)), 8155 (COPY_TO_REGCLASS 8156 (ANDPSrr (COPY_TO_REGCLASS VR128:$src1, VR128), 8157 (COPY_TO_REGCLASS VR128:$src2, VR128)), VR128)>; 8158 8159def : Pat<(f128 (X86for VR128:$src1, (memopf128 addr:$src2))), 8160 (COPY_TO_REGCLASS 8161 (ORPSrm (COPY_TO_REGCLASS VR128:$src1, VR128), f128mem:$src2), 8162 VR128)>; 8163 8164def : Pat<(f128 (X86for VR128:$src1, VR128:$src2)), 8165 (COPY_TO_REGCLASS 8166 (ORPSrr (COPY_TO_REGCLASS VR128:$src1, VR128), 8167 (COPY_TO_REGCLASS VR128:$src2, VR128)), VR128)>; 8168 8169def : Pat<(f128 (X86fxor VR128:$src1, (memopf128 addr:$src2))), 8170 (COPY_TO_REGCLASS 8171 (XORPSrm (COPY_TO_REGCLASS VR128:$src1, VR128), f128mem:$src2), 8172 VR128)>; 8173 8174def : Pat<(f128 (X86fxor VR128:$src1, VR128:$src2)), 8175 (COPY_TO_REGCLASS 8176 (XORPSrr (COPY_TO_REGCLASS VR128:$src1, VR128), 8177 (COPY_TO_REGCLASS VR128:$src2, VR128)), VR128)>; 8178 8179//===----------------------------------------------------------------------===// 8180// GFNI instructions 8181//===----------------------------------------------------------------------===// 8182 8183multiclass GF2P8MULB_rm<string OpcodeStr, ValueType OpVT, 8184 RegisterClass RC, PatFrag MemOpFrag, 8185 X86MemOperand X86MemOp, bit Is2Addr = 0> { 8186 let ExeDomain = SSEPackedInt, 8187 AsmString = !if(Is2Addr, 8188 OpcodeStr##"\t{$src2, $dst|$dst, $src2}", 8189 OpcodeStr##"\t{$src2, $src1, $dst|$dst, $src1, $src2}") in { 8190 let isCommutable = 1 in 8191 def rr : PDI<0xCF, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), "", 8192 [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1, RC:$src2)))]>, 8193 Sched<[SchedWriteVecALU.XMM]>, T8PD; 8194 8195 def rm : PDI<0xCF, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, X86MemOp:$src2), "", 8196 [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1, 8197 (bitconvert (MemOpFrag addr:$src2)))))]>, 8198 Sched<[SchedWriteVecALU.XMM.Folded, ReadAfterLd]>, T8PD; 8199 } 8200} 8201 8202multiclass GF2P8AFFINE_rmi<bits<8> Op, string OpStr, ValueType OpVT, 8203 SDNode OpNode, RegisterClass RC, PatFrag MemOpFrag, 8204 X86MemOperand X86MemOp, bit Is2Addr = 0> { 8205 let AsmString = !if(Is2Addr, 8206 OpStr##"\t{$src3, $src2, $dst|$dst, $src2, $src3}", 8207 OpStr##"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}") in { 8208 def rri : Ii8<Op, MRMSrcReg, (outs RC:$dst), 8209 (ins RC:$src1, RC:$src2, u8imm:$src3), "", 8210 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))], 8211 SSEPackedInt>, Sched<[SchedWriteVecALU.XMM]>; 8212 def rmi : Ii8<Op, MRMSrcMem, (outs RC:$dst), 8213 (ins RC:$src1, X86MemOp:$src2, u8imm:$src3), "", 8214 [(set RC:$dst, (OpVT (OpNode RC:$src1, 8215 (bitconvert (MemOpFrag addr:$src2)), 8216 imm:$src3)))], SSEPackedInt>, 8217 Sched<[SchedWriteVecALU.XMM.Folded, ReadAfterLd]>; 8218 } 8219} 8220 8221multiclass GF2P8AFFINE_common<bits<8> Op, string OpStr, SDNode OpNode> { 8222 let Constraints = "$src1 = $dst", 8223 Predicates = [HasGFNI, UseSSE2] in 8224 defm NAME : GF2P8AFFINE_rmi<Op, OpStr, v16i8, OpNode, 8225 VR128, loadv2i64, i128mem, 1>; 8226 let Predicates = [HasGFNI, HasAVX, NoVLX_Or_NoBWI] in { 8227 defm V##NAME : GF2P8AFFINE_rmi<Op, "v"##OpStr, v16i8, OpNode, VR128, 8228 loadv2i64, i128mem>, VEX_4V, VEX_W; 8229 defm V##NAME##Y : GF2P8AFFINE_rmi<Op, "v"##OpStr, v32i8, OpNode, VR256, 8230 loadv4i64, i256mem>, VEX_4V, VEX_L, VEX_W; 8231 } 8232} 8233 8234// GF2P8MULB 8235let Constraints = "$src1 = $dst", 8236 Predicates = [HasGFNI, UseSSE2] in 8237defm GF2P8MULB : GF2P8MULB_rm<"gf2p8mulb", v16i8, VR128, memopv2i64, 8238 i128mem, 1>; 8239let Predicates = [HasGFNI, HasAVX, NoVLX_Or_NoBWI] in { 8240 defm VGF2P8MULB : GF2P8MULB_rm<"vgf2p8mulb", v16i8, VR128, loadv2i64, 8241 i128mem>, VEX_4V; 8242 defm VGF2P8MULBY : GF2P8MULB_rm<"vgf2p8mulb", v32i8, VR256, loadv4i64, 8243 i256mem>, VEX_4V, VEX_L; 8244} 8245// GF2P8AFFINEINVQB, GF2P8AFFINEQB 8246let isCommutable = 0 in { 8247 defm GF2P8AFFINEINVQB : GF2P8AFFINE_common<0xCF, "gf2p8affineinvqb", 8248 X86GF2P8affineinvqb>, TAPD; 8249 defm GF2P8AFFINEQB : GF2P8AFFINE_common<0xCE, "gf2p8affineqb", 8250 X86GF2P8affineqb>, TAPD; 8251} 8252 8253