1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx,+aes,+pclmul < %s | FileCheck %s 3 4target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" 5target triple = "x86_64-unknown-unknown" 6 7; Stack reload folding tests. 8; 9; By including a nop call with sideeffects we can force a partial register spill of the 10; relevant registers and check that the reload is correctly folded into the instruction. 11 12define <2 x i64> @stack_fold_aesdec(<2 x i64> %a0, <2 x i64> %a1) { 13; CHECK-LABEL: stack_fold_aesdec: 14; CHECK: # %bb.0: 15; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16; CHECK-NEXT: #APP 17; CHECK-NEXT: nop 18; CHECK-NEXT: #NO_APP 19; CHECK-NEXT: vaesdec {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 20; CHECK-NEXT: retq 21 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 22 %2 = call <2 x i64> @llvm.x86.aesni.aesdec(<2 x i64> %a0, <2 x i64> %a1) 23 ret <2 x i64> %2 24} 25declare <2 x i64> @llvm.x86.aesni.aesdec(<2 x i64>, <2 x i64>) nounwind readnone 26 27define <2 x i64> @stack_fold_aesdeclast(<2 x i64> %a0, <2 x i64> %a1) { 28; CHECK-LABEL: stack_fold_aesdeclast: 29; CHECK: # %bb.0: 30; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 31; CHECK-NEXT: #APP 32; CHECK-NEXT: nop 33; CHECK-NEXT: #NO_APP 34; CHECK-NEXT: vaesdeclast {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 35; CHECK-NEXT: retq 36 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 37 %2 = call <2 x i64> @llvm.x86.aesni.aesdeclast(<2 x i64> %a0, <2 x i64> %a1) 38 ret <2 x i64> %2 39} 40declare <2 x i64> @llvm.x86.aesni.aesdeclast(<2 x i64>, <2 x i64>) nounwind readnone 41 42define <2 x i64> @stack_fold_aesenc(<2 x i64> %a0, <2 x i64> %a1) { 43; CHECK-LABEL: stack_fold_aesenc: 44; CHECK: # %bb.0: 45; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 46; CHECK-NEXT: #APP 47; CHECK-NEXT: nop 48; CHECK-NEXT: #NO_APP 49; CHECK-NEXT: vaesenc {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 50; CHECK-NEXT: retq 51 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 52 %2 = call <2 x i64> @llvm.x86.aesni.aesenc(<2 x i64> %a0, <2 x i64> %a1) 53 ret <2 x i64> %2 54} 55declare <2 x i64> @llvm.x86.aesni.aesenc(<2 x i64>, <2 x i64>) nounwind readnone 56 57define <2 x i64> @stack_fold_aesenclast(<2 x i64> %a0, <2 x i64> %a1) { 58; CHECK-LABEL: stack_fold_aesenclast: 59; CHECK: # %bb.0: 60; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 61; CHECK-NEXT: #APP 62; CHECK-NEXT: nop 63; CHECK-NEXT: #NO_APP 64; CHECK-NEXT: vaesenclast {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 65; CHECK-NEXT: retq 66 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 67 %2 = call <2 x i64> @llvm.x86.aesni.aesenclast(<2 x i64> %a0, <2 x i64> %a1) 68 ret <2 x i64> %2 69} 70declare <2 x i64> @llvm.x86.aesni.aesenclast(<2 x i64>, <2 x i64>) nounwind readnone 71 72define <2 x i64> @stack_fold_aesimc(<2 x i64> %a0) { 73; CHECK-LABEL: stack_fold_aesimc: 74; CHECK: # %bb.0: 75; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 76; CHECK-NEXT: #APP 77; CHECK-NEXT: nop 78; CHECK-NEXT: #NO_APP 79; CHECK-NEXT: vaesimc {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 80; CHECK-NEXT: retq 81 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 82 %2 = call <2 x i64> @llvm.x86.aesni.aesimc(<2 x i64> %a0) 83 ret <2 x i64> %2 84} 85declare <2 x i64> @llvm.x86.aesni.aesimc(<2 x i64>) nounwind readnone 86 87define <2 x i64> @stack_fold_aeskeygenassist(<2 x i64> %a0) { 88; CHECK-LABEL: stack_fold_aeskeygenassist: 89; CHECK: # %bb.0: 90; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 91; CHECK-NEXT: #APP 92; CHECK-NEXT: nop 93; CHECK-NEXT: #NO_APP 94; CHECK-NEXT: vaeskeygenassist $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 95; CHECK-NEXT: retq 96 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 97 %2 = call <2 x i64> @llvm.x86.aesni.aeskeygenassist(<2 x i64> %a0, i8 7) 98 ret <2 x i64> %2 99} 100declare <2 x i64> @llvm.x86.aesni.aeskeygenassist(<2 x i64>, i8) nounwind readnone 101 102define <4 x i32> @stack_fold_movd_load(i32 %a0) { 103; CHECK-LABEL: stack_fold_movd_load: 104; CHECK: # %bb.0: 105; CHECK-NEXT: pushq %rbp 106; CHECK-NEXT: .cfi_def_cfa_offset 16 107; CHECK-NEXT: pushq %r15 108; CHECK-NEXT: .cfi_def_cfa_offset 24 109; CHECK-NEXT: pushq %r14 110; CHECK-NEXT: .cfi_def_cfa_offset 32 111; CHECK-NEXT: pushq %r13 112; CHECK-NEXT: .cfi_def_cfa_offset 40 113; CHECK-NEXT: pushq %r12 114; CHECK-NEXT: .cfi_def_cfa_offset 48 115; CHECK-NEXT: pushq %rbx 116; CHECK-NEXT: .cfi_def_cfa_offset 56 117; CHECK-NEXT: .cfi_offset %rbx, -56 118; CHECK-NEXT: .cfi_offset %r12, -48 119; CHECK-NEXT: .cfi_offset %r13, -40 120; CHECK-NEXT: .cfi_offset %r14, -32 121; CHECK-NEXT: .cfi_offset %r15, -24 122; CHECK-NEXT: .cfi_offset %rbp, -16 123; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 124; CHECK-NEXT: #APP 125; CHECK-NEXT: nop 126; CHECK-NEXT: #NO_APP 127; CHECK-NEXT: vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload 128; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero 129; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 130; CHECK-NEXT: vpsubd %xmm1, %xmm0, %xmm0 131; CHECK-NEXT: popq %rbx 132; CHECK-NEXT: .cfi_def_cfa_offset 48 133; CHECK-NEXT: popq %r12 134; CHECK-NEXT: .cfi_def_cfa_offset 40 135; CHECK-NEXT: popq %r13 136; CHECK-NEXT: .cfi_def_cfa_offset 32 137; CHECK-NEXT: popq %r14 138; CHECK-NEXT: .cfi_def_cfa_offset 24 139; CHECK-NEXT: popq %r15 140; CHECK-NEXT: .cfi_def_cfa_offset 16 141; CHECK-NEXT: popq %rbp 142; CHECK-NEXT: .cfi_def_cfa_offset 8 143; CHECK-NEXT: retq 144 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() 145 %2 = insertelement <4 x i32> zeroinitializer, i32 %a0, i32 0 146 ; add forces execution domain 147 %3 = add <4 x i32> %2, <i32 1, i32 1, i32 1, i32 1> 148 ret <4 x i32> %3 149} 150 151define i32 @stack_fold_movd_store(<4 x i32> %a0, <4 x i32> %a1) { 152; CHECK-LABEL: stack_fold_movd_store: 153; CHECK: # %bb.0: 154; CHECK-NEXT: pushq %rbp 155; CHECK-NEXT: .cfi_def_cfa_offset 16 156; CHECK-NEXT: pushq %r15 157; CHECK-NEXT: .cfi_def_cfa_offset 24 158; CHECK-NEXT: pushq %r14 159; CHECK-NEXT: .cfi_def_cfa_offset 32 160; CHECK-NEXT: pushq %r13 161; CHECK-NEXT: .cfi_def_cfa_offset 40 162; CHECK-NEXT: pushq %r12 163; CHECK-NEXT: .cfi_def_cfa_offset 48 164; CHECK-NEXT: pushq %rbx 165; CHECK-NEXT: .cfi_def_cfa_offset 56 166; CHECK-NEXT: .cfi_offset %rbx, -56 167; CHECK-NEXT: .cfi_offset %r12, -48 168; CHECK-NEXT: .cfi_offset %r13, -40 169; CHECK-NEXT: .cfi_offset %r14, -32 170; CHECK-NEXT: .cfi_offset %r15, -24 171; CHECK-NEXT: .cfi_offset %rbp, -16 172; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 173; CHECK-NEXT: vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill 174; CHECK-NEXT: #APP 175; CHECK-NEXT: nop 176; CHECK-NEXT: #NO_APP 177; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload 178; CHECK-NEXT: popq %rbx 179; CHECK-NEXT: .cfi_def_cfa_offset 48 180; CHECK-NEXT: popq %r12 181; CHECK-NEXT: .cfi_def_cfa_offset 40 182; CHECK-NEXT: popq %r13 183; CHECK-NEXT: .cfi_def_cfa_offset 32 184; CHECK-NEXT: popq %r14 185; CHECK-NEXT: .cfi_def_cfa_offset 24 186; CHECK-NEXT: popq %r15 187; CHECK-NEXT: .cfi_def_cfa_offset 16 188; CHECK-NEXT: popq %rbp 189; CHECK-NEXT: .cfi_def_cfa_offset 8 190; CHECK-NEXT: retq 191 ; add forces execution domain 192 %1 = add <4 x i32> %a0, %a1 193 %2 = extractelement <4 x i32> %1, i32 0 194 %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() 195 ret i32 %2 196} 197 198define <2 x i64> @stack_fold_movq_load(<2 x i64> %a0) { 199; CHECK-LABEL: stack_fold_movq_load: 200; CHECK: # %bb.0: 201; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 202; CHECK-NEXT: #APP 203; CHECK-NEXT: nop 204; CHECK-NEXT: #NO_APP 205; CHECK-NEXT: vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 206; CHECK-NEXT: # xmm0 = mem[0],zero 207; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 208; CHECK-NEXT: vpsubq %xmm1, %xmm0, %xmm0 209; CHECK-NEXT: retq 210 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 211 %2 = shufflevector <2 x i64> %a0, <2 x i64> zeroinitializer, <2 x i32> <i32 0, i32 2> 212 ; add forces execution domain 213 %3 = add <2 x i64> %2, <i64 1, i64 1> 214 ret <2 x i64> %3 215} 216 217define i64 @stack_fold_movq_store(<2 x i64> %a0, <2 x i64> %a1) { 218; CHECK-LABEL: stack_fold_movq_store: 219; CHECK: # %bb.0: 220; CHECK-NEXT: pushq %rbp 221; CHECK-NEXT: .cfi_def_cfa_offset 16 222; CHECK-NEXT: pushq %r15 223; CHECK-NEXT: .cfi_def_cfa_offset 24 224; CHECK-NEXT: pushq %r14 225; CHECK-NEXT: .cfi_def_cfa_offset 32 226; CHECK-NEXT: pushq %r13 227; CHECK-NEXT: .cfi_def_cfa_offset 40 228; CHECK-NEXT: pushq %r12 229; CHECK-NEXT: .cfi_def_cfa_offset 48 230; CHECK-NEXT: pushq %rbx 231; CHECK-NEXT: .cfi_def_cfa_offset 56 232; CHECK-NEXT: .cfi_offset %rbx, -56 233; CHECK-NEXT: .cfi_offset %r12, -48 234; CHECK-NEXT: .cfi_offset %r13, -40 235; CHECK-NEXT: .cfi_offset %r14, -32 236; CHECK-NEXT: .cfi_offset %r15, -24 237; CHECK-NEXT: .cfi_offset %rbp, -16 238; CHECK-NEXT: vpaddq %xmm1, %xmm0, %xmm0 239; CHECK-NEXT: vmovq %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill 240; CHECK-NEXT: #APP 241; CHECK-NEXT: nop 242; CHECK-NEXT: #NO_APP 243; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload 244; CHECK-NEXT: popq %rbx 245; CHECK-NEXT: .cfi_def_cfa_offset 48 246; CHECK-NEXT: popq %r12 247; CHECK-NEXT: .cfi_def_cfa_offset 40 248; CHECK-NEXT: popq %r13 249; CHECK-NEXT: .cfi_def_cfa_offset 32 250; CHECK-NEXT: popq %r14 251; CHECK-NEXT: .cfi_def_cfa_offset 24 252; CHECK-NEXT: popq %r15 253; CHECK-NEXT: .cfi_def_cfa_offset 16 254; CHECK-NEXT: popq %rbp 255; CHECK-NEXT: .cfi_def_cfa_offset 8 256; CHECK-NEXT: retq 257 ; add forces execution domain 258 %1 = add <2 x i64> %a0, %a1 259 %2 = extractelement <2 x i64> %1, i32 0 260 %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() 261 ret i64 %2 262} 263 264define <8 x i16> @stack_fold_mpsadbw(<16 x i8> %a0, <16 x i8> %a1) { 265; CHECK-LABEL: stack_fold_mpsadbw: 266; CHECK: # %bb.0: 267; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 268; CHECK-NEXT: #APP 269; CHECK-NEXT: nop 270; CHECK-NEXT: #NO_APP 271; CHECK-NEXT: vmpsadbw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 272; CHECK-NEXT: retq 273 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 274 %2 = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %a0, <16 x i8> %a1, i8 7) 275 ret <8 x i16> %2 276} 277declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i8) nounwind readnone 278 279define <16 x i8> @stack_fold_pabsb(<16 x i8> %a0) { 280; CHECK-LABEL: stack_fold_pabsb: 281; CHECK: # %bb.0: 282; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 283; CHECK-NEXT: #APP 284; CHECK-NEXT: nop 285; CHECK-NEXT: #NO_APP 286; CHECK-NEXT: vpabsb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 287; CHECK-NEXT: retq 288 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 289 %2 = icmp sgt <16 x i8> %a0, zeroinitializer 290 %3 = sub <16 x i8> zeroinitializer, %a0 291 %4 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %3 292 ret <16 x i8> %4 293} 294 295define <4 x i32> @stack_fold_pabsd(<4 x i32> %a0) { 296; CHECK-LABEL: stack_fold_pabsd: 297; CHECK: # %bb.0: 298; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 299; CHECK-NEXT: #APP 300; CHECK-NEXT: nop 301; CHECK-NEXT: #NO_APP 302; CHECK-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 303; CHECK-NEXT: retq 304 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 305 %2 = icmp sgt <4 x i32> %a0, zeroinitializer 306 %3 = sub <4 x i32> zeroinitializer, %a0 307 %4 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %3 308 ret <4 x i32> %4 309} 310 311define <8 x i16> @stack_fold_pabsw(<8 x i16> %a0) { 312; CHECK-LABEL: stack_fold_pabsw: 313; CHECK: # %bb.0: 314; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 315; CHECK-NEXT: #APP 316; CHECK-NEXT: nop 317; CHECK-NEXT: #NO_APP 318; CHECK-NEXT: vpabsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 319; CHECK-NEXT: retq 320 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 321 %2 = icmp sgt <8 x i16> %a0, zeroinitializer 322 %3 = sub <8 x i16> zeroinitializer, %a0 323 %4 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %3 324 ret <8 x i16> %4 325} 326 327define <8 x i16> @stack_fold_packssdw(<4 x i32> %a0, <4 x i32> %a1) { 328; CHECK-LABEL: stack_fold_packssdw: 329; CHECK: # %bb.0: 330; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 331; CHECK-NEXT: #APP 332; CHECK-NEXT: nop 333; CHECK-NEXT: #NO_APP 334; CHECK-NEXT: vpackssdw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 335; CHECK-NEXT: retq 336 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 337 %2 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a0, <4 x i32> %a1) 338 ret <8 x i16> %2 339} 340declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>) nounwind readnone 341 342define <16 x i8> @stack_fold_packsswb(<8 x i16> %a0, <8 x i16> %a1) { 343; CHECK-LABEL: stack_fold_packsswb: 344; CHECK: # %bb.0: 345; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 346; CHECK-NEXT: #APP 347; CHECK-NEXT: nop 348; CHECK-NEXT: #NO_APP 349; CHECK-NEXT: vpacksswb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 350; CHECK-NEXT: retq 351 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 352 %2 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a0, <8 x i16> %a1) 353 ret <16 x i8> %2 354} 355declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>) nounwind readnone 356 357define <8 x i16> @stack_fold_packusdw(<4 x i32> %a0, <4 x i32> %a1) { 358; CHECK-LABEL: stack_fold_packusdw: 359; CHECK: # %bb.0: 360; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 361; CHECK-NEXT: #APP 362; CHECK-NEXT: nop 363; CHECK-NEXT: #NO_APP 364; CHECK-NEXT: vpackusdw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 365; CHECK-NEXT: retq 366 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 367 %2 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a0, <4 x i32> %a1) 368 ret <8 x i16> %2 369} 370declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>) nounwind readnone 371 372define <16 x i8> @stack_fold_packuswb(<8 x i16> %a0, <8 x i16> %a1) { 373; CHECK-LABEL: stack_fold_packuswb: 374; CHECK: # %bb.0: 375; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 376; CHECK-NEXT: #APP 377; CHECK-NEXT: nop 378; CHECK-NEXT: #NO_APP 379; CHECK-NEXT: vpackuswb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 380; CHECK-NEXT: retq 381 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 382 %2 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a0, <8 x i16> %a1) 383 ret <16 x i8> %2 384} 385declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>) nounwind readnone 386 387define <16 x i8> @stack_fold_paddb(<16 x i8> %a0, <16 x i8> %a1) { 388; CHECK-LABEL: stack_fold_paddb: 389; CHECK: # %bb.0: 390; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 391; CHECK-NEXT: #APP 392; CHECK-NEXT: nop 393; CHECK-NEXT: #NO_APP 394; CHECK-NEXT: vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 395; CHECK-NEXT: retq 396 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 397 %2 = add <16 x i8> %a0, %a1 398 ret <16 x i8> %2 399} 400 401define <4 x i32> @stack_fold_paddd(<4 x i32> %a0, <4 x i32> %a1) { 402; CHECK-LABEL: stack_fold_paddd: 403; CHECK: # %bb.0: 404; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 405; CHECK-NEXT: #APP 406; CHECK-NEXT: nop 407; CHECK-NEXT: #NO_APP 408; CHECK-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 409; CHECK-NEXT: retq 410 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 411 %2 = add <4 x i32> %a0, %a1 412 ret <4 x i32> %2 413} 414 415define <2 x i64> @stack_fold_paddq(<2 x i64> %a0, <2 x i64> %a1) { 416; CHECK-LABEL: stack_fold_paddq: 417; CHECK: # %bb.0: 418; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 419; CHECK-NEXT: #APP 420; CHECK-NEXT: nop 421; CHECK-NEXT: #NO_APP 422; CHECK-NEXT: vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 423; CHECK-NEXT: retq 424 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 425 %2 = add <2 x i64> %a0, %a1 426 ret <2 x i64> %2 427} 428 429define <16 x i8> @stack_fold_paddsb(<16 x i8> %a0, <16 x i8> %a1) { 430; CHECK-LABEL: stack_fold_paddsb: 431; CHECK: # %bb.0: 432; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 433; CHECK-NEXT: #APP 434; CHECK-NEXT: nop 435; CHECK-NEXT: #NO_APP 436; CHECK-NEXT: vpaddsb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 437; CHECK-NEXT: retq 438 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 439 %2 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> %a0, <16 x i8> %a1) 440 ret <16 x i8> %2 441} 442declare <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8>, <16 x i8>) nounwind readnone 443 444define <8 x i16> @stack_fold_paddsw(<8 x i16> %a0, <8 x i16> %a1) { 445; CHECK-LABEL: stack_fold_paddsw: 446; CHECK: # %bb.0: 447; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 448; CHECK-NEXT: #APP 449; CHECK-NEXT: nop 450; CHECK-NEXT: #NO_APP 451; CHECK-NEXT: vpaddsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 452; CHECK-NEXT: retq 453 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 454 %2 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %a0, <8 x i16> %a1) 455 ret <8 x i16> %2 456} 457declare <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16>, <8 x i16>) nounwind readnone 458 459define <16 x i8> @stack_fold_paddusb(<16 x i8> %a0, <16 x i8> %a1) { 460; CHECK-LABEL: stack_fold_paddusb: 461; CHECK: # %bb.0: 462; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 463; CHECK-NEXT: #APP 464; CHECK-NEXT: nop 465; CHECK-NEXT: #NO_APP 466; CHECK-NEXT: vpaddusb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 467; CHECK-NEXT: retq 468 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 469 %2 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> %a0, <16 x i8> %a1) 470 ret <16 x i8> %2 471} 472declare <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8>, <16 x i8>) nounwind readnone 473 474define <8 x i16> @stack_fold_paddusw(<8 x i16> %a0, <8 x i16> %a1) { 475; CHECK-LABEL: stack_fold_paddusw: 476; CHECK: # %bb.0: 477; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 478; CHECK-NEXT: #APP 479; CHECK-NEXT: nop 480; CHECK-NEXT: #NO_APP 481; CHECK-NEXT: vpaddusw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 482; CHECK-NEXT: retq 483 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 484 %2 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %a0, <8 x i16> %a1) 485 ret <8 x i16> %2 486} 487declare <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16>, <8 x i16>) nounwind readnone 488 489define <8 x i16> @stack_fold_paddw(<8 x i16> %a0, <8 x i16> %a1) { 490; CHECK-LABEL: stack_fold_paddw: 491; CHECK: # %bb.0: 492; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 493; CHECK-NEXT: #APP 494; CHECK-NEXT: nop 495; CHECK-NEXT: #NO_APP 496; CHECK-NEXT: vpaddw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 497; CHECK-NEXT: retq 498 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 499 %2 = add <8 x i16> %a0, %a1 500 ret <8 x i16> %2 501} 502 503define <16 x i8> @stack_fold_palignr(<16 x i8> %a0, <16 x i8> %a1) { 504; CHECK-LABEL: stack_fold_palignr: 505; CHECK: # %bb.0: 506; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 507; CHECK-NEXT: #APP 508; CHECK-NEXT: nop 509; CHECK-NEXT: #NO_APP 510; CHECK-NEXT: vpalignr $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 511; CHECK-NEXT: # xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] 512; CHECK-NEXT: retq 513 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 514 %2 = shufflevector <16 x i8> %a1, <16 x i8> %a0, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16> 515 ret <16 x i8> %2 516} 517 518define <16 x i8> @stack_fold_pand(<16 x i8> %a0, <16 x i8> %a1) { 519; CHECK-LABEL: stack_fold_pand: 520; CHECK: # %bb.0: 521; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 522; CHECK-NEXT: #APP 523; CHECK-NEXT: nop 524; CHECK-NEXT: #NO_APP 525; CHECK-NEXT: vpand {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 526; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 527; CHECK-NEXT: vpsubb %xmm1, %xmm0, %xmm0 528; CHECK-NEXT: retq 529 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 530 %2 = and <16 x i8> %a0, %a1 531 ; add forces execution domain 532 %3 = add <16 x i8> %2, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 533 ret <16 x i8> %3 534} 535 536define <16 x i8> @stack_fold_pandn(<16 x i8> %a0, <16 x i8> %a1) { 537; CHECK-LABEL: stack_fold_pandn: 538; CHECK: # %bb.0: 539; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 540; CHECK-NEXT: #APP 541; CHECK-NEXT: nop 542; CHECK-NEXT: #NO_APP 543; CHECK-NEXT: vpandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 544; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 545; CHECK-NEXT: vpsubb %xmm1, %xmm0, %xmm0 546; CHECK-NEXT: retq 547 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 548 %2 = xor <16 x i8> %a0, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1> 549 %3 = and <16 x i8> %2, %a1 550 ; add forces execution domain 551 %4 = add <16 x i8> %3, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 552 ret <16 x i8> %4 553} 554 555define <16 x i8> @stack_fold_pavgb(<16 x i8> %a0, <16 x i8> %a1) { 556; CHECK-LABEL: stack_fold_pavgb: 557; CHECK: # %bb.0: 558; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 559; CHECK-NEXT: #APP 560; CHECK-NEXT: nop 561; CHECK-NEXT: #NO_APP 562; CHECK-NEXT: vpavgb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 563; CHECK-NEXT: retq 564 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 565 %2 = zext <16 x i8> %a0 to <16 x i16> 566 %3 = zext <16 x i8> %a1 to <16 x i16> 567 %4 = add <16 x i16> %2, %3 568 %5 = add <16 x i16> %4, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 569 %6 = lshr <16 x i16> %5, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 570 %7 = trunc <16 x i16> %6 to <16 x i8> 571 ret <16 x i8> %7 572} 573 574define <8 x i16> @stack_fold_pavgw(<8 x i16> %a0, <8 x i16> %a1) { 575; CHECK-LABEL: stack_fold_pavgw: 576; CHECK: # %bb.0: 577; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 578; CHECK-NEXT: #APP 579; CHECK-NEXT: nop 580; CHECK-NEXT: #NO_APP 581; CHECK-NEXT: vpavgw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 582; CHECK-NEXT: retq 583 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 584 %2 = zext <8 x i16> %a0 to <8 x i32> 585 %3 = zext <8 x i16> %a1 to <8 x i32> 586 %4 = add <8 x i32> %2, %3 587 %5 = add <8 x i32> %4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 588 %6 = lshr <8 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 589 %7 = trunc <8 x i32> %6 to <8 x i16> 590 ret <8 x i16> %7 591} 592 593define <16 x i8> @stack_fold_pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %c) { 594; CHECK-LABEL: stack_fold_pblendvb: 595; CHECK: # %bb.0: 596; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 597; CHECK-NEXT: #APP 598; CHECK-NEXT: nop 599; CHECK-NEXT: #NO_APP 600; CHECK-NEXT: vpblendvb %xmm0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 601; CHECK-NEXT: retq 602 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 603 %2 = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %a1, <16 x i8> %c, <16 x i8> %a0) 604 ret <16 x i8> %2 605} 606declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone 607 608define <8 x i16> @stack_fold_pblendw(<8 x i16> %a0, <8 x i16> %a1) { 609; CHECK-LABEL: stack_fold_pblendw: 610; CHECK: # %bb.0: 611; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 612; CHECK-NEXT: #APP 613; CHECK-NEXT: nop 614; CHECK-NEXT: #NO_APP 615; CHECK-NEXT: vpblendw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 616; CHECK-NEXT: # xmm0 = mem[0,1,2],xmm0[3,4,5,6,7] 617; CHECK-NEXT: retq 618 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 619 %2 = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 8, i32 9, i32 10, i32 3, i32 4, i32 5, i32 6, i32 7> 620 ret <8 x i16> %2 621} 622 623define <2 x i64> @stack_fold_pclmulqdq(<2 x i64> %a0, <2 x i64> %a1) { 624; CHECK-LABEL: stack_fold_pclmulqdq: 625; CHECK: # %bb.0: 626; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 627; CHECK-NEXT: #APP 628; CHECK-NEXT: nop 629; CHECK-NEXT: #NO_APP 630; CHECK-NEXT: vpclmulqdq $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 631; CHECK-NEXT: retq 632 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 633 %2 = call <2 x i64> @llvm.x86.pclmulqdq(<2 x i64> %a0, <2 x i64> %a1, i8 0) 634 ret <2 x i64> %2 635} 636declare <2 x i64> @llvm.x86.pclmulqdq(<2 x i64>, <2 x i64>, i8) nounwind readnone 637 638define <16 x i8> @stack_fold_pcmpeqb(<16 x i8> %a0, <16 x i8> %a1) { 639; CHECK-LABEL: stack_fold_pcmpeqb: 640; CHECK: # %bb.0: 641; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 642; CHECK-NEXT: #APP 643; CHECK-NEXT: nop 644; CHECK-NEXT: #NO_APP 645; CHECK-NEXT: vpcmpeqb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 646; CHECK-NEXT: retq 647 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 648 %2 = icmp eq <16 x i8> %a0, %a1 649 %3 = sext <16 x i1> %2 to <16 x i8> 650 ret <16 x i8> %3 651} 652 653define <4 x i32> @stack_fold_pcmpeqd(<4 x i32> %a0, <4 x i32> %a1) { 654; CHECK-LABEL: stack_fold_pcmpeqd: 655; CHECK: # %bb.0: 656; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 657; CHECK-NEXT: #APP 658; CHECK-NEXT: nop 659; CHECK-NEXT: #NO_APP 660; CHECK-NEXT: vpcmpeqd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 661; CHECK-NEXT: retq 662 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 663 %2 = icmp eq <4 x i32> %a0, %a1 664 %3 = sext <4 x i1> %2 to <4 x i32> 665 ret <4 x i32> %3 666} 667 668define <2 x i64> @stack_fold_pcmpeqq(<2 x i64> %a0, <2 x i64> %a1) { 669; CHECK-LABEL: stack_fold_pcmpeqq: 670; CHECK: # %bb.0: 671; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 672; CHECK-NEXT: #APP 673; CHECK-NEXT: nop 674; CHECK-NEXT: #NO_APP 675; CHECK-NEXT: vpcmpeqq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 676; CHECK-NEXT: retq 677 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 678 %2 = icmp eq <2 x i64> %a0, %a1 679 %3 = sext <2 x i1> %2 to <2 x i64> 680 ret <2 x i64> %3 681} 682 683define <8 x i16> @stack_fold_pcmpeqw(<8 x i16> %a0, <8 x i16> %a1) { 684; CHECK-LABEL: stack_fold_pcmpeqw: 685; CHECK: # %bb.0: 686; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 687; CHECK-NEXT: #APP 688; CHECK-NEXT: nop 689; CHECK-NEXT: #NO_APP 690; CHECK-NEXT: vpcmpeqw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 691; CHECK-NEXT: retq 692 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 693 %2 = icmp eq <8 x i16> %a0, %a1 694 %3 = sext <8 x i1> %2 to <8 x i16> 695 ret <8 x i16> %3 696} 697 698define i32 @stack_fold_pcmpestri(<16 x i8> %a0, <16 x i8> %a1) { 699; CHECK-LABEL: stack_fold_pcmpestri: 700; CHECK: # %bb.0: 701; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 702; CHECK-NEXT: #APP 703; CHECK-NEXT: nop 704; CHECK-NEXT: #NO_APP 705; CHECK-NEXT: movl $7, %eax 706; CHECK-NEXT: movl $7, %edx 707; CHECK-NEXT: vpcmpestri $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 708; CHECK-NEXT: movl %ecx, %eax 709; CHECK-NEXT: retq 710 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{rax},~{flags}"() 711 %2 = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %a0, i32 7, <16 x i8> %a1, i32 7, i8 7) 712 ret i32 %2 713} 714declare i32 @llvm.x86.sse42.pcmpestri128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone 715 716define <16 x i8> @stack_fold_pcmpestrm(<16 x i8> %a0, <16 x i8> %a1) { 717; CHECK-LABEL: stack_fold_pcmpestrm: 718; CHECK: # %bb.0: 719; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 720; CHECK-NEXT: #APP 721; CHECK-NEXT: nop 722; CHECK-NEXT: #NO_APP 723; CHECK-NEXT: movl $7, %eax 724; CHECK-NEXT: movl $7, %edx 725; CHECK-NEXT: vpcmpestrm $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 726; CHECK-NEXT: retq 727 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{rax},~{flags}"() 728 %2 = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %a0, i32 7, <16 x i8> %a1, i32 7, i8 7) 729 ret <16 x i8> %2 730} 731declare <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone 732 733define <16 x i8> @stack_fold_pcmpgtb(<16 x i8> %a0, <16 x i8> %a1) { 734; CHECK-LABEL: stack_fold_pcmpgtb: 735; CHECK: # %bb.0: 736; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 737; CHECK-NEXT: #APP 738; CHECK-NEXT: nop 739; CHECK-NEXT: #NO_APP 740; CHECK-NEXT: vpcmpgtb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 741; CHECK-NEXT: retq 742 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 743 %2 = icmp sgt <16 x i8> %a0, %a1 744 %3 = sext <16 x i1> %2 to <16 x i8> 745 ret <16 x i8> %3 746} 747 748define <4 x i32> @stack_fold_pcmpgtd(<4 x i32> %a0, <4 x i32> %a1) { 749; CHECK-LABEL: stack_fold_pcmpgtd: 750; CHECK: # %bb.0: 751; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 752; CHECK-NEXT: #APP 753; CHECK-NEXT: nop 754; CHECK-NEXT: #NO_APP 755; CHECK-NEXT: vpcmpgtd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 756; CHECK-NEXT: retq 757 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 758 %2 = icmp sgt <4 x i32> %a0, %a1 759 %3 = sext <4 x i1> %2 to <4 x i32> 760 ret <4 x i32> %3 761} 762 763define <2 x i64> @stack_fold_pcmpgtq(<2 x i64> %a0, <2 x i64> %a1) { 764; CHECK-LABEL: stack_fold_pcmpgtq: 765; CHECK: # %bb.0: 766; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 767; CHECK-NEXT: #APP 768; CHECK-NEXT: nop 769; CHECK-NEXT: #NO_APP 770; CHECK-NEXT: vpcmpgtq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 771; CHECK-NEXT: retq 772 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 773 %2 = icmp sgt <2 x i64> %a0, %a1 774 %3 = sext <2 x i1> %2 to <2 x i64> 775 ret <2 x i64> %3 776} 777 778define <8 x i16> @stack_fold_pcmpgtw(<8 x i16> %a0, <8 x i16> %a1) { 779; CHECK-LABEL: stack_fold_pcmpgtw: 780; CHECK: # %bb.0: 781; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 782; CHECK-NEXT: #APP 783; CHECK-NEXT: nop 784; CHECK-NEXT: #NO_APP 785; CHECK-NEXT: vpcmpgtw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 786; CHECK-NEXT: retq 787 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 788 %2 = icmp sgt <8 x i16> %a0, %a1 789 %3 = sext <8 x i1> %2 to <8 x i16> 790 ret <8 x i16> %3 791} 792 793define i32 @stack_fold_pcmpistri(<16 x i8> %a0, <16 x i8> %a1) { 794; CHECK-LABEL: stack_fold_pcmpistri: 795; CHECK: # %bb.0: 796; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 797; CHECK-NEXT: #APP 798; CHECK-NEXT: nop 799; CHECK-NEXT: #NO_APP 800; CHECK-NEXT: vpcmpistri $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 801; CHECK-NEXT: movl %ecx, %eax 802; CHECK-NEXT: retq 803 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 804 %2 = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %a0, <16 x i8> %a1, i8 7) 805 ret i32 %2 806} 807declare i32 @llvm.x86.sse42.pcmpistri128(<16 x i8>, <16 x i8>, i8) nounwind readnone 808 809define <16 x i8> @stack_fold_pcmpistrm(<16 x i8> %a0, <16 x i8> %a1) { 810; CHECK-LABEL: stack_fold_pcmpistrm: 811; CHECK: # %bb.0: 812; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 813; CHECK-NEXT: #APP 814; CHECK-NEXT: nop 815; CHECK-NEXT: #NO_APP 816; CHECK-NEXT: vpcmpistrm $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 817; CHECK-NEXT: retq 818 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 819 %2 = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %a0, <16 x i8> %a1, i8 7) 820 ret <16 x i8> %2 821} 822declare <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8>, <16 x i8>, i8) nounwind readnone 823 824; TODO stack_fold_pextrb 825 826define i32 @stack_fold_pextrd(<4 x i32> %a0, <4 x i32> %a1) { 827; CHECK-LABEL: stack_fold_pextrd: 828; CHECK: # %bb.0: 829; CHECK-NEXT: pushq %rbp 830; CHECK-NEXT: .cfi_def_cfa_offset 16 831; CHECK-NEXT: pushq %r15 832; CHECK-NEXT: .cfi_def_cfa_offset 24 833; CHECK-NEXT: pushq %r14 834; CHECK-NEXT: .cfi_def_cfa_offset 32 835; CHECK-NEXT: pushq %r13 836; CHECK-NEXT: .cfi_def_cfa_offset 40 837; CHECK-NEXT: pushq %r12 838; CHECK-NEXT: .cfi_def_cfa_offset 48 839; CHECK-NEXT: pushq %rbx 840; CHECK-NEXT: .cfi_def_cfa_offset 56 841; CHECK-NEXT: .cfi_offset %rbx, -56 842; CHECK-NEXT: .cfi_offset %r12, -48 843; CHECK-NEXT: .cfi_offset %r13, -40 844; CHECK-NEXT: .cfi_offset %r14, -32 845; CHECK-NEXT: .cfi_offset %r15, -24 846; CHECK-NEXT: .cfi_offset %rbp, -16 847; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 848; CHECK-NEXT: vpextrd $1, %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill 849; CHECK-NEXT: #APP 850; CHECK-NEXT: nop 851; CHECK-NEXT: #NO_APP 852; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload 853; CHECK-NEXT: popq %rbx 854; CHECK-NEXT: .cfi_def_cfa_offset 48 855; CHECK-NEXT: popq %r12 856; CHECK-NEXT: .cfi_def_cfa_offset 40 857; CHECK-NEXT: popq %r13 858; CHECK-NEXT: .cfi_def_cfa_offset 32 859; CHECK-NEXT: popq %r14 860; CHECK-NEXT: .cfi_def_cfa_offset 24 861; CHECK-NEXT: popq %r15 862; CHECK-NEXT: .cfi_def_cfa_offset 16 863; CHECK-NEXT: popq %rbp 864; CHECK-NEXT: .cfi_def_cfa_offset 8 865; CHECK-NEXT: retq 866 ; add forces execution domain 867 %1 = add <4 x i32> %a0, %a1 868 %2 = extractelement <4 x i32> %1, i32 1 869 %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() 870 ret i32 %2 871} 872 873define i64 @stack_fold_pextrq(<2 x i64> %a0) { 874; CHECK-LABEL: stack_fold_pextrq: 875; CHECK: # %bb.0: 876; CHECK-NEXT: pushq %rbp 877; CHECK-NEXT: .cfi_def_cfa_offset 16 878; CHECK-NEXT: pushq %r15 879; CHECK-NEXT: .cfi_def_cfa_offset 24 880; CHECK-NEXT: pushq %r14 881; CHECK-NEXT: .cfi_def_cfa_offset 32 882; CHECK-NEXT: pushq %r13 883; CHECK-NEXT: .cfi_def_cfa_offset 40 884; CHECK-NEXT: pushq %r12 885; CHECK-NEXT: .cfi_def_cfa_offset 48 886; CHECK-NEXT: pushq %rbx 887; CHECK-NEXT: .cfi_def_cfa_offset 56 888; CHECK-NEXT: .cfi_offset %rbx, -56 889; CHECK-NEXT: .cfi_offset %r12, -48 890; CHECK-NEXT: .cfi_offset %r13, -40 891; CHECK-NEXT: .cfi_offset %r14, -32 892; CHECK-NEXT: .cfi_offset %r15, -24 893; CHECK-NEXT: .cfi_offset %rbp, -16 894; CHECK-NEXT: vpextrq $1, %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill 895; CHECK-NEXT: #APP 896; CHECK-NEXT: nop 897; CHECK-NEXT: #NO_APP 898; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload 899; CHECK-NEXT: popq %rbx 900; CHECK-NEXT: .cfi_def_cfa_offset 48 901; CHECK-NEXT: popq %r12 902; CHECK-NEXT: .cfi_def_cfa_offset 40 903; CHECK-NEXT: popq %r13 904; CHECK-NEXT: .cfi_def_cfa_offset 32 905; CHECK-NEXT: popq %r14 906; CHECK-NEXT: .cfi_def_cfa_offset 24 907; CHECK-NEXT: popq %r15 908; CHECK-NEXT: .cfi_def_cfa_offset 16 909; CHECK-NEXT: popq %rbp 910; CHECK-NEXT: .cfi_def_cfa_offset 8 911; CHECK-NEXT: retq 912 %1 = extractelement <2 x i64> %a0, i32 1 913 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() 914 ret i64 %1 915} 916 917; TODO stack_fold_pextrw 918 919define <4 x i32> @stack_fold_phaddd(<4 x i32> %a0, <4 x i32> %a1) { 920; CHECK-LABEL: stack_fold_phaddd: 921; CHECK: # %bb.0: 922; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 923; CHECK-NEXT: #APP 924; CHECK-NEXT: nop 925; CHECK-NEXT: #NO_APP 926; CHECK-NEXT: vphaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 927; CHECK-NEXT: retq 928 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 929 %2 = call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %a0, <4 x i32> %a1) 930 ret <4 x i32> %2 931} 932declare <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32>, <4 x i32>) nounwind readnone 933 934define <8 x i16> @stack_fold_phaddsw(<8 x i16> %a0, <8 x i16> %a1) { 935; CHECK-LABEL: stack_fold_phaddsw: 936; CHECK: # %bb.0: 937; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 938; CHECK-NEXT: #APP 939; CHECK-NEXT: nop 940; CHECK-NEXT: #NO_APP 941; CHECK-NEXT: vphaddsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 942; CHECK-NEXT: retq 943 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 944 %2 = call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> %a0, <8 x i16> %a1) 945 ret <8 x i16> %2 946} 947declare <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16>, <8 x i16>) nounwind readnone 948 949define <8 x i16> @stack_fold_phaddw(<8 x i16> %a0, <8 x i16> %a1) { 950; CHECK-LABEL: stack_fold_phaddw: 951; CHECK: # %bb.0: 952; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 953; CHECK-NEXT: #APP 954; CHECK-NEXT: nop 955; CHECK-NEXT: #NO_APP 956; CHECK-NEXT: vphaddw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 957; CHECK-NEXT: retq 958 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 959 %2 = call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %a0, <8 x i16> %a1) 960 ret <8 x i16> %2 961} 962declare <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16>, <8 x i16>) nounwind readnone 963 964define <8 x i16> @stack_fold_phminposuw(<8 x i16> %a0) { 965; CHECK-LABEL: stack_fold_phminposuw: 966; CHECK: # %bb.0: 967; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 968; CHECK-NEXT: #APP 969; CHECK-NEXT: nop 970; CHECK-NEXT: #NO_APP 971; CHECK-NEXT: vphminposuw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 972; CHECK-NEXT: retq 973 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 974 %2 = call <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16> %a0) 975 ret <8 x i16> %2 976} 977declare <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16>) nounwind readnone 978 979define <4 x i32> @stack_fold_phsubd(<4 x i32> %a0, <4 x i32> %a1) { 980; CHECK-LABEL: stack_fold_phsubd: 981; CHECK: # %bb.0: 982; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 983; CHECK-NEXT: #APP 984; CHECK-NEXT: nop 985; CHECK-NEXT: #NO_APP 986; CHECK-NEXT: vphsubd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 987; CHECK-NEXT: retq 988 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 989 %2 = call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %a0, <4 x i32> %a1) 990 ret <4 x i32> %2 991} 992declare <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32>, <4 x i32>) nounwind readnone 993 994define <8 x i16> @stack_fold_phsubsw(<8 x i16> %a0, <8 x i16> %a1) { 995; CHECK-LABEL: stack_fold_phsubsw: 996; CHECK: # %bb.0: 997; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 998; CHECK-NEXT: #APP 999; CHECK-NEXT: nop 1000; CHECK-NEXT: #NO_APP 1001; CHECK-NEXT: vphsubsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1002; CHECK-NEXT: retq 1003 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1004 %2 = call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16> %a0, <8 x i16> %a1) 1005 ret <8 x i16> %2 1006} 1007declare <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16>, <8 x i16>) nounwind readnone 1008 1009define <8 x i16> @stack_fold_phsubw(<8 x i16> %a0, <8 x i16> %a1) { 1010; CHECK-LABEL: stack_fold_phsubw: 1011; CHECK: # %bb.0: 1012; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1013; CHECK-NEXT: #APP 1014; CHECK-NEXT: nop 1015; CHECK-NEXT: #NO_APP 1016; CHECK-NEXT: vphsubw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1017; CHECK-NEXT: retq 1018 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1019 %2 = call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %a0, <8 x i16> %a1) 1020 ret <8 x i16> %2 1021} 1022declare <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16>, <8 x i16>) nounwind readnone 1023 1024define <16 x i8> @stack_fold_pinsrb(<16 x i8> %a0, i8 %a1) { 1025; CHECK-LABEL: stack_fold_pinsrb: 1026; CHECK: # %bb.0: 1027; CHECK-NEXT: pushq %rbp 1028; CHECK-NEXT: .cfi_def_cfa_offset 16 1029; CHECK-NEXT: pushq %r15 1030; CHECK-NEXT: .cfi_def_cfa_offset 24 1031; CHECK-NEXT: pushq %r14 1032; CHECK-NEXT: .cfi_def_cfa_offset 32 1033; CHECK-NEXT: pushq %r13 1034; CHECK-NEXT: .cfi_def_cfa_offset 40 1035; CHECK-NEXT: pushq %r12 1036; CHECK-NEXT: .cfi_def_cfa_offset 48 1037; CHECK-NEXT: pushq %rbx 1038; CHECK-NEXT: .cfi_def_cfa_offset 56 1039; CHECK-NEXT: .cfi_offset %rbx, -56 1040; CHECK-NEXT: .cfi_offset %r12, -48 1041; CHECK-NEXT: .cfi_offset %r13, -40 1042; CHECK-NEXT: .cfi_offset %r14, -32 1043; CHECK-NEXT: .cfi_offset %r15, -24 1044; CHECK-NEXT: .cfi_offset %rbp, -16 1045; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 1046; CHECK-NEXT: #APP 1047; CHECK-NEXT: nop 1048; CHECK-NEXT: #NO_APP 1049; CHECK-NEXT: vpinsrb $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload 1050; CHECK-NEXT: popq %rbx 1051; CHECK-NEXT: .cfi_def_cfa_offset 48 1052; CHECK-NEXT: popq %r12 1053; CHECK-NEXT: .cfi_def_cfa_offset 40 1054; CHECK-NEXT: popq %r13 1055; CHECK-NEXT: .cfi_def_cfa_offset 32 1056; CHECK-NEXT: popq %r14 1057; CHECK-NEXT: .cfi_def_cfa_offset 24 1058; CHECK-NEXT: popq %r15 1059; CHECK-NEXT: .cfi_def_cfa_offset 16 1060; CHECK-NEXT: popq %rbp 1061; CHECK-NEXT: .cfi_def_cfa_offset 8 1062; CHECK-NEXT: retq 1063 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() 1064 %2 = insertelement <16 x i8> %a0, i8 %a1, i32 1 1065 ret <16 x i8> %2 1066} 1067 1068define <4 x i32> @stack_fold_pinsrd(<4 x i32> %a0, i32 %a1) { 1069; CHECK-LABEL: stack_fold_pinsrd: 1070; CHECK: # %bb.0: 1071; CHECK-NEXT: pushq %rbp 1072; CHECK-NEXT: .cfi_def_cfa_offset 16 1073; CHECK-NEXT: pushq %r15 1074; CHECK-NEXT: .cfi_def_cfa_offset 24 1075; CHECK-NEXT: pushq %r14 1076; CHECK-NEXT: .cfi_def_cfa_offset 32 1077; CHECK-NEXT: pushq %r13 1078; CHECK-NEXT: .cfi_def_cfa_offset 40 1079; CHECK-NEXT: pushq %r12 1080; CHECK-NEXT: .cfi_def_cfa_offset 48 1081; CHECK-NEXT: pushq %rbx 1082; CHECK-NEXT: .cfi_def_cfa_offset 56 1083; CHECK-NEXT: .cfi_offset %rbx, -56 1084; CHECK-NEXT: .cfi_offset %r12, -48 1085; CHECK-NEXT: .cfi_offset %r13, -40 1086; CHECK-NEXT: .cfi_offset %r14, -32 1087; CHECK-NEXT: .cfi_offset %r15, -24 1088; CHECK-NEXT: .cfi_offset %rbp, -16 1089; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 1090; CHECK-NEXT: #APP 1091; CHECK-NEXT: nop 1092; CHECK-NEXT: #NO_APP 1093; CHECK-NEXT: vpinsrd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload 1094; CHECK-NEXT: popq %rbx 1095; CHECK-NEXT: .cfi_def_cfa_offset 48 1096; CHECK-NEXT: popq %r12 1097; CHECK-NEXT: .cfi_def_cfa_offset 40 1098; CHECK-NEXT: popq %r13 1099; CHECK-NEXT: .cfi_def_cfa_offset 32 1100; CHECK-NEXT: popq %r14 1101; CHECK-NEXT: .cfi_def_cfa_offset 24 1102; CHECK-NEXT: popq %r15 1103; CHECK-NEXT: .cfi_def_cfa_offset 16 1104; CHECK-NEXT: popq %rbp 1105; CHECK-NEXT: .cfi_def_cfa_offset 8 1106; CHECK-NEXT: retq 1107 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() 1108 %2 = insertelement <4 x i32> %a0, i32 %a1, i32 1 1109 ret <4 x i32> %2 1110} 1111 1112define <2 x i64> @stack_fold_pinsrq(<2 x i64> %a0, i64 %a1) { 1113; CHECK-LABEL: stack_fold_pinsrq: 1114; CHECK: # %bb.0: 1115; CHECK-NEXT: pushq %rbp 1116; CHECK-NEXT: .cfi_def_cfa_offset 16 1117; CHECK-NEXT: pushq %r15 1118; CHECK-NEXT: .cfi_def_cfa_offset 24 1119; CHECK-NEXT: pushq %r14 1120; CHECK-NEXT: .cfi_def_cfa_offset 32 1121; CHECK-NEXT: pushq %r13 1122; CHECK-NEXT: .cfi_def_cfa_offset 40 1123; CHECK-NEXT: pushq %r12 1124; CHECK-NEXT: .cfi_def_cfa_offset 48 1125; CHECK-NEXT: pushq %rbx 1126; CHECK-NEXT: .cfi_def_cfa_offset 56 1127; CHECK-NEXT: .cfi_offset %rbx, -56 1128; CHECK-NEXT: .cfi_offset %r12, -48 1129; CHECK-NEXT: .cfi_offset %r13, -40 1130; CHECK-NEXT: .cfi_offset %r14, -32 1131; CHECK-NEXT: .cfi_offset %r15, -24 1132; CHECK-NEXT: .cfi_offset %rbp, -16 1133; CHECK-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 1134; CHECK-NEXT: #APP 1135; CHECK-NEXT: nop 1136; CHECK-NEXT: #NO_APP 1137; CHECK-NEXT: vpinsrq $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 8-byte Folded Reload 1138; CHECK-NEXT: popq %rbx 1139; CHECK-NEXT: .cfi_def_cfa_offset 48 1140; CHECK-NEXT: popq %r12 1141; CHECK-NEXT: .cfi_def_cfa_offset 40 1142; CHECK-NEXT: popq %r13 1143; CHECK-NEXT: .cfi_def_cfa_offset 32 1144; CHECK-NEXT: popq %r14 1145; CHECK-NEXT: .cfi_def_cfa_offset 24 1146; CHECK-NEXT: popq %r15 1147; CHECK-NEXT: .cfi_def_cfa_offset 16 1148; CHECK-NEXT: popq %rbp 1149; CHECK-NEXT: .cfi_def_cfa_offset 8 1150; CHECK-NEXT: retq 1151 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() 1152 %2 = insertelement <2 x i64> %a0, i64 %a1, i32 1 1153 ret <2 x i64> %2 1154} 1155 1156define <8 x i16> @stack_fold_pinsrw(<8 x i16> %a0, i16 %a1) { 1157; CHECK-LABEL: stack_fold_pinsrw: 1158; CHECK: # %bb.0: 1159; CHECK-NEXT: pushq %rbp 1160; CHECK-NEXT: .cfi_def_cfa_offset 16 1161; CHECK-NEXT: pushq %r15 1162; CHECK-NEXT: .cfi_def_cfa_offset 24 1163; CHECK-NEXT: pushq %r14 1164; CHECK-NEXT: .cfi_def_cfa_offset 32 1165; CHECK-NEXT: pushq %r13 1166; CHECK-NEXT: .cfi_def_cfa_offset 40 1167; CHECK-NEXT: pushq %r12 1168; CHECK-NEXT: .cfi_def_cfa_offset 48 1169; CHECK-NEXT: pushq %rbx 1170; CHECK-NEXT: .cfi_def_cfa_offset 56 1171; CHECK-NEXT: .cfi_offset %rbx, -56 1172; CHECK-NEXT: .cfi_offset %r12, -48 1173; CHECK-NEXT: .cfi_offset %r13, -40 1174; CHECK-NEXT: .cfi_offset %r14, -32 1175; CHECK-NEXT: .cfi_offset %r15, -24 1176; CHECK-NEXT: .cfi_offset %rbp, -16 1177; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 1178; CHECK-NEXT: #APP 1179; CHECK-NEXT: nop 1180; CHECK-NEXT: #NO_APP 1181; CHECK-NEXT: vpinsrw $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload 1182; CHECK-NEXT: popq %rbx 1183; CHECK-NEXT: .cfi_def_cfa_offset 48 1184; CHECK-NEXT: popq %r12 1185; CHECK-NEXT: .cfi_def_cfa_offset 40 1186; CHECK-NEXT: popq %r13 1187; CHECK-NEXT: .cfi_def_cfa_offset 32 1188; CHECK-NEXT: popq %r14 1189; CHECK-NEXT: .cfi_def_cfa_offset 24 1190; CHECK-NEXT: popq %r15 1191; CHECK-NEXT: .cfi_def_cfa_offset 16 1192; CHECK-NEXT: popq %rbp 1193; CHECK-NEXT: .cfi_def_cfa_offset 8 1194; CHECK-NEXT: retq 1195 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() 1196 %2 = insertelement <8 x i16> %a0, i16 %a1, i32 1 1197 ret <8 x i16> %2 1198} 1199 1200define <8 x i16> @stack_fold_pmaddubsw(<16 x i8> %a0, <16 x i8> %a1) { 1201; CHECK-LABEL: stack_fold_pmaddubsw: 1202; CHECK: # %bb.0: 1203; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1204; CHECK-NEXT: #APP 1205; CHECK-NEXT: nop 1206; CHECK-NEXT: #NO_APP 1207; CHECK-NEXT: vpmaddubsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1208; CHECK-NEXT: retq 1209 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1210 %2 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a0, <16 x i8> %a1) 1211 ret <8 x i16> %2 1212} 1213declare <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8>, <16 x i8>) nounwind readnone 1214 1215define <4 x i32> @stack_fold_pmaddwd(<8 x i16> %a0, <8 x i16> %a1) { 1216; CHECK-LABEL: stack_fold_pmaddwd: 1217; CHECK: # %bb.0: 1218; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1219; CHECK-NEXT: #APP 1220; CHECK-NEXT: nop 1221; CHECK-NEXT: #NO_APP 1222; CHECK-NEXT: vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1223; CHECK-NEXT: retq 1224 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1225 %2 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a0, <8 x i16> %a1) 1226 ret <4 x i32> %2 1227} 1228declare <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>) nounwind readnone 1229 1230define <16 x i8> @stack_fold_pmaxsb(<16 x i8> %a0, <16 x i8> %a1) { 1231; CHECK-LABEL: stack_fold_pmaxsb: 1232; CHECK: # %bb.0: 1233; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1234; CHECK-NEXT: #APP 1235; CHECK-NEXT: nop 1236; CHECK-NEXT: #NO_APP 1237; CHECK-NEXT: vpmaxsb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1238; CHECK-NEXT: retq 1239 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1240 %2 = icmp sgt <16 x i8> %a0, %a1 1241 %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %a1 1242 ret <16 x i8> %3 1243} 1244 1245define <4 x i32> @stack_fold_pmaxsd(<4 x i32> %a0, <4 x i32> %a1) { 1246; CHECK-LABEL: stack_fold_pmaxsd: 1247; CHECK: # %bb.0: 1248; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1249; CHECK-NEXT: #APP 1250; CHECK-NEXT: nop 1251; CHECK-NEXT: #NO_APP 1252; CHECK-NEXT: vpmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1253; CHECK-NEXT: retq 1254 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1255 %2 = icmp sgt <4 x i32> %a0, %a1 1256 %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %a1 1257 ret <4 x i32> %3 1258} 1259 1260define <8 x i16> @stack_fold_pmaxsw(<8 x i16> %a0, <8 x i16> %a1) { 1261; CHECK-LABEL: stack_fold_pmaxsw: 1262; CHECK: # %bb.0: 1263; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1264; CHECK-NEXT: #APP 1265; CHECK-NEXT: nop 1266; CHECK-NEXT: #NO_APP 1267; CHECK-NEXT: vpmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1268; CHECK-NEXT: retq 1269 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1270 %2 = icmp sgt <8 x i16> %a0, %a1 1271 %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %a1 1272 ret <8 x i16> %3 1273} 1274 1275define <16 x i8> @stack_fold_pmaxub(<16 x i8> %a0, <16 x i8> %a1) { 1276; CHECK-LABEL: stack_fold_pmaxub: 1277; CHECK: # %bb.0: 1278; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1279; CHECK-NEXT: #APP 1280; CHECK-NEXT: nop 1281; CHECK-NEXT: #NO_APP 1282; CHECK-NEXT: vpmaxub {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1283; CHECK-NEXT: retq 1284 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1285 %2 = icmp ugt <16 x i8> %a0, %a1 1286 %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %a1 1287 ret <16 x i8> %3 1288} 1289 1290define <4 x i32> @stack_fold_pmaxud(<4 x i32> %a0, <4 x i32> %a1) { 1291; CHECK-LABEL: stack_fold_pmaxud: 1292; CHECK: # %bb.0: 1293; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1294; CHECK-NEXT: #APP 1295; CHECK-NEXT: nop 1296; CHECK-NEXT: #NO_APP 1297; CHECK-NEXT: vpmaxud {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1298; CHECK-NEXT: retq 1299 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1300 %2 = icmp ugt <4 x i32> %a0, %a1 1301 %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %a1 1302 ret <4 x i32> %3 1303} 1304 1305define <8 x i16> @stack_fold_pmaxuw(<8 x i16> %a0, <8 x i16> %a1) { 1306; CHECK-LABEL: stack_fold_pmaxuw: 1307; CHECK: # %bb.0: 1308; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1309; CHECK-NEXT: #APP 1310; CHECK-NEXT: nop 1311; CHECK-NEXT: #NO_APP 1312; CHECK-NEXT: vpmaxuw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1313; CHECK-NEXT: retq 1314 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1315 %2 = icmp ugt <8 x i16> %a0, %a1 1316 %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %a1 1317 ret <8 x i16> %3 1318} 1319 1320define <16 x i8> @stack_fold_pminsb(<16 x i8> %a0, <16 x i8> %a1) { 1321; CHECK-LABEL: stack_fold_pminsb: 1322; CHECK: # %bb.0: 1323; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1324; CHECK-NEXT: #APP 1325; CHECK-NEXT: nop 1326; CHECK-NEXT: #NO_APP 1327; CHECK-NEXT: vpminsb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1328; CHECK-NEXT: retq 1329 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1330 %2 = icmp slt <16 x i8> %a0, %a1 1331 %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %a1 1332 ret <16 x i8> %3 1333} 1334 1335define <4 x i32> @stack_fold_pminsd(<4 x i32> %a0, <4 x i32> %a1) { 1336; CHECK-LABEL: stack_fold_pminsd: 1337; CHECK: # %bb.0: 1338; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1339; CHECK-NEXT: #APP 1340; CHECK-NEXT: nop 1341; CHECK-NEXT: #NO_APP 1342; CHECK-NEXT: vpminsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1343; CHECK-NEXT: retq 1344 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1345 %2 = icmp slt <4 x i32> %a0, %a1 1346 %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %a1 1347 ret <4 x i32> %3 1348} 1349 1350define <8 x i16> @stack_fold_pminsw(<8 x i16> %a0, <8 x i16> %a1) { 1351; CHECK-LABEL: stack_fold_pminsw: 1352; CHECK: # %bb.0: 1353; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1354; CHECK-NEXT: #APP 1355; CHECK-NEXT: nop 1356; CHECK-NEXT: #NO_APP 1357; CHECK-NEXT: vpminsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1358; CHECK-NEXT: retq 1359 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1360 %2 = icmp slt <8 x i16> %a0, %a1 1361 %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %a1 1362 ret <8 x i16> %3 1363} 1364 1365define <16 x i8> @stack_fold_pminub(<16 x i8> %a0, <16 x i8> %a1) { 1366; CHECK-LABEL: stack_fold_pminub: 1367; CHECK: # %bb.0: 1368; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1369; CHECK-NEXT: #APP 1370; CHECK-NEXT: nop 1371; CHECK-NEXT: #NO_APP 1372; CHECK-NEXT: vpminub {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1373; CHECK-NEXT: retq 1374 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1375 %2 = icmp ult <16 x i8> %a0, %a1 1376 %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %a1 1377 ret <16 x i8> %3 1378} 1379 1380define <4 x i32> @stack_fold_pminud(<4 x i32> %a0, <4 x i32> %a1) { 1381; CHECK-LABEL: stack_fold_pminud: 1382; CHECK: # %bb.0: 1383; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1384; CHECK-NEXT: #APP 1385; CHECK-NEXT: nop 1386; CHECK-NEXT: #NO_APP 1387; CHECK-NEXT: vpminud {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1388; CHECK-NEXT: retq 1389 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1390 %2 = icmp ult <4 x i32> %a0, %a1 1391 %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %a1 1392 ret <4 x i32> %3 1393} 1394 1395define <8 x i16> @stack_fold_pminuw(<8 x i16> %a0, <8 x i16> %a1) { 1396; CHECK-LABEL: stack_fold_pminuw: 1397; CHECK: # %bb.0: 1398; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1399; CHECK-NEXT: #APP 1400; CHECK-NEXT: nop 1401; CHECK-NEXT: #NO_APP 1402; CHECK-NEXT: vpminuw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1403; CHECK-NEXT: retq 1404 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1405 %2 = icmp ult <8 x i16> %a0, %a1 1406 %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %a1 1407 ret <8 x i16> %3 1408} 1409 1410define <2 x i64> @stack_fold_pmuldq(<4 x i32> %a0, <4 x i32> %a1) { 1411; CHECK-LABEL: stack_fold_pmuldq: 1412; CHECK: # %bb.0: 1413; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1414; CHECK-NEXT: #APP 1415; CHECK-NEXT: nop 1416; CHECK-NEXT: #NO_APP 1417; CHECK-NEXT: vpmuldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1418; CHECK-NEXT: retq 1419 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1420 %2 = bitcast <4 x i32> %a0 to <2 x i64> 1421 %3 = bitcast <4 x i32> %a1 to <2 x i64> 1422 %4 = shl <2 x i64> %2, <i64 32, i64 32> 1423 %5 = ashr <2 x i64> %4, <i64 32, i64 32> 1424 %6 = shl <2 x i64> %3, <i64 32, i64 32> 1425 %7 = ashr <2 x i64> %6, <i64 32, i64 32> 1426 %8 = mul <2 x i64> %5, %7 1427 ret <2 x i64> %8 1428} 1429 1430define <8 x i16> @stack_fold_pmulhrsw(<8 x i16> %a0, <8 x i16> %a1) { 1431; CHECK-LABEL: stack_fold_pmulhrsw: 1432; CHECK: # %bb.0: 1433; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1434; CHECK-NEXT: #APP 1435; CHECK-NEXT: nop 1436; CHECK-NEXT: #NO_APP 1437; CHECK-NEXT: vpmulhrsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1438; CHECK-NEXT: retq 1439 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1440 %2 = call <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16> %a0, <8 x i16> %a1) 1441 ret <8 x i16> %2 1442} 1443declare <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16>, <8 x i16>) nounwind readnone 1444 1445define <8 x i16> @stack_fold_pmulhuw(<8 x i16> %a0, <8 x i16> %a1) { 1446; CHECK-LABEL: stack_fold_pmulhuw: 1447; CHECK: # %bb.0: 1448; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1449; CHECK-NEXT: #APP 1450; CHECK-NEXT: nop 1451; CHECK-NEXT: #NO_APP 1452; CHECK-NEXT: vpmulhuw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1453; CHECK-NEXT: retq 1454 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1455 %2 = call <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16> %a0, <8 x i16> %a1) 1456 ret <8 x i16> %2 1457} 1458declare <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16>, <8 x i16>) nounwind readnone 1459 1460define <8 x i16> @stack_fold_pmulhw(<8 x i16> %a0, <8 x i16> %a1) { 1461; CHECK-LABEL: stack_fold_pmulhw: 1462; CHECK: # %bb.0: 1463; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1464; CHECK-NEXT: #APP 1465; CHECK-NEXT: nop 1466; CHECK-NEXT: #NO_APP 1467; CHECK-NEXT: vpmulhw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1468; CHECK-NEXT: retq 1469 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1470 %2 = call <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16> %a0, <8 x i16> %a1) 1471 ret <8 x i16> %2 1472} 1473declare <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16>, <8 x i16>) nounwind readnone 1474 1475define <4 x i32> @stack_fold_pmulld(<4 x i32> %a0, <4 x i32> %a1) { 1476; CHECK-LABEL: stack_fold_pmulld: 1477; CHECK: # %bb.0: 1478; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1479; CHECK-NEXT: #APP 1480; CHECK-NEXT: nop 1481; CHECK-NEXT: #NO_APP 1482; CHECK-NEXT: vpmulld {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1483; CHECK-NEXT: retq 1484 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1485 %2 = mul <4 x i32> %a0, %a1 1486 ret <4 x i32> %2 1487} 1488 1489define <8 x i16> @stack_fold_pmullw(<8 x i16> %a0, <8 x i16> %a1) { 1490; CHECK-LABEL: stack_fold_pmullw: 1491; CHECK: # %bb.0: 1492; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1493; CHECK-NEXT: #APP 1494; CHECK-NEXT: nop 1495; CHECK-NEXT: #NO_APP 1496; CHECK-NEXT: vpmullw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1497; CHECK-NEXT: retq 1498 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1499 %2 = mul <8 x i16> %a0, %a1 1500 ret <8 x i16> %2 1501} 1502 1503define <2 x i64> @stack_fold_pmuludq(<4 x i32> %a0, <4 x i32> %a1) { 1504; CHECK-LABEL: stack_fold_pmuludq: 1505; CHECK: # %bb.0: 1506; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1507; CHECK-NEXT: #APP 1508; CHECK-NEXT: nop 1509; CHECK-NEXT: #NO_APP 1510; CHECK-NEXT: vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1511; CHECK-NEXT: retq 1512 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1513 %2 = bitcast <4 x i32> %a0 to <2 x i64> 1514 %3 = bitcast <4 x i32> %a1 to <2 x i64> 1515 %4 = and <2 x i64> %2, <i64 4294967295, i64 4294967295> 1516 %5 = and <2 x i64> %3, <i64 4294967295, i64 4294967295> 1517 %6 = mul <2 x i64> %4, %5 1518 ret <2 x i64> %6 1519} 1520 1521define <16 x i8> @stack_fold_por(<16 x i8> %a0, <16 x i8> %a1) { 1522; CHECK-LABEL: stack_fold_por: 1523; CHECK: # %bb.0: 1524; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1525; CHECK-NEXT: #APP 1526; CHECK-NEXT: nop 1527; CHECK-NEXT: #NO_APP 1528; CHECK-NEXT: vpor {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1529; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 1530; CHECK-NEXT: vpsubb %xmm1, %xmm0, %xmm0 1531; CHECK-NEXT: retq 1532 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1533 %2 = or <16 x i8> %a0, %a1 1534 ; add forces execution domain 1535 %3 = add <16 x i8> %2, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 1536 ret <16 x i8> %3 1537} 1538 1539define <2 x i64> @stack_fold_psadbw(<16 x i8> %a0, <16 x i8> %a1) { 1540; CHECK-LABEL: stack_fold_psadbw: 1541; CHECK: # %bb.0: 1542; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1543; CHECK-NEXT: #APP 1544; CHECK-NEXT: nop 1545; CHECK-NEXT: #NO_APP 1546; CHECK-NEXT: vpsadbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1547; CHECK-NEXT: retq 1548 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1549 %2 = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %a0, <16 x i8> %a1) 1550 ret <2 x i64> %2 1551} 1552declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone 1553 1554define <16 x i8> @stack_fold_pshufb(<16 x i8> %a0, <16 x i8> %a1) { 1555; CHECK-LABEL: stack_fold_pshufb: 1556; CHECK: # %bb.0: 1557; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1558; CHECK-NEXT: #APP 1559; CHECK-NEXT: nop 1560; CHECK-NEXT: #NO_APP 1561; CHECK-NEXT: vpshufb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1562; CHECK-NEXT: retq 1563 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1564 %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> %a1) 1565 ret <16 x i8> %2 1566} 1567declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>) nounwind readnone 1568 1569define <4 x i32> @stack_fold_pshufd(<4 x i32> %a0) { 1570; CHECK-LABEL: stack_fold_pshufd: 1571; CHECK: # %bb.0: 1572; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1573; CHECK-NEXT: #APP 1574; CHECK-NEXT: nop 1575; CHECK-NEXT: #NO_APP 1576; CHECK-NEXT: vpshufd $27, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1577; CHECK-NEXT: # xmm0 = mem[3,2,1,0] 1578; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 1579; CHECK-NEXT: vpsubd %xmm1, %xmm0, %xmm0 1580; CHECK-NEXT: retq 1581 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1582 %2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 1583 ; add forces execution domain 1584 %3 = add <4 x i32> %2, <i32 1, i32 1, i32 1, i32 1> 1585 ret <4 x i32> %3 1586} 1587 1588define <8 x i16> @stack_fold_pshufhw(<8 x i16> %a0) { 1589; CHECK-LABEL: stack_fold_pshufhw: 1590; CHECK: # %bb.0: 1591; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1592; CHECK-NEXT: #APP 1593; CHECK-NEXT: nop 1594; CHECK-NEXT: #NO_APP 1595; CHECK-NEXT: vpshufhw $11, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1596; CHECK-NEXT: # xmm0 = mem[0,1,2,3,7,6,4,4] 1597; CHECK-NEXT: retq 1598 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1599 %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 4, i32 4> 1600 ret <8 x i16> %2 1601} 1602 1603define <8 x i16> @stack_fold_pshuflw(<8 x i16> %a0) { 1604; CHECK-LABEL: stack_fold_pshuflw: 1605; CHECK: # %bb.0: 1606; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1607; CHECK-NEXT: #APP 1608; CHECK-NEXT: nop 1609; CHECK-NEXT: #NO_APP 1610; CHECK-NEXT: vpshuflw $27, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1611; CHECK-NEXT: # xmm0 = mem[3,2,1,0,4,5,6,7] 1612; CHECK-NEXT: retq 1613 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1614 %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7> 1615 ret <8 x i16> %2 1616} 1617 1618define <16 x i8> @stack_fold_psignb(<16 x i8> %a0, <16 x i8> %a1) { 1619; CHECK-LABEL: stack_fold_psignb: 1620; CHECK: # %bb.0: 1621; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1622; CHECK-NEXT: #APP 1623; CHECK-NEXT: nop 1624; CHECK-NEXT: #NO_APP 1625; CHECK-NEXT: vpsignb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1626; CHECK-NEXT: retq 1627 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1628 %2 = call <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8> %a0, <16 x i8> %a1) 1629 ret <16 x i8> %2 1630} 1631declare <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8>, <16 x i8>) nounwind readnone 1632 1633define <4 x i32> @stack_fold_psignd(<4 x i32> %a0, <4 x i32> %a1) { 1634; CHECK-LABEL: stack_fold_psignd: 1635; CHECK: # %bb.0: 1636; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1637; CHECK-NEXT: #APP 1638; CHECK-NEXT: nop 1639; CHECK-NEXT: #NO_APP 1640; CHECK-NEXT: vpsignd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1641; CHECK-NEXT: retq 1642 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1643 %2 = call <4 x i32> @llvm.x86.ssse3.psign.d.128(<4 x i32> %a0, <4 x i32> %a1) 1644 ret <4 x i32> %2 1645} 1646declare <4 x i32> @llvm.x86.ssse3.psign.d.128(<4 x i32>, <4 x i32>) nounwind readnone 1647 1648define <8 x i16> @stack_fold_psignw(<8 x i16> %a0, <8 x i16> %a1) { 1649; CHECK-LABEL: stack_fold_psignw: 1650; CHECK: # %bb.0: 1651; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1652; CHECK-NEXT: #APP 1653; CHECK-NEXT: nop 1654; CHECK-NEXT: #NO_APP 1655; CHECK-NEXT: vpsignw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1656; CHECK-NEXT: retq 1657 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1658 %2 = call <8 x i16> @llvm.x86.ssse3.psign.w.128(<8 x i16> %a0, <8 x i16> %a1) 1659 ret <8 x i16> %2 1660} 1661declare <8 x i16> @llvm.x86.ssse3.psign.w.128(<8 x i16>, <8 x i16>) nounwind readnone 1662 1663define <4 x i32> @stack_fold_pslld(<4 x i32> %a0, <4 x i32> %a1) { 1664; CHECK-LABEL: stack_fold_pslld: 1665; CHECK: # %bb.0: 1666; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1667; CHECK-NEXT: #APP 1668; CHECK-NEXT: nop 1669; CHECK-NEXT: #NO_APP 1670; CHECK-NEXT: vpslld {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1671; CHECK-NEXT: retq 1672 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1673 %2 = call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %a0, <4 x i32> %a1) 1674 ret <4 x i32> %2 1675} 1676declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) nounwind readnone 1677 1678define <2 x i64> @stack_fold_psllq(<2 x i64> %a0, <2 x i64> %a1) { 1679; CHECK-LABEL: stack_fold_psllq: 1680; CHECK: # %bb.0: 1681; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1682; CHECK-NEXT: #APP 1683; CHECK-NEXT: nop 1684; CHECK-NEXT: #NO_APP 1685; CHECK-NEXT: vpsllq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1686; CHECK-NEXT: retq 1687 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1688 %2 = call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %a0, <2 x i64> %a1) 1689 ret <2 x i64> %2 1690} 1691declare <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64>, <2 x i64>) nounwind readnone 1692 1693define <8 x i16> @stack_fold_psllw(<8 x i16> %a0, <8 x i16> %a1) { 1694; CHECK-LABEL: stack_fold_psllw: 1695; CHECK: # %bb.0: 1696; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1697; CHECK-NEXT: #APP 1698; CHECK-NEXT: nop 1699; CHECK-NEXT: #NO_APP 1700; CHECK-NEXT: vpsllw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1701; CHECK-NEXT: retq 1702 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1703 %2 = call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %a0, <8 x i16> %a1) 1704 ret <8 x i16> %2 1705} 1706declare <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16>, <8 x i16>) nounwind readnone 1707 1708define <4 x i32> @stack_fold_psrad(<4 x i32> %a0, <4 x i32> %a1) { 1709; CHECK-LABEL: stack_fold_psrad: 1710; CHECK: # %bb.0: 1711; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1712; CHECK-NEXT: #APP 1713; CHECK-NEXT: nop 1714; CHECK-NEXT: #NO_APP 1715; CHECK-NEXT: vpsrad {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1716; CHECK-NEXT: retq 1717 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1718 %2 = call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %a0, <4 x i32> %a1) 1719 ret <4 x i32> %2 1720} 1721declare <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32>, <4 x i32>) nounwind readnone 1722 1723define <8 x i16> @stack_fold_psraw(<8 x i16> %a0, <8 x i16> %a1) { 1724; CHECK-LABEL: stack_fold_psraw: 1725; CHECK: # %bb.0: 1726; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1727; CHECK-NEXT: #APP 1728; CHECK-NEXT: nop 1729; CHECK-NEXT: #NO_APP 1730; CHECK-NEXT: vpsraw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1731; CHECK-NEXT: retq 1732 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1733 %2 = call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %a0, <8 x i16> %a1) 1734 ret <8 x i16> %2 1735} 1736declare <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16>, <8 x i16>) nounwind readnone 1737 1738define <4 x i32> @stack_fold_psrld(<4 x i32> %a0, <4 x i32> %a1) { 1739; CHECK-LABEL: stack_fold_psrld: 1740; CHECK: # %bb.0: 1741; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1742; CHECK-NEXT: #APP 1743; CHECK-NEXT: nop 1744; CHECK-NEXT: #NO_APP 1745; CHECK-NEXT: vpsrld {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1746; CHECK-NEXT: retq 1747 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1748 %2 = call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %a0, <4 x i32> %a1) 1749 ret <4 x i32> %2 1750} 1751declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) nounwind readnone 1752 1753define <2 x i64> @stack_fold_psrlq(<2 x i64> %a0, <2 x i64> %a1) { 1754; CHECK-LABEL: stack_fold_psrlq: 1755; CHECK: # %bb.0: 1756; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1757; CHECK-NEXT: #APP 1758; CHECK-NEXT: nop 1759; CHECK-NEXT: #NO_APP 1760; CHECK-NEXT: vpsrlq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1761; CHECK-NEXT: retq 1762 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1763 %2 = call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %a0, <2 x i64> %a1) 1764 ret <2 x i64> %2 1765} 1766declare <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64>, <2 x i64>) nounwind readnone 1767 1768define <8 x i16> @stack_fold_psrlw(<8 x i16> %a0, <8 x i16> %a1) { 1769; CHECK-LABEL: stack_fold_psrlw: 1770; CHECK: # %bb.0: 1771; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1772; CHECK-NEXT: #APP 1773; CHECK-NEXT: nop 1774; CHECK-NEXT: #NO_APP 1775; CHECK-NEXT: vpsrlw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1776; CHECK-NEXT: retq 1777 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1778 %2 = call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %a0, <8 x i16> %a1) 1779 ret <8 x i16> %2 1780} 1781declare <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16>, <8 x i16>) nounwind readnone 1782 1783define <16 x i8> @stack_fold_psubb(<16 x i8> %a0, <16 x i8> %a1) { 1784; CHECK-LABEL: stack_fold_psubb: 1785; CHECK: # %bb.0: 1786; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1787; CHECK-NEXT: #APP 1788; CHECK-NEXT: nop 1789; CHECK-NEXT: #NO_APP 1790; CHECK-NEXT: vpsubb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1791; CHECK-NEXT: retq 1792 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1793 %2 = sub <16 x i8> %a0, %a1 1794 ret <16 x i8> %2 1795} 1796 1797define <4 x i32> @stack_fold_psubd(<4 x i32> %a0, <4 x i32> %a1) { 1798; CHECK-LABEL: stack_fold_psubd: 1799; CHECK: # %bb.0: 1800; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1801; CHECK-NEXT: #APP 1802; CHECK-NEXT: nop 1803; CHECK-NEXT: #NO_APP 1804; CHECK-NEXT: vpsubd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1805; CHECK-NEXT: retq 1806 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1807 %2 = sub <4 x i32> %a0, %a1 1808 ret <4 x i32> %2 1809} 1810 1811define <2 x i64> @stack_fold_psubq(<2 x i64> %a0, <2 x i64> %a1) { 1812; CHECK-LABEL: stack_fold_psubq: 1813; CHECK: # %bb.0: 1814; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1815; CHECK-NEXT: #APP 1816; CHECK-NEXT: nop 1817; CHECK-NEXT: #NO_APP 1818; CHECK-NEXT: vpsubq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1819; CHECK-NEXT: retq 1820 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1821 %2 = sub <2 x i64> %a0, %a1 1822 ret <2 x i64> %2 1823} 1824 1825define <16 x i8> @stack_fold_psubsb(<16 x i8> %a0, <16 x i8> %a1) { 1826; CHECK-LABEL: stack_fold_psubsb: 1827; CHECK: # %bb.0: 1828; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1829; CHECK-NEXT: #APP 1830; CHECK-NEXT: nop 1831; CHECK-NEXT: #NO_APP 1832; CHECK-NEXT: vpsubsb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1833; CHECK-NEXT: retq 1834 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1835 %2 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> %a0, <16 x i8> %a1) 1836 ret <16 x i8> %2 1837} 1838declare <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8>, <16 x i8>) nounwind readnone 1839 1840define <8 x i16> @stack_fold_psubsw(<8 x i16> %a0, <8 x i16> %a1) { 1841; CHECK-LABEL: stack_fold_psubsw: 1842; CHECK: # %bb.0: 1843; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1844; CHECK-NEXT: #APP 1845; CHECK-NEXT: nop 1846; CHECK-NEXT: #NO_APP 1847; CHECK-NEXT: vpsubsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1848; CHECK-NEXT: retq 1849 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1850 %2 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %a0, <8 x i16> %a1) 1851 ret <8 x i16> %2 1852} 1853declare <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16>, <8 x i16>) nounwind readnone 1854 1855define <16 x i8> @stack_fold_psubusb(<16 x i8> %a0, <16 x i8> %a1) { 1856; CHECK-LABEL: stack_fold_psubusb: 1857; CHECK: # %bb.0: 1858; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1859; CHECK-NEXT: #APP 1860; CHECK-NEXT: nop 1861; CHECK-NEXT: #NO_APP 1862; CHECK-NEXT: vpsubusb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1863; CHECK-NEXT: retq 1864 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1865 %2 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> %a0, <16 x i8> %a1) 1866 ret <16 x i8> %2 1867} 1868declare <16 x i8> @llvm.usub.sat.v16i8(<16 x i8>, <16 x i8>) nounwind readnone 1869 1870define <8 x i16> @stack_fold_psubusw(<8 x i16> %a0, <8 x i16> %a1) { 1871; CHECK-LABEL: stack_fold_psubusw: 1872; CHECK: # %bb.0: 1873; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1874; CHECK-NEXT: #APP 1875; CHECK-NEXT: nop 1876; CHECK-NEXT: #NO_APP 1877; CHECK-NEXT: vpsubusw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1878; CHECK-NEXT: retq 1879 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1880 %2 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %a0, <8 x i16> %a1) 1881 ret <8 x i16> %2 1882} 1883declare <8 x i16> @llvm.usub.sat.v8i16(<8 x i16>, <8 x i16>) nounwind readnone 1884 1885define <8 x i16> @stack_fold_psubw(<8 x i16> %a0, <8 x i16> %a1) { 1886; CHECK-LABEL: stack_fold_psubw: 1887; CHECK: # %bb.0: 1888; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1889; CHECK-NEXT: #APP 1890; CHECK-NEXT: nop 1891; CHECK-NEXT: #NO_APP 1892; CHECK-NEXT: vpsubw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1893; CHECK-NEXT: retq 1894 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1895 %2 = sub <8 x i16> %a0, %a1 1896 ret <8 x i16> %2 1897} 1898 1899define i32 @stack_fold_ptest(<2 x i64> %a0, <2 x i64> %a1) { 1900; CHECK-LABEL: stack_fold_ptest: 1901; CHECK: # %bb.0: 1902; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1903; CHECK-NEXT: #APP 1904; CHECK-NEXT: nop 1905; CHECK-NEXT: #NO_APP 1906; CHECK-NEXT: xorl %eax, %eax 1907; CHECK-NEXT: vptest {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1908; CHECK-NEXT: setb %al 1909; CHECK-NEXT: retq 1910 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1911 %2 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %a0, <2 x i64> %a1) 1912 ret i32 %2 1913} 1914declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) nounwind readnone 1915 1916define i32 @stack_fold_ptest_ymm(<4 x i64> %a0, <4 x i64> %a1) { 1917; CHECK-LABEL: stack_fold_ptest_ymm: 1918; CHECK: # %bb.0: 1919; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1920; CHECK-NEXT: #APP 1921; CHECK-NEXT: nop 1922; CHECK-NEXT: #NO_APP 1923; CHECK-NEXT: xorl %eax, %eax 1924; CHECK-NEXT: vptest {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload 1925; CHECK-NEXT: setb %al 1926; CHECK-NEXT: vzeroupper 1927; CHECK-NEXT: retq 1928 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1929 %2 = call i32 @llvm.x86.avx.ptestc.256(<4 x i64> %a0, <4 x i64> %a1) 1930 ret i32 %2 1931} 1932declare i32 @llvm.x86.avx.ptestc.256(<4 x i64>, <4 x i64>) nounwind readnone 1933 1934define <16 x i8> @stack_fold_punpckhbw(<16 x i8> %a0, <16 x i8> %a1) { 1935; CHECK-LABEL: stack_fold_punpckhbw: 1936; CHECK: # %bb.0: 1937; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1938; CHECK-NEXT: #APP 1939; CHECK-NEXT: nop 1940; CHECK-NEXT: #NO_APP 1941; CHECK-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1942; CHECK-NEXT: # xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] 1943; CHECK-NEXT: retq 1944 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1945 %2 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31> 1946 ret <16 x i8> %2 1947} 1948 1949define <4 x i32> @stack_fold_punpckhdq(<4 x i32> %a0, <4 x i32> %a1) { 1950; CHECK-LABEL: stack_fold_punpckhdq: 1951; CHECK: # %bb.0: 1952; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1953; CHECK-NEXT: #APP 1954; CHECK-NEXT: nop 1955; CHECK-NEXT: #NO_APP 1956; CHECK-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1957; CHECK-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] 1958; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 1959; CHECK-NEXT: vpsubd %xmm1, %xmm0, %xmm0 1960; CHECK-NEXT: retq 1961 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1962 %2 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 2, i32 6, i32 3, i32 7> 1963 ; add forces execution domain 1964 %3 = add <4 x i32> %2, <i32 1, i32 1, i32 1, i32 1> 1965 ret <4 x i32> %3 1966} 1967 1968define <2 x i64> @stack_fold_punpckhqdq(<2 x i64> %a0, <2 x i64> %a1) { 1969; CHECK-LABEL: stack_fold_punpckhqdq: 1970; CHECK: # %bb.0: 1971; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1972; CHECK-NEXT: #APP 1973; CHECK-NEXT: nop 1974; CHECK-NEXT: #NO_APP 1975; CHECK-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1976; CHECK-NEXT: # xmm0 = xmm0[1],mem[1] 1977; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 1978; CHECK-NEXT: vpsubq %xmm1, %xmm0, %xmm0 1979; CHECK-NEXT: retq 1980 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1981 %2 = shufflevector <2 x i64> %a0, <2 x i64> %a1, <2 x i32> <i32 1, i32 3> 1982 ; add forces execution domain 1983 %3 = add <2 x i64> %2, <i64 1, i64 1> 1984 ret <2 x i64> %3 1985} 1986 1987define <8 x i16> @stack_fold_punpckhwd(<8 x i16> %a0, <8 x i16> %a1) { 1988; CHECK-LABEL: stack_fold_punpckhwd: 1989; CHECK: # %bb.0: 1990; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1991; CHECK-NEXT: #APP 1992; CHECK-NEXT: nop 1993; CHECK-NEXT: #NO_APP 1994; CHECK-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1995; CHECK-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 1996; CHECK-NEXT: retq 1997 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1998 %2 = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> 1999 ret <8 x i16> %2 2000} 2001 2002define <16 x i8> @stack_fold_punpcklbw(<16 x i8> %a0, <16 x i8> %a1) { 2003; CHECK-LABEL: stack_fold_punpcklbw: 2004; CHECK: # %bb.0: 2005; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2006; CHECK-NEXT: #APP 2007; CHECK-NEXT: nop 2008; CHECK-NEXT: #NO_APP 2009; CHECK-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 2010; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 2011; CHECK-NEXT: retq 2012 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2013 %2 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23> 2014 ret <16 x i8> %2 2015} 2016 2017define <4 x i32> @stack_fold_punpckldq(<4 x i32> %a0, <4 x i32> %a1) { 2018; CHECK-LABEL: stack_fold_punpckldq: 2019; CHECK: # %bb.0: 2020; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2021; CHECK-NEXT: #APP 2022; CHECK-NEXT: nop 2023; CHECK-NEXT: #NO_APP 2024; CHECK-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 2025; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 2026; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 2027; CHECK-NEXT: vpsubd %xmm1, %xmm0, %xmm0 2028; CHECK-NEXT: retq 2029 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2030 %2 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 0, i32 4, i32 1, i32 5> 2031 ; add forces execution domain 2032 %3 = add <4 x i32> %2, <i32 1, i32 1, i32 1, i32 1> 2033 ret <4 x i32> %3 2034} 2035 2036define <2 x i64> @stack_fold_punpcklqdq(<2 x i64> %a0, <2 x i64> %a1) { 2037; CHECK-LABEL: stack_fold_punpcklqdq: 2038; CHECK: # %bb.0: 2039; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2040; CHECK-NEXT: #APP 2041; CHECK-NEXT: nop 2042; CHECK-NEXT: #NO_APP 2043; CHECK-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 2044; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] 2045; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 2046; CHECK-NEXT: vpsubq %xmm1, %xmm0, %xmm0 2047; CHECK-NEXT: retq 2048 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2049 %2 = shufflevector <2 x i64> %a0, <2 x i64> %a1, <2 x i32> <i32 0, i32 2> 2050 ; add forces execution domain 2051 %3 = add <2 x i64> %2, <i64 1, i64 1> 2052 ret <2 x i64> %3 2053} 2054 2055define <8 x i16> @stack_fold_punpcklwd(<8 x i16> %a0, <8 x i16> %a1) { 2056; CHECK-LABEL: stack_fold_punpcklwd: 2057; CHECK: # %bb.0: 2058; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2059; CHECK-NEXT: #APP 2060; CHECK-NEXT: nop 2061; CHECK-NEXT: #NO_APP 2062; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 2063; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 2064; CHECK-NEXT: retq 2065 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2066 %2 = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11> 2067 ret <8 x i16> %2 2068} 2069 2070define <16 x i8> @stack_fold_pxor(<16 x i8> %a0, <16 x i8> %a1) { 2071; CHECK-LABEL: stack_fold_pxor: 2072; CHECK: # %bb.0: 2073; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2074; CHECK-NEXT: #APP 2075; CHECK-NEXT: nop 2076; CHECK-NEXT: #NO_APP 2077; CHECK-NEXT: vpxor {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 2078; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 2079; CHECK-NEXT: vpsubb %xmm1, %xmm0, %xmm0 2080; CHECK-NEXT: retq 2081 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2082 %2 = xor <16 x i8> %a0, %a1 2083 ; add forces execution domain 2084 %3 = add <16 x i8> %2, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 2085 ret <16 x i8> %3 2086} 2087