1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx,+aes,+pclmul < %s | FileCheck %s
3
4target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
5target triple = "x86_64-unknown-unknown"
6
7; Stack reload folding tests.
8;
9; By including a nop call with sideeffects we can force a partial register spill of the
10; relevant registers and check that the reload is correctly folded into the instruction.
11
12define <2 x i64> @stack_fold_aesdec(<2 x i64> %a0, <2 x i64> %a1) {
13; CHECK-LABEL: stack_fold_aesdec:
14; CHECK:       # %bb.0:
15; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16; CHECK-NEXT:    #APP
17; CHECK-NEXT:    nop
18; CHECK-NEXT:    #NO_APP
19; CHECK-NEXT:    vaesdec {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
20; CHECK-NEXT:    retq
21  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
22  %2 = call <2 x i64> @llvm.x86.aesni.aesdec(<2 x i64> %a0, <2 x i64> %a1)
23  ret <2 x i64> %2
24}
25declare <2 x i64> @llvm.x86.aesni.aesdec(<2 x i64>, <2 x i64>) nounwind readnone
26
27define <2 x i64> @stack_fold_aesdeclast(<2 x i64> %a0, <2 x i64> %a1) {
28; CHECK-LABEL: stack_fold_aesdeclast:
29; CHECK:       # %bb.0:
30; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
31; CHECK-NEXT:    #APP
32; CHECK-NEXT:    nop
33; CHECK-NEXT:    #NO_APP
34; CHECK-NEXT:    vaesdeclast {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
35; CHECK-NEXT:    retq
36  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
37  %2 = call <2 x i64> @llvm.x86.aesni.aesdeclast(<2 x i64> %a0, <2 x i64> %a1)
38  ret <2 x i64> %2
39}
40declare <2 x i64> @llvm.x86.aesni.aesdeclast(<2 x i64>, <2 x i64>) nounwind readnone
41
42define <2 x i64> @stack_fold_aesenc(<2 x i64> %a0, <2 x i64> %a1) {
43; CHECK-LABEL: stack_fold_aesenc:
44; CHECK:       # %bb.0:
45; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
46; CHECK-NEXT:    #APP
47; CHECK-NEXT:    nop
48; CHECK-NEXT:    #NO_APP
49; CHECK-NEXT:    vaesenc {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
50; CHECK-NEXT:    retq
51  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
52  %2 = call <2 x i64> @llvm.x86.aesni.aesenc(<2 x i64> %a0, <2 x i64> %a1)
53  ret <2 x i64> %2
54}
55declare <2 x i64> @llvm.x86.aesni.aesenc(<2 x i64>, <2 x i64>) nounwind readnone
56
57define <2 x i64> @stack_fold_aesenclast(<2 x i64> %a0, <2 x i64> %a1) {
58; CHECK-LABEL: stack_fold_aesenclast:
59; CHECK:       # %bb.0:
60; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
61; CHECK-NEXT:    #APP
62; CHECK-NEXT:    nop
63; CHECK-NEXT:    #NO_APP
64; CHECK-NEXT:    vaesenclast {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
65; CHECK-NEXT:    retq
66  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
67  %2 = call <2 x i64> @llvm.x86.aesni.aesenclast(<2 x i64> %a0, <2 x i64> %a1)
68  ret <2 x i64> %2
69}
70declare <2 x i64> @llvm.x86.aesni.aesenclast(<2 x i64>, <2 x i64>) nounwind readnone
71
72define <2 x i64> @stack_fold_aesimc(<2 x i64> %a0) {
73; CHECK-LABEL: stack_fold_aesimc:
74; CHECK:       # %bb.0:
75; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
76; CHECK-NEXT:    #APP
77; CHECK-NEXT:    nop
78; CHECK-NEXT:    #NO_APP
79; CHECK-NEXT:    vaesimc {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
80; CHECK-NEXT:    retq
81  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
82  %2 = call <2 x i64> @llvm.x86.aesni.aesimc(<2 x i64> %a0)
83  ret <2 x i64> %2
84}
85declare <2 x i64> @llvm.x86.aesni.aesimc(<2 x i64>) nounwind readnone
86
87define <2 x i64> @stack_fold_aeskeygenassist(<2 x i64> %a0) {
88; CHECK-LABEL: stack_fold_aeskeygenassist:
89; CHECK:       # %bb.0:
90; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
91; CHECK-NEXT:    #APP
92; CHECK-NEXT:    nop
93; CHECK-NEXT:    #NO_APP
94; CHECK-NEXT:    vaeskeygenassist $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
95; CHECK-NEXT:    retq
96  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
97  %2 = call <2 x i64> @llvm.x86.aesni.aeskeygenassist(<2 x i64> %a0, i8 7)
98  ret <2 x i64> %2
99}
100declare <2 x i64> @llvm.x86.aesni.aeskeygenassist(<2 x i64>, i8) nounwind readnone
101
102define <4 x i32> @stack_fold_movd_load(i32 %a0) {
103; CHECK-LABEL: stack_fold_movd_load:
104; CHECK:       # %bb.0:
105; CHECK-NEXT:    pushq %rbp
106; CHECK-NEXT:    .cfi_def_cfa_offset 16
107; CHECK-NEXT:    pushq %r15
108; CHECK-NEXT:    .cfi_def_cfa_offset 24
109; CHECK-NEXT:    pushq %r14
110; CHECK-NEXT:    .cfi_def_cfa_offset 32
111; CHECK-NEXT:    pushq %r13
112; CHECK-NEXT:    .cfi_def_cfa_offset 40
113; CHECK-NEXT:    pushq %r12
114; CHECK-NEXT:    .cfi_def_cfa_offset 48
115; CHECK-NEXT:    pushq %rbx
116; CHECK-NEXT:    .cfi_def_cfa_offset 56
117; CHECK-NEXT:    .cfi_offset %rbx, -56
118; CHECK-NEXT:    .cfi_offset %r12, -48
119; CHECK-NEXT:    .cfi_offset %r13, -40
120; CHECK-NEXT:    .cfi_offset %r14, -32
121; CHECK-NEXT:    .cfi_offset %r15, -24
122; CHECK-NEXT:    .cfi_offset %rbp, -16
123; CHECK-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
124; CHECK-NEXT:    #APP
125; CHECK-NEXT:    nop
126; CHECK-NEXT:    #NO_APP
127; CHECK-NEXT:    vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
128; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
129; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
130; CHECK-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
131; CHECK-NEXT:    popq %rbx
132; CHECK-NEXT:    .cfi_def_cfa_offset 48
133; CHECK-NEXT:    popq %r12
134; CHECK-NEXT:    .cfi_def_cfa_offset 40
135; CHECK-NEXT:    popq %r13
136; CHECK-NEXT:    .cfi_def_cfa_offset 32
137; CHECK-NEXT:    popq %r14
138; CHECK-NEXT:    .cfi_def_cfa_offset 24
139; CHECK-NEXT:    popq %r15
140; CHECK-NEXT:    .cfi_def_cfa_offset 16
141; CHECK-NEXT:    popq %rbp
142; CHECK-NEXT:    .cfi_def_cfa_offset 8
143; CHECK-NEXT:    retq
144  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
145  %2 = insertelement <4 x i32> zeroinitializer, i32 %a0, i32 0
146  ; add forces execution domain
147  %3 = add <4 x i32> %2, <i32 1, i32 1, i32 1, i32 1>
148  ret <4 x i32> %3
149}
150
151define i32 @stack_fold_movd_store(<4 x i32> %a0, <4 x i32> %a1) {
152; CHECK-LABEL: stack_fold_movd_store:
153; CHECK:       # %bb.0:
154; CHECK-NEXT:    pushq %rbp
155; CHECK-NEXT:    .cfi_def_cfa_offset 16
156; CHECK-NEXT:    pushq %r15
157; CHECK-NEXT:    .cfi_def_cfa_offset 24
158; CHECK-NEXT:    pushq %r14
159; CHECK-NEXT:    .cfi_def_cfa_offset 32
160; CHECK-NEXT:    pushq %r13
161; CHECK-NEXT:    .cfi_def_cfa_offset 40
162; CHECK-NEXT:    pushq %r12
163; CHECK-NEXT:    .cfi_def_cfa_offset 48
164; CHECK-NEXT:    pushq %rbx
165; CHECK-NEXT:    .cfi_def_cfa_offset 56
166; CHECK-NEXT:    .cfi_offset %rbx, -56
167; CHECK-NEXT:    .cfi_offset %r12, -48
168; CHECK-NEXT:    .cfi_offset %r13, -40
169; CHECK-NEXT:    .cfi_offset %r14, -32
170; CHECK-NEXT:    .cfi_offset %r15, -24
171; CHECK-NEXT:    .cfi_offset %rbp, -16
172; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
173; CHECK-NEXT:    vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
174; CHECK-NEXT:    #APP
175; CHECK-NEXT:    nop
176; CHECK-NEXT:    #NO_APP
177; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
178; CHECK-NEXT:    popq %rbx
179; CHECK-NEXT:    .cfi_def_cfa_offset 48
180; CHECK-NEXT:    popq %r12
181; CHECK-NEXT:    .cfi_def_cfa_offset 40
182; CHECK-NEXT:    popq %r13
183; CHECK-NEXT:    .cfi_def_cfa_offset 32
184; CHECK-NEXT:    popq %r14
185; CHECK-NEXT:    .cfi_def_cfa_offset 24
186; CHECK-NEXT:    popq %r15
187; CHECK-NEXT:    .cfi_def_cfa_offset 16
188; CHECK-NEXT:    popq %rbp
189; CHECK-NEXT:    .cfi_def_cfa_offset 8
190; CHECK-NEXT:    retq
191  ; add forces execution domain
192  %1 = add <4 x i32> %a0, %a1
193  %2 = extractelement <4 x i32> %1, i32 0
194  %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
195  ret i32 %2
196}
197
198define <2 x i64> @stack_fold_movq_load(<2 x i64> %a0) {
199; CHECK-LABEL: stack_fold_movq_load:
200; CHECK:       # %bb.0:
201; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
202; CHECK-NEXT:    #APP
203; CHECK-NEXT:    nop
204; CHECK-NEXT:    #NO_APP
205; CHECK-NEXT:    vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
206; CHECK-NEXT:    # xmm0 = mem[0],zero
207; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
208; CHECK-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
209; CHECK-NEXT:    retq
210  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
211  %2 = shufflevector <2 x i64> %a0, <2 x i64> zeroinitializer, <2 x i32> <i32 0, i32 2>
212  ; add forces execution domain
213  %3 = add <2 x i64> %2, <i64 1, i64 1>
214  ret <2 x i64> %3
215}
216
217define i64 @stack_fold_movq_store(<2 x i64> %a0, <2 x i64> %a1) {
218; CHECK-LABEL: stack_fold_movq_store:
219; CHECK:       # %bb.0:
220; CHECK-NEXT:    pushq %rbp
221; CHECK-NEXT:    .cfi_def_cfa_offset 16
222; CHECK-NEXT:    pushq %r15
223; CHECK-NEXT:    .cfi_def_cfa_offset 24
224; CHECK-NEXT:    pushq %r14
225; CHECK-NEXT:    .cfi_def_cfa_offset 32
226; CHECK-NEXT:    pushq %r13
227; CHECK-NEXT:    .cfi_def_cfa_offset 40
228; CHECK-NEXT:    pushq %r12
229; CHECK-NEXT:    .cfi_def_cfa_offset 48
230; CHECK-NEXT:    pushq %rbx
231; CHECK-NEXT:    .cfi_def_cfa_offset 56
232; CHECK-NEXT:    .cfi_offset %rbx, -56
233; CHECK-NEXT:    .cfi_offset %r12, -48
234; CHECK-NEXT:    .cfi_offset %r13, -40
235; CHECK-NEXT:    .cfi_offset %r14, -32
236; CHECK-NEXT:    .cfi_offset %r15, -24
237; CHECK-NEXT:    .cfi_offset %rbp, -16
238; CHECK-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
239; CHECK-NEXT:    vmovq %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
240; CHECK-NEXT:    #APP
241; CHECK-NEXT:    nop
242; CHECK-NEXT:    #NO_APP
243; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
244; CHECK-NEXT:    popq %rbx
245; CHECK-NEXT:    .cfi_def_cfa_offset 48
246; CHECK-NEXT:    popq %r12
247; CHECK-NEXT:    .cfi_def_cfa_offset 40
248; CHECK-NEXT:    popq %r13
249; CHECK-NEXT:    .cfi_def_cfa_offset 32
250; CHECK-NEXT:    popq %r14
251; CHECK-NEXT:    .cfi_def_cfa_offset 24
252; CHECK-NEXT:    popq %r15
253; CHECK-NEXT:    .cfi_def_cfa_offset 16
254; CHECK-NEXT:    popq %rbp
255; CHECK-NEXT:    .cfi_def_cfa_offset 8
256; CHECK-NEXT:    retq
257  ; add forces execution domain
258  %1 = add <2 x i64> %a0, %a1
259  %2 = extractelement <2 x i64> %1, i32 0
260  %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
261  ret i64 %2
262}
263
264define <8 x i16> @stack_fold_mpsadbw(<16 x i8> %a0, <16 x i8> %a1) {
265; CHECK-LABEL: stack_fold_mpsadbw:
266; CHECK:       # %bb.0:
267; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
268; CHECK-NEXT:    #APP
269; CHECK-NEXT:    nop
270; CHECK-NEXT:    #NO_APP
271; CHECK-NEXT:    vmpsadbw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
272; CHECK-NEXT:    retq
273  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
274  %2 = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %a0, <16 x i8> %a1, i8 7)
275  ret <8 x i16> %2
276}
277declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i8) nounwind readnone
278
279define <16 x i8> @stack_fold_pabsb(<16 x i8> %a0) {
280; CHECK-LABEL: stack_fold_pabsb:
281; CHECK:       # %bb.0:
282; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
283; CHECK-NEXT:    #APP
284; CHECK-NEXT:    nop
285; CHECK-NEXT:    #NO_APP
286; CHECK-NEXT:    vpabsb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
287; CHECK-NEXT:    retq
288  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
289  %2 = icmp sgt <16 x i8> %a0, zeroinitializer
290  %3 = sub <16 x i8> zeroinitializer, %a0
291  %4 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %3
292  ret <16 x i8> %4
293}
294
295define <4 x i32> @stack_fold_pabsd(<4 x i32> %a0) {
296; CHECK-LABEL: stack_fold_pabsd:
297; CHECK:       # %bb.0:
298; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
299; CHECK-NEXT:    #APP
300; CHECK-NEXT:    nop
301; CHECK-NEXT:    #NO_APP
302; CHECK-NEXT:    vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
303; CHECK-NEXT:    retq
304  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
305  %2 = icmp sgt <4 x i32> %a0, zeroinitializer
306  %3 = sub <4 x i32> zeroinitializer, %a0
307  %4 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %3
308  ret <4 x i32> %4
309}
310
311define <8 x i16> @stack_fold_pabsw(<8 x i16> %a0) {
312; CHECK-LABEL: stack_fold_pabsw:
313; CHECK:       # %bb.0:
314; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
315; CHECK-NEXT:    #APP
316; CHECK-NEXT:    nop
317; CHECK-NEXT:    #NO_APP
318; CHECK-NEXT:    vpabsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
319; CHECK-NEXT:    retq
320  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
321  %2 = icmp sgt <8 x i16> %a0, zeroinitializer
322  %3 = sub <8 x i16> zeroinitializer, %a0
323  %4 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %3
324  ret <8 x i16> %4
325}
326
327define <8 x i16> @stack_fold_packssdw(<4 x i32> %a0, <4 x i32> %a1) {
328; CHECK-LABEL: stack_fold_packssdw:
329; CHECK:       # %bb.0:
330; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
331; CHECK-NEXT:    #APP
332; CHECK-NEXT:    nop
333; CHECK-NEXT:    #NO_APP
334; CHECK-NEXT:    vpackssdw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
335; CHECK-NEXT:    retq
336  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
337  %2 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a0, <4 x i32> %a1)
338  ret <8 x i16> %2
339}
340declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>) nounwind readnone
341
342define <16 x i8> @stack_fold_packsswb(<8 x i16> %a0, <8 x i16> %a1) {
343; CHECK-LABEL: stack_fold_packsswb:
344; CHECK:       # %bb.0:
345; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
346; CHECK-NEXT:    #APP
347; CHECK-NEXT:    nop
348; CHECK-NEXT:    #NO_APP
349; CHECK-NEXT:    vpacksswb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
350; CHECK-NEXT:    retq
351  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
352  %2 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a0, <8 x i16> %a1)
353  ret <16 x i8> %2
354}
355declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>) nounwind readnone
356
357define <8 x i16> @stack_fold_packusdw(<4 x i32> %a0, <4 x i32> %a1) {
358; CHECK-LABEL: stack_fold_packusdw:
359; CHECK:       # %bb.0:
360; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
361; CHECK-NEXT:    #APP
362; CHECK-NEXT:    nop
363; CHECK-NEXT:    #NO_APP
364; CHECK-NEXT:    vpackusdw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
365; CHECK-NEXT:    retq
366  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
367  %2 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a0, <4 x i32> %a1)
368  ret <8 x i16> %2
369}
370declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>) nounwind readnone
371
372define <16 x i8> @stack_fold_packuswb(<8 x i16> %a0, <8 x i16> %a1) {
373; CHECK-LABEL: stack_fold_packuswb:
374; CHECK:       # %bb.0:
375; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
376; CHECK-NEXT:    #APP
377; CHECK-NEXT:    nop
378; CHECK-NEXT:    #NO_APP
379; CHECK-NEXT:    vpackuswb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
380; CHECK-NEXT:    retq
381  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
382  %2 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a0, <8 x i16> %a1)
383  ret <16 x i8> %2
384}
385declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>) nounwind readnone
386
387define <16 x i8> @stack_fold_paddb(<16 x i8> %a0, <16 x i8> %a1) {
388; CHECK-LABEL: stack_fold_paddb:
389; CHECK:       # %bb.0:
390; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
391; CHECK-NEXT:    #APP
392; CHECK-NEXT:    nop
393; CHECK-NEXT:    #NO_APP
394; CHECK-NEXT:    vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
395; CHECK-NEXT:    retq
396  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
397  %2 = add <16 x i8> %a0, %a1
398  ret <16 x i8> %2
399}
400
401define <4 x i32> @stack_fold_paddd(<4 x i32> %a0, <4 x i32> %a1) {
402; CHECK-LABEL: stack_fold_paddd:
403; CHECK:       # %bb.0:
404; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
405; CHECK-NEXT:    #APP
406; CHECK-NEXT:    nop
407; CHECK-NEXT:    #NO_APP
408; CHECK-NEXT:    vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
409; CHECK-NEXT:    retq
410  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
411  %2 = add <4 x i32> %a0, %a1
412  ret <4 x i32> %2
413}
414
415define <2 x i64> @stack_fold_paddq(<2 x i64> %a0, <2 x i64> %a1) {
416; CHECK-LABEL: stack_fold_paddq:
417; CHECK:       # %bb.0:
418; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
419; CHECK-NEXT:    #APP
420; CHECK-NEXT:    nop
421; CHECK-NEXT:    #NO_APP
422; CHECK-NEXT:    vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
423; CHECK-NEXT:    retq
424  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
425  %2 = add <2 x i64> %a0, %a1
426  ret <2 x i64> %2
427}
428
429define <16 x i8> @stack_fold_paddsb(<16 x i8> %a0, <16 x i8> %a1) {
430; CHECK-LABEL: stack_fold_paddsb:
431; CHECK:       # %bb.0:
432; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
433; CHECK-NEXT:    #APP
434; CHECK-NEXT:    nop
435; CHECK-NEXT:    #NO_APP
436; CHECK-NEXT:    vpaddsb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
437; CHECK-NEXT:    retq
438  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
439  %2 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> %a0, <16 x i8> %a1)
440  ret <16 x i8> %2
441}
442declare <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
443
444define <8 x i16> @stack_fold_paddsw(<8 x i16> %a0, <8 x i16> %a1) {
445; CHECK-LABEL: stack_fold_paddsw:
446; CHECK:       # %bb.0:
447; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
448; CHECK-NEXT:    #APP
449; CHECK-NEXT:    nop
450; CHECK-NEXT:    #NO_APP
451; CHECK-NEXT:    vpaddsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
452; CHECK-NEXT:    retq
453  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
454  %2 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %a0, <8 x i16> %a1)
455  ret <8 x i16> %2
456}
457declare <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
458
459define <16 x i8> @stack_fold_paddusb(<16 x i8> %a0, <16 x i8> %a1) {
460; CHECK-LABEL: stack_fold_paddusb:
461; CHECK:       # %bb.0:
462; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
463; CHECK-NEXT:    #APP
464; CHECK-NEXT:    nop
465; CHECK-NEXT:    #NO_APP
466; CHECK-NEXT:    vpaddusb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
467; CHECK-NEXT:    retq
468  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
469  %2 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> %a0, <16 x i8> %a1)
470  ret <16 x i8> %2
471}
472declare <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
473
474define <8 x i16> @stack_fold_paddusw(<8 x i16> %a0, <8 x i16> %a1) {
475; CHECK-LABEL: stack_fold_paddusw:
476; CHECK:       # %bb.0:
477; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
478; CHECK-NEXT:    #APP
479; CHECK-NEXT:    nop
480; CHECK-NEXT:    #NO_APP
481; CHECK-NEXT:    vpaddusw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
482; CHECK-NEXT:    retq
483  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
484  %2 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %a0, <8 x i16> %a1)
485  ret <8 x i16> %2
486}
487declare <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
488
489define <8 x i16> @stack_fold_paddw(<8 x i16> %a0, <8 x i16> %a1) {
490; CHECK-LABEL: stack_fold_paddw:
491; CHECK:       # %bb.0:
492; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
493; CHECK-NEXT:    #APP
494; CHECK-NEXT:    nop
495; CHECK-NEXT:    #NO_APP
496; CHECK-NEXT:    vpaddw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
497; CHECK-NEXT:    retq
498  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
499  %2 = add <8 x i16> %a0, %a1
500  ret <8 x i16> %2
501}
502
503define <16 x i8> @stack_fold_palignr(<16 x i8> %a0, <16 x i8> %a1) {
504; CHECK-LABEL: stack_fold_palignr:
505; CHECK:       # %bb.0:
506; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
507; CHECK-NEXT:    #APP
508; CHECK-NEXT:    nop
509; CHECK-NEXT:    #NO_APP
510; CHECK-NEXT:    vpalignr $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
511; CHECK-NEXT:    # xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
512; CHECK-NEXT:    retq
513  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
514  %2 = shufflevector <16 x i8> %a1, <16 x i8> %a0, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
515  ret <16 x i8> %2
516}
517
518define <16 x i8> @stack_fold_pand(<16 x i8> %a0, <16 x i8> %a1) {
519; CHECK-LABEL: stack_fold_pand:
520; CHECK:       # %bb.0:
521; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
522; CHECK-NEXT:    #APP
523; CHECK-NEXT:    nop
524; CHECK-NEXT:    #NO_APP
525; CHECK-NEXT:    vpand {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
526; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
527; CHECK-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
528; CHECK-NEXT:    retq
529  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
530  %2 = and <16 x i8> %a0, %a1
531  ; add forces execution domain
532  %3 = add <16 x i8> %2, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
533  ret <16 x i8> %3
534}
535
536define <16 x i8> @stack_fold_pandn(<16 x i8> %a0, <16 x i8> %a1) {
537; CHECK-LABEL: stack_fold_pandn:
538; CHECK:       # %bb.0:
539; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
540; CHECK-NEXT:    #APP
541; CHECK-NEXT:    nop
542; CHECK-NEXT:    #NO_APP
543; CHECK-NEXT:    vpandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
544; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
545; CHECK-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
546; CHECK-NEXT:    retq
547  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
548  %2 = xor <16 x i8> %a0, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
549  %3 = and <16 x i8> %2, %a1
550  ; add forces execution domain
551  %4 = add <16 x i8> %3, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
552  ret <16 x i8> %4
553}
554
555define <16 x i8> @stack_fold_pavgb(<16 x i8> %a0, <16 x i8> %a1) {
556; CHECK-LABEL: stack_fold_pavgb:
557; CHECK:       # %bb.0:
558; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
559; CHECK-NEXT:    #APP
560; CHECK-NEXT:    nop
561; CHECK-NEXT:    #NO_APP
562; CHECK-NEXT:    vpavgb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
563; CHECK-NEXT:    retq
564  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
565  %2 = zext <16 x i8> %a0 to <16 x i16>
566  %3 = zext <16 x i8> %a1 to <16 x i16>
567  %4 = add <16 x i16> %2, %3
568  %5 = add <16 x i16> %4, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
569  %6 = lshr <16 x i16> %5, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
570  %7 = trunc <16 x i16> %6 to <16 x i8>
571  ret <16 x i8> %7
572}
573
574define <8 x i16> @stack_fold_pavgw(<8 x i16> %a0, <8 x i16> %a1) {
575; CHECK-LABEL: stack_fold_pavgw:
576; CHECK:       # %bb.0:
577; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
578; CHECK-NEXT:    #APP
579; CHECK-NEXT:    nop
580; CHECK-NEXT:    #NO_APP
581; CHECK-NEXT:    vpavgw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
582; CHECK-NEXT:    retq
583  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
584  %2 = zext <8 x i16> %a0 to <8 x i32>
585  %3 = zext <8 x i16> %a1 to <8 x i32>
586  %4 = add <8 x i32> %2, %3
587  %5 = add <8 x i32> %4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
588  %6 = lshr <8 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
589  %7 = trunc <8 x i32> %6 to <8 x i16>
590  ret <8 x i16> %7
591}
592
593define <16 x i8> @stack_fold_pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %c) {
594; CHECK-LABEL: stack_fold_pblendvb:
595; CHECK:       # %bb.0:
596; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
597; CHECK-NEXT:    #APP
598; CHECK-NEXT:    nop
599; CHECK-NEXT:    #NO_APP
600; CHECK-NEXT:    vpblendvb %xmm0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
601; CHECK-NEXT:    retq
602  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
603  %2 = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %a1, <16 x i8> %c, <16 x i8> %a0)
604  ret <16 x i8> %2
605}
606declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
607
608define <8 x i16> @stack_fold_pblendw(<8 x i16> %a0, <8 x i16> %a1) {
609; CHECK-LABEL: stack_fold_pblendw:
610; CHECK:       # %bb.0:
611; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
612; CHECK-NEXT:    #APP
613; CHECK-NEXT:    nop
614; CHECK-NEXT:    #NO_APP
615; CHECK-NEXT:    vpblendw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
616; CHECK-NEXT:    # xmm0 = mem[0,1,2],xmm0[3,4,5,6,7]
617; CHECK-NEXT:    retq
618  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
619  %2 = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 8, i32 9, i32 10, i32 3, i32 4, i32 5, i32 6, i32 7>
620  ret <8 x i16> %2
621}
622
623define <2 x i64> @stack_fold_pclmulqdq(<2 x i64> %a0, <2 x i64> %a1) {
624; CHECK-LABEL: stack_fold_pclmulqdq:
625; CHECK:       # %bb.0:
626; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
627; CHECK-NEXT:    #APP
628; CHECK-NEXT:    nop
629; CHECK-NEXT:    #NO_APP
630; CHECK-NEXT:    vpclmulqdq $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
631; CHECK-NEXT:    retq
632  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
633  %2 = call <2 x i64> @llvm.x86.pclmulqdq(<2 x i64> %a0, <2 x i64> %a1, i8 0)
634  ret <2 x i64> %2
635}
636declare <2 x i64> @llvm.x86.pclmulqdq(<2 x i64>, <2 x i64>, i8) nounwind readnone
637
638define <16 x i8> @stack_fold_pcmpeqb(<16 x i8> %a0, <16 x i8> %a1) {
639; CHECK-LABEL: stack_fold_pcmpeqb:
640; CHECK:       # %bb.0:
641; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
642; CHECK-NEXT:    #APP
643; CHECK-NEXT:    nop
644; CHECK-NEXT:    #NO_APP
645; CHECK-NEXT:    vpcmpeqb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
646; CHECK-NEXT:    retq
647  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
648  %2 = icmp eq <16 x i8> %a0, %a1
649  %3 = sext <16 x i1> %2 to <16 x i8>
650  ret <16 x i8> %3
651}
652
653define <4 x i32> @stack_fold_pcmpeqd(<4 x i32> %a0, <4 x i32> %a1) {
654; CHECK-LABEL: stack_fold_pcmpeqd:
655; CHECK:       # %bb.0:
656; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
657; CHECK-NEXT:    #APP
658; CHECK-NEXT:    nop
659; CHECK-NEXT:    #NO_APP
660; CHECK-NEXT:    vpcmpeqd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
661; CHECK-NEXT:    retq
662  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
663  %2 = icmp eq <4 x i32> %a0, %a1
664  %3 = sext <4 x i1> %2 to <4 x i32>
665  ret <4 x i32> %3
666}
667
668define <2 x i64> @stack_fold_pcmpeqq(<2 x i64> %a0, <2 x i64> %a1) {
669; CHECK-LABEL: stack_fold_pcmpeqq:
670; CHECK:       # %bb.0:
671; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
672; CHECK-NEXT:    #APP
673; CHECK-NEXT:    nop
674; CHECK-NEXT:    #NO_APP
675; CHECK-NEXT:    vpcmpeqq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
676; CHECK-NEXT:    retq
677  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
678  %2 = icmp eq <2 x i64> %a0, %a1
679  %3 = sext <2 x i1> %2 to <2 x i64>
680  ret <2 x i64> %3
681}
682
683define <8 x i16> @stack_fold_pcmpeqw(<8 x i16> %a0, <8 x i16> %a1) {
684; CHECK-LABEL: stack_fold_pcmpeqw:
685; CHECK:       # %bb.0:
686; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
687; CHECK-NEXT:    #APP
688; CHECK-NEXT:    nop
689; CHECK-NEXT:    #NO_APP
690; CHECK-NEXT:    vpcmpeqw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
691; CHECK-NEXT:    retq
692  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
693  %2 = icmp eq <8 x i16> %a0, %a1
694  %3 = sext <8 x i1> %2 to <8 x i16>
695  ret <8 x i16> %3
696}
697
698define i32 @stack_fold_pcmpestri(<16 x i8> %a0, <16 x i8> %a1) {
699; CHECK-LABEL: stack_fold_pcmpestri:
700; CHECK:       # %bb.0:
701; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
702; CHECK-NEXT:    #APP
703; CHECK-NEXT:    nop
704; CHECK-NEXT:    #NO_APP
705; CHECK-NEXT:    movl $7, %eax
706; CHECK-NEXT:    movl $7, %edx
707; CHECK-NEXT:    vpcmpestri $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
708; CHECK-NEXT:    movl %ecx, %eax
709; CHECK-NEXT:    retq
710  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{rax},~{flags}"()
711  %2 = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %a0, i32 7, <16 x i8> %a1, i32 7, i8 7)
712  ret i32 %2
713}
714declare i32 @llvm.x86.sse42.pcmpestri128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
715
716define <16 x i8> @stack_fold_pcmpestrm(<16 x i8> %a0, <16 x i8> %a1) {
717; CHECK-LABEL: stack_fold_pcmpestrm:
718; CHECK:       # %bb.0:
719; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
720; CHECK-NEXT:    #APP
721; CHECK-NEXT:    nop
722; CHECK-NEXT:    #NO_APP
723; CHECK-NEXT:    movl $7, %eax
724; CHECK-NEXT:    movl $7, %edx
725; CHECK-NEXT:    vpcmpestrm $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
726; CHECK-NEXT:    retq
727  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{rax},~{flags}"()
728  %2 = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %a0, i32 7, <16 x i8> %a1, i32 7, i8 7)
729  ret <16 x i8> %2
730}
731declare <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
732
733define <16 x i8> @stack_fold_pcmpgtb(<16 x i8> %a0, <16 x i8> %a1) {
734; CHECK-LABEL: stack_fold_pcmpgtb:
735; CHECK:       # %bb.0:
736; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
737; CHECK-NEXT:    #APP
738; CHECK-NEXT:    nop
739; CHECK-NEXT:    #NO_APP
740; CHECK-NEXT:    vpcmpgtb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
741; CHECK-NEXT:    retq
742  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
743  %2 = icmp sgt <16 x i8> %a0, %a1
744  %3 = sext <16 x i1> %2 to <16 x i8>
745  ret <16 x i8> %3
746}
747
748define <4 x i32> @stack_fold_pcmpgtd(<4 x i32> %a0, <4 x i32> %a1) {
749; CHECK-LABEL: stack_fold_pcmpgtd:
750; CHECK:       # %bb.0:
751; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
752; CHECK-NEXT:    #APP
753; CHECK-NEXT:    nop
754; CHECK-NEXT:    #NO_APP
755; CHECK-NEXT:    vpcmpgtd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
756; CHECK-NEXT:    retq
757  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
758  %2 = icmp sgt <4 x i32> %a0, %a1
759  %3 = sext <4 x i1> %2 to <4 x i32>
760  ret <4 x i32> %3
761}
762
763define <2 x i64> @stack_fold_pcmpgtq(<2 x i64> %a0, <2 x i64> %a1) {
764; CHECK-LABEL: stack_fold_pcmpgtq:
765; CHECK:       # %bb.0:
766; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
767; CHECK-NEXT:    #APP
768; CHECK-NEXT:    nop
769; CHECK-NEXT:    #NO_APP
770; CHECK-NEXT:    vpcmpgtq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
771; CHECK-NEXT:    retq
772  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
773  %2 = icmp sgt <2 x i64> %a0, %a1
774  %3 = sext <2 x i1> %2 to <2 x i64>
775  ret <2 x i64> %3
776}
777
778define <8 x i16> @stack_fold_pcmpgtw(<8 x i16> %a0, <8 x i16> %a1) {
779; CHECK-LABEL: stack_fold_pcmpgtw:
780; CHECK:       # %bb.0:
781; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
782; CHECK-NEXT:    #APP
783; CHECK-NEXT:    nop
784; CHECK-NEXT:    #NO_APP
785; CHECK-NEXT:    vpcmpgtw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
786; CHECK-NEXT:    retq
787  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
788  %2 = icmp sgt <8 x i16> %a0, %a1
789  %3 = sext <8 x i1> %2 to <8 x i16>
790  ret <8 x i16> %3
791}
792
793define i32 @stack_fold_pcmpistri(<16 x i8> %a0, <16 x i8> %a1) {
794; CHECK-LABEL: stack_fold_pcmpistri:
795; CHECK:       # %bb.0:
796; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
797; CHECK-NEXT:    #APP
798; CHECK-NEXT:    nop
799; CHECK-NEXT:    #NO_APP
800; CHECK-NEXT:    vpcmpistri $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
801; CHECK-NEXT:    movl %ecx, %eax
802; CHECK-NEXT:    retq
803  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
804  %2 = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %a0, <16 x i8> %a1, i8 7)
805  ret i32 %2
806}
807declare i32 @llvm.x86.sse42.pcmpistri128(<16 x i8>, <16 x i8>, i8) nounwind readnone
808
809define <16 x i8> @stack_fold_pcmpistrm(<16 x i8> %a0, <16 x i8> %a1) {
810; CHECK-LABEL: stack_fold_pcmpistrm:
811; CHECK:       # %bb.0:
812; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
813; CHECK-NEXT:    #APP
814; CHECK-NEXT:    nop
815; CHECK-NEXT:    #NO_APP
816; CHECK-NEXT:    vpcmpistrm $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
817; CHECK-NEXT:    retq
818  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
819  %2 = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %a0, <16 x i8> %a1, i8 7)
820  ret <16 x i8> %2
821}
822declare <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8>, <16 x i8>, i8) nounwind readnone
823
824; TODO stack_fold_pextrb
825
826define i32 @stack_fold_pextrd(<4 x i32> %a0, <4 x i32> %a1) {
827; CHECK-LABEL: stack_fold_pextrd:
828; CHECK:       # %bb.0:
829; CHECK-NEXT:    pushq %rbp
830; CHECK-NEXT:    .cfi_def_cfa_offset 16
831; CHECK-NEXT:    pushq %r15
832; CHECK-NEXT:    .cfi_def_cfa_offset 24
833; CHECK-NEXT:    pushq %r14
834; CHECK-NEXT:    .cfi_def_cfa_offset 32
835; CHECK-NEXT:    pushq %r13
836; CHECK-NEXT:    .cfi_def_cfa_offset 40
837; CHECK-NEXT:    pushq %r12
838; CHECK-NEXT:    .cfi_def_cfa_offset 48
839; CHECK-NEXT:    pushq %rbx
840; CHECK-NEXT:    .cfi_def_cfa_offset 56
841; CHECK-NEXT:    .cfi_offset %rbx, -56
842; CHECK-NEXT:    .cfi_offset %r12, -48
843; CHECK-NEXT:    .cfi_offset %r13, -40
844; CHECK-NEXT:    .cfi_offset %r14, -32
845; CHECK-NEXT:    .cfi_offset %r15, -24
846; CHECK-NEXT:    .cfi_offset %rbp, -16
847; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
848; CHECK-NEXT:    vpextrd $1, %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
849; CHECK-NEXT:    #APP
850; CHECK-NEXT:    nop
851; CHECK-NEXT:    #NO_APP
852; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
853; CHECK-NEXT:    popq %rbx
854; CHECK-NEXT:    .cfi_def_cfa_offset 48
855; CHECK-NEXT:    popq %r12
856; CHECK-NEXT:    .cfi_def_cfa_offset 40
857; CHECK-NEXT:    popq %r13
858; CHECK-NEXT:    .cfi_def_cfa_offset 32
859; CHECK-NEXT:    popq %r14
860; CHECK-NEXT:    .cfi_def_cfa_offset 24
861; CHECK-NEXT:    popq %r15
862; CHECK-NEXT:    .cfi_def_cfa_offset 16
863; CHECK-NEXT:    popq %rbp
864; CHECK-NEXT:    .cfi_def_cfa_offset 8
865; CHECK-NEXT:    retq
866  ; add forces execution domain
867  %1 = add <4 x i32> %a0, %a1
868  %2 = extractelement <4 x i32> %1, i32 1
869  %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
870  ret i32 %2
871}
872
873define i64 @stack_fold_pextrq(<2 x i64> %a0) {
874; CHECK-LABEL: stack_fold_pextrq:
875; CHECK:       # %bb.0:
876; CHECK-NEXT:    pushq %rbp
877; CHECK-NEXT:    .cfi_def_cfa_offset 16
878; CHECK-NEXT:    pushq %r15
879; CHECK-NEXT:    .cfi_def_cfa_offset 24
880; CHECK-NEXT:    pushq %r14
881; CHECK-NEXT:    .cfi_def_cfa_offset 32
882; CHECK-NEXT:    pushq %r13
883; CHECK-NEXT:    .cfi_def_cfa_offset 40
884; CHECK-NEXT:    pushq %r12
885; CHECK-NEXT:    .cfi_def_cfa_offset 48
886; CHECK-NEXT:    pushq %rbx
887; CHECK-NEXT:    .cfi_def_cfa_offset 56
888; CHECK-NEXT:    .cfi_offset %rbx, -56
889; CHECK-NEXT:    .cfi_offset %r12, -48
890; CHECK-NEXT:    .cfi_offset %r13, -40
891; CHECK-NEXT:    .cfi_offset %r14, -32
892; CHECK-NEXT:    .cfi_offset %r15, -24
893; CHECK-NEXT:    .cfi_offset %rbp, -16
894; CHECK-NEXT:    vpextrq $1, %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
895; CHECK-NEXT:    #APP
896; CHECK-NEXT:    nop
897; CHECK-NEXT:    #NO_APP
898; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
899; CHECK-NEXT:    popq %rbx
900; CHECK-NEXT:    .cfi_def_cfa_offset 48
901; CHECK-NEXT:    popq %r12
902; CHECK-NEXT:    .cfi_def_cfa_offset 40
903; CHECK-NEXT:    popq %r13
904; CHECK-NEXT:    .cfi_def_cfa_offset 32
905; CHECK-NEXT:    popq %r14
906; CHECK-NEXT:    .cfi_def_cfa_offset 24
907; CHECK-NEXT:    popq %r15
908; CHECK-NEXT:    .cfi_def_cfa_offset 16
909; CHECK-NEXT:    popq %rbp
910; CHECK-NEXT:    .cfi_def_cfa_offset 8
911; CHECK-NEXT:    retq
912  %1 = extractelement <2 x i64> %a0, i32 1
913  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
914  ret i64 %1
915}
916
917; TODO stack_fold_pextrw
918
919define <4 x i32> @stack_fold_phaddd(<4 x i32> %a0, <4 x i32> %a1) {
920; CHECK-LABEL: stack_fold_phaddd:
921; CHECK:       # %bb.0:
922; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
923; CHECK-NEXT:    #APP
924; CHECK-NEXT:    nop
925; CHECK-NEXT:    #NO_APP
926; CHECK-NEXT:    vphaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
927; CHECK-NEXT:    retq
928  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
929  %2 = call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %a0, <4 x i32> %a1)
930  ret <4 x i32> %2
931}
932declare <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32>, <4 x i32>) nounwind readnone
933
934define <8 x i16> @stack_fold_phaddsw(<8 x i16> %a0, <8 x i16> %a1) {
935; CHECK-LABEL: stack_fold_phaddsw:
936; CHECK:       # %bb.0:
937; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
938; CHECK-NEXT:    #APP
939; CHECK-NEXT:    nop
940; CHECK-NEXT:    #NO_APP
941; CHECK-NEXT:    vphaddsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
942; CHECK-NEXT:    retq
943  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
944  %2 = call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> %a0, <8 x i16> %a1)
945  ret <8 x i16> %2
946}
947declare <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16>, <8 x i16>) nounwind readnone
948
949define <8 x i16> @stack_fold_phaddw(<8 x i16> %a0, <8 x i16> %a1) {
950; CHECK-LABEL: stack_fold_phaddw:
951; CHECK:       # %bb.0:
952; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
953; CHECK-NEXT:    #APP
954; CHECK-NEXT:    nop
955; CHECK-NEXT:    #NO_APP
956; CHECK-NEXT:    vphaddw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
957; CHECK-NEXT:    retq
958  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
959  %2 = call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %a0, <8 x i16> %a1)
960  ret <8 x i16> %2
961}
962declare <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16>, <8 x i16>) nounwind readnone
963
964define <8 x i16> @stack_fold_phminposuw(<8 x i16> %a0) {
965; CHECK-LABEL: stack_fold_phminposuw:
966; CHECK:       # %bb.0:
967; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
968; CHECK-NEXT:    #APP
969; CHECK-NEXT:    nop
970; CHECK-NEXT:    #NO_APP
971; CHECK-NEXT:    vphminposuw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
972; CHECK-NEXT:    retq
973  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
974  %2 = call <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16> %a0)
975  ret <8 x i16> %2
976}
977declare <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16>) nounwind readnone
978
979define <4 x i32> @stack_fold_phsubd(<4 x i32> %a0, <4 x i32> %a1) {
980; CHECK-LABEL: stack_fold_phsubd:
981; CHECK:       # %bb.0:
982; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
983; CHECK-NEXT:    #APP
984; CHECK-NEXT:    nop
985; CHECK-NEXT:    #NO_APP
986; CHECK-NEXT:    vphsubd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
987; CHECK-NEXT:    retq
988  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
989  %2 = call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %a0, <4 x i32> %a1)
990  ret <4 x i32> %2
991}
992declare <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32>, <4 x i32>) nounwind readnone
993
994define <8 x i16> @stack_fold_phsubsw(<8 x i16> %a0, <8 x i16> %a1) {
995; CHECK-LABEL: stack_fold_phsubsw:
996; CHECK:       # %bb.0:
997; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
998; CHECK-NEXT:    #APP
999; CHECK-NEXT:    nop
1000; CHECK-NEXT:    #NO_APP
1001; CHECK-NEXT:    vphsubsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1002; CHECK-NEXT:    retq
1003  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1004  %2 = call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16> %a0, <8 x i16> %a1)
1005  ret <8 x i16> %2
1006}
1007declare <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16>, <8 x i16>) nounwind readnone
1008
1009define <8 x i16> @stack_fold_phsubw(<8 x i16> %a0, <8 x i16> %a1) {
1010; CHECK-LABEL: stack_fold_phsubw:
1011; CHECK:       # %bb.0:
1012; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1013; CHECK-NEXT:    #APP
1014; CHECK-NEXT:    nop
1015; CHECK-NEXT:    #NO_APP
1016; CHECK-NEXT:    vphsubw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1017; CHECK-NEXT:    retq
1018  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1019  %2 = call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %a0, <8 x i16> %a1)
1020  ret <8 x i16> %2
1021}
1022declare <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16>, <8 x i16>) nounwind readnone
1023
1024define <16 x i8> @stack_fold_pinsrb(<16 x i8> %a0, i8 %a1) {
1025; CHECK-LABEL: stack_fold_pinsrb:
1026; CHECK:       # %bb.0:
1027; CHECK-NEXT:    pushq %rbp
1028; CHECK-NEXT:    .cfi_def_cfa_offset 16
1029; CHECK-NEXT:    pushq %r15
1030; CHECK-NEXT:    .cfi_def_cfa_offset 24
1031; CHECK-NEXT:    pushq %r14
1032; CHECK-NEXT:    .cfi_def_cfa_offset 32
1033; CHECK-NEXT:    pushq %r13
1034; CHECK-NEXT:    .cfi_def_cfa_offset 40
1035; CHECK-NEXT:    pushq %r12
1036; CHECK-NEXT:    .cfi_def_cfa_offset 48
1037; CHECK-NEXT:    pushq %rbx
1038; CHECK-NEXT:    .cfi_def_cfa_offset 56
1039; CHECK-NEXT:    .cfi_offset %rbx, -56
1040; CHECK-NEXT:    .cfi_offset %r12, -48
1041; CHECK-NEXT:    .cfi_offset %r13, -40
1042; CHECK-NEXT:    .cfi_offset %r14, -32
1043; CHECK-NEXT:    .cfi_offset %r15, -24
1044; CHECK-NEXT:    .cfi_offset %rbp, -16
1045; CHECK-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1046; CHECK-NEXT:    #APP
1047; CHECK-NEXT:    nop
1048; CHECK-NEXT:    #NO_APP
1049; CHECK-NEXT:    vpinsrb $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
1050; CHECK-NEXT:    popq %rbx
1051; CHECK-NEXT:    .cfi_def_cfa_offset 48
1052; CHECK-NEXT:    popq %r12
1053; CHECK-NEXT:    .cfi_def_cfa_offset 40
1054; CHECK-NEXT:    popq %r13
1055; CHECK-NEXT:    .cfi_def_cfa_offset 32
1056; CHECK-NEXT:    popq %r14
1057; CHECK-NEXT:    .cfi_def_cfa_offset 24
1058; CHECK-NEXT:    popq %r15
1059; CHECK-NEXT:    .cfi_def_cfa_offset 16
1060; CHECK-NEXT:    popq %rbp
1061; CHECK-NEXT:    .cfi_def_cfa_offset 8
1062; CHECK-NEXT:    retq
1063  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
1064  %2 = insertelement <16 x i8> %a0, i8 %a1, i32 1
1065  ret <16 x i8> %2
1066}
1067
1068define <4 x i32> @stack_fold_pinsrd(<4 x i32> %a0, i32 %a1) {
1069; CHECK-LABEL: stack_fold_pinsrd:
1070; CHECK:       # %bb.0:
1071; CHECK-NEXT:    pushq %rbp
1072; CHECK-NEXT:    .cfi_def_cfa_offset 16
1073; CHECK-NEXT:    pushq %r15
1074; CHECK-NEXT:    .cfi_def_cfa_offset 24
1075; CHECK-NEXT:    pushq %r14
1076; CHECK-NEXT:    .cfi_def_cfa_offset 32
1077; CHECK-NEXT:    pushq %r13
1078; CHECK-NEXT:    .cfi_def_cfa_offset 40
1079; CHECK-NEXT:    pushq %r12
1080; CHECK-NEXT:    .cfi_def_cfa_offset 48
1081; CHECK-NEXT:    pushq %rbx
1082; CHECK-NEXT:    .cfi_def_cfa_offset 56
1083; CHECK-NEXT:    .cfi_offset %rbx, -56
1084; CHECK-NEXT:    .cfi_offset %r12, -48
1085; CHECK-NEXT:    .cfi_offset %r13, -40
1086; CHECK-NEXT:    .cfi_offset %r14, -32
1087; CHECK-NEXT:    .cfi_offset %r15, -24
1088; CHECK-NEXT:    .cfi_offset %rbp, -16
1089; CHECK-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1090; CHECK-NEXT:    #APP
1091; CHECK-NEXT:    nop
1092; CHECK-NEXT:    #NO_APP
1093; CHECK-NEXT:    vpinsrd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
1094; CHECK-NEXT:    popq %rbx
1095; CHECK-NEXT:    .cfi_def_cfa_offset 48
1096; CHECK-NEXT:    popq %r12
1097; CHECK-NEXT:    .cfi_def_cfa_offset 40
1098; CHECK-NEXT:    popq %r13
1099; CHECK-NEXT:    .cfi_def_cfa_offset 32
1100; CHECK-NEXT:    popq %r14
1101; CHECK-NEXT:    .cfi_def_cfa_offset 24
1102; CHECK-NEXT:    popq %r15
1103; CHECK-NEXT:    .cfi_def_cfa_offset 16
1104; CHECK-NEXT:    popq %rbp
1105; CHECK-NEXT:    .cfi_def_cfa_offset 8
1106; CHECK-NEXT:    retq
1107  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
1108  %2 = insertelement <4 x i32> %a0, i32 %a1, i32 1
1109  ret <4 x i32> %2
1110}
1111
1112define <2 x i64> @stack_fold_pinsrq(<2 x i64> %a0, i64 %a1) {
1113; CHECK-LABEL: stack_fold_pinsrq:
1114; CHECK:       # %bb.0:
1115; CHECK-NEXT:    pushq %rbp
1116; CHECK-NEXT:    .cfi_def_cfa_offset 16
1117; CHECK-NEXT:    pushq %r15
1118; CHECK-NEXT:    .cfi_def_cfa_offset 24
1119; CHECK-NEXT:    pushq %r14
1120; CHECK-NEXT:    .cfi_def_cfa_offset 32
1121; CHECK-NEXT:    pushq %r13
1122; CHECK-NEXT:    .cfi_def_cfa_offset 40
1123; CHECK-NEXT:    pushq %r12
1124; CHECK-NEXT:    .cfi_def_cfa_offset 48
1125; CHECK-NEXT:    pushq %rbx
1126; CHECK-NEXT:    .cfi_def_cfa_offset 56
1127; CHECK-NEXT:    .cfi_offset %rbx, -56
1128; CHECK-NEXT:    .cfi_offset %r12, -48
1129; CHECK-NEXT:    .cfi_offset %r13, -40
1130; CHECK-NEXT:    .cfi_offset %r14, -32
1131; CHECK-NEXT:    .cfi_offset %r15, -24
1132; CHECK-NEXT:    .cfi_offset %rbp, -16
1133; CHECK-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1134; CHECK-NEXT:    #APP
1135; CHECK-NEXT:    nop
1136; CHECK-NEXT:    #NO_APP
1137; CHECK-NEXT:    vpinsrq $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 8-byte Folded Reload
1138; CHECK-NEXT:    popq %rbx
1139; CHECK-NEXT:    .cfi_def_cfa_offset 48
1140; CHECK-NEXT:    popq %r12
1141; CHECK-NEXT:    .cfi_def_cfa_offset 40
1142; CHECK-NEXT:    popq %r13
1143; CHECK-NEXT:    .cfi_def_cfa_offset 32
1144; CHECK-NEXT:    popq %r14
1145; CHECK-NEXT:    .cfi_def_cfa_offset 24
1146; CHECK-NEXT:    popq %r15
1147; CHECK-NEXT:    .cfi_def_cfa_offset 16
1148; CHECK-NEXT:    popq %rbp
1149; CHECK-NEXT:    .cfi_def_cfa_offset 8
1150; CHECK-NEXT:    retq
1151  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
1152  %2 = insertelement <2 x i64> %a0, i64 %a1, i32 1
1153  ret <2 x i64> %2
1154}
1155
1156define <8 x i16> @stack_fold_pinsrw(<8 x i16> %a0, i16 %a1) {
1157; CHECK-LABEL: stack_fold_pinsrw:
1158; CHECK:       # %bb.0:
1159; CHECK-NEXT:    pushq %rbp
1160; CHECK-NEXT:    .cfi_def_cfa_offset 16
1161; CHECK-NEXT:    pushq %r15
1162; CHECK-NEXT:    .cfi_def_cfa_offset 24
1163; CHECK-NEXT:    pushq %r14
1164; CHECK-NEXT:    .cfi_def_cfa_offset 32
1165; CHECK-NEXT:    pushq %r13
1166; CHECK-NEXT:    .cfi_def_cfa_offset 40
1167; CHECK-NEXT:    pushq %r12
1168; CHECK-NEXT:    .cfi_def_cfa_offset 48
1169; CHECK-NEXT:    pushq %rbx
1170; CHECK-NEXT:    .cfi_def_cfa_offset 56
1171; CHECK-NEXT:    .cfi_offset %rbx, -56
1172; CHECK-NEXT:    .cfi_offset %r12, -48
1173; CHECK-NEXT:    .cfi_offset %r13, -40
1174; CHECK-NEXT:    .cfi_offset %r14, -32
1175; CHECK-NEXT:    .cfi_offset %r15, -24
1176; CHECK-NEXT:    .cfi_offset %rbp, -16
1177; CHECK-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1178; CHECK-NEXT:    #APP
1179; CHECK-NEXT:    nop
1180; CHECK-NEXT:    #NO_APP
1181; CHECK-NEXT:    vpinsrw $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
1182; CHECK-NEXT:    popq %rbx
1183; CHECK-NEXT:    .cfi_def_cfa_offset 48
1184; CHECK-NEXT:    popq %r12
1185; CHECK-NEXT:    .cfi_def_cfa_offset 40
1186; CHECK-NEXT:    popq %r13
1187; CHECK-NEXT:    .cfi_def_cfa_offset 32
1188; CHECK-NEXT:    popq %r14
1189; CHECK-NEXT:    .cfi_def_cfa_offset 24
1190; CHECK-NEXT:    popq %r15
1191; CHECK-NEXT:    .cfi_def_cfa_offset 16
1192; CHECK-NEXT:    popq %rbp
1193; CHECK-NEXT:    .cfi_def_cfa_offset 8
1194; CHECK-NEXT:    retq
1195  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
1196  %2 = insertelement <8 x i16> %a0, i16 %a1, i32 1
1197  ret <8 x i16> %2
1198}
1199
1200define <8 x i16> @stack_fold_pmaddubsw(<16 x i8> %a0, <16 x i8> %a1) {
1201; CHECK-LABEL: stack_fold_pmaddubsw:
1202; CHECK:       # %bb.0:
1203; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1204; CHECK-NEXT:    #APP
1205; CHECK-NEXT:    nop
1206; CHECK-NEXT:    #NO_APP
1207; CHECK-NEXT:    vpmaddubsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1208; CHECK-NEXT:    retq
1209  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1210  %2 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a0, <16 x i8> %a1)
1211  ret <8 x i16> %2
1212}
1213declare <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8>, <16 x i8>) nounwind readnone
1214
1215define <4 x i32> @stack_fold_pmaddwd(<8 x i16> %a0, <8 x i16> %a1) {
1216; CHECK-LABEL: stack_fold_pmaddwd:
1217; CHECK:       # %bb.0:
1218; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1219; CHECK-NEXT:    #APP
1220; CHECK-NEXT:    nop
1221; CHECK-NEXT:    #NO_APP
1222; CHECK-NEXT:    vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1223; CHECK-NEXT:    retq
1224  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1225  %2 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a0, <8 x i16> %a1)
1226  ret <4 x i32> %2
1227}
1228declare <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>) nounwind readnone
1229
1230define <16 x i8> @stack_fold_pmaxsb(<16 x i8> %a0, <16 x i8> %a1) {
1231; CHECK-LABEL: stack_fold_pmaxsb:
1232; CHECK:       # %bb.0:
1233; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1234; CHECK-NEXT:    #APP
1235; CHECK-NEXT:    nop
1236; CHECK-NEXT:    #NO_APP
1237; CHECK-NEXT:    vpmaxsb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1238; CHECK-NEXT:    retq
1239  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1240  %2 = icmp sgt <16 x i8> %a0, %a1
1241  %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %a1
1242  ret <16 x i8> %3
1243}
1244
1245define <4 x i32> @stack_fold_pmaxsd(<4 x i32> %a0, <4 x i32> %a1) {
1246; CHECK-LABEL: stack_fold_pmaxsd:
1247; CHECK:       # %bb.0:
1248; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1249; CHECK-NEXT:    #APP
1250; CHECK-NEXT:    nop
1251; CHECK-NEXT:    #NO_APP
1252; CHECK-NEXT:    vpmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1253; CHECK-NEXT:    retq
1254  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1255  %2 = icmp sgt <4 x i32> %a0, %a1
1256  %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %a1
1257  ret <4 x i32> %3
1258}
1259
1260define <8 x i16> @stack_fold_pmaxsw(<8 x i16> %a0, <8 x i16> %a1) {
1261; CHECK-LABEL: stack_fold_pmaxsw:
1262; CHECK:       # %bb.0:
1263; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1264; CHECK-NEXT:    #APP
1265; CHECK-NEXT:    nop
1266; CHECK-NEXT:    #NO_APP
1267; CHECK-NEXT:    vpmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1268; CHECK-NEXT:    retq
1269  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1270  %2 = icmp sgt <8 x i16> %a0, %a1
1271  %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %a1
1272  ret <8 x i16> %3
1273}
1274
1275define <16 x i8> @stack_fold_pmaxub(<16 x i8> %a0, <16 x i8> %a1) {
1276; CHECK-LABEL: stack_fold_pmaxub:
1277; CHECK:       # %bb.0:
1278; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1279; CHECK-NEXT:    #APP
1280; CHECK-NEXT:    nop
1281; CHECK-NEXT:    #NO_APP
1282; CHECK-NEXT:    vpmaxub {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1283; CHECK-NEXT:    retq
1284  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1285  %2 = icmp ugt <16 x i8> %a0, %a1
1286  %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %a1
1287  ret <16 x i8> %3
1288}
1289
1290define <4 x i32> @stack_fold_pmaxud(<4 x i32> %a0, <4 x i32> %a1) {
1291; CHECK-LABEL: stack_fold_pmaxud:
1292; CHECK:       # %bb.0:
1293; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1294; CHECK-NEXT:    #APP
1295; CHECK-NEXT:    nop
1296; CHECK-NEXT:    #NO_APP
1297; CHECK-NEXT:    vpmaxud {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1298; CHECK-NEXT:    retq
1299  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1300  %2 = icmp ugt <4 x i32> %a0, %a1
1301  %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %a1
1302  ret <4 x i32> %3
1303}
1304
1305define <8 x i16> @stack_fold_pmaxuw(<8 x i16> %a0, <8 x i16> %a1) {
1306; CHECK-LABEL: stack_fold_pmaxuw:
1307; CHECK:       # %bb.0:
1308; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1309; CHECK-NEXT:    #APP
1310; CHECK-NEXT:    nop
1311; CHECK-NEXT:    #NO_APP
1312; CHECK-NEXT:    vpmaxuw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1313; CHECK-NEXT:    retq
1314  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1315  %2 = icmp ugt <8 x i16> %a0, %a1
1316  %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %a1
1317  ret <8 x i16> %3
1318}
1319
1320define <16 x i8> @stack_fold_pminsb(<16 x i8> %a0, <16 x i8> %a1) {
1321; CHECK-LABEL: stack_fold_pminsb:
1322; CHECK:       # %bb.0:
1323; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1324; CHECK-NEXT:    #APP
1325; CHECK-NEXT:    nop
1326; CHECK-NEXT:    #NO_APP
1327; CHECK-NEXT:    vpminsb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1328; CHECK-NEXT:    retq
1329  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1330  %2 = icmp slt <16 x i8> %a0, %a1
1331  %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %a1
1332  ret <16 x i8> %3
1333}
1334
1335define <4 x i32> @stack_fold_pminsd(<4 x i32> %a0, <4 x i32> %a1) {
1336; CHECK-LABEL: stack_fold_pminsd:
1337; CHECK:       # %bb.0:
1338; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1339; CHECK-NEXT:    #APP
1340; CHECK-NEXT:    nop
1341; CHECK-NEXT:    #NO_APP
1342; CHECK-NEXT:    vpminsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1343; CHECK-NEXT:    retq
1344  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1345  %2 = icmp slt <4 x i32> %a0, %a1
1346  %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %a1
1347  ret <4 x i32> %3
1348}
1349
1350define <8 x i16> @stack_fold_pminsw(<8 x i16> %a0, <8 x i16> %a1) {
1351; CHECK-LABEL: stack_fold_pminsw:
1352; CHECK:       # %bb.0:
1353; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1354; CHECK-NEXT:    #APP
1355; CHECK-NEXT:    nop
1356; CHECK-NEXT:    #NO_APP
1357; CHECK-NEXT:    vpminsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1358; CHECK-NEXT:    retq
1359  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1360  %2 = icmp slt <8 x i16> %a0, %a1
1361  %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %a1
1362  ret <8 x i16> %3
1363}
1364
1365define <16 x i8> @stack_fold_pminub(<16 x i8> %a0, <16 x i8> %a1) {
1366; CHECK-LABEL: stack_fold_pminub:
1367; CHECK:       # %bb.0:
1368; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1369; CHECK-NEXT:    #APP
1370; CHECK-NEXT:    nop
1371; CHECK-NEXT:    #NO_APP
1372; CHECK-NEXT:    vpminub {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1373; CHECK-NEXT:    retq
1374  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1375  %2 = icmp ult <16 x i8> %a0, %a1
1376  %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %a1
1377  ret <16 x i8> %3
1378}
1379
1380define <4 x i32> @stack_fold_pminud(<4 x i32> %a0, <4 x i32> %a1) {
1381; CHECK-LABEL: stack_fold_pminud:
1382; CHECK:       # %bb.0:
1383; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1384; CHECK-NEXT:    #APP
1385; CHECK-NEXT:    nop
1386; CHECK-NEXT:    #NO_APP
1387; CHECK-NEXT:    vpminud {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1388; CHECK-NEXT:    retq
1389  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1390  %2 = icmp ult <4 x i32> %a0, %a1
1391  %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %a1
1392  ret <4 x i32> %3
1393}
1394
1395define <8 x i16> @stack_fold_pminuw(<8 x i16> %a0, <8 x i16> %a1) {
1396; CHECK-LABEL: stack_fold_pminuw:
1397; CHECK:       # %bb.0:
1398; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1399; CHECK-NEXT:    #APP
1400; CHECK-NEXT:    nop
1401; CHECK-NEXT:    #NO_APP
1402; CHECK-NEXT:    vpminuw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1403; CHECK-NEXT:    retq
1404  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1405  %2 = icmp ult <8 x i16> %a0, %a1
1406  %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %a1
1407  ret <8 x i16> %3
1408}
1409
1410define <2 x i64> @stack_fold_pmuldq(<4 x i32> %a0, <4 x i32> %a1) {
1411; CHECK-LABEL: stack_fold_pmuldq:
1412; CHECK:       # %bb.0:
1413; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1414; CHECK-NEXT:    #APP
1415; CHECK-NEXT:    nop
1416; CHECK-NEXT:    #NO_APP
1417; CHECK-NEXT:    vpmuldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1418; CHECK-NEXT:    retq
1419  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1420  %2 = bitcast <4 x i32> %a0 to <2 x i64>
1421  %3 = bitcast <4 x i32> %a1 to <2 x i64>
1422  %4 = shl <2 x i64> %2, <i64 32, i64 32>
1423  %5 = ashr <2 x i64> %4, <i64 32, i64 32>
1424  %6 = shl <2 x i64> %3, <i64 32, i64 32>
1425  %7 = ashr <2 x i64> %6, <i64 32, i64 32>
1426  %8 = mul <2 x i64> %5, %7
1427  ret <2 x i64> %8
1428}
1429
1430define <8 x i16> @stack_fold_pmulhrsw(<8 x i16> %a0, <8 x i16> %a1) {
1431; CHECK-LABEL: stack_fold_pmulhrsw:
1432; CHECK:       # %bb.0:
1433; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1434; CHECK-NEXT:    #APP
1435; CHECK-NEXT:    nop
1436; CHECK-NEXT:    #NO_APP
1437; CHECK-NEXT:    vpmulhrsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1438; CHECK-NEXT:    retq
1439  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1440  %2 = call <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16> %a0, <8 x i16> %a1)
1441  ret <8 x i16> %2
1442}
1443declare <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16>, <8 x i16>) nounwind readnone
1444
1445define <8 x i16> @stack_fold_pmulhuw(<8 x i16> %a0, <8 x i16> %a1) {
1446; CHECK-LABEL: stack_fold_pmulhuw:
1447; CHECK:       # %bb.0:
1448; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1449; CHECK-NEXT:    #APP
1450; CHECK-NEXT:    nop
1451; CHECK-NEXT:    #NO_APP
1452; CHECK-NEXT:    vpmulhuw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1453; CHECK-NEXT:    retq
1454  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1455  %2 = call <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16> %a0, <8 x i16> %a1)
1456  ret <8 x i16> %2
1457}
1458declare <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16>, <8 x i16>) nounwind readnone
1459
1460define <8 x i16> @stack_fold_pmulhw(<8 x i16> %a0, <8 x i16> %a1) {
1461; CHECK-LABEL: stack_fold_pmulhw:
1462; CHECK:       # %bb.0:
1463; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1464; CHECK-NEXT:    #APP
1465; CHECK-NEXT:    nop
1466; CHECK-NEXT:    #NO_APP
1467; CHECK-NEXT:    vpmulhw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1468; CHECK-NEXT:    retq
1469  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1470  %2 = call <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16> %a0, <8 x i16> %a1)
1471  ret <8 x i16> %2
1472}
1473declare <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16>, <8 x i16>) nounwind readnone
1474
1475define <4 x i32> @stack_fold_pmulld(<4 x i32> %a0, <4 x i32> %a1) {
1476; CHECK-LABEL: stack_fold_pmulld:
1477; CHECK:       # %bb.0:
1478; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1479; CHECK-NEXT:    #APP
1480; CHECK-NEXT:    nop
1481; CHECK-NEXT:    #NO_APP
1482; CHECK-NEXT:    vpmulld {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1483; CHECK-NEXT:    retq
1484  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1485  %2 = mul <4 x i32> %a0, %a1
1486  ret <4 x i32> %2
1487}
1488
1489define <8 x i16> @stack_fold_pmullw(<8 x i16> %a0, <8 x i16> %a1) {
1490; CHECK-LABEL: stack_fold_pmullw:
1491; CHECK:       # %bb.0:
1492; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1493; CHECK-NEXT:    #APP
1494; CHECK-NEXT:    nop
1495; CHECK-NEXT:    #NO_APP
1496; CHECK-NEXT:    vpmullw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1497; CHECK-NEXT:    retq
1498  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1499  %2 = mul <8 x i16> %a0, %a1
1500  ret <8 x i16> %2
1501}
1502
1503define <2 x i64> @stack_fold_pmuludq(<4 x i32> %a0, <4 x i32> %a1) {
1504; CHECK-LABEL: stack_fold_pmuludq:
1505; CHECK:       # %bb.0:
1506; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1507; CHECK-NEXT:    #APP
1508; CHECK-NEXT:    nop
1509; CHECK-NEXT:    #NO_APP
1510; CHECK-NEXT:    vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1511; CHECK-NEXT:    retq
1512  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1513  %2 = bitcast <4 x i32> %a0 to <2 x i64>
1514  %3 = bitcast <4 x i32> %a1 to <2 x i64>
1515  %4 = and <2 x i64> %2, <i64 4294967295, i64 4294967295>
1516  %5 = and <2 x i64> %3, <i64 4294967295, i64 4294967295>
1517  %6 = mul <2 x i64> %4, %5
1518  ret <2 x i64> %6
1519}
1520
1521define <16 x i8> @stack_fold_por(<16 x i8> %a0, <16 x i8> %a1) {
1522; CHECK-LABEL: stack_fold_por:
1523; CHECK:       # %bb.0:
1524; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1525; CHECK-NEXT:    #APP
1526; CHECK-NEXT:    nop
1527; CHECK-NEXT:    #NO_APP
1528; CHECK-NEXT:    vpor {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1529; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
1530; CHECK-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
1531; CHECK-NEXT:    retq
1532  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1533  %2 = or <16 x i8> %a0, %a1
1534  ; add forces execution domain
1535  %3 = add <16 x i8> %2, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
1536  ret <16 x i8> %3
1537}
1538
1539define <2 x i64> @stack_fold_psadbw(<16 x i8> %a0, <16 x i8> %a1) {
1540; CHECK-LABEL: stack_fold_psadbw:
1541; CHECK:       # %bb.0:
1542; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1543; CHECK-NEXT:    #APP
1544; CHECK-NEXT:    nop
1545; CHECK-NEXT:    #NO_APP
1546; CHECK-NEXT:    vpsadbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1547; CHECK-NEXT:    retq
1548  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1549  %2 = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %a0, <16 x i8> %a1)
1550  ret <2 x i64> %2
1551}
1552declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
1553
1554define <16 x i8> @stack_fold_pshufb(<16 x i8> %a0, <16 x i8> %a1) {
1555; CHECK-LABEL: stack_fold_pshufb:
1556; CHECK:       # %bb.0:
1557; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1558; CHECK-NEXT:    #APP
1559; CHECK-NEXT:    nop
1560; CHECK-NEXT:    #NO_APP
1561; CHECK-NEXT:    vpshufb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1562; CHECK-NEXT:    retq
1563  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1564  %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> %a1)
1565  ret <16 x i8> %2
1566}
1567declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>) nounwind readnone
1568
1569define <4 x i32> @stack_fold_pshufd(<4 x i32> %a0) {
1570; CHECK-LABEL: stack_fold_pshufd:
1571; CHECK:       # %bb.0:
1572; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1573; CHECK-NEXT:    #APP
1574; CHECK-NEXT:    nop
1575; CHECK-NEXT:    #NO_APP
1576; CHECK-NEXT:    vpshufd $27, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1577; CHECK-NEXT:    # xmm0 = mem[3,2,1,0]
1578; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
1579; CHECK-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
1580; CHECK-NEXT:    retq
1581  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1582  %2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
1583  ; add forces execution domain
1584  %3 = add <4 x i32> %2, <i32 1, i32 1, i32 1, i32 1>
1585  ret <4 x i32> %3
1586}
1587
1588define <8 x i16> @stack_fold_pshufhw(<8 x i16> %a0) {
1589; CHECK-LABEL: stack_fold_pshufhw:
1590; CHECK:       # %bb.0:
1591; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1592; CHECK-NEXT:    #APP
1593; CHECK-NEXT:    nop
1594; CHECK-NEXT:    #NO_APP
1595; CHECK-NEXT:    vpshufhw $11, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1596; CHECK-NEXT:    # xmm0 = mem[0,1,2,3,7,6,4,4]
1597; CHECK-NEXT:    retq
1598  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1599  %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 4, i32 4>
1600  ret <8 x i16> %2
1601}
1602
1603define <8 x i16> @stack_fold_pshuflw(<8 x i16> %a0) {
1604; CHECK-LABEL: stack_fold_pshuflw:
1605; CHECK:       # %bb.0:
1606; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1607; CHECK-NEXT:    #APP
1608; CHECK-NEXT:    nop
1609; CHECK-NEXT:    #NO_APP
1610; CHECK-NEXT:    vpshuflw $27, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1611; CHECK-NEXT:    # xmm0 = mem[3,2,1,0,4,5,6,7]
1612; CHECK-NEXT:    retq
1613  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1614  %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7>
1615  ret <8 x i16> %2
1616}
1617
1618define <16 x i8> @stack_fold_psignb(<16 x i8> %a0, <16 x i8> %a1) {
1619; CHECK-LABEL: stack_fold_psignb:
1620; CHECK:       # %bb.0:
1621; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1622; CHECK-NEXT:    #APP
1623; CHECK-NEXT:    nop
1624; CHECK-NEXT:    #NO_APP
1625; CHECK-NEXT:    vpsignb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1626; CHECK-NEXT:    retq
1627  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1628  %2 = call <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8> %a0, <16 x i8> %a1)
1629  ret <16 x i8> %2
1630}
1631declare <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8>, <16 x i8>) nounwind readnone
1632
1633define <4 x i32> @stack_fold_psignd(<4 x i32> %a0, <4 x i32> %a1) {
1634; CHECK-LABEL: stack_fold_psignd:
1635; CHECK:       # %bb.0:
1636; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1637; CHECK-NEXT:    #APP
1638; CHECK-NEXT:    nop
1639; CHECK-NEXT:    #NO_APP
1640; CHECK-NEXT:    vpsignd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1641; CHECK-NEXT:    retq
1642  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1643  %2 = call <4 x i32> @llvm.x86.ssse3.psign.d.128(<4 x i32> %a0, <4 x i32> %a1)
1644  ret <4 x i32> %2
1645}
1646declare <4 x i32> @llvm.x86.ssse3.psign.d.128(<4 x i32>, <4 x i32>) nounwind readnone
1647
1648define <8 x i16> @stack_fold_psignw(<8 x i16> %a0, <8 x i16> %a1) {
1649; CHECK-LABEL: stack_fold_psignw:
1650; CHECK:       # %bb.0:
1651; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1652; CHECK-NEXT:    #APP
1653; CHECK-NEXT:    nop
1654; CHECK-NEXT:    #NO_APP
1655; CHECK-NEXT:    vpsignw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1656; CHECK-NEXT:    retq
1657  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1658  %2 = call <8 x i16> @llvm.x86.ssse3.psign.w.128(<8 x i16> %a0, <8 x i16> %a1)
1659  ret <8 x i16> %2
1660}
1661declare <8 x i16> @llvm.x86.ssse3.psign.w.128(<8 x i16>, <8 x i16>) nounwind readnone
1662
1663define <4 x i32> @stack_fold_pslld(<4 x i32> %a0, <4 x i32> %a1) {
1664; CHECK-LABEL: stack_fold_pslld:
1665; CHECK:       # %bb.0:
1666; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1667; CHECK-NEXT:    #APP
1668; CHECK-NEXT:    nop
1669; CHECK-NEXT:    #NO_APP
1670; CHECK-NEXT:    vpslld {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1671; CHECK-NEXT:    retq
1672  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1673  %2 = call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %a0, <4 x i32> %a1)
1674  ret <4 x i32> %2
1675}
1676declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) nounwind readnone
1677
1678define <2 x i64> @stack_fold_psllq(<2 x i64> %a0, <2 x i64> %a1) {
1679; CHECK-LABEL: stack_fold_psllq:
1680; CHECK:       # %bb.0:
1681; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1682; CHECK-NEXT:    #APP
1683; CHECK-NEXT:    nop
1684; CHECK-NEXT:    #NO_APP
1685; CHECK-NEXT:    vpsllq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1686; CHECK-NEXT:    retq
1687  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1688  %2 = call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %a0, <2 x i64> %a1)
1689  ret <2 x i64> %2
1690}
1691declare <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64>, <2 x i64>) nounwind readnone
1692
1693define <8 x i16> @stack_fold_psllw(<8 x i16> %a0, <8 x i16> %a1) {
1694; CHECK-LABEL: stack_fold_psllw:
1695; CHECK:       # %bb.0:
1696; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1697; CHECK-NEXT:    #APP
1698; CHECK-NEXT:    nop
1699; CHECK-NEXT:    #NO_APP
1700; CHECK-NEXT:    vpsllw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1701; CHECK-NEXT:    retq
1702  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1703  %2 = call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %a0, <8 x i16> %a1)
1704  ret <8 x i16> %2
1705}
1706declare <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16>, <8 x i16>) nounwind readnone
1707
1708define <4 x i32> @stack_fold_psrad(<4 x i32> %a0, <4 x i32> %a1) {
1709; CHECK-LABEL: stack_fold_psrad:
1710; CHECK:       # %bb.0:
1711; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1712; CHECK-NEXT:    #APP
1713; CHECK-NEXT:    nop
1714; CHECK-NEXT:    #NO_APP
1715; CHECK-NEXT:    vpsrad {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1716; CHECK-NEXT:    retq
1717  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1718  %2 = call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %a0, <4 x i32> %a1)
1719  ret <4 x i32> %2
1720}
1721declare <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32>, <4 x i32>) nounwind readnone
1722
1723define <8 x i16> @stack_fold_psraw(<8 x i16> %a0, <8 x i16> %a1) {
1724; CHECK-LABEL: stack_fold_psraw:
1725; CHECK:       # %bb.0:
1726; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1727; CHECK-NEXT:    #APP
1728; CHECK-NEXT:    nop
1729; CHECK-NEXT:    #NO_APP
1730; CHECK-NEXT:    vpsraw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1731; CHECK-NEXT:    retq
1732  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1733  %2 = call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %a0, <8 x i16> %a1)
1734  ret <8 x i16> %2
1735}
1736declare <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16>, <8 x i16>) nounwind readnone
1737
1738define <4 x i32> @stack_fold_psrld(<4 x i32> %a0, <4 x i32> %a1) {
1739; CHECK-LABEL: stack_fold_psrld:
1740; CHECK:       # %bb.0:
1741; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1742; CHECK-NEXT:    #APP
1743; CHECK-NEXT:    nop
1744; CHECK-NEXT:    #NO_APP
1745; CHECK-NEXT:    vpsrld {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1746; CHECK-NEXT:    retq
1747  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1748  %2 = call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %a0, <4 x i32> %a1)
1749  ret <4 x i32> %2
1750}
1751declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) nounwind readnone
1752
1753define <2 x i64> @stack_fold_psrlq(<2 x i64> %a0, <2 x i64> %a1) {
1754; CHECK-LABEL: stack_fold_psrlq:
1755; CHECK:       # %bb.0:
1756; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1757; CHECK-NEXT:    #APP
1758; CHECK-NEXT:    nop
1759; CHECK-NEXT:    #NO_APP
1760; CHECK-NEXT:    vpsrlq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1761; CHECK-NEXT:    retq
1762  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1763  %2 = call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %a0, <2 x i64> %a1)
1764  ret <2 x i64> %2
1765}
1766declare <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64>, <2 x i64>) nounwind readnone
1767
1768define <8 x i16> @stack_fold_psrlw(<8 x i16> %a0, <8 x i16> %a1) {
1769; CHECK-LABEL: stack_fold_psrlw:
1770; CHECK:       # %bb.0:
1771; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1772; CHECK-NEXT:    #APP
1773; CHECK-NEXT:    nop
1774; CHECK-NEXT:    #NO_APP
1775; CHECK-NEXT:    vpsrlw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1776; CHECK-NEXT:    retq
1777  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1778  %2 = call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %a0, <8 x i16> %a1)
1779  ret <8 x i16> %2
1780}
1781declare <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16>, <8 x i16>) nounwind readnone
1782
1783define <16 x i8> @stack_fold_psubb(<16 x i8> %a0, <16 x i8> %a1) {
1784; CHECK-LABEL: stack_fold_psubb:
1785; CHECK:       # %bb.0:
1786; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1787; CHECK-NEXT:    #APP
1788; CHECK-NEXT:    nop
1789; CHECK-NEXT:    #NO_APP
1790; CHECK-NEXT:    vpsubb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1791; CHECK-NEXT:    retq
1792  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1793  %2 = sub <16 x i8> %a0, %a1
1794  ret <16 x i8> %2
1795}
1796
1797define <4 x i32> @stack_fold_psubd(<4 x i32> %a0, <4 x i32> %a1) {
1798; CHECK-LABEL: stack_fold_psubd:
1799; CHECK:       # %bb.0:
1800; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1801; CHECK-NEXT:    #APP
1802; CHECK-NEXT:    nop
1803; CHECK-NEXT:    #NO_APP
1804; CHECK-NEXT:    vpsubd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1805; CHECK-NEXT:    retq
1806  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1807  %2 = sub <4 x i32> %a0, %a1
1808  ret <4 x i32> %2
1809}
1810
1811define <2 x i64> @stack_fold_psubq(<2 x i64> %a0, <2 x i64> %a1) {
1812; CHECK-LABEL: stack_fold_psubq:
1813; CHECK:       # %bb.0:
1814; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1815; CHECK-NEXT:    #APP
1816; CHECK-NEXT:    nop
1817; CHECK-NEXT:    #NO_APP
1818; CHECK-NEXT:    vpsubq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1819; CHECK-NEXT:    retq
1820  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1821  %2 = sub <2 x i64> %a0, %a1
1822  ret <2 x i64> %2
1823}
1824
1825define <16 x i8> @stack_fold_psubsb(<16 x i8> %a0, <16 x i8> %a1) {
1826; CHECK-LABEL: stack_fold_psubsb:
1827; CHECK:       # %bb.0:
1828; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1829; CHECK-NEXT:    #APP
1830; CHECK-NEXT:    nop
1831; CHECK-NEXT:    #NO_APP
1832; CHECK-NEXT:    vpsubsb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1833; CHECK-NEXT:    retq
1834  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1835  %2 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> %a0, <16 x i8> %a1)
1836  ret <16 x i8> %2
1837}
1838declare <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
1839
1840define <8 x i16> @stack_fold_psubsw(<8 x i16> %a0, <8 x i16> %a1) {
1841; CHECK-LABEL: stack_fold_psubsw:
1842; CHECK:       # %bb.0:
1843; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1844; CHECK-NEXT:    #APP
1845; CHECK-NEXT:    nop
1846; CHECK-NEXT:    #NO_APP
1847; CHECK-NEXT:    vpsubsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1848; CHECK-NEXT:    retq
1849  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1850  %2 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %a0, <8 x i16> %a1)
1851  ret <8 x i16> %2
1852}
1853declare <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
1854
1855define <16 x i8> @stack_fold_psubusb(<16 x i8> %a0, <16 x i8> %a1) {
1856; CHECK-LABEL: stack_fold_psubusb:
1857; CHECK:       # %bb.0:
1858; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1859; CHECK-NEXT:    #APP
1860; CHECK-NEXT:    nop
1861; CHECK-NEXT:    #NO_APP
1862; CHECK-NEXT:    vpsubusb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1863; CHECK-NEXT:    retq
1864  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1865  %2 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> %a0, <16 x i8> %a1)
1866  ret <16 x i8> %2
1867}
1868declare <16 x i8> @llvm.usub.sat.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
1869
1870define <8 x i16> @stack_fold_psubusw(<8 x i16> %a0, <8 x i16> %a1) {
1871; CHECK-LABEL: stack_fold_psubusw:
1872; CHECK:       # %bb.0:
1873; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1874; CHECK-NEXT:    #APP
1875; CHECK-NEXT:    nop
1876; CHECK-NEXT:    #NO_APP
1877; CHECK-NEXT:    vpsubusw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1878; CHECK-NEXT:    retq
1879  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1880  %2 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %a0, <8 x i16> %a1)
1881  ret <8 x i16> %2
1882}
1883declare <8 x i16> @llvm.usub.sat.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
1884
1885define <8 x i16> @stack_fold_psubw(<8 x i16> %a0, <8 x i16> %a1) {
1886; CHECK-LABEL: stack_fold_psubw:
1887; CHECK:       # %bb.0:
1888; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1889; CHECK-NEXT:    #APP
1890; CHECK-NEXT:    nop
1891; CHECK-NEXT:    #NO_APP
1892; CHECK-NEXT:    vpsubw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1893; CHECK-NEXT:    retq
1894  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1895  %2 = sub <8 x i16> %a0, %a1
1896  ret <8 x i16> %2
1897}
1898
1899define i32 @stack_fold_ptest(<2 x i64> %a0, <2 x i64> %a1) {
1900; CHECK-LABEL: stack_fold_ptest:
1901; CHECK:       # %bb.0:
1902; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1903; CHECK-NEXT:    #APP
1904; CHECK-NEXT:    nop
1905; CHECK-NEXT:    #NO_APP
1906; CHECK-NEXT:    xorl %eax, %eax
1907; CHECK-NEXT:    vptest {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1908; CHECK-NEXT:    setb %al
1909; CHECK-NEXT:    retq
1910  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1911  %2 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %a0, <2 x i64> %a1)
1912  ret i32 %2
1913}
1914declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) nounwind readnone
1915
1916define i32 @stack_fold_ptest_ymm(<4 x i64> %a0, <4 x i64> %a1) {
1917; CHECK-LABEL: stack_fold_ptest_ymm:
1918; CHECK:       # %bb.0:
1919; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1920; CHECK-NEXT:    #APP
1921; CHECK-NEXT:    nop
1922; CHECK-NEXT:    #NO_APP
1923; CHECK-NEXT:    xorl %eax, %eax
1924; CHECK-NEXT:    vptest {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
1925; CHECK-NEXT:    setb %al
1926; CHECK-NEXT:    vzeroupper
1927; CHECK-NEXT:    retq
1928  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1929  %2 = call i32 @llvm.x86.avx.ptestc.256(<4 x i64> %a0, <4 x i64> %a1)
1930  ret i32 %2
1931}
1932declare i32 @llvm.x86.avx.ptestc.256(<4 x i64>, <4 x i64>) nounwind readnone
1933
1934define <16 x i8> @stack_fold_punpckhbw(<16 x i8> %a0, <16 x i8> %a1) {
1935; CHECK-LABEL: stack_fold_punpckhbw:
1936; CHECK:       # %bb.0:
1937; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1938; CHECK-NEXT:    #APP
1939; CHECK-NEXT:    nop
1940; CHECK-NEXT:    #NO_APP
1941; CHECK-NEXT:    vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1942; CHECK-NEXT:    # xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15]
1943; CHECK-NEXT:    retq
1944  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1945  %2 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
1946  ret <16 x i8> %2
1947}
1948
1949define <4 x i32> @stack_fold_punpckhdq(<4 x i32> %a0, <4 x i32> %a1) {
1950; CHECK-LABEL: stack_fold_punpckhdq:
1951; CHECK:       # %bb.0:
1952; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1953; CHECK-NEXT:    #APP
1954; CHECK-NEXT:    nop
1955; CHECK-NEXT:    #NO_APP
1956; CHECK-NEXT:    vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1957; CHECK-NEXT:    # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
1958; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
1959; CHECK-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
1960; CHECK-NEXT:    retq
1961  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1962  %2 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
1963  ; add forces execution domain
1964  %3 = add <4 x i32> %2, <i32 1, i32 1, i32 1, i32 1>
1965  ret <4 x i32> %3
1966}
1967
1968define <2 x i64> @stack_fold_punpckhqdq(<2 x i64> %a0, <2 x i64> %a1) {
1969; CHECK-LABEL: stack_fold_punpckhqdq:
1970; CHECK:       # %bb.0:
1971; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1972; CHECK-NEXT:    #APP
1973; CHECK-NEXT:    nop
1974; CHECK-NEXT:    #NO_APP
1975; CHECK-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1976; CHECK-NEXT:    # xmm0 = xmm0[1],mem[1]
1977; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
1978; CHECK-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
1979; CHECK-NEXT:    retq
1980  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1981  %2 = shufflevector <2 x i64> %a0, <2 x i64> %a1, <2 x i32> <i32 1, i32 3>
1982  ; add forces execution domain
1983  %3 = add <2 x i64> %2, <i64 1, i64 1>
1984  ret <2 x i64> %3
1985}
1986
1987define <8 x i16> @stack_fold_punpckhwd(<8 x i16> %a0, <8 x i16> %a1) {
1988; CHECK-LABEL: stack_fold_punpckhwd:
1989; CHECK:       # %bb.0:
1990; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1991; CHECK-NEXT:    #APP
1992; CHECK-NEXT:    nop
1993; CHECK-NEXT:    #NO_APP
1994; CHECK-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1995; CHECK-NEXT:    # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
1996; CHECK-NEXT:    retq
1997  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1998  %2 = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
1999  ret <8 x i16> %2
2000}
2001
2002define <16 x i8> @stack_fold_punpcklbw(<16 x i8> %a0, <16 x i8> %a1) {
2003; CHECK-LABEL: stack_fold_punpcklbw:
2004; CHECK:       # %bb.0:
2005; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2006; CHECK-NEXT:    #APP
2007; CHECK-NEXT:    nop
2008; CHECK-NEXT:    #NO_APP
2009; CHECK-NEXT:    vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2010; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
2011; CHECK-NEXT:    retq
2012  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2013  %2 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
2014  ret <16 x i8> %2
2015}
2016
2017define <4 x i32> @stack_fold_punpckldq(<4 x i32> %a0, <4 x i32> %a1) {
2018; CHECK-LABEL: stack_fold_punpckldq:
2019; CHECK:       # %bb.0:
2020; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2021; CHECK-NEXT:    #APP
2022; CHECK-NEXT:    nop
2023; CHECK-NEXT:    #NO_APP
2024; CHECK-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2025; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
2026; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
2027; CHECK-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
2028; CHECK-NEXT:    retq
2029  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2030  %2 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
2031  ; add forces execution domain
2032  %3 = add <4 x i32> %2, <i32 1, i32 1, i32 1, i32 1>
2033  ret <4 x i32> %3
2034}
2035
2036define <2 x i64> @stack_fold_punpcklqdq(<2 x i64> %a0, <2 x i64> %a1) {
2037; CHECK-LABEL: stack_fold_punpcklqdq:
2038; CHECK:       # %bb.0:
2039; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2040; CHECK-NEXT:    #APP
2041; CHECK-NEXT:    nop
2042; CHECK-NEXT:    #NO_APP
2043; CHECK-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2044; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
2045; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
2046; CHECK-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
2047; CHECK-NEXT:    retq
2048  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2049  %2 = shufflevector <2 x i64> %a0, <2 x i64> %a1, <2 x i32> <i32 0, i32 2>
2050  ; add forces execution domain
2051  %3 = add <2 x i64> %2, <i64 1, i64 1>
2052  ret <2 x i64> %3
2053}
2054
2055define <8 x i16> @stack_fold_punpcklwd(<8 x i16> %a0, <8 x i16> %a1) {
2056; CHECK-LABEL: stack_fold_punpcklwd:
2057; CHECK:       # %bb.0:
2058; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2059; CHECK-NEXT:    #APP
2060; CHECK-NEXT:    nop
2061; CHECK-NEXT:    #NO_APP
2062; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2063; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
2064; CHECK-NEXT:    retq
2065  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2066  %2 = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
2067  ret <8 x i16> %2
2068}
2069
2070define <16 x i8> @stack_fold_pxor(<16 x i8> %a0, <16 x i8> %a1) {
2071; CHECK-LABEL: stack_fold_pxor:
2072; CHECK:       # %bb.0:
2073; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2074; CHECK-NEXT:    #APP
2075; CHECK-NEXT:    nop
2076; CHECK-NEXT:    #NO_APP
2077; CHECK-NEXT:    vpxor {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2078; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
2079; CHECK-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
2080; CHECK-NEXT:    retq
2081  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2082  %2 = xor <16 x i8> %a0, %a1
2083  ; add forces execution domain
2084  %3 = add <16 x i8> %2, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
2085  ret <16 x i8> %3
2086}
2087