1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+sse4.2,+aes,+pclmul < %s | FileCheck %s
3
4target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
5target triple = "x86_64-unknown-unknown"
6
7; Stack reload folding tests.
8;
9; By including a nop call with sideeffects we can force a partial register spill of the
10; relevant registers and check that the reload is correctly folded into the instruction.
11
12define <2 x i64> @stack_fold_aesdec(<2 x i64> %a0, <2 x i64> %a1) {
13; CHECK-LABEL: stack_fold_aesdec:
14; CHECK:       # %bb.0:
15; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16; CHECK-NEXT:    #APP
17; CHECK-NEXT:    nop
18; CHECK-NEXT:    #NO_APP
19; CHECK-NEXT:    aesdec {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
20; CHECK-NEXT:    retq
21  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
22  %2 = call <2 x i64> @llvm.x86.aesni.aesdec(<2 x i64> %a0, <2 x i64> %a1)
23  ret <2 x i64> %2
24}
25declare <2 x i64> @llvm.x86.aesni.aesdec(<2 x i64>, <2 x i64>) nounwind readnone
26
27define <2 x i64> @stack_fold_aesdeclast(<2 x i64> %a0, <2 x i64> %a1) {
28; CHECK-LABEL: stack_fold_aesdeclast:
29; CHECK:       # %bb.0:
30; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
31; CHECK-NEXT:    #APP
32; CHECK-NEXT:    nop
33; CHECK-NEXT:    #NO_APP
34; CHECK-NEXT:    aesdeclast {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
35; CHECK-NEXT:    retq
36  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
37  %2 = call <2 x i64> @llvm.x86.aesni.aesdeclast(<2 x i64> %a0, <2 x i64> %a1)
38  ret <2 x i64> %2
39}
40declare <2 x i64> @llvm.x86.aesni.aesdeclast(<2 x i64>, <2 x i64>) nounwind readnone
41
42define <2 x i64> @stack_fold_aesenc(<2 x i64> %a0, <2 x i64> %a1) {
43; CHECK-LABEL: stack_fold_aesenc:
44; CHECK:       # %bb.0:
45; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
46; CHECK-NEXT:    #APP
47; CHECK-NEXT:    nop
48; CHECK-NEXT:    #NO_APP
49; CHECK-NEXT:    aesenc {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
50; CHECK-NEXT:    retq
51  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
52  %2 = call <2 x i64> @llvm.x86.aesni.aesenc(<2 x i64> %a0, <2 x i64> %a1)
53  ret <2 x i64> %2
54}
55declare <2 x i64> @llvm.x86.aesni.aesenc(<2 x i64>, <2 x i64>) nounwind readnone
56
57define <2 x i64> @stack_fold_aesenclast(<2 x i64> %a0, <2 x i64> %a1) {
58; CHECK-LABEL: stack_fold_aesenclast:
59; CHECK:       # %bb.0:
60; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
61; CHECK-NEXT:    #APP
62; CHECK-NEXT:    nop
63; CHECK-NEXT:    #NO_APP
64; CHECK-NEXT:    aesenclast {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
65; CHECK-NEXT:    retq
66  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
67  %2 = call <2 x i64> @llvm.x86.aesni.aesenclast(<2 x i64> %a0, <2 x i64> %a1)
68  ret <2 x i64> %2
69}
70declare <2 x i64> @llvm.x86.aesni.aesenclast(<2 x i64>, <2 x i64>) nounwind readnone
71
72define <2 x i64> @stack_fold_aesimc(<2 x i64> %a0) {
73; CHECK-LABEL: stack_fold_aesimc:
74; CHECK:       # %bb.0:
75; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
76; CHECK-NEXT:    #APP
77; CHECK-NEXT:    nop
78; CHECK-NEXT:    #NO_APP
79; CHECK-NEXT:    aesimc {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
80; CHECK-NEXT:    retq
81  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
82  %2 = call <2 x i64> @llvm.x86.aesni.aesimc(<2 x i64> %a0)
83  ret <2 x i64> %2
84}
85declare <2 x i64> @llvm.x86.aesni.aesimc(<2 x i64>) nounwind readnone
86
87define <2 x i64> @stack_fold_aeskeygenassist(<2 x i64> %a0) {
88; CHECK-LABEL: stack_fold_aeskeygenassist:
89; CHECK:       # %bb.0:
90; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
91; CHECK-NEXT:    #APP
92; CHECK-NEXT:    nop
93; CHECK-NEXT:    #NO_APP
94; CHECK-NEXT:    aeskeygenassist $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
95; CHECK-NEXT:    retq
96  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
97  %2 = call <2 x i64> @llvm.x86.aesni.aeskeygenassist(<2 x i64> %a0, i8 7)
98  ret <2 x i64> %2
99}
100declare <2 x i64> @llvm.x86.aesni.aeskeygenassist(<2 x i64>, i8) nounwind readnone
101
102define i32 @stack_fold_crc32_32_8(i32 %a0, i8 %a1) {
103; CHECK-LABEL: stack_fold_crc32_32_8:
104; CHECK:       # %bb.0:
105; CHECK-NEXT:    pushq %rbp
106; CHECK-NEXT:    .cfi_def_cfa_offset 16
107; CHECK-NEXT:    pushq %r15
108; CHECK-NEXT:    .cfi_def_cfa_offset 24
109; CHECK-NEXT:    pushq %r14
110; CHECK-NEXT:    .cfi_def_cfa_offset 32
111; CHECK-NEXT:    pushq %r13
112; CHECK-NEXT:    .cfi_def_cfa_offset 40
113; CHECK-NEXT:    pushq %r12
114; CHECK-NEXT:    .cfi_def_cfa_offset 48
115; CHECK-NEXT:    pushq %rbx
116; CHECK-NEXT:    .cfi_def_cfa_offset 56
117; CHECK-NEXT:    .cfi_offset %rbx, -56
118; CHECK-NEXT:    .cfi_offset %r12, -48
119; CHECK-NEXT:    .cfi_offset %r13, -40
120; CHECK-NEXT:    .cfi_offset %r14, -32
121; CHECK-NEXT:    .cfi_offset %r15, -24
122; CHECK-NEXT:    .cfi_offset %rbp, -16
123; CHECK-NEXT:    movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
124; CHECK-NEXT:    movl %edi, %eax
125; CHECK-NEXT:    #APP
126; CHECK-NEXT:    nop
127; CHECK-NEXT:    #NO_APP
128; CHECK-NEXT:    crc32b {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
129; CHECK-NEXT:    popq %rbx
130; CHECK-NEXT:    .cfi_def_cfa_offset 48
131; CHECK-NEXT:    popq %r12
132; CHECK-NEXT:    .cfi_def_cfa_offset 40
133; CHECK-NEXT:    popq %r13
134; CHECK-NEXT:    .cfi_def_cfa_offset 32
135; CHECK-NEXT:    popq %r14
136; CHECK-NEXT:    .cfi_def_cfa_offset 24
137; CHECK-NEXT:    popq %r15
138; CHECK-NEXT:    .cfi_def_cfa_offset 16
139; CHECK-NEXT:    popq %rbp
140; CHECK-NEXT:    .cfi_def_cfa_offset 8
141; CHECK-NEXT:    retq
142  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
143  %2 = call i32 @llvm.x86.sse42.crc32.32.8(i32 %a0, i8 %a1)
144  ret i32 %2
145}
146declare i32 @llvm.x86.sse42.crc32.32.8(i32, i8) nounwind
147
148define i32 @stack_fold_crc32_32_16(i32 %a0, i16 %a1) {
149; CHECK-LABEL: stack_fold_crc32_32_16:
150; CHECK:       # %bb.0:
151; CHECK-NEXT:    pushq %rbp
152; CHECK-NEXT:    .cfi_def_cfa_offset 16
153; CHECK-NEXT:    pushq %r15
154; CHECK-NEXT:    .cfi_def_cfa_offset 24
155; CHECK-NEXT:    pushq %r14
156; CHECK-NEXT:    .cfi_def_cfa_offset 32
157; CHECK-NEXT:    pushq %r13
158; CHECK-NEXT:    .cfi_def_cfa_offset 40
159; CHECK-NEXT:    pushq %r12
160; CHECK-NEXT:    .cfi_def_cfa_offset 48
161; CHECK-NEXT:    pushq %rbx
162; CHECK-NEXT:    .cfi_def_cfa_offset 56
163; CHECK-NEXT:    .cfi_offset %rbx, -56
164; CHECK-NEXT:    .cfi_offset %r12, -48
165; CHECK-NEXT:    .cfi_offset %r13, -40
166; CHECK-NEXT:    .cfi_offset %r14, -32
167; CHECK-NEXT:    .cfi_offset %r15, -24
168; CHECK-NEXT:    .cfi_offset %rbp, -16
169; CHECK-NEXT:    movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
170; CHECK-NEXT:    movl %edi, %eax
171; CHECK-NEXT:    #APP
172; CHECK-NEXT:    nop
173; CHECK-NEXT:    #NO_APP
174; CHECK-NEXT:    crc32w {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
175; CHECK-NEXT:    popq %rbx
176; CHECK-NEXT:    .cfi_def_cfa_offset 48
177; CHECK-NEXT:    popq %r12
178; CHECK-NEXT:    .cfi_def_cfa_offset 40
179; CHECK-NEXT:    popq %r13
180; CHECK-NEXT:    .cfi_def_cfa_offset 32
181; CHECK-NEXT:    popq %r14
182; CHECK-NEXT:    .cfi_def_cfa_offset 24
183; CHECK-NEXT:    popq %r15
184; CHECK-NEXT:    .cfi_def_cfa_offset 16
185; CHECK-NEXT:    popq %rbp
186; CHECK-NEXT:    .cfi_def_cfa_offset 8
187; CHECK-NEXT:    retq
188  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
189  %2 = call i32 @llvm.x86.sse42.crc32.32.16(i32 %a0, i16 %a1)
190  ret i32 %2
191}
192declare i32 @llvm.x86.sse42.crc32.32.16(i32, i16) nounwind
193
194define i32 @stack_fold_crc32_32_32(i32 %a0, i32 %a1) {
195; CHECK-LABEL: stack_fold_crc32_32_32:
196; CHECK:       # %bb.0:
197; CHECK-NEXT:    pushq %rbp
198; CHECK-NEXT:    .cfi_def_cfa_offset 16
199; CHECK-NEXT:    pushq %r15
200; CHECK-NEXT:    .cfi_def_cfa_offset 24
201; CHECK-NEXT:    pushq %r14
202; CHECK-NEXT:    .cfi_def_cfa_offset 32
203; CHECK-NEXT:    pushq %r13
204; CHECK-NEXT:    .cfi_def_cfa_offset 40
205; CHECK-NEXT:    pushq %r12
206; CHECK-NEXT:    .cfi_def_cfa_offset 48
207; CHECK-NEXT:    pushq %rbx
208; CHECK-NEXT:    .cfi_def_cfa_offset 56
209; CHECK-NEXT:    .cfi_offset %rbx, -56
210; CHECK-NEXT:    .cfi_offset %r12, -48
211; CHECK-NEXT:    .cfi_offset %r13, -40
212; CHECK-NEXT:    .cfi_offset %r14, -32
213; CHECK-NEXT:    .cfi_offset %r15, -24
214; CHECK-NEXT:    .cfi_offset %rbp, -16
215; CHECK-NEXT:    movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
216; CHECK-NEXT:    movl %edi, %eax
217; CHECK-NEXT:    #APP
218; CHECK-NEXT:    nop
219; CHECK-NEXT:    #NO_APP
220; CHECK-NEXT:    crc32l {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload
221; CHECK-NEXT:    popq %rbx
222; CHECK-NEXT:    .cfi_def_cfa_offset 48
223; CHECK-NEXT:    popq %r12
224; CHECK-NEXT:    .cfi_def_cfa_offset 40
225; CHECK-NEXT:    popq %r13
226; CHECK-NEXT:    .cfi_def_cfa_offset 32
227; CHECK-NEXT:    popq %r14
228; CHECK-NEXT:    .cfi_def_cfa_offset 24
229; CHECK-NEXT:    popq %r15
230; CHECK-NEXT:    .cfi_def_cfa_offset 16
231; CHECK-NEXT:    popq %rbp
232; CHECK-NEXT:    .cfi_def_cfa_offset 8
233; CHECK-NEXT:    retq
234  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
235  %2 = call i32 @llvm.x86.sse42.crc32.32.32(i32 %a0, i32 %a1)
236  ret i32 %2
237}
238declare i32 @llvm.x86.sse42.crc32.32.32(i32, i32) nounwind
239
240define i64 @stack_fold_crc32_64_64(i64 %a0, i64 %a1) {
241; CHECK-LABEL: stack_fold_crc32_64_64:
242; CHECK:       # %bb.0:
243; CHECK-NEXT:    pushq %rbp
244; CHECK-NEXT:    .cfi_def_cfa_offset 16
245; CHECK-NEXT:    pushq %r15
246; CHECK-NEXT:    .cfi_def_cfa_offset 24
247; CHECK-NEXT:    pushq %r14
248; CHECK-NEXT:    .cfi_def_cfa_offset 32
249; CHECK-NEXT:    pushq %r13
250; CHECK-NEXT:    .cfi_def_cfa_offset 40
251; CHECK-NEXT:    pushq %r12
252; CHECK-NEXT:    .cfi_def_cfa_offset 48
253; CHECK-NEXT:    pushq %rbx
254; CHECK-NEXT:    .cfi_def_cfa_offset 56
255; CHECK-NEXT:    .cfi_offset %rbx, -56
256; CHECK-NEXT:    .cfi_offset %r12, -48
257; CHECK-NEXT:    .cfi_offset %r13, -40
258; CHECK-NEXT:    .cfi_offset %r14, -32
259; CHECK-NEXT:    .cfi_offset %r15, -24
260; CHECK-NEXT:    .cfi_offset %rbp, -16
261; CHECK-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
262; CHECK-NEXT:    movq %rdi, %rax
263; CHECK-NEXT:    #APP
264; CHECK-NEXT:    nop
265; CHECK-NEXT:    #NO_APP
266; CHECK-NEXT:    crc32q {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
267; CHECK-NEXT:    popq %rbx
268; CHECK-NEXT:    .cfi_def_cfa_offset 48
269; CHECK-NEXT:    popq %r12
270; CHECK-NEXT:    .cfi_def_cfa_offset 40
271; CHECK-NEXT:    popq %r13
272; CHECK-NEXT:    .cfi_def_cfa_offset 32
273; CHECK-NEXT:    popq %r14
274; CHECK-NEXT:    .cfi_def_cfa_offset 24
275; CHECK-NEXT:    popq %r15
276; CHECK-NEXT:    .cfi_def_cfa_offset 16
277; CHECK-NEXT:    popq %rbp
278; CHECK-NEXT:    .cfi_def_cfa_offset 8
279; CHECK-NEXT:    retq
280  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
281  %2 = call i64 @llvm.x86.sse42.crc32.64.64(i64 %a0, i64 %a1)
282  ret i64 %2
283}
284declare i64 @llvm.x86.sse42.crc32.64.64(i64, i64) nounwind
285
286define <4 x i32> @stack_fold_movd_load(i32 %a0) {
287; CHECK-LABEL: stack_fold_movd_load:
288; CHECK:       # %bb.0:
289; CHECK-NEXT:    pushq %rbp
290; CHECK-NEXT:    .cfi_def_cfa_offset 16
291; CHECK-NEXT:    pushq %r15
292; CHECK-NEXT:    .cfi_def_cfa_offset 24
293; CHECK-NEXT:    pushq %r14
294; CHECK-NEXT:    .cfi_def_cfa_offset 32
295; CHECK-NEXT:    pushq %r13
296; CHECK-NEXT:    .cfi_def_cfa_offset 40
297; CHECK-NEXT:    pushq %r12
298; CHECK-NEXT:    .cfi_def_cfa_offset 48
299; CHECK-NEXT:    pushq %rbx
300; CHECK-NEXT:    .cfi_def_cfa_offset 56
301; CHECK-NEXT:    .cfi_offset %rbx, -56
302; CHECK-NEXT:    .cfi_offset %r12, -48
303; CHECK-NEXT:    .cfi_offset %r13, -40
304; CHECK-NEXT:    .cfi_offset %r14, -32
305; CHECK-NEXT:    .cfi_offset %r15, -24
306; CHECK-NEXT:    .cfi_offset %rbp, -16
307; CHECK-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
308; CHECK-NEXT:    #APP
309; CHECK-NEXT:    nop
310; CHECK-NEXT:    #NO_APP
311; CHECK-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
312; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
313; CHECK-NEXT:    pcmpeqd %xmm1, %xmm1
314; CHECK-NEXT:    psubd %xmm1, %xmm0
315; CHECK-NEXT:    popq %rbx
316; CHECK-NEXT:    .cfi_def_cfa_offset 48
317; CHECK-NEXT:    popq %r12
318; CHECK-NEXT:    .cfi_def_cfa_offset 40
319; CHECK-NEXT:    popq %r13
320; CHECK-NEXT:    .cfi_def_cfa_offset 32
321; CHECK-NEXT:    popq %r14
322; CHECK-NEXT:    .cfi_def_cfa_offset 24
323; CHECK-NEXT:    popq %r15
324; CHECK-NEXT:    .cfi_def_cfa_offset 16
325; CHECK-NEXT:    popq %rbp
326; CHECK-NEXT:    .cfi_def_cfa_offset 8
327; CHECK-NEXT:    retq
328  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
329  %2 = insertelement <4 x i32> zeroinitializer, i32 %a0, i32 0
330  ; add forces execution domain
331  %3 = add <4 x i32> %2, <i32 1, i32 1, i32 1, i32 1>
332  ret <4 x i32> %3
333}
334
335define i32 @stack_fold_movd_store(<4 x i32> %a0, <4 x i32> %a1) {
336; CHECK-LABEL: stack_fold_movd_store:
337; CHECK:       # %bb.0:
338; CHECK-NEXT:    pushq %rbp
339; CHECK-NEXT:    .cfi_def_cfa_offset 16
340; CHECK-NEXT:    pushq %r15
341; CHECK-NEXT:    .cfi_def_cfa_offset 24
342; CHECK-NEXT:    pushq %r14
343; CHECK-NEXT:    .cfi_def_cfa_offset 32
344; CHECK-NEXT:    pushq %r13
345; CHECK-NEXT:    .cfi_def_cfa_offset 40
346; CHECK-NEXT:    pushq %r12
347; CHECK-NEXT:    .cfi_def_cfa_offset 48
348; CHECK-NEXT:    pushq %rbx
349; CHECK-NEXT:    .cfi_def_cfa_offset 56
350; CHECK-NEXT:    .cfi_offset %rbx, -56
351; CHECK-NEXT:    .cfi_offset %r12, -48
352; CHECK-NEXT:    .cfi_offset %r13, -40
353; CHECK-NEXT:    .cfi_offset %r14, -32
354; CHECK-NEXT:    .cfi_offset %r15, -24
355; CHECK-NEXT:    .cfi_offset %rbp, -16
356; CHECK-NEXT:    paddd %xmm1, %xmm0
357; CHECK-NEXT:    movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
358; CHECK-NEXT:    #APP
359; CHECK-NEXT:    nop
360; CHECK-NEXT:    #NO_APP
361; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
362; CHECK-NEXT:    popq %rbx
363; CHECK-NEXT:    .cfi_def_cfa_offset 48
364; CHECK-NEXT:    popq %r12
365; CHECK-NEXT:    .cfi_def_cfa_offset 40
366; CHECK-NEXT:    popq %r13
367; CHECK-NEXT:    .cfi_def_cfa_offset 32
368; CHECK-NEXT:    popq %r14
369; CHECK-NEXT:    .cfi_def_cfa_offset 24
370; CHECK-NEXT:    popq %r15
371; CHECK-NEXT:    .cfi_def_cfa_offset 16
372; CHECK-NEXT:    popq %rbp
373; CHECK-NEXT:    .cfi_def_cfa_offset 8
374; CHECK-NEXT:    retq
375  ; add forces execution domain
376  %1 = add <4 x i32> %a0, %a1
377  %2 = extractelement <4 x i32> %1, i32 0
378  %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
379  ret i32 %2
380}
381
382define <2 x i64> @stack_fold_movq_load(<2 x i64> %a0) {
383; CHECK-LABEL: stack_fold_movq_load:
384; CHECK:       # %bb.0:
385; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
386; CHECK-NEXT:    #APP
387; CHECK-NEXT:    nop
388; CHECK-NEXT:    #NO_APP
389; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
390; CHECK-NEXT:    # xmm0 = mem[0],zero
391; CHECK-NEXT:    pcmpeqd %xmm1, %xmm1
392; CHECK-NEXT:    psubq %xmm1, %xmm0
393; CHECK-NEXT:    retq
394  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
395  %2 = shufflevector <2 x i64> %a0, <2 x i64> zeroinitializer, <2 x i32> <i32 0, i32 2>
396  ; add forces execution domain
397  %3 = add <2 x i64> %2, <i64 1, i64 1>
398  ret <2 x i64> %3
399}
400
401define i64 @stack_fold_movq_store(<2 x i64> %a0, <2 x i64> %a1) {
402; CHECK-LABEL: stack_fold_movq_store:
403; CHECK:       # %bb.0:
404; CHECK-NEXT:    pushq %rbp
405; CHECK-NEXT:    .cfi_def_cfa_offset 16
406; CHECK-NEXT:    pushq %r15
407; CHECK-NEXT:    .cfi_def_cfa_offset 24
408; CHECK-NEXT:    pushq %r14
409; CHECK-NEXT:    .cfi_def_cfa_offset 32
410; CHECK-NEXT:    pushq %r13
411; CHECK-NEXT:    .cfi_def_cfa_offset 40
412; CHECK-NEXT:    pushq %r12
413; CHECK-NEXT:    .cfi_def_cfa_offset 48
414; CHECK-NEXT:    pushq %rbx
415; CHECK-NEXT:    .cfi_def_cfa_offset 56
416; CHECK-NEXT:    .cfi_offset %rbx, -56
417; CHECK-NEXT:    .cfi_offset %r12, -48
418; CHECK-NEXT:    .cfi_offset %r13, -40
419; CHECK-NEXT:    .cfi_offset %r14, -32
420; CHECK-NEXT:    .cfi_offset %r15, -24
421; CHECK-NEXT:    .cfi_offset %rbp, -16
422; CHECK-NEXT:    paddq %xmm1, %xmm0
423; CHECK-NEXT:    movq %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
424; CHECK-NEXT:    #APP
425; CHECK-NEXT:    nop
426; CHECK-NEXT:    #NO_APP
427; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
428; CHECK-NEXT:    popq %rbx
429; CHECK-NEXT:    .cfi_def_cfa_offset 48
430; CHECK-NEXT:    popq %r12
431; CHECK-NEXT:    .cfi_def_cfa_offset 40
432; CHECK-NEXT:    popq %r13
433; CHECK-NEXT:    .cfi_def_cfa_offset 32
434; CHECK-NEXT:    popq %r14
435; CHECK-NEXT:    .cfi_def_cfa_offset 24
436; CHECK-NEXT:    popq %r15
437; CHECK-NEXT:    .cfi_def_cfa_offset 16
438; CHECK-NEXT:    popq %rbp
439; CHECK-NEXT:    .cfi_def_cfa_offset 8
440; CHECK-NEXT:    retq
441  ; add forces execution domain
442  %1 = add <2 x i64> %a0, %a1
443  %2 = extractelement <2 x i64> %1, i32 0
444  %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
445  ret i64 %2
446}
447
448define <8 x i16> @stack_fold_mpsadbw(<16 x i8> %a0, <16 x i8> %a1) {
449; CHECK-LABEL: stack_fold_mpsadbw:
450; CHECK:       # %bb.0:
451; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
452; CHECK-NEXT:    #APP
453; CHECK-NEXT:    nop
454; CHECK-NEXT:    #NO_APP
455; CHECK-NEXT:    mpsadbw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
456; CHECK-NEXT:    retq
457  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
458  %2 = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %a0, <16 x i8> %a1, i8 7)
459  ret <8 x i16> %2
460}
461declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i8) nounwind readnone
462
463define <16 x i8> @stack_fold_pabsb(<16 x i8> %a0) {
464; CHECK-LABEL: stack_fold_pabsb:
465; CHECK:       # %bb.0:
466; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
467; CHECK-NEXT:    #APP
468; CHECK-NEXT:    nop
469; CHECK-NEXT:    #NO_APP
470; CHECK-NEXT:    pabsb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
471; CHECK-NEXT:    retq
472  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
473  %2 = icmp sgt <16 x i8> %a0, zeroinitializer
474  %3 = sub <16 x i8> zeroinitializer, %a0
475  %4 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %3
476  ret <16 x i8> %4
477}
478
479define <4 x i32> @stack_fold_pabsd(<4 x i32> %a0) {
480; CHECK-LABEL: stack_fold_pabsd:
481; CHECK:       # %bb.0:
482; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
483; CHECK-NEXT:    #APP
484; CHECK-NEXT:    nop
485; CHECK-NEXT:    #NO_APP
486; CHECK-NEXT:    pabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
487; CHECK-NEXT:    retq
488  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
489  %2 = icmp sgt <4 x i32> %a0, zeroinitializer
490  %3 = sub <4 x i32> zeroinitializer, %a0
491  %4 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %3
492  ret <4 x i32> %4
493}
494
495define <8 x i16> @stack_fold_pabsw(<8 x i16> %a0) {
496; CHECK-LABEL: stack_fold_pabsw:
497; CHECK:       # %bb.0:
498; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
499; CHECK-NEXT:    #APP
500; CHECK-NEXT:    nop
501; CHECK-NEXT:    #NO_APP
502; CHECK-NEXT:    pabsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
503; CHECK-NEXT:    retq
504  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
505  %2 = icmp sgt <8 x i16> %a0, zeroinitializer
506  %3 = sub <8 x i16> zeroinitializer, %a0
507  %4 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %3
508  ret <8 x i16> %4
509}
510
511define <8 x i16> @stack_fold_packssdw(<4 x i32> %a0, <4 x i32> %a1) {
512; CHECK-LABEL: stack_fold_packssdw:
513; CHECK:       # %bb.0:
514; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
515; CHECK-NEXT:    #APP
516; CHECK-NEXT:    nop
517; CHECK-NEXT:    #NO_APP
518; CHECK-NEXT:    packssdw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
519; CHECK-NEXT:    retq
520  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
521  %2 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a0, <4 x i32> %a1)
522  ret <8 x i16> %2
523}
524declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>) nounwind readnone
525
526define <16 x i8> @stack_fold_packsswb(<8 x i16> %a0, <8 x i16> %a1) {
527; CHECK-LABEL: stack_fold_packsswb:
528; CHECK:       # %bb.0:
529; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
530; CHECK-NEXT:    #APP
531; CHECK-NEXT:    nop
532; CHECK-NEXT:    #NO_APP
533; CHECK-NEXT:    packsswb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
534; CHECK-NEXT:    retq
535  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
536  %2 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a0, <8 x i16> %a1)
537  ret <16 x i8> %2
538}
539declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>) nounwind readnone
540
541define <8 x i16> @stack_fold_packusdw(<4 x i32> %a0, <4 x i32> %a1) {
542; CHECK-LABEL: stack_fold_packusdw:
543; CHECK:       # %bb.0:
544; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
545; CHECK-NEXT:    #APP
546; CHECK-NEXT:    nop
547; CHECK-NEXT:    #NO_APP
548; CHECK-NEXT:    packusdw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
549; CHECK-NEXT:    retq
550  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
551  %2 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a0, <4 x i32> %a1)
552  ret <8 x i16> %2
553}
554declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>) nounwind readnone
555
556define <16 x i8> @stack_fold_packuswb(<8 x i16> %a0, <8 x i16> %a1) {
557; CHECK-LABEL: stack_fold_packuswb:
558; CHECK:       # %bb.0:
559; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
560; CHECK-NEXT:    #APP
561; CHECK-NEXT:    nop
562; CHECK-NEXT:    #NO_APP
563; CHECK-NEXT:    packuswb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
564; CHECK-NEXT:    retq
565  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
566  %2 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a0, <8 x i16> %a1)
567  ret <16 x i8> %2
568}
569declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>) nounwind readnone
570
571define <16 x i8> @stack_fold_paddb(<16 x i8> %a0, <16 x i8> %a1) {
572; CHECK-LABEL: stack_fold_paddb:
573; CHECK:       # %bb.0:
574; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
575; CHECK-NEXT:    #APP
576; CHECK-NEXT:    nop
577; CHECK-NEXT:    #NO_APP
578; CHECK-NEXT:    paddb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
579; CHECK-NEXT:    retq
580  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
581  %2 = add <16 x i8> %a0, %a1
582  ret <16 x i8> %2
583}
584
585define <4 x i32> @stack_fold_paddd(<4 x i32> %a0, <4 x i32> %a1) {
586; CHECK-LABEL: stack_fold_paddd:
587; CHECK:       # %bb.0:
588; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
589; CHECK-NEXT:    #APP
590; CHECK-NEXT:    nop
591; CHECK-NEXT:    #NO_APP
592; CHECK-NEXT:    paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
593; CHECK-NEXT:    retq
594  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
595  %2 = add <4 x i32> %a0, %a1
596  ret <4 x i32> %2
597}
598
599define <2 x i64> @stack_fold_paddq(<2 x i64> %a0, <2 x i64> %a1) {
600; CHECK-LABEL: stack_fold_paddq:
601; CHECK:       # %bb.0:
602; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
603; CHECK-NEXT:    #APP
604; CHECK-NEXT:    nop
605; CHECK-NEXT:    #NO_APP
606; CHECK-NEXT:    paddq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
607; CHECK-NEXT:    retq
608  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
609  %2 = add <2 x i64> %a0, %a1
610  ret <2 x i64> %2
611}
612
613define <16 x i8> @stack_fold_paddsb(<16 x i8> %a0, <16 x i8> %a1) {
614; CHECK-LABEL: stack_fold_paddsb:
615; CHECK:       # %bb.0:
616; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
617; CHECK-NEXT:    #APP
618; CHECK-NEXT:    nop
619; CHECK-NEXT:    #NO_APP
620; CHECK-NEXT:    paddsb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
621; CHECK-NEXT:    retq
622  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
623  %2 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> %a0, <16 x i8> %a1)
624  ret <16 x i8> %2
625}
626declare <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
627
628define <8 x i16> @stack_fold_paddsw(<8 x i16> %a0, <8 x i16> %a1) {
629; CHECK-LABEL: stack_fold_paddsw:
630; CHECK:       # %bb.0:
631; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
632; CHECK-NEXT:    #APP
633; CHECK-NEXT:    nop
634; CHECK-NEXT:    #NO_APP
635; CHECK-NEXT:    paddsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
636; CHECK-NEXT:    retq
637  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
638  %2 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %a0, <8 x i16> %a1)
639  ret <8 x i16> %2
640}
641declare <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
642
643define <16 x i8> @stack_fold_paddusb(<16 x i8> %a0, <16 x i8> %a1) {
644; CHECK-LABEL: stack_fold_paddusb:
645; CHECK:       # %bb.0:
646; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
647; CHECK-NEXT:    #APP
648; CHECK-NEXT:    nop
649; CHECK-NEXT:    #NO_APP
650; CHECK-NEXT:    paddusb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
651; CHECK-NEXT:    retq
652  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
653  %2 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> %a0, <16 x i8> %a1)
654  ret <16 x i8> %2
655}
656declare <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
657
658define <8 x i16> @stack_fold_paddusw(<8 x i16> %a0, <8 x i16> %a1) {
659; CHECK-LABEL: stack_fold_paddusw:
660; CHECK:       # %bb.0:
661; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
662; CHECK-NEXT:    #APP
663; CHECK-NEXT:    nop
664; CHECK-NEXT:    #NO_APP
665; CHECK-NEXT:    paddusw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
666; CHECK-NEXT:    retq
667  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
668  %2 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %a0, <8 x i16> %a1)
669  ret <8 x i16> %2
670}
671declare <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
672
673define <8 x i16> @stack_fold_paddw(<8 x i16> %a0, <8 x i16> %a1) {
674; CHECK-LABEL: stack_fold_paddw:
675; CHECK:       # %bb.0:
676; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
677; CHECK-NEXT:    #APP
678; CHECK-NEXT:    nop
679; CHECK-NEXT:    #NO_APP
680; CHECK-NEXT:    paddw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
681; CHECK-NEXT:    retq
682  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
683  %2 = add <8 x i16> %a0, %a1
684  ret <8 x i16> %2
685}
686
687define <16 x i8> @stack_fold_palignr(<16 x i8> %a0, <16 x i8> %a1) {
688; CHECK-LABEL: stack_fold_palignr:
689; CHECK:       # %bb.0:
690; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
691; CHECK-NEXT:    #APP
692; CHECK-NEXT:    nop
693; CHECK-NEXT:    #NO_APP
694; CHECK-NEXT:    palignr $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
695; CHECK-NEXT:    # xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
696; CHECK-NEXT:    retq
697  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
698  %2 = shufflevector <16 x i8> %a1, <16 x i8> %a0, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
699  ret <16 x i8> %2
700}
701
702define <16 x i8> @stack_fold_pand(<16 x i8> %a0, <16 x i8> %a1) {
703; CHECK-LABEL: stack_fold_pand:
704; CHECK:       # %bb.0:
705; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
706; CHECK-NEXT:    #APP
707; CHECK-NEXT:    nop
708; CHECK-NEXT:    #NO_APP
709; CHECK-NEXT:    pand {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
710; CHECK-NEXT:    pcmpeqd %xmm1, %xmm1
711; CHECK-NEXT:    psubb %xmm1, %xmm0
712; CHECK-NEXT:    retq
713  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
714  %2 = and <16 x i8> %a0, %a1
715  ; add forces execution domain
716  %3 = add <16 x i8> %2, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
717  ret <16 x i8> %3
718}
719
720define <16 x i8> @stack_fold_pandn(<16 x i8> %a0, <16 x i8> %a1) {
721; CHECK-LABEL: stack_fold_pandn:
722; CHECK:       # %bb.0:
723; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
724; CHECK-NEXT:    #APP
725; CHECK-NEXT:    nop
726; CHECK-NEXT:    #NO_APP
727; CHECK-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
728; CHECK-NEXT:    pcmpeqd %xmm1, %xmm1
729; CHECK-NEXT:    psubb %xmm1, %xmm0
730; CHECK-NEXT:    retq
731  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
732  %2 = xor <16 x i8> %a0, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
733  %3 = and <16 x i8> %2, %a1
734  ; add forces execution domain
735  %4 = add <16 x i8> %3, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
736  ret <16 x i8> %4
737}
738
739define <16 x i8> @stack_fold_pavgb(<16 x i8> %a0, <16 x i8> %a1) {
740; CHECK-LABEL: stack_fold_pavgb:
741; CHECK:       # %bb.0:
742; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
743; CHECK-NEXT:    #APP
744; CHECK-NEXT:    nop
745; CHECK-NEXT:    #NO_APP
746; CHECK-NEXT:    pavgb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
747; CHECK-NEXT:    retq
748  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
749  %2 = zext <16 x i8> %a0 to <16 x i16>
750  %3 = zext <16 x i8> %a1 to <16 x i16>
751  %4 = add <16 x i16> %2, %3
752  %5 = add <16 x i16> %4, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
753  %6 = lshr <16 x i16> %5, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
754  %7 = trunc <16 x i16> %6 to <16 x i8>
755  ret <16 x i8> %7
756}
757
758define <8 x i16> @stack_fold_pavgw(<8 x i16> %a0, <8 x i16> %a1) {
759; CHECK-LABEL: stack_fold_pavgw:
760; CHECK:       # %bb.0:
761; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
762; CHECK-NEXT:    #APP
763; CHECK-NEXT:    nop
764; CHECK-NEXT:    #NO_APP
765; CHECK-NEXT:    pavgw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
766; CHECK-NEXT:    retq
767  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
768  %2 = zext <8 x i16> %a0 to <8 x i32>
769  %3 = zext <8 x i16> %a1 to <8 x i32>
770  %4 = add <8 x i32> %2, %3
771  %5 = add <8 x i32> %4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
772  %6 = lshr <8 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
773  %7 = trunc <8 x i32> %6 to <8 x i16>
774  ret <8 x i16> %7
775}
776
777define <16 x i8> @stack_fold_pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %c) {
778; CHECK-LABEL: stack_fold_pblendvb:
779; CHECK:       # %bb.0:
780; CHECK-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
781; CHECK-NEXT:    movdqa %xmm1, %xmm2
782; CHECK-NEXT:    #APP
783; CHECK-NEXT:    nop
784; CHECK-NEXT:    #NO_APP
785; CHECK-NEXT:    pblendvb %xmm0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
786; CHECK-NEXT:    movdqa %xmm2, %xmm0
787; CHECK-NEXT:    retq
788  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
789  %2 = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %a1, <16 x i8> %c, <16 x i8> %a0)
790  ret <16 x i8> %2
791}
792declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
793
794define <8 x i16> @stack_fold_pblendw(<8 x i16> %a0, <8 x i16> %a1) {
795; CHECK-LABEL: stack_fold_pblendw:
796; CHECK:       # %bb.0:
797; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
798; CHECK-NEXT:    #APP
799; CHECK-NEXT:    nop
800; CHECK-NEXT:    #NO_APP
801; CHECK-NEXT:    pblendw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
802; CHECK-NEXT:    # xmm0 = mem[0,1,2],xmm0[3,4,5,6,7]
803; CHECK-NEXT:    retq
804  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
805  %2 = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 8, i32 9, i32 10, i32 3, i32 4, i32 5, i32 6, i32 7>
806  ret <8 x i16> %2
807}
808
809define <2 x i64> @stack_fold_pclmulqdq(<2 x i64> %a0, <2 x i64> %a1) {
810; CHECK-LABEL: stack_fold_pclmulqdq:
811; CHECK:       # %bb.0:
812; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
813; CHECK-NEXT:    #APP
814; CHECK-NEXT:    nop
815; CHECK-NEXT:    #NO_APP
816; CHECK-NEXT:    pclmulqdq $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
817; CHECK-NEXT:    retq
818  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
819  %2 = call <2 x i64> @llvm.x86.pclmulqdq(<2 x i64> %a0, <2 x i64> %a1, i8 0)
820  ret <2 x i64> %2
821}
822declare <2 x i64> @llvm.x86.pclmulqdq(<2 x i64>, <2 x i64>, i8) nounwind readnone
823
824define <16 x i8> @stack_fold_pcmpeqb(<16 x i8> %a0, <16 x i8> %a1) {
825; CHECK-LABEL: stack_fold_pcmpeqb:
826; CHECK:       # %bb.0:
827; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
828; CHECK-NEXT:    #APP
829; CHECK-NEXT:    nop
830; CHECK-NEXT:    #NO_APP
831; CHECK-NEXT:    pcmpeqb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
832; CHECK-NEXT:    retq
833  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
834  %2 = icmp eq <16 x i8> %a0, %a1
835  %3 = sext <16 x i1> %2 to <16 x i8>
836  ret <16 x i8> %3
837}
838
839define <4 x i32> @stack_fold_pcmpeqd(<4 x i32> %a0, <4 x i32> %a1) {
840; CHECK-LABEL: stack_fold_pcmpeqd:
841; CHECK:       # %bb.0:
842; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
843; CHECK-NEXT:    #APP
844; CHECK-NEXT:    nop
845; CHECK-NEXT:    #NO_APP
846; CHECK-NEXT:    pcmpeqd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
847; CHECK-NEXT:    retq
848  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
849  %2 = icmp eq <4 x i32> %a0, %a1
850  %3 = sext <4 x i1> %2 to <4 x i32>
851  ret <4 x i32> %3
852}
853
854define <2 x i64> @stack_fold_pcmpeqq(<2 x i64> %a0, <2 x i64> %a1) {
855; CHECK-LABEL: stack_fold_pcmpeqq:
856; CHECK:       # %bb.0:
857; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
858; CHECK-NEXT:    #APP
859; CHECK-NEXT:    nop
860; CHECK-NEXT:    #NO_APP
861; CHECK-NEXT:    pcmpeqq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
862; CHECK-NEXT:    retq
863  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
864  %2 = icmp eq <2 x i64> %a0, %a1
865  %3 = sext <2 x i1> %2 to <2 x i64>
866  ret <2 x i64> %3
867}
868
869define <8 x i16> @stack_fold_pcmpeqw(<8 x i16> %a0, <8 x i16> %a1) {
870; CHECK-LABEL: stack_fold_pcmpeqw:
871; CHECK:       # %bb.0:
872; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
873; CHECK-NEXT:    #APP
874; CHECK-NEXT:    nop
875; CHECK-NEXT:    #NO_APP
876; CHECK-NEXT:    pcmpeqw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
877; CHECK-NEXT:    retq
878  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
879  %2 = icmp eq <8 x i16> %a0, %a1
880  %3 = sext <8 x i1> %2 to <8 x i16>
881  ret <8 x i16> %3
882}
883
884define i32 @stack_fold_pcmpestri(<16 x i8> %a0, <16 x i8> %a1) {
885; CHECK-LABEL: stack_fold_pcmpestri:
886; CHECK:       # %bb.0:
887; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
888; CHECK-NEXT:    #APP
889; CHECK-NEXT:    nop
890; CHECK-NEXT:    #NO_APP
891; CHECK-NEXT:    movl $7, %eax
892; CHECK-NEXT:    movl $7, %edx
893; CHECK-NEXT:    pcmpestri $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
894; CHECK-NEXT:    movl %ecx, %eax
895; CHECK-NEXT:    retq
896  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{rax},~{flags}"()
897  %2 = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %a0, i32 7, <16 x i8> %a1, i32 7, i8 7)
898  ret i32 %2
899}
900declare i32 @llvm.x86.sse42.pcmpestri128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
901
902define <16 x i8> @stack_fold_pcmpestrm(<16 x i8> %a0, <16 x i8> %a1) {
903; CHECK-LABEL: stack_fold_pcmpestrm:
904; CHECK:       # %bb.0:
905; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
906; CHECK-NEXT:    #APP
907; CHECK-NEXT:    nop
908; CHECK-NEXT:    #NO_APP
909; CHECK-NEXT:    movl $7, %eax
910; CHECK-NEXT:    movl $7, %edx
911; CHECK-NEXT:    pcmpestrm $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
912; CHECK-NEXT:    retq
913  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{rax},~{flags}"()
914  %2 = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %a0, i32 7, <16 x i8> %a1, i32 7, i8 7)
915  ret <16 x i8> %2
916}
917declare <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
918
919define <16 x i8> @stack_fold_pcmpgtb(<16 x i8> %a0, <16 x i8> %a1) {
920; CHECK-LABEL: stack_fold_pcmpgtb:
921; CHECK:       # %bb.0:
922; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
923; CHECK-NEXT:    #APP
924; CHECK-NEXT:    nop
925; CHECK-NEXT:    #NO_APP
926; CHECK-NEXT:    pcmpgtb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
927; CHECK-NEXT:    retq
928  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
929  %2 = icmp sgt <16 x i8> %a0, %a1
930  %3 = sext <16 x i1> %2 to <16 x i8>
931  ret <16 x i8> %3
932}
933
934define <4 x i32> @stack_fold_pcmpgtd(<4 x i32> %a0, <4 x i32> %a1) {
935; CHECK-LABEL: stack_fold_pcmpgtd:
936; CHECK:       # %bb.0:
937; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
938; CHECK-NEXT:    #APP
939; CHECK-NEXT:    nop
940; CHECK-NEXT:    #NO_APP
941; CHECK-NEXT:    pcmpgtd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
942; CHECK-NEXT:    retq
943  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
944  %2 = icmp sgt <4 x i32> %a0, %a1
945  %3 = sext <4 x i1> %2 to <4 x i32>
946  ret <4 x i32> %3
947}
948
949define <2 x i64> @stack_fold_pcmpgtq(<2 x i64> %a0, <2 x i64> %a1) {
950; CHECK-LABEL: stack_fold_pcmpgtq:
951; CHECK:       # %bb.0:
952; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
953; CHECK-NEXT:    #APP
954; CHECK-NEXT:    nop
955; CHECK-NEXT:    #NO_APP
956; CHECK-NEXT:    pcmpgtq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
957; CHECK-NEXT:    retq
958  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
959  %2 = icmp sgt <2 x i64> %a0, %a1
960  %3 = sext <2 x i1> %2 to <2 x i64>
961  ret <2 x i64> %3
962}
963
964define <8 x i16> @stack_fold_pcmpgtw(<8 x i16> %a0, <8 x i16> %a1) {
965; CHECK-LABEL: stack_fold_pcmpgtw:
966; CHECK:       # %bb.0:
967; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
968; CHECK-NEXT:    #APP
969; CHECK-NEXT:    nop
970; CHECK-NEXT:    #NO_APP
971; CHECK-NEXT:    pcmpgtw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
972; CHECK-NEXT:    retq
973  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
974  %2 = icmp sgt <8 x i16> %a0, %a1
975  %3 = sext <8 x i1> %2 to <8 x i16>
976  ret <8 x i16> %3
977}
978
979define i32 @stack_fold_pcmpistri(<16 x i8> %a0, <16 x i8> %a1) {
980; CHECK-LABEL: stack_fold_pcmpistri:
981; CHECK:       # %bb.0:
982; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
983; CHECK-NEXT:    #APP
984; CHECK-NEXT:    nop
985; CHECK-NEXT:    #NO_APP
986; CHECK-NEXT:    pcmpistri $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
987; CHECK-NEXT:    movl %ecx, %eax
988; CHECK-NEXT:    retq
989  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
990  %2 = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %a0, <16 x i8> %a1, i8 7)
991  ret i32 %2
992}
993declare i32 @llvm.x86.sse42.pcmpistri128(<16 x i8>, <16 x i8>, i8) nounwind readnone
994
995define <16 x i8> @stack_fold_pcmpistrm(<16 x i8> %a0, <16 x i8> %a1) {
996; CHECK-LABEL: stack_fold_pcmpistrm:
997; CHECK:       # %bb.0:
998; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
999; CHECK-NEXT:    #APP
1000; CHECK-NEXT:    nop
1001; CHECK-NEXT:    #NO_APP
1002; CHECK-NEXT:    pcmpistrm $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1003; CHECK-NEXT:    retq
1004  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1005  %2 = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %a0, <16 x i8> %a1, i8 7)
1006  ret <16 x i8> %2
1007}
1008declare <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8>, <16 x i8>, i8) nounwind readnone
1009
1010; TODO stack_fold_pextrb
1011
1012; We can't naively fold pextrw as it only writes to a 16-bit memory location
1013; even though it can store to a 32-bit register.
1014define i16 @stack_fold_pextrw(<8 x i16> %a0) {
1015; CHECK-LABEL: stack_fold_pextrw:
1016; CHECK:       # %bb.0: # %entry
1017; CHECK-NEXT:    pushq %rbp
1018; CHECK-NEXT:    .cfi_def_cfa_offset 16
1019; CHECK-NEXT:    pushq %r15
1020; CHECK-NEXT:    .cfi_def_cfa_offset 24
1021; CHECK-NEXT:    pushq %r14
1022; CHECK-NEXT:    .cfi_def_cfa_offset 32
1023; CHECK-NEXT:    pushq %r13
1024; CHECK-NEXT:    .cfi_def_cfa_offset 40
1025; CHECK-NEXT:    pushq %r12
1026; CHECK-NEXT:    .cfi_def_cfa_offset 48
1027; CHECK-NEXT:    pushq %rbx
1028; CHECK-NEXT:    .cfi_def_cfa_offset 56
1029; CHECK-NEXT:    .cfi_offset %rbx, -56
1030; CHECK-NEXT:    .cfi_offset %r12, -48
1031; CHECK-NEXT:    .cfi_offset %r13, -40
1032; CHECK-NEXT:    .cfi_offset %r14, -32
1033; CHECK-NEXT:    .cfi_offset %r15, -24
1034; CHECK-NEXT:    .cfi_offset %rbp, -16
1035; CHECK-NEXT:    pextrw $1, %xmm0, %eax
1036; CHECK-NEXT:    addl $2, %eax
1037; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1038; CHECK-NEXT:    #APP
1039; CHECK-NEXT:    nop
1040; CHECK-NEXT:    #NO_APP
1041; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
1042; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
1043; CHECK-NEXT:    popq %rbx
1044; CHECK-NEXT:    .cfi_def_cfa_offset 48
1045; CHECK-NEXT:    popq %r12
1046; CHECK-NEXT:    .cfi_def_cfa_offset 40
1047; CHECK-NEXT:    popq %r13
1048; CHECK-NEXT:    .cfi_def_cfa_offset 32
1049; CHECK-NEXT:    popq %r14
1050; CHECK-NEXT:    .cfi_def_cfa_offset 24
1051; CHECK-NEXT:    popq %r15
1052; CHECK-NEXT:    .cfi_def_cfa_offset 16
1053; CHECK-NEXT:    popq %rbp
1054; CHECK-NEXT:    .cfi_def_cfa_offset 8
1055; CHECK-NEXT:    retq
1056entry:
1057; add forces execution domain
1058  %add = add <8 x i16> %a0, <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>
1059  %extract = extractelement <8 x i16> %add, i32 1
1060  %asm = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
1061  ret i16 %extract
1062}
1063
1064define i32 @stack_fold_pextrd(<4 x i32> %a0, <4 x i32> %a1) {
1065; CHECK-LABEL: stack_fold_pextrd:
1066; CHECK:       # %bb.0:
1067; CHECK-NEXT:    pushq %rbp
1068; CHECK-NEXT:    .cfi_def_cfa_offset 16
1069; CHECK-NEXT:    pushq %r15
1070; CHECK-NEXT:    .cfi_def_cfa_offset 24
1071; CHECK-NEXT:    pushq %r14
1072; CHECK-NEXT:    .cfi_def_cfa_offset 32
1073; CHECK-NEXT:    pushq %r13
1074; CHECK-NEXT:    .cfi_def_cfa_offset 40
1075; CHECK-NEXT:    pushq %r12
1076; CHECK-NEXT:    .cfi_def_cfa_offset 48
1077; CHECK-NEXT:    pushq %rbx
1078; CHECK-NEXT:    .cfi_def_cfa_offset 56
1079; CHECK-NEXT:    .cfi_offset %rbx, -56
1080; CHECK-NEXT:    .cfi_offset %r12, -48
1081; CHECK-NEXT:    .cfi_offset %r13, -40
1082; CHECK-NEXT:    .cfi_offset %r14, -32
1083; CHECK-NEXT:    .cfi_offset %r15, -24
1084; CHECK-NEXT:    .cfi_offset %rbp, -16
1085; CHECK-NEXT:    paddd %xmm1, %xmm0
1086; CHECK-NEXT:    pextrd $1, %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
1087; CHECK-NEXT:    #APP
1088; CHECK-NEXT:    nop
1089; CHECK-NEXT:    #NO_APP
1090; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
1091; CHECK-NEXT:    popq %rbx
1092; CHECK-NEXT:    .cfi_def_cfa_offset 48
1093; CHECK-NEXT:    popq %r12
1094; CHECK-NEXT:    .cfi_def_cfa_offset 40
1095; CHECK-NEXT:    popq %r13
1096; CHECK-NEXT:    .cfi_def_cfa_offset 32
1097; CHECK-NEXT:    popq %r14
1098; CHECK-NEXT:    .cfi_def_cfa_offset 24
1099; CHECK-NEXT:    popq %r15
1100; CHECK-NEXT:    .cfi_def_cfa_offset 16
1101; CHECK-NEXT:    popq %rbp
1102; CHECK-NEXT:    .cfi_def_cfa_offset 8
1103; CHECK-NEXT:    retq
1104  ; add forces execution domain
1105  %1 = add <4 x i32> %a0, %a1
1106  %2 = extractelement <4 x i32> %1, i32 1
1107  %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
1108  ret i32 %2
1109}
1110
1111define i64 @stack_fold_pextrq(<2 x i64> %a0) {
1112; CHECK-LABEL: stack_fold_pextrq:
1113; CHECK:       # %bb.0:
1114; CHECK-NEXT:    pushq %rbp
1115; CHECK-NEXT:    .cfi_def_cfa_offset 16
1116; CHECK-NEXT:    pushq %r15
1117; CHECK-NEXT:    .cfi_def_cfa_offset 24
1118; CHECK-NEXT:    pushq %r14
1119; CHECK-NEXT:    .cfi_def_cfa_offset 32
1120; CHECK-NEXT:    pushq %r13
1121; CHECK-NEXT:    .cfi_def_cfa_offset 40
1122; CHECK-NEXT:    pushq %r12
1123; CHECK-NEXT:    .cfi_def_cfa_offset 48
1124; CHECK-NEXT:    pushq %rbx
1125; CHECK-NEXT:    .cfi_def_cfa_offset 56
1126; CHECK-NEXT:    .cfi_offset %rbx, -56
1127; CHECK-NEXT:    .cfi_offset %r12, -48
1128; CHECK-NEXT:    .cfi_offset %r13, -40
1129; CHECK-NEXT:    .cfi_offset %r14, -32
1130; CHECK-NEXT:    .cfi_offset %r15, -24
1131; CHECK-NEXT:    .cfi_offset %rbp, -16
1132; CHECK-NEXT:    pextrq $1, %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
1133; CHECK-NEXT:    #APP
1134; CHECK-NEXT:    nop
1135; CHECK-NEXT:    #NO_APP
1136; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
1137; CHECK-NEXT:    popq %rbx
1138; CHECK-NEXT:    .cfi_def_cfa_offset 48
1139; CHECK-NEXT:    popq %r12
1140; CHECK-NEXT:    .cfi_def_cfa_offset 40
1141; CHECK-NEXT:    popq %r13
1142; CHECK-NEXT:    .cfi_def_cfa_offset 32
1143; CHECK-NEXT:    popq %r14
1144; CHECK-NEXT:    .cfi_def_cfa_offset 24
1145; CHECK-NEXT:    popq %r15
1146; CHECK-NEXT:    .cfi_def_cfa_offset 16
1147; CHECK-NEXT:    popq %rbp
1148; CHECK-NEXT:    .cfi_def_cfa_offset 8
1149; CHECK-NEXT:    retq
1150  %1 = extractelement <2 x i64> %a0, i32 1
1151  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
1152  ret i64 %1
1153}
1154
1155define <4 x i32> @stack_fold_phaddd(<4 x i32> %a0, <4 x i32> %a1) {
1156; CHECK-LABEL: stack_fold_phaddd:
1157; CHECK:       # %bb.0:
1158; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1159; CHECK-NEXT:    #APP
1160; CHECK-NEXT:    nop
1161; CHECK-NEXT:    #NO_APP
1162; CHECK-NEXT:    phaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1163; CHECK-NEXT:    retq
1164  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1165  %2 = call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %a0, <4 x i32> %a1)
1166  ret <4 x i32> %2
1167}
1168declare <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32>, <4 x i32>) nounwind readnone
1169
1170define <8 x i16> @stack_fold_phaddsw(<8 x i16> %a0, <8 x i16> %a1) {
1171; CHECK-LABEL: stack_fold_phaddsw:
1172; CHECK:       # %bb.0:
1173; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1174; CHECK-NEXT:    #APP
1175; CHECK-NEXT:    nop
1176; CHECK-NEXT:    #NO_APP
1177; CHECK-NEXT:    phaddsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1178; CHECK-NEXT:    retq
1179  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1180  %2 = call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> %a0, <8 x i16> %a1)
1181  ret <8 x i16> %2
1182}
1183declare <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16>, <8 x i16>) nounwind readnone
1184
1185define <8 x i16> @stack_fold_phaddw(<8 x i16> %a0, <8 x i16> %a1) {
1186; CHECK-LABEL: stack_fold_phaddw:
1187; CHECK:       # %bb.0:
1188; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1189; CHECK-NEXT:    #APP
1190; CHECK-NEXT:    nop
1191; CHECK-NEXT:    #NO_APP
1192; CHECK-NEXT:    phaddw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1193; CHECK-NEXT:    retq
1194  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1195  %2 = call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %a0, <8 x i16> %a1)
1196  ret <8 x i16> %2
1197}
1198declare <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16>, <8 x i16>) nounwind readnone
1199
1200define <8 x i16> @stack_fold_phminposuw(<8 x i16> %a0) {
1201; CHECK-LABEL: stack_fold_phminposuw:
1202; CHECK:       # %bb.0:
1203; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1204; CHECK-NEXT:    #APP
1205; CHECK-NEXT:    nop
1206; CHECK-NEXT:    #NO_APP
1207; CHECK-NEXT:    phminposuw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1208; CHECK-NEXT:    retq
1209  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1210  %2 = call <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16> %a0)
1211  ret <8 x i16> %2
1212}
1213declare <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16>) nounwind readnone
1214
1215define <4 x i32> @stack_fold_phsubd(<4 x i32> %a0, <4 x i32> %a1) {
1216; CHECK-LABEL: stack_fold_phsubd:
1217; CHECK:       # %bb.0:
1218; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1219; CHECK-NEXT:    #APP
1220; CHECK-NEXT:    nop
1221; CHECK-NEXT:    #NO_APP
1222; CHECK-NEXT:    phsubd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1223; CHECK-NEXT:    retq
1224  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1225  %2 = call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %a0, <4 x i32> %a1)
1226  ret <4 x i32> %2
1227}
1228declare <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32>, <4 x i32>) nounwind readnone
1229
1230define <8 x i16> @stack_fold_phsubsw(<8 x i16> %a0, <8 x i16> %a1) {
1231; CHECK-LABEL: stack_fold_phsubsw:
1232; CHECK:       # %bb.0:
1233; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1234; CHECK-NEXT:    #APP
1235; CHECK-NEXT:    nop
1236; CHECK-NEXT:    #NO_APP
1237; CHECK-NEXT:    phsubsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1238; CHECK-NEXT:    retq
1239  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1240  %2 = call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16> %a0, <8 x i16> %a1)
1241  ret <8 x i16> %2
1242}
1243declare <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16>, <8 x i16>) nounwind readnone
1244
1245define <8 x i16> @stack_fold_phsubw(<8 x i16> %a0, <8 x i16> %a1) {
1246; CHECK-LABEL: stack_fold_phsubw:
1247; CHECK:       # %bb.0:
1248; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1249; CHECK-NEXT:    #APP
1250; CHECK-NEXT:    nop
1251; CHECK-NEXT:    #NO_APP
1252; CHECK-NEXT:    phsubw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1253; CHECK-NEXT:    retq
1254  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1255  %2 = call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %a0, <8 x i16> %a1)
1256  ret <8 x i16> %2
1257}
1258declare <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16>, <8 x i16>) nounwind readnone
1259
1260define <16 x i8> @stack_fold_pinsrb(<16 x i8> %a0, i8 %a1) {
1261; CHECK-LABEL: stack_fold_pinsrb:
1262; CHECK:       # %bb.0:
1263; CHECK-NEXT:    pushq %rbp
1264; CHECK-NEXT:    .cfi_def_cfa_offset 16
1265; CHECK-NEXT:    pushq %r15
1266; CHECK-NEXT:    .cfi_def_cfa_offset 24
1267; CHECK-NEXT:    pushq %r14
1268; CHECK-NEXT:    .cfi_def_cfa_offset 32
1269; CHECK-NEXT:    pushq %r13
1270; CHECK-NEXT:    .cfi_def_cfa_offset 40
1271; CHECK-NEXT:    pushq %r12
1272; CHECK-NEXT:    .cfi_def_cfa_offset 48
1273; CHECK-NEXT:    pushq %rbx
1274; CHECK-NEXT:    .cfi_def_cfa_offset 56
1275; CHECK-NEXT:    .cfi_offset %rbx, -56
1276; CHECK-NEXT:    .cfi_offset %r12, -48
1277; CHECK-NEXT:    .cfi_offset %r13, -40
1278; CHECK-NEXT:    .cfi_offset %r14, -32
1279; CHECK-NEXT:    .cfi_offset %r15, -24
1280; CHECK-NEXT:    .cfi_offset %rbp, -16
1281; CHECK-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1282; CHECK-NEXT:    #APP
1283; CHECK-NEXT:    nop
1284; CHECK-NEXT:    #NO_APP
1285; CHECK-NEXT:    pinsrb $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
1286; CHECK-NEXT:    popq %rbx
1287; CHECK-NEXT:    .cfi_def_cfa_offset 48
1288; CHECK-NEXT:    popq %r12
1289; CHECK-NEXT:    .cfi_def_cfa_offset 40
1290; CHECK-NEXT:    popq %r13
1291; CHECK-NEXT:    .cfi_def_cfa_offset 32
1292; CHECK-NEXT:    popq %r14
1293; CHECK-NEXT:    .cfi_def_cfa_offset 24
1294; CHECK-NEXT:    popq %r15
1295; CHECK-NEXT:    .cfi_def_cfa_offset 16
1296; CHECK-NEXT:    popq %rbp
1297; CHECK-NEXT:    .cfi_def_cfa_offset 8
1298; CHECK-NEXT:    retq
1299  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
1300  %2 = insertelement <16 x i8> %a0, i8 %a1, i32 1
1301  ret <16 x i8> %2
1302}
1303
1304define <4 x i32> @stack_fold_pinsrd(<4 x i32> %a0, i32 %a1) {
1305; CHECK-LABEL: stack_fold_pinsrd:
1306; CHECK:       # %bb.0:
1307; CHECK-NEXT:    pushq %rbp
1308; CHECK-NEXT:    .cfi_def_cfa_offset 16
1309; CHECK-NEXT:    pushq %r15
1310; CHECK-NEXT:    .cfi_def_cfa_offset 24
1311; CHECK-NEXT:    pushq %r14
1312; CHECK-NEXT:    .cfi_def_cfa_offset 32
1313; CHECK-NEXT:    pushq %r13
1314; CHECK-NEXT:    .cfi_def_cfa_offset 40
1315; CHECK-NEXT:    pushq %r12
1316; CHECK-NEXT:    .cfi_def_cfa_offset 48
1317; CHECK-NEXT:    pushq %rbx
1318; CHECK-NEXT:    .cfi_def_cfa_offset 56
1319; CHECK-NEXT:    .cfi_offset %rbx, -56
1320; CHECK-NEXT:    .cfi_offset %r12, -48
1321; CHECK-NEXT:    .cfi_offset %r13, -40
1322; CHECK-NEXT:    .cfi_offset %r14, -32
1323; CHECK-NEXT:    .cfi_offset %r15, -24
1324; CHECK-NEXT:    .cfi_offset %rbp, -16
1325; CHECK-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1326; CHECK-NEXT:    #APP
1327; CHECK-NEXT:    nop
1328; CHECK-NEXT:    #NO_APP
1329; CHECK-NEXT:    pinsrd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
1330; CHECK-NEXT:    popq %rbx
1331; CHECK-NEXT:    .cfi_def_cfa_offset 48
1332; CHECK-NEXT:    popq %r12
1333; CHECK-NEXT:    .cfi_def_cfa_offset 40
1334; CHECK-NEXT:    popq %r13
1335; CHECK-NEXT:    .cfi_def_cfa_offset 32
1336; CHECK-NEXT:    popq %r14
1337; CHECK-NEXT:    .cfi_def_cfa_offset 24
1338; CHECK-NEXT:    popq %r15
1339; CHECK-NEXT:    .cfi_def_cfa_offset 16
1340; CHECK-NEXT:    popq %rbp
1341; CHECK-NEXT:    .cfi_def_cfa_offset 8
1342; CHECK-NEXT:    retq
1343  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
1344  %2 = insertelement <4 x i32> %a0, i32 %a1, i32 1
1345  ret <4 x i32> %2
1346}
1347
1348define <2 x i64> @stack_fold_pinsrq(<2 x i64> %a0, i64 %a1) {
1349; CHECK-LABEL: stack_fold_pinsrq:
1350; CHECK:       # %bb.0:
1351; CHECK-NEXT:    pushq %rbp
1352; CHECK-NEXT:    .cfi_def_cfa_offset 16
1353; CHECK-NEXT:    pushq %r15
1354; CHECK-NEXT:    .cfi_def_cfa_offset 24
1355; CHECK-NEXT:    pushq %r14
1356; CHECK-NEXT:    .cfi_def_cfa_offset 32
1357; CHECK-NEXT:    pushq %r13
1358; CHECK-NEXT:    .cfi_def_cfa_offset 40
1359; CHECK-NEXT:    pushq %r12
1360; CHECK-NEXT:    .cfi_def_cfa_offset 48
1361; CHECK-NEXT:    pushq %rbx
1362; CHECK-NEXT:    .cfi_def_cfa_offset 56
1363; CHECK-NEXT:    .cfi_offset %rbx, -56
1364; CHECK-NEXT:    .cfi_offset %r12, -48
1365; CHECK-NEXT:    .cfi_offset %r13, -40
1366; CHECK-NEXT:    .cfi_offset %r14, -32
1367; CHECK-NEXT:    .cfi_offset %r15, -24
1368; CHECK-NEXT:    .cfi_offset %rbp, -16
1369; CHECK-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1370; CHECK-NEXT:    #APP
1371; CHECK-NEXT:    nop
1372; CHECK-NEXT:    #NO_APP
1373; CHECK-NEXT:    pinsrq $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload
1374; CHECK-NEXT:    popq %rbx
1375; CHECK-NEXT:    .cfi_def_cfa_offset 48
1376; CHECK-NEXT:    popq %r12
1377; CHECK-NEXT:    .cfi_def_cfa_offset 40
1378; CHECK-NEXT:    popq %r13
1379; CHECK-NEXT:    .cfi_def_cfa_offset 32
1380; CHECK-NEXT:    popq %r14
1381; CHECK-NEXT:    .cfi_def_cfa_offset 24
1382; CHECK-NEXT:    popq %r15
1383; CHECK-NEXT:    .cfi_def_cfa_offset 16
1384; CHECK-NEXT:    popq %rbp
1385; CHECK-NEXT:    .cfi_def_cfa_offset 8
1386; CHECK-NEXT:    retq
1387  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
1388  %2 = insertelement <2 x i64> %a0, i64 %a1, i32 1
1389  ret <2 x i64> %2
1390}
1391
1392define <8 x i16> @stack_fold_pinsrw(<8 x i16> %a0, i16 %a1) {
1393; CHECK-LABEL: stack_fold_pinsrw:
1394; CHECK:       # %bb.0:
1395; CHECK-NEXT:    pushq %rbp
1396; CHECK-NEXT:    .cfi_def_cfa_offset 16
1397; CHECK-NEXT:    pushq %r15
1398; CHECK-NEXT:    .cfi_def_cfa_offset 24
1399; CHECK-NEXT:    pushq %r14
1400; CHECK-NEXT:    .cfi_def_cfa_offset 32
1401; CHECK-NEXT:    pushq %r13
1402; CHECK-NEXT:    .cfi_def_cfa_offset 40
1403; CHECK-NEXT:    pushq %r12
1404; CHECK-NEXT:    .cfi_def_cfa_offset 48
1405; CHECK-NEXT:    pushq %rbx
1406; CHECK-NEXT:    .cfi_def_cfa_offset 56
1407; CHECK-NEXT:    .cfi_offset %rbx, -56
1408; CHECK-NEXT:    .cfi_offset %r12, -48
1409; CHECK-NEXT:    .cfi_offset %r13, -40
1410; CHECK-NEXT:    .cfi_offset %r14, -32
1411; CHECK-NEXT:    .cfi_offset %r15, -24
1412; CHECK-NEXT:    .cfi_offset %rbp, -16
1413; CHECK-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1414; CHECK-NEXT:    #APP
1415; CHECK-NEXT:    nop
1416; CHECK-NEXT:    #NO_APP
1417; CHECK-NEXT:    pinsrw $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
1418; CHECK-NEXT:    popq %rbx
1419; CHECK-NEXT:    .cfi_def_cfa_offset 48
1420; CHECK-NEXT:    popq %r12
1421; CHECK-NEXT:    .cfi_def_cfa_offset 40
1422; CHECK-NEXT:    popq %r13
1423; CHECK-NEXT:    .cfi_def_cfa_offset 32
1424; CHECK-NEXT:    popq %r14
1425; CHECK-NEXT:    .cfi_def_cfa_offset 24
1426; CHECK-NEXT:    popq %r15
1427; CHECK-NEXT:    .cfi_def_cfa_offset 16
1428; CHECK-NEXT:    popq %rbp
1429; CHECK-NEXT:    .cfi_def_cfa_offset 8
1430; CHECK-NEXT:    retq
1431  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
1432  %2 = insertelement <8 x i16> %a0, i16 %a1, i32 1
1433  ret <8 x i16> %2
1434}
1435
1436define <8 x i16> @stack_fold_pmaddubsw(<16 x i8> %a0, <16 x i8> %a1) {
1437; CHECK-LABEL: stack_fold_pmaddubsw:
1438; CHECK:       # %bb.0:
1439; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1440; CHECK-NEXT:    #APP
1441; CHECK-NEXT:    nop
1442; CHECK-NEXT:    #NO_APP
1443; CHECK-NEXT:    pmaddubsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1444; CHECK-NEXT:    retq
1445  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1446  %2 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a0, <16 x i8> %a1)
1447  ret <8 x i16> %2
1448}
1449declare <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8>, <16 x i8>) nounwind readnone
1450
1451define <4 x i32> @stack_fold_pmaddwd(<8 x i16> %a0, <8 x i16> %a1) {
1452; CHECK-LABEL: stack_fold_pmaddwd:
1453; CHECK:       # %bb.0:
1454; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1455; CHECK-NEXT:    #APP
1456; CHECK-NEXT:    nop
1457; CHECK-NEXT:    #NO_APP
1458; CHECK-NEXT:    pmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1459; CHECK-NEXT:    retq
1460  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1461  %2 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a0, <8 x i16> %a1)
1462  ret <4 x i32> %2
1463}
1464declare <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>) nounwind readnone
1465
1466define <16 x i8> @stack_fold_pmaxsb(<16 x i8> %a0, <16 x i8> %a1) {
1467; CHECK-LABEL: stack_fold_pmaxsb:
1468; CHECK:       # %bb.0:
1469; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1470; CHECK-NEXT:    #APP
1471; CHECK-NEXT:    nop
1472; CHECK-NEXT:    #NO_APP
1473; CHECK-NEXT:    pmaxsb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1474; CHECK-NEXT:    retq
1475  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1476  %2 = icmp sgt <16 x i8> %a0, %a1
1477  %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %a1
1478  ret <16 x i8> %3
1479}
1480
1481define <4 x i32> @stack_fold_pmaxsd(<4 x i32> %a0, <4 x i32> %a1) {
1482; CHECK-LABEL: stack_fold_pmaxsd:
1483; CHECK:       # %bb.0:
1484; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1485; CHECK-NEXT:    #APP
1486; CHECK-NEXT:    nop
1487; CHECK-NEXT:    #NO_APP
1488; CHECK-NEXT:    pmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1489; CHECK-NEXT:    retq
1490  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1491  %2 = icmp sgt <4 x i32> %a0, %a1
1492  %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %a1
1493  ret <4 x i32> %3
1494}
1495
1496define <8 x i16> @stack_fold_pmaxsw(<8 x i16> %a0, <8 x i16> %a1) {
1497; CHECK-LABEL: stack_fold_pmaxsw:
1498; CHECK:       # %bb.0:
1499; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1500; CHECK-NEXT:    #APP
1501; CHECK-NEXT:    nop
1502; CHECK-NEXT:    #NO_APP
1503; CHECK-NEXT:    pmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1504; CHECK-NEXT:    retq
1505  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1506  %2 = icmp sgt <8 x i16> %a0, %a1
1507  %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %a1
1508  ret <8 x i16> %3
1509}
1510
1511define <16 x i8> @stack_fold_pmaxub(<16 x i8> %a0, <16 x i8> %a1) {
1512; CHECK-LABEL: stack_fold_pmaxub:
1513; CHECK:       # %bb.0:
1514; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1515; CHECK-NEXT:    #APP
1516; CHECK-NEXT:    nop
1517; CHECK-NEXT:    #NO_APP
1518; CHECK-NEXT:    pmaxub {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1519; CHECK-NEXT:    retq
1520  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1521  %2 = icmp ugt <16 x i8> %a0, %a1
1522  %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %a1
1523  ret <16 x i8> %3
1524}
1525
1526define <4 x i32> @stack_fold_pmaxud(<4 x i32> %a0, <4 x i32> %a1) {
1527; CHECK-LABEL: stack_fold_pmaxud:
1528; CHECK:       # %bb.0:
1529; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1530; CHECK-NEXT:    #APP
1531; CHECK-NEXT:    nop
1532; CHECK-NEXT:    #NO_APP
1533; CHECK-NEXT:    pmaxud {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1534; CHECK-NEXT:    retq
1535  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1536  %2 = icmp ugt <4 x i32> %a0, %a1
1537  %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %a1
1538  ret <4 x i32> %3
1539}
1540
1541define <8 x i16> @stack_fold_pmaxuw(<8 x i16> %a0, <8 x i16> %a1) {
1542; CHECK-LABEL: stack_fold_pmaxuw:
1543; CHECK:       # %bb.0:
1544; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1545; CHECK-NEXT:    #APP
1546; CHECK-NEXT:    nop
1547; CHECK-NEXT:    #NO_APP
1548; CHECK-NEXT:    pmaxuw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1549; CHECK-NEXT:    retq
1550  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1551  %2 = icmp ugt <8 x i16> %a0, %a1
1552  %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %a1
1553  ret <8 x i16> %3
1554}
1555
1556define <16 x i8> @stack_fold_pminsb(<16 x i8> %a0, <16 x i8> %a1) {
1557; CHECK-LABEL: stack_fold_pminsb:
1558; CHECK:       # %bb.0:
1559; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1560; CHECK-NEXT:    #APP
1561; CHECK-NEXT:    nop
1562; CHECK-NEXT:    #NO_APP
1563; CHECK-NEXT:    pminsb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1564; CHECK-NEXT:    retq
1565  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1566  %2 = icmp slt <16 x i8> %a0, %a1
1567  %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %a1
1568  ret <16 x i8> %3
1569}
1570
1571define <4 x i32> @stack_fold_pminsd(<4 x i32> %a0, <4 x i32> %a1) {
1572; CHECK-LABEL: stack_fold_pminsd:
1573; CHECK:       # %bb.0:
1574; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1575; CHECK-NEXT:    #APP
1576; CHECK-NEXT:    nop
1577; CHECK-NEXT:    #NO_APP
1578; CHECK-NEXT:    pminsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1579; CHECK-NEXT:    retq
1580  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1581  %2 = icmp slt <4 x i32> %a0, %a1
1582  %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %a1
1583  ret <4 x i32> %3
1584}
1585
1586define <8 x i16> @stack_fold_pminsw(<8 x i16> %a0, <8 x i16> %a1) {
1587; CHECK-LABEL: stack_fold_pminsw:
1588; CHECK:       # %bb.0:
1589; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1590; CHECK-NEXT:    #APP
1591; CHECK-NEXT:    nop
1592; CHECK-NEXT:    #NO_APP
1593; CHECK-NEXT:    pminsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1594; CHECK-NEXT:    retq
1595  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1596  %2 = icmp slt <8 x i16> %a0, %a1
1597  %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %a1
1598  ret <8 x i16> %3
1599}
1600
1601define <16 x i8> @stack_fold_pminub(<16 x i8> %a0, <16 x i8> %a1) {
1602; CHECK-LABEL: stack_fold_pminub:
1603; CHECK:       # %bb.0:
1604; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1605; CHECK-NEXT:    #APP
1606; CHECK-NEXT:    nop
1607; CHECK-NEXT:    #NO_APP
1608; CHECK-NEXT:    pminub {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1609; CHECK-NEXT:    retq
1610  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1611  %2 = icmp ult <16 x i8> %a0, %a1
1612  %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %a1
1613  ret <16 x i8> %3
1614}
1615
1616define <4 x i32> @stack_fold_pminud(<4 x i32> %a0, <4 x i32> %a1) {
1617; CHECK-LABEL: stack_fold_pminud:
1618; CHECK:       # %bb.0:
1619; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1620; CHECK-NEXT:    #APP
1621; CHECK-NEXT:    nop
1622; CHECK-NEXT:    #NO_APP
1623; CHECK-NEXT:    pminud {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1624; CHECK-NEXT:    retq
1625  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1626  %2 = icmp ult <4 x i32> %a0, %a1
1627  %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %a1
1628  ret <4 x i32> %3
1629}
1630
1631define <8 x i16> @stack_fold_pminuw(<8 x i16> %a0, <8 x i16> %a1) {
1632; CHECK-LABEL: stack_fold_pminuw:
1633; CHECK:       # %bb.0:
1634; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1635; CHECK-NEXT:    #APP
1636; CHECK-NEXT:    nop
1637; CHECK-NEXT:    #NO_APP
1638; CHECK-NEXT:    pminuw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1639; CHECK-NEXT:    retq
1640  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1641  %2 = icmp ult <8 x i16> %a0, %a1
1642  %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %a1
1643  ret <8 x i16> %3
1644}
1645
1646define <4 x i32> @stack_fold_pmovsxbd(<16 x i8> %a0) {
1647; CHECK-LABEL: stack_fold_pmovsxbd:
1648; CHECK:       # %bb.0:
1649; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1650; CHECK-NEXT:    #APP
1651; CHECK-NEXT:    nop
1652; CHECK-NEXT:    #NO_APP
1653; CHECK-NEXT:    pmovsxbd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1654; CHECK-NEXT:    retq
1655  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1656  %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1657  %3 = sext <4 x i8> %2 to <4 x i32>
1658  ret <4 x i32> %3
1659}
1660
1661define <2 x i64> @stack_fold_pmovsxbq(<16 x i8> %a0) {
1662; CHECK-LABEL: stack_fold_pmovsxbq:
1663; CHECK:       # %bb.0:
1664; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1665; CHECK-NEXT:    #APP
1666; CHECK-NEXT:    nop
1667; CHECK-NEXT:    #NO_APP
1668; CHECK-NEXT:    pmovsxbq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1669; CHECK-NEXT:    retq
1670  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1671  %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
1672  %3 = sext <2 x i8> %2 to <2 x i64>
1673  ret <2 x i64> %3
1674}
1675
1676define <8 x i16> @stack_fold_pmovsxbw(<16 x i8> %a0) {
1677; CHECK-LABEL: stack_fold_pmovsxbw:
1678; CHECK:       # %bb.0:
1679; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1680; CHECK-NEXT:    #APP
1681; CHECK-NEXT:    nop
1682; CHECK-NEXT:    #NO_APP
1683; CHECK-NEXT:    pmovsxbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1684; CHECK-NEXT:    retq
1685  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1686  %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1687  %3 = sext <8 x i8> %2 to <8 x i16>
1688  ret <8 x i16> %3
1689}
1690
1691define <2 x i64> @stack_fold_pmovsxdq(<4 x i32> %a0) {
1692; CHECK-LABEL: stack_fold_pmovsxdq:
1693; CHECK:       # %bb.0:
1694; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1695; CHECK-NEXT:    #APP
1696; CHECK-NEXT:    nop
1697; CHECK-NEXT:    #NO_APP
1698; CHECK-NEXT:    pmovsxdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1699; CHECK-NEXT:    retq
1700  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1701  %2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
1702  %3 = sext <2 x i32> %2 to <2 x i64>
1703  ret <2 x i64> %3
1704}
1705
1706define <4 x i32> @stack_fold_pmovsxwd(<8 x i16> %a0) {
1707; CHECK-LABEL: stack_fold_pmovsxwd:
1708; CHECK:       # %bb.0:
1709; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1710; CHECK-NEXT:    #APP
1711; CHECK-NEXT:    nop
1712; CHECK-NEXT:    #NO_APP
1713; CHECK-NEXT:    pmovsxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1714; CHECK-NEXT:    retq
1715  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1716  %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1717  %3 = sext <4 x i16> %2 to <4 x i32>
1718  ret <4 x i32> %3
1719}
1720
1721define <2 x i64> @stack_fold_pmovsxwq(<8 x i16> %a0) {
1722; CHECK-LABEL: stack_fold_pmovsxwq:
1723; CHECK:       # %bb.0:
1724; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1725; CHECK-NEXT:    #APP
1726; CHECK-NEXT:    nop
1727; CHECK-NEXT:    #NO_APP
1728; CHECK-NEXT:    pmovsxwq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1729; CHECK-NEXT:    retq
1730  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1731  %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
1732  %3 = sext <2 x i16> %2 to <2 x i64>
1733  ret <2 x i64> %3
1734}
1735
1736define <4 x i32> @stack_fold_pmovzxbd(<16 x i8> %a0) {
1737; CHECK-LABEL: stack_fold_pmovzxbd:
1738; CHECK:       # %bb.0:
1739; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1740; CHECK-NEXT:    #APP
1741; CHECK-NEXT:    nop
1742; CHECK-NEXT:    #NO_APP
1743; CHECK-NEXT:    pmovzxbd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1744; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1745; CHECK-NEXT:    retq
1746  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1747  %2 = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 17, i32 18, i32 1, i32 19, i32 20, i32 21, i32 2, i32 22, i32 23, i32 24, i32 3, i32 25, i32 26, i32 27>
1748  %3 = bitcast <16 x i8> %2 to <4 x i32>
1749  ret <4 x i32> %3
1750}
1751
1752define <2 x i64> @stack_fold_pmovzxbq(<16 x i8> %a0) {
1753; CHECK-LABEL: stack_fold_pmovzxbq:
1754; CHECK:       # %bb.0:
1755; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1756; CHECK-NEXT:    #APP
1757; CHECK-NEXT:    nop
1758; CHECK-NEXT:    #NO_APP
1759; CHECK-NEXT:    pmovzxbq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1760; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
1761; CHECK-NEXT:    retq
1762  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1763  %2 = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 1, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28>
1764  %3 = bitcast <16 x i8> %2 to <2 x i64>
1765  ret <2 x i64> %3
1766}
1767
1768define <8 x i16> @stack_fold_pmovzxbw(<16 x i8> %a0) {
1769; CHECK-LABEL: stack_fold_pmovzxbw:
1770; CHECK:       # %bb.0:
1771; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1772; CHECK-NEXT:    #APP
1773; CHECK-NEXT:    nop
1774; CHECK-NEXT:    #NO_APP
1775; CHECK-NEXT:    pmovzxbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1776; CHECK-NEXT:    # xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
1777; CHECK-NEXT:    retq
1778  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1779  %2 = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
1780  %3 = bitcast <16 x i8> %2 to <8 x i16>
1781  ret <8 x i16> %3
1782}
1783
1784define <2 x i64> @stack_fold_pmovzxdq(<4 x i32> %a0) {
1785; CHECK-LABEL: stack_fold_pmovzxdq:
1786; CHECK:       # %bb.0:
1787; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1788; CHECK-NEXT:    #APP
1789; CHECK-NEXT:    nop
1790; CHECK-NEXT:    #NO_APP
1791; CHECK-NEXT:    pmovzxdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1792; CHECK-NEXT:    # xmm0 = mem[0],zero,mem[1],zero
1793; CHECK-NEXT:    retq
1794  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1795  %2 = shufflevector <4 x i32> %a0, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
1796  %3 = bitcast <4 x i32> %2 to <2 x i64>
1797  ret <2 x i64> %3
1798}
1799
1800define <4 x i32> @stack_fold_pmovzxwd(<8 x i16> %a0) {
1801; CHECK-LABEL: stack_fold_pmovzxwd:
1802; CHECK:       # %bb.0:
1803; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1804; CHECK-NEXT:    #APP
1805; CHECK-NEXT:    nop
1806; CHECK-NEXT:    #NO_APP
1807; CHECK-NEXT:    pmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1808; CHECK-NEXT:    # xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1809; CHECK-NEXT:    retq
1810  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1811  %2 = shufflevector <8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
1812  %3 = bitcast <8 x i16> %2 to <4 x i32>
1813  ret <4 x i32> %3
1814}
1815
1816define <2 x i64> @stack_fold_pmovzxwq(<8 x i16> %a0) {
1817; CHECK-LABEL: stack_fold_pmovzxwq:
1818; CHECK:       # %bb.0:
1819; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1820; CHECK-NEXT:    #APP
1821; CHECK-NEXT:    nop
1822; CHECK-NEXT:    #NO_APP
1823; CHECK-NEXT:    pmovzxwq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1824; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
1825; CHECK-NEXT:    retq
1826  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1827  %2 = shufflevector <8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 8, i32 9, i32 10, i32 1, i32 11, i32 12, i32 13>
1828  %3 = bitcast <8 x i16> %2 to <2 x i64>
1829  ret <2 x i64> %3
1830}
1831
1832define <2 x i64> @stack_fold_pmuldq(<4 x i32> %a0, <4 x i32> %a1) {
1833; CHECK-LABEL: stack_fold_pmuldq:
1834; CHECK:       # %bb.0:
1835; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1836; CHECK-NEXT:    #APP
1837; CHECK-NEXT:    nop
1838; CHECK-NEXT:    #NO_APP
1839; CHECK-NEXT:    pmuldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1840; CHECK-NEXT:    retq
1841  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1842  %2 = bitcast <4 x i32> %a0 to <2 x i64>
1843  %3 = bitcast <4 x i32> %a1 to <2 x i64>
1844  %4 = shl <2 x i64> %2, <i64 32, i64 32>
1845  %5 = ashr <2 x i64> %4, <i64 32, i64 32>
1846  %6 = shl <2 x i64> %3, <i64 32, i64 32>
1847  %7 = ashr <2 x i64> %6, <i64 32, i64 32>
1848  %8 = mul <2 x i64> %5, %7
1849  ret <2 x i64> %8
1850}
1851
1852define <8 x i16> @stack_fold_pmulhrsw(<8 x i16> %a0, <8 x i16> %a1) {
1853; CHECK-LABEL: stack_fold_pmulhrsw:
1854; CHECK:       # %bb.0:
1855; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1856; CHECK-NEXT:    #APP
1857; CHECK-NEXT:    nop
1858; CHECK-NEXT:    #NO_APP
1859; CHECK-NEXT:    pmulhrsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1860; CHECK-NEXT:    retq
1861  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1862  %2 = call <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16> %a0, <8 x i16> %a1)
1863  ret <8 x i16> %2
1864}
1865declare <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16>, <8 x i16>) nounwind readnone
1866
1867define <8 x i16> @stack_fold_pmulhuw(<8 x i16> %a0, <8 x i16> %a1) {
1868; CHECK-LABEL: stack_fold_pmulhuw:
1869; CHECK:       # %bb.0:
1870; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1871; CHECK-NEXT:    #APP
1872; CHECK-NEXT:    nop
1873; CHECK-NEXT:    #NO_APP
1874; CHECK-NEXT:    pmulhuw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1875; CHECK-NEXT:    retq
1876  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1877  %2 = call <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16> %a0, <8 x i16> %a1)
1878  ret <8 x i16> %2
1879}
1880declare <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16>, <8 x i16>) nounwind readnone
1881
1882define <8 x i16> @stack_fold_pmulhw(<8 x i16> %a0, <8 x i16> %a1) {
1883; CHECK-LABEL: stack_fold_pmulhw:
1884; CHECK:       # %bb.0:
1885; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1886; CHECK-NEXT:    #APP
1887; CHECK-NEXT:    nop
1888; CHECK-NEXT:    #NO_APP
1889; CHECK-NEXT:    pmulhw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1890; CHECK-NEXT:    retq
1891  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1892  %2 = call <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16> %a0, <8 x i16> %a1)
1893  ret <8 x i16> %2
1894}
1895declare <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16>, <8 x i16>) nounwind readnone
1896
1897define <4 x i32> @stack_fold_pmulld(<4 x i32> %a0, <4 x i32> %a1) {
1898; CHECK-LABEL: stack_fold_pmulld:
1899; CHECK:       # %bb.0:
1900; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1901; CHECK-NEXT:    #APP
1902; CHECK-NEXT:    nop
1903; CHECK-NEXT:    #NO_APP
1904; CHECK-NEXT:    pmulld {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1905; CHECK-NEXT:    retq
1906  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1907  %2 = mul <4 x i32> %a0, %a1
1908  ret <4 x i32> %2
1909}
1910
1911define <8 x i16> @stack_fold_pmullw(<8 x i16> %a0, <8 x i16> %a1) {
1912; CHECK-LABEL: stack_fold_pmullw:
1913; CHECK:       # %bb.0:
1914; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1915; CHECK-NEXT:    #APP
1916; CHECK-NEXT:    nop
1917; CHECK-NEXT:    #NO_APP
1918; CHECK-NEXT:    pmullw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1919; CHECK-NEXT:    retq
1920  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1921  %2 = mul <8 x i16> %a0, %a1
1922  ret <8 x i16> %2
1923}
1924
1925define <2 x i64> @stack_fold_pmuludq(<4 x i32> %a0, <4 x i32> %a1) {
1926; CHECK-LABEL: stack_fold_pmuludq:
1927; CHECK:       # %bb.0:
1928; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1929; CHECK-NEXT:    #APP
1930; CHECK-NEXT:    nop
1931; CHECK-NEXT:    #NO_APP
1932; CHECK-NEXT:    pmuludq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1933; CHECK-NEXT:    retq
1934  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1935  %2 = bitcast <4 x i32> %a0 to <2 x i64>
1936  %3 = bitcast <4 x i32> %a1 to <2 x i64>
1937  %4 = and <2 x i64> %2, <i64 4294967295, i64 4294967295>
1938  %5 = and <2 x i64> %3, <i64 4294967295, i64 4294967295>
1939  %6 = mul <2 x i64> %4, %5
1940  ret <2 x i64> %6
1941}
1942
1943define <16 x i8> @stack_fold_por(<16 x i8> %a0, <16 x i8> %a1) {
1944; CHECK-LABEL: stack_fold_por:
1945; CHECK:       # %bb.0:
1946; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1947; CHECK-NEXT:    #APP
1948; CHECK-NEXT:    nop
1949; CHECK-NEXT:    #NO_APP
1950; CHECK-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1951; CHECK-NEXT:    pcmpeqd %xmm1, %xmm1
1952; CHECK-NEXT:    psubb %xmm1, %xmm0
1953; CHECK-NEXT:    retq
1954  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1955  %2 = or <16 x i8> %a0, %a1
1956  ; add forces execution domain
1957  %3 = add <16 x i8> %2, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
1958  ret <16 x i8> %3
1959}
1960
1961define <2 x i64> @stack_fold_psadbw(<16 x i8> %a0, <16 x i8> %a1) {
1962; CHECK-LABEL: stack_fold_psadbw:
1963; CHECK:       # %bb.0:
1964; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1965; CHECK-NEXT:    #APP
1966; CHECK-NEXT:    nop
1967; CHECK-NEXT:    #NO_APP
1968; CHECK-NEXT:    psadbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1969; CHECK-NEXT:    retq
1970  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1971  %2 = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %a0, <16 x i8> %a1)
1972  ret <2 x i64> %2
1973}
1974declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
1975
1976define <16 x i8> @stack_fold_pshufb(<16 x i8> %a0, <16 x i8> %a1) {
1977; CHECK-LABEL: stack_fold_pshufb:
1978; CHECK:       # %bb.0:
1979; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1980; CHECK-NEXT:    #APP
1981; CHECK-NEXT:    nop
1982; CHECK-NEXT:    #NO_APP
1983; CHECK-NEXT:    pshufb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1984; CHECK-NEXT:    retq
1985  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1986  %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> %a1)
1987  ret <16 x i8> %2
1988}
1989declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>) nounwind readnone
1990
1991define <4 x i32> @stack_fold_pshufd(<4 x i32> %a0) {
1992; CHECK-LABEL: stack_fold_pshufd:
1993; CHECK:       # %bb.0:
1994; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1995; CHECK-NEXT:    #APP
1996; CHECK-NEXT:    nop
1997; CHECK-NEXT:    #NO_APP
1998; CHECK-NEXT:    pshufd $27, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1999; CHECK-NEXT:    # xmm0 = mem[3,2,1,0]
2000; CHECK-NEXT:    retq
2001  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2002  %2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
2003  ret <4 x i32> %2
2004}
2005
2006define <8 x i16> @stack_fold_pshufhw(<8 x i16> %a0) {
2007; CHECK-LABEL: stack_fold_pshufhw:
2008; CHECK:       # %bb.0:
2009; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2010; CHECK-NEXT:    #APP
2011; CHECK-NEXT:    nop
2012; CHECK-NEXT:    #NO_APP
2013; CHECK-NEXT:    pshufhw $11, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2014; CHECK-NEXT:    # xmm0 = mem[0,1,2,3,7,6,4,4]
2015; CHECK-NEXT:    retq
2016  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2017  %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 4, i32 4>
2018  ret <8 x i16> %2
2019}
2020
2021define <8 x i16> @stack_fold_pshuflw(<8 x i16> %a0) {
2022; CHECK-LABEL: stack_fold_pshuflw:
2023; CHECK:       # %bb.0:
2024; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2025; CHECK-NEXT:    #APP
2026; CHECK-NEXT:    nop
2027; CHECK-NEXT:    #NO_APP
2028; CHECK-NEXT:    pshuflw $27, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2029; CHECK-NEXT:    # xmm0 = mem[3,2,1,0,4,5,6,7]
2030; CHECK-NEXT:    retq
2031  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2032  %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7>
2033  ret <8 x i16> %2
2034}
2035
2036define <16 x i8> @stack_fold_psignb(<16 x i8> %a0, <16 x i8> %a1) {
2037; CHECK-LABEL: stack_fold_psignb:
2038; CHECK:       # %bb.0:
2039; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2040; CHECK-NEXT:    #APP
2041; CHECK-NEXT:    nop
2042; CHECK-NEXT:    #NO_APP
2043; CHECK-NEXT:    psignb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2044; CHECK-NEXT:    retq
2045  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2046  %2 = call <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8> %a0, <16 x i8> %a1)
2047  ret <16 x i8> %2
2048}
2049declare <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8>, <16 x i8>) nounwind readnone
2050
2051define <4 x i32> @stack_fold_psignd(<4 x i32> %a0, <4 x i32> %a1) {
2052; CHECK-LABEL: stack_fold_psignd:
2053; CHECK:       # %bb.0:
2054; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2055; CHECK-NEXT:    #APP
2056; CHECK-NEXT:    nop
2057; CHECK-NEXT:    #NO_APP
2058; CHECK-NEXT:    psignd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2059; CHECK-NEXT:    retq
2060  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2061  %2 = call <4 x i32> @llvm.x86.ssse3.psign.d.128(<4 x i32> %a0, <4 x i32> %a1)
2062  ret <4 x i32> %2
2063}
2064declare <4 x i32> @llvm.x86.ssse3.psign.d.128(<4 x i32>, <4 x i32>) nounwind readnone
2065
2066define <8 x i16> @stack_fold_psignw(<8 x i16> %a0, <8 x i16> %a1) {
2067; CHECK-LABEL: stack_fold_psignw:
2068; CHECK:       # %bb.0:
2069; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2070; CHECK-NEXT:    #APP
2071; CHECK-NEXT:    nop
2072; CHECK-NEXT:    #NO_APP
2073; CHECK-NEXT:    psignw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2074; CHECK-NEXT:    retq
2075  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2076  %2 = call <8 x i16> @llvm.x86.ssse3.psign.w.128(<8 x i16> %a0, <8 x i16> %a1)
2077  ret <8 x i16> %2
2078}
2079declare <8 x i16> @llvm.x86.ssse3.psign.w.128(<8 x i16>, <8 x i16>) nounwind readnone
2080
2081define <4 x i32> @stack_fold_pslld(<4 x i32> %a0, <4 x i32> %a1) {
2082; CHECK-LABEL: stack_fold_pslld:
2083; CHECK:       # %bb.0:
2084; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2085; CHECK-NEXT:    #APP
2086; CHECK-NEXT:    nop
2087; CHECK-NEXT:    #NO_APP
2088; CHECK-NEXT:    pslld {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2089; CHECK-NEXT:    retq
2090  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2091  %2 = call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %a0, <4 x i32> %a1)
2092  ret <4 x i32> %2
2093}
2094declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) nounwind readnone
2095
2096define <2 x i64> @stack_fold_psllq(<2 x i64> %a0, <2 x i64> %a1) {
2097; CHECK-LABEL: stack_fold_psllq:
2098; CHECK:       # %bb.0:
2099; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2100; CHECK-NEXT:    #APP
2101; CHECK-NEXT:    nop
2102; CHECK-NEXT:    #NO_APP
2103; CHECK-NEXT:    psllq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2104; CHECK-NEXT:    retq
2105  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2106  %2 = call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %a0, <2 x i64> %a1)
2107  ret <2 x i64> %2
2108}
2109declare <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64>, <2 x i64>) nounwind readnone
2110
2111define <8 x i16> @stack_fold_psllw(<8 x i16> %a0, <8 x i16> %a1) {
2112; CHECK-LABEL: stack_fold_psllw:
2113; CHECK:       # %bb.0:
2114; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2115; CHECK-NEXT:    #APP
2116; CHECK-NEXT:    nop
2117; CHECK-NEXT:    #NO_APP
2118; CHECK-NEXT:    psllw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2119; CHECK-NEXT:    retq
2120  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2121  %2 = call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %a0, <8 x i16> %a1)
2122  ret <8 x i16> %2
2123}
2124declare <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16>, <8 x i16>) nounwind readnone
2125
2126define <4 x i32> @stack_fold_psrad(<4 x i32> %a0, <4 x i32> %a1) {
2127; CHECK-LABEL: stack_fold_psrad:
2128; CHECK:       # %bb.0:
2129; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2130; CHECK-NEXT:    #APP
2131; CHECK-NEXT:    nop
2132; CHECK-NEXT:    #NO_APP
2133; CHECK-NEXT:    psrad {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2134; CHECK-NEXT:    retq
2135  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2136  %2 = call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %a0, <4 x i32> %a1)
2137  ret <4 x i32> %2
2138}
2139declare <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32>, <4 x i32>) nounwind readnone
2140
2141define <8 x i16> @stack_fold_psraw(<8 x i16> %a0, <8 x i16> %a1) {
2142; CHECK-LABEL: stack_fold_psraw:
2143; CHECK:       # %bb.0:
2144; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2145; CHECK-NEXT:    #APP
2146; CHECK-NEXT:    nop
2147; CHECK-NEXT:    #NO_APP
2148; CHECK-NEXT:    psraw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2149; CHECK-NEXT:    retq
2150  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2151  %2 = call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %a0, <8 x i16> %a1)
2152  ret <8 x i16> %2
2153}
2154declare <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16>, <8 x i16>) nounwind readnone
2155
2156define <4 x i32> @stack_fold_psrld(<4 x i32> %a0, <4 x i32> %a1) {
2157; CHECK-LABEL: stack_fold_psrld:
2158; CHECK:       # %bb.0:
2159; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2160; CHECK-NEXT:    #APP
2161; CHECK-NEXT:    nop
2162; CHECK-NEXT:    #NO_APP
2163; CHECK-NEXT:    psrld {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2164; CHECK-NEXT:    retq
2165  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2166  %2 = call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %a0, <4 x i32> %a1)
2167  ret <4 x i32> %2
2168}
2169declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) nounwind readnone
2170
2171define <2 x i64> @stack_fold_psrlq(<2 x i64> %a0, <2 x i64> %a1) {
2172; CHECK-LABEL: stack_fold_psrlq:
2173; CHECK:       # %bb.0:
2174; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2175; CHECK-NEXT:    #APP
2176; CHECK-NEXT:    nop
2177; CHECK-NEXT:    #NO_APP
2178; CHECK-NEXT:    psrlq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2179; CHECK-NEXT:    retq
2180  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2181  %2 = call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %a0, <2 x i64> %a1)
2182  ret <2 x i64> %2
2183}
2184declare <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64>, <2 x i64>) nounwind readnone
2185
2186define <8 x i16> @stack_fold_psrlw(<8 x i16> %a0, <8 x i16> %a1) {
2187; CHECK-LABEL: stack_fold_psrlw:
2188; CHECK:       # %bb.0:
2189; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2190; CHECK-NEXT:    #APP
2191; CHECK-NEXT:    nop
2192; CHECK-NEXT:    #NO_APP
2193; CHECK-NEXT:    psrlw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2194; CHECK-NEXT:    retq
2195  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2196  %2 = call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %a0, <8 x i16> %a1)
2197  ret <8 x i16> %2
2198}
2199declare <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16>, <8 x i16>) nounwind readnone
2200
2201define <16 x i8> @stack_fold_psubb(<16 x i8> %a0, <16 x i8> %a1) {
2202; CHECK-LABEL: stack_fold_psubb:
2203; CHECK:       # %bb.0:
2204; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2205; CHECK-NEXT:    #APP
2206; CHECK-NEXT:    nop
2207; CHECK-NEXT:    #NO_APP
2208; CHECK-NEXT:    psubb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2209; CHECK-NEXT:    retq
2210  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2211  %2 = sub <16 x i8> %a0, %a1
2212  ret <16 x i8> %2
2213}
2214
2215define <4 x i32> @stack_fold_psubd(<4 x i32> %a0, <4 x i32> %a1) {
2216; CHECK-LABEL: stack_fold_psubd:
2217; CHECK:       # %bb.0:
2218; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2219; CHECK-NEXT:    #APP
2220; CHECK-NEXT:    nop
2221; CHECK-NEXT:    #NO_APP
2222; CHECK-NEXT:    psubd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2223; CHECK-NEXT:    retq
2224  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2225  %2 = sub <4 x i32> %a0, %a1
2226  ret <4 x i32> %2
2227}
2228
2229define <2 x i64> @stack_fold_psubq(<2 x i64> %a0, <2 x i64> %a1) {
2230; CHECK-LABEL: stack_fold_psubq:
2231; CHECK:       # %bb.0:
2232; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2233; CHECK-NEXT:    #APP
2234; CHECK-NEXT:    nop
2235; CHECK-NEXT:    #NO_APP
2236; CHECK-NEXT:    psubq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2237; CHECK-NEXT:    retq
2238  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2239  %2 = sub <2 x i64> %a0, %a1
2240  ret <2 x i64> %2
2241}
2242
2243define <16 x i8> @stack_fold_psubsb(<16 x i8> %a0, <16 x i8> %a1) {
2244; CHECK-LABEL: stack_fold_psubsb:
2245; CHECK:       # %bb.0:
2246; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2247; CHECK-NEXT:    #APP
2248; CHECK-NEXT:    nop
2249; CHECK-NEXT:    #NO_APP
2250; CHECK-NEXT:    psubsb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2251; CHECK-NEXT:    retq
2252  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2253  %2 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> %a0, <16 x i8> %a1)
2254  ret <16 x i8> %2
2255}
2256declare <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
2257
2258define <8 x i16> @stack_fold_psubsw(<8 x i16> %a0, <8 x i16> %a1) {
2259; CHECK-LABEL: stack_fold_psubsw:
2260; CHECK:       # %bb.0:
2261; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2262; CHECK-NEXT:    #APP
2263; CHECK-NEXT:    nop
2264; CHECK-NEXT:    #NO_APP
2265; CHECK-NEXT:    psubsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2266; CHECK-NEXT:    retq
2267  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2268  %2 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %a0, <8 x i16> %a1)
2269  ret <8 x i16> %2
2270}
2271declare <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
2272
2273define <16 x i8> @stack_fold_psubusb(<16 x i8> %a0, <16 x i8> %a1) {
2274; CHECK-LABEL: stack_fold_psubusb:
2275; CHECK:       # %bb.0:
2276; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2277; CHECK-NEXT:    #APP
2278; CHECK-NEXT:    nop
2279; CHECK-NEXT:    #NO_APP
2280; CHECK-NEXT:    psubusb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2281; CHECK-NEXT:    retq
2282  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2283  %2 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> %a0, <16 x i8> %a1)
2284  ret <16 x i8> %2
2285}
2286declare <16 x i8> @llvm.usub.sat.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
2287
2288define <8 x i16> @stack_fold_psubusw(<8 x i16> %a0, <8 x i16> %a1) {
2289; CHECK-LABEL: stack_fold_psubusw:
2290; CHECK:       # %bb.0:
2291; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2292; CHECK-NEXT:    #APP
2293; CHECK-NEXT:    nop
2294; CHECK-NEXT:    #NO_APP
2295; CHECK-NEXT:    psubusw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2296; CHECK-NEXT:    retq
2297  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2298  %2 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %a0, <8 x i16> %a1)
2299  ret <8 x i16> %2
2300}
2301declare <8 x i16> @llvm.usub.sat.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
2302
2303define <8 x i16> @stack_fold_psubw(<8 x i16> %a0, <8 x i16> %a1) {
2304; CHECK-LABEL: stack_fold_psubw:
2305; CHECK:       # %bb.0:
2306; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2307; CHECK-NEXT:    #APP
2308; CHECK-NEXT:    nop
2309; CHECK-NEXT:    #NO_APP
2310; CHECK-NEXT:    psubw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2311; CHECK-NEXT:    retq
2312  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2313  %2 = sub <8 x i16> %a0, %a1
2314  ret <8 x i16> %2
2315}
2316
2317define i32 @stack_fold_ptest(<2 x i64> %a0, <2 x i64> %a1) {
2318; CHECK-LABEL: stack_fold_ptest:
2319; CHECK:       # %bb.0:
2320; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2321; CHECK-NEXT:    #APP
2322; CHECK-NEXT:    nop
2323; CHECK-NEXT:    #NO_APP
2324; CHECK-NEXT:    xorl %eax, %eax
2325; CHECK-NEXT:    ptest {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2326; CHECK-NEXT:    setb %al
2327; CHECK-NEXT:    retq
2328  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2329  %2 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %a0, <2 x i64> %a1)
2330  ret i32 %2
2331}
2332declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) nounwind readnone
2333
2334define <16 x i8> @stack_fold_punpckhbw(<16 x i8> %a0, <16 x i8> %a1) {
2335; CHECK-LABEL: stack_fold_punpckhbw:
2336; CHECK:       # %bb.0:
2337; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2338; CHECK-NEXT:    #APP
2339; CHECK-NEXT:    nop
2340; CHECK-NEXT:    #NO_APP
2341; CHECK-NEXT:    punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2342; CHECK-NEXT:    # xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15]
2343; CHECK-NEXT:    retq
2344  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2345  %2 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
2346  ret <16 x i8> %2
2347}
2348
2349define <4 x i32> @stack_fold_punpckhdq(<4 x i32> %a0, <4 x i32> %a1) {
2350; CHECK-LABEL: stack_fold_punpckhdq:
2351; CHECK:       # %bb.0:
2352; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2353; CHECK-NEXT:    #APP
2354; CHECK-NEXT:    nop
2355; CHECK-NEXT:    #NO_APP
2356; CHECK-NEXT:    punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2357; CHECK-NEXT:    # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
2358; CHECK-NEXT:    pcmpeqd %xmm1, %xmm1
2359; CHECK-NEXT:    psubd %xmm1, %xmm0
2360; CHECK-NEXT:    retq
2361  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2362  %2 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
2363  ; add forces execution domain
2364  %3 = add <4 x i32> %2, <i32 1, i32 1, i32 1, i32 1>
2365  ret <4 x i32> %3
2366}
2367
2368define <2 x i64> @stack_fold_punpckhqdq(<2 x i64> %a0, <2 x i64> %a1) {
2369; CHECK-LABEL: stack_fold_punpckhqdq:
2370; CHECK:       # %bb.0:
2371; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2372; CHECK-NEXT:    #APP
2373; CHECK-NEXT:    nop
2374; CHECK-NEXT:    #NO_APP
2375; CHECK-NEXT:    punpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2376; CHECK-NEXT:    # xmm0 = xmm0[1],mem[1]
2377; CHECK-NEXT:    pcmpeqd %xmm1, %xmm1
2378; CHECK-NEXT:    psubq %xmm1, %xmm0
2379; CHECK-NEXT:    retq
2380  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2381  %2 = shufflevector <2 x i64> %a0, <2 x i64> %a1, <2 x i32> <i32 1, i32 3>
2382  ; add forces execution domain
2383  %3 = add <2 x i64> %2, <i64 1, i64 1>
2384  ret <2 x i64> %3
2385}
2386
2387define <8 x i16> @stack_fold_punpckhwd(<8 x i16> %a0, <8 x i16> %a1) {
2388; CHECK-LABEL: stack_fold_punpckhwd:
2389; CHECK:       # %bb.0:
2390; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2391; CHECK-NEXT:    #APP
2392; CHECK-NEXT:    nop
2393; CHECK-NEXT:    #NO_APP
2394; CHECK-NEXT:    punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2395; CHECK-NEXT:    # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
2396; CHECK-NEXT:    retq
2397  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2398  %2 = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
2399  ret <8 x i16> %2
2400}
2401
2402define <16 x i8> @stack_fold_punpcklbw(<16 x i8> %a0, <16 x i8> %a1) {
2403; CHECK-LABEL: stack_fold_punpcklbw:
2404; CHECK:       # %bb.0:
2405; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2406; CHECK-NEXT:    #APP
2407; CHECK-NEXT:    nop
2408; CHECK-NEXT:    #NO_APP
2409; CHECK-NEXT:    punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2410; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
2411; CHECK-NEXT:    retq
2412  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2413  %2 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
2414  ret <16 x i8> %2
2415}
2416
2417define <4 x i32> @stack_fold_punpckldq(<4 x i32> %a0, <4 x i32> %a1) {
2418; CHECK-LABEL: stack_fold_punpckldq:
2419; CHECK:       # %bb.0:
2420; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2421; CHECK-NEXT:    #APP
2422; CHECK-NEXT:    nop
2423; CHECK-NEXT:    #NO_APP
2424; CHECK-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2425; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
2426; CHECK-NEXT:    pcmpeqd %xmm1, %xmm1
2427; CHECK-NEXT:    psubd %xmm1, %xmm0
2428; CHECK-NEXT:    retq
2429  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2430  %2 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
2431  ; add forces execution domain
2432  %3 = add <4 x i32> %2, <i32 1, i32 1, i32 1, i32 1>
2433  ret <4 x i32> %3
2434}
2435
2436define <2 x i64> @stack_fold_punpcklqdq(<2 x i64> %a0, <2 x i64> %a1) {
2437; CHECK-LABEL: stack_fold_punpcklqdq:
2438; CHECK:       # %bb.0:
2439; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2440; CHECK-NEXT:    #APP
2441; CHECK-NEXT:    nop
2442; CHECK-NEXT:    #NO_APP
2443; CHECK-NEXT:    punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2444; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
2445; CHECK-NEXT:    pcmpeqd %xmm1, %xmm1
2446; CHECK-NEXT:    psubq %xmm1, %xmm0
2447; CHECK-NEXT:    retq
2448  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2449  %2 = shufflevector <2 x i64> %a0, <2 x i64> %a1, <2 x i32> <i32 0, i32 2>
2450  ; add forces execution domain
2451  %3 = add <2 x i64> %2, <i64 1, i64 1>
2452  ret <2 x i64> %3
2453}
2454
2455define <8 x i16> @stack_fold_punpcklwd(<8 x i16> %a0, <8 x i16> %a1) {
2456; CHECK-LABEL: stack_fold_punpcklwd:
2457; CHECK:       # %bb.0:
2458; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2459; CHECK-NEXT:    #APP
2460; CHECK-NEXT:    nop
2461; CHECK-NEXT:    #NO_APP
2462; CHECK-NEXT:    punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2463; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
2464; CHECK-NEXT:    retq
2465  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2466  %2 = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
2467  ret <8 x i16> %2
2468}
2469
2470define <16 x i8> @stack_fold_pxor(<16 x i8> %a0, <16 x i8> %a1) {
2471; CHECK-LABEL: stack_fold_pxor:
2472; CHECK:       # %bb.0:
2473; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2474; CHECK-NEXT:    #APP
2475; CHECK-NEXT:    nop
2476; CHECK-NEXT:    #NO_APP
2477; CHECK-NEXT:    pxor {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2478; CHECK-NEXT:    pcmpeqd %xmm1, %xmm1
2479; CHECK-NEXT:    psubb %xmm1, %xmm0
2480; CHECK-NEXT:    retq
2481  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2482  %2 = xor <16 x i8> %a0, %a1
2483  ; add forces execution domain
2484  %3 = add <16 x i8> %2, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
2485  ret <16 x i8> %3
2486}
2487