1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx,+xop < %s | FileCheck %s
3
4target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
5target triple = "x86_64-unknown-unknown"
6
7; Stack reload folding tests.
8;
9; By including a nop call with sideeffects we can force a partial register spill of the
10; relevant registers and check that the reload is correctly folded into the instruction.
11
12define <2 x double> @stack_fold_vfrczpd(<2 x double> %a0) {
13; CHECK-LABEL: stack_fold_vfrczpd:
14; CHECK:       # %bb.0:
15; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16; CHECK-NEXT:    #APP
17; CHECK-NEXT:    nop
18; CHECK-NEXT:    #NO_APP
19; CHECK-NEXT:    vfrczpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
20; CHECK-NEXT:    retq
21  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
22  %2 = call <2 x double> @llvm.x86.xop.vfrcz.pd(<2 x double> %a0)
23  ret <2 x double> %2
24}
25declare <2 x double> @llvm.x86.xop.vfrcz.pd(<2 x double>) nounwind readnone
26
27define <4 x double> @stack_fold_vfrczpd_ymm(<4 x double> %a0) {
28; CHECK-LABEL: stack_fold_vfrczpd_ymm:
29; CHECK:       # %bb.0:
30; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
31; CHECK-NEXT:    #APP
32; CHECK-NEXT:    nop
33; CHECK-NEXT:    #NO_APP
34; CHECK-NEXT:    vfrczpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
35; CHECK-NEXT:    retq
36  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
37  %2 = call <4 x double> @llvm.x86.xop.vfrcz.pd.256(<4 x double> %a0)
38  ret <4 x double> %2
39}
40declare <4 x double> @llvm.x86.xop.vfrcz.pd.256(<4 x double>) nounwind readnone
41
42define <4 x float> @stack_fold_vfrczps(<4 x float> %a0) {
43; CHECK-LABEL: stack_fold_vfrczps:
44; CHECK:       # %bb.0:
45; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
46; CHECK-NEXT:    #APP
47; CHECK-NEXT:    nop
48; CHECK-NEXT:    #NO_APP
49; CHECK-NEXT:    vfrczps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
50; CHECK-NEXT:    retq
51  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
52  %2 = call <4 x float> @llvm.x86.xop.vfrcz.ps(<4 x float> %a0)
53  ret <4 x float> %2
54}
55declare <4 x float> @llvm.x86.xop.vfrcz.ps(<4 x float>) nounwind readnone
56
57define <8 x float> @stack_fold_vfrczps_ymm(<8 x float> %a0) {
58; CHECK-LABEL: stack_fold_vfrczps_ymm:
59; CHECK:       # %bb.0:
60; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
61; CHECK-NEXT:    #APP
62; CHECK-NEXT:    nop
63; CHECK-NEXT:    #NO_APP
64; CHECK-NEXT:    vfrczps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
65; CHECK-NEXT:    retq
66  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
67  %2 = call <8 x float> @llvm.x86.xop.vfrcz.ps.256(<8 x float> %a0)
68  ret <8 x float> %2
69}
70declare <8 x float> @llvm.x86.xop.vfrcz.ps.256(<8 x float>) nounwind readnone
71
72define <2 x double> @stack_fold_vfrczsd(<2 x double> %a0) {
73; CHECK-LABEL: stack_fold_vfrczsd:
74; CHECK:       # %bb.0:
75; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
76; CHECK-NEXT:    #APP
77; CHECK-NEXT:    nop
78; CHECK-NEXT:    #NO_APP
79; CHECK-NEXT:    vfrczsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
80; CHECK-NEXT:    retq
81  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
82  %2 = call <2 x double> @llvm.x86.xop.vfrcz.sd(<2 x double> %a0)
83  ret <2 x double> %2
84}
85declare <2 x double> @llvm.x86.xop.vfrcz.sd(<2 x double>) nounwind readnone
86
87define <4 x float> @stack_fold_vfrczss(<4 x float> %a0) {
88; CHECK-LABEL: stack_fold_vfrczss:
89; CHECK:       # %bb.0:
90; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
91; CHECK-NEXT:    #APP
92; CHECK-NEXT:    nop
93; CHECK-NEXT:    #NO_APP
94; CHECK-NEXT:    vfrczss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
95; CHECK-NEXT:    retq
96  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
97  %2 = call <4 x float> @llvm.x86.xop.vfrcz.ss(<4 x float> %a0)
98  ret <4 x float> %2
99}
100declare <4 x float> @llvm.x86.xop.vfrcz.ss(<4 x float>) nounwind readnone
101
102define <2 x i64> @stack_fold_vpcmov_rm(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) {
103; CHECK-LABEL: stack_fold_vpcmov_rm:
104; CHECK:       # %bb.0:
105; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
106; CHECK-NEXT:    #APP
107; CHECK-NEXT:    nop
108; CHECK-NEXT:    #NO_APP
109; CHECK-NEXT:    vpcmov {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0, %xmm0 # 16-byte Folded Reload
110; CHECK-NEXT:    retq
111  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
112  %2 = call <2 x i64> @llvm.x86.xop.vpcmov(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2)
113  ret <2 x i64> %2
114}
115define <2 x i64> @stack_fold_vpcmov_mr(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) {
116; CHECK-LABEL: stack_fold_vpcmov_mr:
117; CHECK:       # %bb.0:
118; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
119; CHECK-NEXT:    #APP
120; CHECK-NEXT:    nop
121; CHECK-NEXT:    #NO_APP
122; CHECK-NEXT:    vpcmov %xmm1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
123; CHECK-NEXT:    retq
124  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
125  %2 = call <2 x i64> @llvm.x86.xop.vpcmov(<2 x i64> %a0, <2 x i64> %a2, <2 x i64> %a1)
126  ret <2 x i64> %2
127}
128declare <2 x i64> @llvm.x86.xop.vpcmov(<2 x i64>, <2 x i64>, <2 x i64>) nounwind readnone
129
130define <4 x i64> @stack_fold_vpcmov_rm_ymm(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) {
131; CHECK-LABEL: stack_fold_vpcmov_rm_ymm:
132; CHECK:       # %bb.0:
133; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
134; CHECK-NEXT:    #APP
135; CHECK-NEXT:    nop
136; CHECK-NEXT:    #NO_APP
137; CHECK-NEXT:    vpcmov {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0, %ymm0 # 32-byte Folded Reload
138; CHECK-NEXT:    retq
139  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
140  %2 = call <4 x i64> @llvm.x86.xop.vpcmov.256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2)
141  ret <4 x i64> %2
142}
143define <4 x i64> @stack_fold_vpcmov_mr_ymm(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) {
144; CHECK-LABEL: stack_fold_vpcmov_mr_ymm:
145; CHECK:       # %bb.0:
146; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
147; CHECK-NEXT:    #APP
148; CHECK-NEXT:    nop
149; CHECK-NEXT:    #NO_APP
150; CHECK-NEXT:    vpcmov %ymm1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
151; CHECK-NEXT:    retq
152  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
153  %2 = call <4 x i64> @llvm.x86.xop.vpcmov.256(<4 x i64> %a0, <4 x i64> %a2, <4 x i64> %a1)
154  ret <4 x i64> %2
155}
156declare <4 x i64> @llvm.x86.xop.vpcmov.256(<4 x i64>, <4 x i64>, <4 x i64>) nounwind readnone
157
158define <16 x i8> @stack_fold_vpcomb(<16 x i8> %a0, <16 x i8> %a1) {
159; CHECK-LABEL: stack_fold_vpcomb:
160; CHECK:       # %bb.0:
161; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
162; CHECK-NEXT:    #APP
163; CHECK-NEXT:    nop
164; CHECK-NEXT:    #NO_APP
165; CHECK-NEXT:    vpcomltb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
166; CHECK-NEXT:    retq
167  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
168  %2 = call <16 x i8> @llvm.x86.xop.vpcomb(<16 x i8> %a0, <16 x i8> %a1, i8 0)
169  ret <16 x i8> %2
170}
171declare <16 x i8> @llvm.x86.xop.vpcomb(<16 x i8>, <16 x i8>, i8) nounwind readnone
172
173define <4 x i32> @stack_fold_vpcomd(<4 x i32> %a0, <4 x i32> %a1) {
174; CHECK-LABEL: stack_fold_vpcomd:
175; CHECK:       # %bb.0:
176; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
177; CHECK-NEXT:    #APP
178; CHECK-NEXT:    nop
179; CHECK-NEXT:    #NO_APP
180; CHECK-NEXT:    vpcomltd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
181; CHECK-NEXT:    retq
182  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
183  %2 = call <4 x i32> @llvm.x86.xop.vpcomd(<4 x i32> %a0, <4 x i32> %a1, i8 0)
184  ret <4 x i32> %2
185}
186declare <4 x i32> @llvm.x86.xop.vpcomd(<4 x i32>, <4 x i32>, i8) nounwind readnone
187
188define <2 x i64> @stack_fold_vpcomq(<2 x i64> %a0, <2 x i64> %a1) {
189; CHECK-LABEL: stack_fold_vpcomq:
190; CHECK:       # %bb.0:
191; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
192; CHECK-NEXT:    #APP
193; CHECK-NEXT:    nop
194; CHECK-NEXT:    #NO_APP
195; CHECK-NEXT:    vpcomltq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
196; CHECK-NEXT:    retq
197  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
198  %2 = call <2 x i64> @llvm.x86.xop.vpcomq(<2 x i64> %a0, <2 x i64> %a1, i8 0)
199  ret <2 x i64> %2
200}
201declare <2 x i64> @llvm.x86.xop.vpcomq(<2 x i64>, <2 x i64>, i8) nounwind readnone
202
203define <16 x i8> @stack_fold_vpcomub(<16 x i8> %a0, <16 x i8> %a1) {
204; CHECK-LABEL: stack_fold_vpcomub:
205; CHECK:       # %bb.0:
206; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
207; CHECK-NEXT:    #APP
208; CHECK-NEXT:    nop
209; CHECK-NEXT:    #NO_APP
210; CHECK-NEXT:    vpcomltub {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
211; CHECK-NEXT:    retq
212  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
213  %2 = call <16 x i8> @llvm.x86.xop.vpcomub(<16 x i8> %a0, <16 x i8> %a1, i8 0)
214  ret <16 x i8> %2
215}
216declare <16 x i8> @llvm.x86.xop.vpcomub(<16 x i8>, <16 x i8>, i8) nounwind readnone
217
218define <4 x i32> @stack_fold_vpcomud(<4 x i32> %a0, <4 x i32> %a1) {
219; CHECK-LABEL: stack_fold_vpcomud:
220; CHECK:       # %bb.0:
221; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
222; CHECK-NEXT:    #APP
223; CHECK-NEXT:    nop
224; CHECK-NEXT:    #NO_APP
225; CHECK-NEXT:    vpcomltud {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
226; CHECK-NEXT:    retq
227  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
228  %2 = call <4 x i32> @llvm.x86.xop.vpcomud(<4 x i32> %a0, <4 x i32> %a1, i8 0)
229  ret <4 x i32> %2
230}
231declare <4 x i32> @llvm.x86.xop.vpcomud(<4 x i32>, <4 x i32>, i8) nounwind readnone
232
233define <2 x i64> @stack_fold_vpcomuq(<2 x i64> %a0, <2 x i64> %a1) {
234; CHECK-LABEL: stack_fold_vpcomuq:
235; CHECK:       # %bb.0:
236; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
237; CHECK-NEXT:    #APP
238; CHECK-NEXT:    nop
239; CHECK-NEXT:    #NO_APP
240; CHECK-NEXT:    vpcomltuq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
241; CHECK-NEXT:    retq
242  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
243  %2 = call <2 x i64> @llvm.x86.xop.vpcomuq(<2 x i64> %a0, <2 x i64> %a1, i8 0)
244  ret <2 x i64> %2
245}
246declare <2 x i64> @llvm.x86.xop.vpcomuq(<2 x i64>, <2 x i64>, i8) nounwind readnone
247
248define <8 x i16> @stack_fold_vpcomuw(<8 x i16> %a0, <8 x i16> %a1) {
249; CHECK-LABEL: stack_fold_vpcomuw:
250; CHECK:       # %bb.0:
251; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
252; CHECK-NEXT:    #APP
253; CHECK-NEXT:    nop
254; CHECK-NEXT:    #NO_APP
255; CHECK-NEXT:    vpcomltuw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
256; CHECK-NEXT:    retq
257  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
258  %2 = call <8 x i16> @llvm.x86.xop.vpcomuw(<8 x i16> %a0, <8 x i16> %a1, i8 0)
259  ret <8 x i16> %2
260}
261declare <8 x i16> @llvm.x86.xop.vpcomuw(<8 x i16>, <8 x i16>, i8) nounwind readnone
262
263define <8 x i16> @stack_fold_vpcomw(<8 x i16> %a0, <8 x i16> %a1) {
264; CHECK-LABEL: stack_fold_vpcomw:
265; CHECK:       # %bb.0:
266; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
267; CHECK-NEXT:    #APP
268; CHECK-NEXT:    nop
269; CHECK-NEXT:    #NO_APP
270; CHECK-NEXT:    vpcomltw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
271; CHECK-NEXT:    retq
272  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
273  %2 = call <8 x i16> @llvm.x86.xop.vpcomw(<8 x i16> %a0, <8 x i16> %a1, i8 0)
274  ret <8 x i16> %2
275}
276declare <8 x i16> @llvm.x86.xop.vpcomw(<8 x i16>, <8 x i16>, i8) nounwind readnone
277
278define <2 x double> @stack_fold_vpermil2pd_rm(<2 x double> %a0, <2 x double> %a1, <2 x i64> %a2) {
279; CHECK-LABEL: stack_fold_vpermil2pd_rm:
280; CHECK:       # %bb.0:
281; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
282; CHECK-NEXT:    #APP
283; CHECK-NEXT:    nop
284; CHECK-NEXT:    #NO_APP
285; CHECK-NEXT:    vpermil2pd $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0, %xmm0 # 16-byte Folded Reload
286; CHECK-NEXT:    retq
287  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
288  %2 = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %a1, <2 x i64> %a2, i8 0)
289  ret <2 x double> %2
290}
291define <2 x double> @stack_fold_vpermil2pd_mr(<2 x double> %a0, <2 x i64> %a1, <2 x double> %a2) {
292; CHECK-LABEL: stack_fold_vpermil2pd_mr:
293; CHECK:       # %bb.0:
294; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
295; CHECK-NEXT:    #APP
296; CHECK-NEXT:    nop
297; CHECK-NEXT:    #NO_APP
298; CHECK-NEXT:    vpermil2pd $0, %xmm1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
299; CHECK-NEXT:    retq
300  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
301  %2 = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %a2, <2 x i64> %a1, i8 0)
302  ret <2 x double> %2
303}
304declare <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double>, <2 x double>, <2 x i64>, i8) nounwind readnone
305
306define <4 x double> @stack_fold_vpermil2pd_rm_ymm(<4 x double> %a0, <4 x double> %a1, <4 x i64> %a2) {
307; CHECK-LABEL: stack_fold_vpermil2pd_rm_ymm:
308; CHECK:       # %bb.0:
309; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
310; CHECK-NEXT:    #APP
311; CHECK-NEXT:    nop
312; CHECK-NEXT:    #NO_APP
313; CHECK-NEXT:    vpermil2pd $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0, %ymm0 # 32-byte Folded Reload
314; CHECK-NEXT:    retq
315  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
316  %2 = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %a1, <4 x i64> %a2, i8 0)
317  ret <4 x double> %2
318}
319define <4 x double> @stack_fold_vpermil2pd_mr_ymm(<4 x double> %a0, <4 x i64> %a1, <4 x double> %a2) {
320; CHECK-LABEL: stack_fold_vpermil2pd_mr_ymm:
321; CHECK:       # %bb.0:
322; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
323; CHECK-NEXT:    #APP
324; CHECK-NEXT:    nop
325; CHECK-NEXT:    #NO_APP
326; CHECK-NEXT:    vpermil2pd $0, %ymm1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
327; CHECK-NEXT:    retq
328  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
329  %2 = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %a2, <4 x i64> %a1, i8 0)
330  ret <4 x double> %2
331}
332declare <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double>, <4 x double>, <4 x i64>, i8) nounwind readnone
333
334define <4 x float> @stack_fold_vpermil2ps_rm(<4 x float> %a0, <4 x float> %a1, <4 x i32> %a2) {
335; CHECK-LABEL: stack_fold_vpermil2ps_rm:
336; CHECK:       # %bb.0:
337; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
338; CHECK-NEXT:    #APP
339; CHECK-NEXT:    nop
340; CHECK-NEXT:    #NO_APP
341; CHECK-NEXT:    vpermil2ps $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0, %xmm0 # 16-byte Folded Reload
342; CHECK-NEXT:    retq
343  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
344  %2 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a0, <4 x float> %a1, <4 x i32> %a2, i8 0)
345  ret <4 x float> %2
346}
347define <4 x float> @stack_fold_vpermil2ps_mr(<4 x float> %a0, <4 x i32> %a1, <4 x float> %a2) {
348; CHECK-LABEL: stack_fold_vpermil2ps_mr:
349; CHECK:       # %bb.0:
350; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
351; CHECK-NEXT:    #APP
352; CHECK-NEXT:    nop
353; CHECK-NEXT:    #NO_APP
354; CHECK-NEXT:    vpermil2ps $0, %xmm1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
355; CHECK-NEXT:    retq
356  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
357  %2 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a0, <4 x float> %a2, <4 x i32> %a1, i8 0)
358  ret <4 x float> %2
359}
360declare <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float>, <4 x float>, <4 x i32>, i8) nounwind readnone
361
362define <8 x float> @stack_fold_vpermil2ps_rm_ymm(<8 x float> %a0, <8 x float> %a1, <8 x i32> %a2) {
363; CHECK-LABEL: stack_fold_vpermil2ps_rm_ymm:
364; CHECK:       # %bb.0:
365; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
366; CHECK-NEXT:    #APP
367; CHECK-NEXT:    nop
368; CHECK-NEXT:    #NO_APP
369; CHECK-NEXT:    vpermil2ps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0, %ymm0 # 32-byte Folded Reload
370; CHECK-NEXT:    retq
371  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
372  %2 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %a0, <8 x float> %a1, <8 x i32> %a2, i8 0)
373  ret <8 x float> %2
374}
375define <8 x float> @stack_fold_vpermil2ps_mr_ymm(<8 x float> %a0, <8 x i32> %a1, <8 x float> %a2) {
376; CHECK-LABEL: stack_fold_vpermil2ps_mr_ymm:
377; CHECK:       # %bb.0:
378; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
379; CHECK-NEXT:    #APP
380; CHECK-NEXT:    nop
381; CHECK-NEXT:    #NO_APP
382; CHECK-NEXT:    vpermil2ps $0, %ymm1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
383; CHECK-NEXT:    retq
384  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
385  %2 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %a0, <8 x float> %a2, <8 x i32> %a1, i8 0)
386  ret <8 x float> %2
387}
388declare <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float>, <8 x float>, <8 x i32>, i8) nounwind readnone
389
390define <4 x i32> @stack_fold_vphaddbd(<16 x i8> %a0) {
391; CHECK-LABEL: stack_fold_vphaddbd:
392; CHECK:       # %bb.0:
393; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
394; CHECK-NEXT:    #APP
395; CHECK-NEXT:    nop
396; CHECK-NEXT:    #NO_APP
397; CHECK-NEXT:    vphaddbd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
398; CHECK-NEXT:    retq
399  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
400  %2 = call <4 x i32> @llvm.x86.xop.vphaddbd(<16 x i8> %a0)
401  ret <4 x i32> %2
402}
403declare <4 x i32> @llvm.x86.xop.vphaddbd(<16 x i8>) nounwind readnone
404
405define <2 x i64> @stack_fold_vphaddbq(<16 x i8> %a0) {
406; CHECK-LABEL: stack_fold_vphaddbq:
407; CHECK:       # %bb.0:
408; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
409; CHECK-NEXT:    #APP
410; CHECK-NEXT:    nop
411; CHECK-NEXT:    #NO_APP
412; CHECK-NEXT:    vphaddbq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
413; CHECK-NEXT:    retq
414  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
415  %2 = call <2 x i64> @llvm.x86.xop.vphaddbq(<16 x i8> %a0)
416  ret <2 x i64> %2
417}
418declare <2 x i64> @llvm.x86.xop.vphaddbq(<16 x i8>) nounwind readnone
419
420define <8 x i16> @stack_fold_vphaddbw(<16 x i8> %a0) {
421; CHECK-LABEL: stack_fold_vphaddbw:
422; CHECK:       # %bb.0:
423; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
424; CHECK-NEXT:    #APP
425; CHECK-NEXT:    nop
426; CHECK-NEXT:    #NO_APP
427; CHECK-NEXT:    vphaddbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
428; CHECK-NEXT:    retq
429  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
430  %2 = call <8 x i16> @llvm.x86.xop.vphaddbw(<16 x i8> %a0)
431  ret <8 x i16> %2
432}
433declare <8 x i16> @llvm.x86.xop.vphaddbw(<16 x i8>) nounwind readnone
434
435define <2 x i64> @stack_fold_vphadddq(<4 x i32> %a0) {
436; CHECK-LABEL: stack_fold_vphadddq:
437; CHECK:       # %bb.0:
438; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
439; CHECK-NEXT:    #APP
440; CHECK-NEXT:    nop
441; CHECK-NEXT:    #NO_APP
442; CHECK-NEXT:    vphadddq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
443; CHECK-NEXT:    retq
444  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
445  %2 = call <2 x i64> @llvm.x86.xop.vphadddq(<4 x i32> %a0)
446  ret <2 x i64> %2
447}
448declare <2 x i64> @llvm.x86.xop.vphadddq(<4 x i32>) nounwind readnone
449
450define <4 x i32> @stack_fold_vphaddubd(<16 x i8> %a0) {
451; CHECK-LABEL: stack_fold_vphaddubd:
452; CHECK:       # %bb.0:
453; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
454; CHECK-NEXT:    #APP
455; CHECK-NEXT:    nop
456; CHECK-NEXT:    #NO_APP
457; CHECK-NEXT:    vphaddubd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
458; CHECK-NEXT:    retq
459  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
460  %2 = call <4 x i32> @llvm.x86.xop.vphaddubd(<16 x i8> %a0)
461  ret <4 x i32> %2
462}
463declare <4 x i32> @llvm.x86.xop.vphaddubd(<16 x i8>) nounwind readnone
464
465define <2 x i64> @stack_fold_vphaddubq(<16 x i8> %a0) {
466; CHECK-LABEL: stack_fold_vphaddubq:
467; CHECK:       # %bb.0:
468; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
469; CHECK-NEXT:    #APP
470; CHECK-NEXT:    nop
471; CHECK-NEXT:    #NO_APP
472; CHECK-NEXT:    vphaddubq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
473; CHECK-NEXT:    retq
474  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
475  %2 = call <2 x i64> @llvm.x86.xop.vphaddubq(<16 x i8> %a0)
476  ret <2 x i64> %2
477}
478declare <2 x i64> @llvm.x86.xop.vphaddubq(<16 x i8>) nounwind readnone
479
480define <8 x i16> @stack_fold_vphaddubw(<16 x i8> %a0) {
481; CHECK-LABEL: stack_fold_vphaddubw:
482; CHECK:       # %bb.0:
483; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
484; CHECK-NEXT:    #APP
485; CHECK-NEXT:    nop
486; CHECK-NEXT:    #NO_APP
487; CHECK-NEXT:    vphaddubw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
488; CHECK-NEXT:    retq
489  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
490  %2 = call <8 x i16> @llvm.x86.xop.vphaddubw(<16 x i8> %a0)
491  ret <8 x i16> %2
492}
493declare <8 x i16> @llvm.x86.xop.vphaddubw(<16 x i8>) nounwind readnone
494
495define <2 x i64> @stack_fold_vphaddudq(<4 x i32> %a0) {
496; CHECK-LABEL: stack_fold_vphaddudq:
497; CHECK:       # %bb.0:
498; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
499; CHECK-NEXT:    #APP
500; CHECK-NEXT:    nop
501; CHECK-NEXT:    #NO_APP
502; CHECK-NEXT:    vphaddudq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
503; CHECK-NEXT:    retq
504  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
505  %2 = call <2 x i64> @llvm.x86.xop.vphaddudq(<4 x i32> %a0)
506  ret <2 x i64> %2
507}
508declare <2 x i64> @llvm.x86.xop.vphaddudq(<4 x i32>) nounwind readnone
509
510define <4 x i32> @stack_fold_vphadduwd(<8 x i16> %a0) {
511; CHECK-LABEL: stack_fold_vphadduwd:
512; CHECK:       # %bb.0:
513; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
514; CHECK-NEXT:    #APP
515; CHECK-NEXT:    nop
516; CHECK-NEXT:    #NO_APP
517; CHECK-NEXT:    vphadduwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
518; CHECK-NEXT:    retq
519  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
520  %2 = call <4 x i32> @llvm.x86.xop.vphadduwd(<8 x i16> %a0)
521  ret <4 x i32> %2
522}
523declare <4 x i32> @llvm.x86.xop.vphadduwd(<8 x i16>) nounwind readnone
524
525define <2 x i64> @stack_fold_vphadduwq(<8 x i16> %a0) {
526; CHECK-LABEL: stack_fold_vphadduwq:
527; CHECK:       # %bb.0:
528; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
529; CHECK-NEXT:    #APP
530; CHECK-NEXT:    nop
531; CHECK-NEXT:    #NO_APP
532; CHECK-NEXT:    vphadduwq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
533; CHECK-NEXT:    retq
534  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
535  %2 = call <2 x i64> @llvm.x86.xop.vphadduwq(<8 x i16> %a0)
536  ret <2 x i64> %2
537}
538declare <2 x i64> @llvm.x86.xop.vphadduwq(<8 x i16>) nounwind readnone
539
540define <4 x i32> @stack_fold_vphaddwd(<8 x i16> %a0) {
541; CHECK-LABEL: stack_fold_vphaddwd:
542; CHECK:       # %bb.0:
543; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
544; CHECK-NEXT:    #APP
545; CHECK-NEXT:    nop
546; CHECK-NEXT:    #NO_APP
547; CHECK-NEXT:    vphaddwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
548; CHECK-NEXT:    retq
549  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
550  %2 = call <4 x i32> @llvm.x86.xop.vphaddwd(<8 x i16> %a0)
551  ret <4 x i32> %2
552}
553declare <4 x i32> @llvm.x86.xop.vphaddwd(<8 x i16>) nounwind readnone
554
555define <2 x i64> @stack_fold_vphaddwq(<8 x i16> %a0) {
556; CHECK-LABEL: stack_fold_vphaddwq:
557; CHECK:       # %bb.0:
558; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
559; CHECK-NEXT:    #APP
560; CHECK-NEXT:    nop
561; CHECK-NEXT:    #NO_APP
562; CHECK-NEXT:    vphaddwq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
563; CHECK-NEXT:    retq
564  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
565  %2 = call <2 x i64> @llvm.x86.xop.vphaddwq(<8 x i16> %a0)
566  ret <2 x i64> %2
567}
568declare <2 x i64> @llvm.x86.xop.vphaddwq(<8 x i16>) nounwind readnone
569
570define <8 x i16> @stack_fold_vphsubbw(<16 x i8> %a0) {
571; CHECK-LABEL: stack_fold_vphsubbw:
572; CHECK:       # %bb.0:
573; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
574; CHECK-NEXT:    #APP
575; CHECK-NEXT:    nop
576; CHECK-NEXT:    #NO_APP
577; CHECK-NEXT:    vphsubbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
578; CHECK-NEXT:    retq
579  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
580  %2 = call <8 x i16> @llvm.x86.xop.vphsubbw(<16 x i8> %a0)
581  ret <8 x i16> %2
582}
583declare <8 x i16> @llvm.x86.xop.vphsubbw(<16 x i8>) nounwind readnone
584
585define <2 x i64> @stack_fold_vphsubdq(<4 x i32> %a0) {
586; CHECK-LABEL: stack_fold_vphsubdq:
587; CHECK:       # %bb.0:
588; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
589; CHECK-NEXT:    #APP
590; CHECK-NEXT:    nop
591; CHECK-NEXT:    #NO_APP
592; CHECK-NEXT:    vphsubdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
593; CHECK-NEXT:    retq
594  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
595  %2 = call <2 x i64> @llvm.x86.xop.vphsubdq(<4 x i32> %a0)
596  ret <2 x i64> %2
597}
598declare <2 x i64> @llvm.x86.xop.vphsubdq(<4 x i32>) nounwind readnone
599
600define <4 x i32> @stack_fold_vphsubwd(<8 x i16> %a0) {
601; CHECK-LABEL: stack_fold_vphsubwd:
602; CHECK:       # %bb.0:
603; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
604; CHECK-NEXT:    #APP
605; CHECK-NEXT:    nop
606; CHECK-NEXT:    #NO_APP
607; CHECK-NEXT:    vphsubwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
608; CHECK-NEXT:    retq
609  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
610  %2 = call <4 x i32> @llvm.x86.xop.vphsubwd(<8 x i16> %a0)
611  ret <4 x i32> %2
612}
613declare <4 x i32> @llvm.x86.xop.vphsubwd(<8 x i16>) nounwind readnone
614
615define <4 x i32> @stack_fold_vpmacsdd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
616; CHECK-LABEL: stack_fold_vpmacsdd:
617; CHECK:       # %bb.0:
618; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
619; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
620; CHECK-NEXT:    #APP
621; CHECK-NEXT:    nop
622; CHECK-NEXT:    #NO_APP
623; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
624; CHECK-NEXT:    vpmacsdd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
625; CHECK-NEXT:    retq
626  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
627  %2 = call <4 x i32> @llvm.x86.xop.vpmacsdd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2)
628  ret <4 x i32> %2
629}
630declare <4 x i32> @llvm.x86.xop.vpmacsdd(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone
631
632define <2 x i64> @stack_fold_vpmacsdqh(<4 x i32> %a0, <4 x i32> %a1, <2 x i64> %a2) {
633; CHECK-LABEL: stack_fold_vpmacsdqh:
634; CHECK:       # %bb.0:
635; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
636; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
637; CHECK-NEXT:    #APP
638; CHECK-NEXT:    nop
639; CHECK-NEXT:    #NO_APP
640; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
641; CHECK-NEXT:    vpmacsdqh %xmm1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
642; CHECK-NEXT:    retq
643  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
644  %2 = call <2 x i64> @llvm.x86.xop.vpmacsdqh(<4 x i32> %a0, <4 x i32> %a1, <2 x i64> %a2)
645  ret <2 x i64> %2
646}
647declare <2 x i64> @llvm.x86.xop.vpmacsdqh(<4 x i32>, <4 x i32>, <2 x i64>) nounwind readnone
648
649define <2 x i64> @stack_fold_vpmacsdql(<4 x i32> %a0, <4 x i32> %a1, <2 x i64> %a2) {
650; CHECK-LABEL: stack_fold_vpmacsdql:
651; CHECK:       # %bb.0:
652; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
653; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
654; CHECK-NEXT:    #APP
655; CHECK-NEXT:    nop
656; CHECK-NEXT:    #NO_APP
657; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
658; CHECK-NEXT:    vpmacsdql %xmm1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
659; CHECK-NEXT:    retq
660  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
661  %2 = call <2 x i64> @llvm.x86.xop.vpmacsdql(<4 x i32> %a0, <4 x i32> %a1, <2 x i64> %a2)
662  ret <2 x i64> %2
663}
664declare <2 x i64> @llvm.x86.xop.vpmacsdql(<4 x i32>, <4 x i32>, <2 x i64>) nounwind readnone
665
666define <4 x i32> @stack_fold_vpmacssdd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
667; CHECK-LABEL: stack_fold_vpmacssdd:
668; CHECK:       # %bb.0:
669; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
670; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
671; CHECK-NEXT:    #APP
672; CHECK-NEXT:    nop
673; CHECK-NEXT:    #NO_APP
674; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
675; CHECK-NEXT:    vpmacssdd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
676; CHECK-NEXT:    retq
677  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
678  %2 = call <4 x i32> @llvm.x86.xop.vpmacssdd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2)
679  ret <4 x i32> %2
680}
681declare <4 x i32> @llvm.x86.xop.vpmacssdd(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone
682
683define <2 x i64> @stack_fold_vpmacssdqh(<4 x i32> %a0, <4 x i32> %a1, <2 x i64> %a2) {
684; CHECK-LABEL: stack_fold_vpmacssdqh:
685; CHECK:       # %bb.0:
686; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
687; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
688; CHECK-NEXT:    #APP
689; CHECK-NEXT:    nop
690; CHECK-NEXT:    #NO_APP
691; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
692; CHECK-NEXT:    vpmacssdqh %xmm1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
693; CHECK-NEXT:    retq
694  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
695  %2 = call <2 x i64> @llvm.x86.xop.vpmacssdqh(<4 x i32> %a0, <4 x i32> %a1, <2 x i64> %a2)
696  ret <2 x i64> %2
697}
698declare <2 x i64> @llvm.x86.xop.vpmacssdqh(<4 x i32>, <4 x i32>, <2 x i64>) nounwind readnone
699
700define <2 x i64> @stack_fold_vpmacssdql(<4 x i32> %a0, <4 x i32> %a1, <2 x i64> %a2) {
701; CHECK-LABEL: stack_fold_vpmacssdql:
702; CHECK:       # %bb.0:
703; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
704; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
705; CHECK-NEXT:    #APP
706; CHECK-NEXT:    nop
707; CHECK-NEXT:    #NO_APP
708; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
709; CHECK-NEXT:    vpmacssdql %xmm1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
710; CHECK-NEXT:    retq
711  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
712  %2 = call <2 x i64> @llvm.x86.xop.vpmacssdql(<4 x i32> %a0, <4 x i32> %a1, <2 x i64> %a2)
713  ret <2 x i64> %2
714}
715declare <2 x i64> @llvm.x86.xop.vpmacssdql(<4 x i32>, <4 x i32>, <2 x i64>) nounwind readnone
716
717define <4 x i32> @stack_fold_vpmacsswd(<8 x i16> %a0, <8 x i16> %a1, <4 x i32> %a2) {
718; CHECK-LABEL: stack_fold_vpmacsswd:
719; CHECK:       # %bb.0:
720; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
721; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
722; CHECK-NEXT:    #APP
723; CHECK-NEXT:    nop
724; CHECK-NEXT:    #NO_APP
725; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
726; CHECK-NEXT:    vpmacsswd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
727; CHECK-NEXT:    retq
728  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
729  %2 = call <4 x i32> @llvm.x86.xop.vpmacsswd(<8 x i16> %a0, <8 x i16> %a1, <4 x i32> %a2)
730  ret <4 x i32> %2
731}
732declare <4 x i32> @llvm.x86.xop.vpmacsswd(<8 x i16>, <8 x i16>, <4 x i32>) nounwind readnone
733
734define <8 x i16> @stack_fold_vpmacssww(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2) {
735; CHECK-LABEL: stack_fold_vpmacssww:
736; CHECK:       # %bb.0:
737; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
738; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
739; CHECK-NEXT:    #APP
740; CHECK-NEXT:    nop
741; CHECK-NEXT:    #NO_APP
742; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
743; CHECK-NEXT:    vpmacssww %xmm1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
744; CHECK-NEXT:    retq
745  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
746  %2 = call <8 x i16> @llvm.x86.xop.vpmacssww(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2)
747  ret <8 x i16> %2
748}
749declare <8 x i16> @llvm.x86.xop.vpmacssww(<8 x i16>, <8 x i16>, <8 x i16>) nounwind readnone
750
751define <4 x i32> @stack_fold_vpmacswd(<8 x i16> %a0, <8 x i16> %a1, <4 x i32> %a2) {
752; CHECK-LABEL: stack_fold_vpmacswd:
753; CHECK:       # %bb.0:
754; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
755; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
756; CHECK-NEXT:    #APP
757; CHECK-NEXT:    nop
758; CHECK-NEXT:    #NO_APP
759; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
760; CHECK-NEXT:    vpmacswd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
761; CHECK-NEXT:    retq
762  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
763  %2 = call <4 x i32> @llvm.x86.xop.vpmacswd(<8 x i16> %a0, <8 x i16> %a1, <4 x i32> %a2)
764  ret <4 x i32> %2
765}
766declare <4 x i32> @llvm.x86.xop.vpmacswd(<8 x i16>, <8 x i16>, <4 x i32>) nounwind readnone
767
768define <8 x i16> @stack_fold_vpmacsww(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2) {
769; CHECK-LABEL: stack_fold_vpmacsww:
770; CHECK:       # %bb.0:
771; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
772; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
773; CHECK-NEXT:    #APP
774; CHECK-NEXT:    nop
775; CHECK-NEXT:    #NO_APP
776; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
777; CHECK-NEXT:    vpmacsww %xmm1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
778; CHECK-NEXT:    retq
779  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
780  %2 = call <8 x i16> @llvm.x86.xop.vpmacsww(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2)
781  ret <8 x i16> %2
782}
783declare <8 x i16> @llvm.x86.xop.vpmacsww(<8 x i16>, <8 x i16>, <8 x i16>) nounwind readnone
784
785define <4 x i32> @stack_fold_vpmadcsswd(<8 x i16> %a0, <8 x i16> %a1, <4 x i32> %a2) {
786; CHECK-LABEL: stack_fold_vpmadcsswd:
787; CHECK:       # %bb.0:
788; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
789; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
790; CHECK-NEXT:    #APP
791; CHECK-NEXT:    nop
792; CHECK-NEXT:    #NO_APP
793; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
794; CHECK-NEXT:    vpmadcsswd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
795; CHECK-NEXT:    retq
796  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
797  %2 = call <4 x i32> @llvm.x86.xop.vpmadcsswd(<8 x i16> %a0, <8 x i16> %a1, <4 x i32> %a2)
798  ret <4 x i32> %2
799}
800declare <4 x i32> @llvm.x86.xop.vpmadcsswd(<8 x i16>, <8 x i16>, <4 x i32>) nounwind readnone
801
802define <4 x i32> @stack_fold_vpmadcswd(<8 x i16> %a0, <8 x i16> %a1, <4 x i32> %a2) {
803; CHECK-LABEL: stack_fold_vpmadcswd:
804; CHECK:       # %bb.0:
805; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
806; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
807; CHECK-NEXT:    #APP
808; CHECK-NEXT:    nop
809; CHECK-NEXT:    #NO_APP
810; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
811; CHECK-NEXT:    vpmadcswd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
812; CHECK-NEXT:    retq
813  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
814  %2 = call <4 x i32> @llvm.x86.xop.vpmadcswd(<8 x i16> %a0, <8 x i16> %a1, <4 x i32> %a2)
815  ret <4 x i32> %2
816}
817declare <4 x i32> @llvm.x86.xop.vpmadcswd(<8 x i16>, <8 x i16>, <4 x i32>) nounwind readnone
818
819define <16 x i8> @stack_fold_vpperm_rm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2) {
820; CHECK-LABEL: stack_fold_vpperm_rm:
821; CHECK:       # %bb.0:
822; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
823; CHECK-NEXT:    #APP
824; CHECK-NEXT:    nop
825; CHECK-NEXT:    #NO_APP
826; CHECK-NEXT:    vpperm {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0, %xmm0 # 16-byte Folded Reload
827; CHECK-NEXT:    retq
828  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
829  %2 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2)
830  ret <16 x i8> %2
831}
832define <16 x i8> @stack_fold_vpperm_mr(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2) {
833; CHECK-LABEL: stack_fold_vpperm_mr:
834; CHECK:       # %bb.0:
835; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
836; CHECK-NEXT:    #APP
837; CHECK-NEXT:    nop
838; CHECK-NEXT:    #NO_APP
839; CHECK-NEXT:    vpperm %xmm1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
840; CHECK-NEXT:    retq
841  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
842  %2 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a2, <16 x i8> %a1)
843  ret <16 x i8> %2
844}
845declare <16 x i8> @llvm.x86.xop.vpperm(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
846
847define <16 x i8> @stack_fold_vprotb(<16 x i8> %a0) {
848; CHECK-LABEL: stack_fold_vprotb:
849; CHECK:       # %bb.0:
850; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
851; CHECK-NEXT:    #APP
852; CHECK-NEXT:    nop
853; CHECK-NEXT:    #NO_APP
854; CHECK-NEXT:    vprotb $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
855; CHECK-NEXT:    retq
856  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
857  %2 = call <16 x i8> @llvm.x86.xop.vprotbi(<16 x i8> %a0, i8 7)
858  ret <16 x i8> %2
859}
860declare <16 x i8> @llvm.x86.xop.vprotbi(<16 x i8>, i8) nounwind readnone
861
862define <16 x i8> @stack_fold_vprotb_rm(<16 x i8> %a0, <16 x i8> %a1) {
863; CHECK-LABEL: stack_fold_vprotb_rm:
864; CHECK:       # %bb.0:
865; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
866; CHECK-NEXT:    #APP
867; CHECK-NEXT:    nop
868; CHECK-NEXT:    #NO_APP
869; CHECK-NEXT:    vprotb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
870; CHECK-NEXT:    retq
871  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
872  %2 = call <16 x i8> @llvm.x86.xop.vprotb(<16 x i8> %a0, <16 x i8> %a1)
873  ret <16 x i8> %2
874}
875define <16 x i8> @stack_fold_vprotb_mr(<16 x i8> %a0, <16 x i8> %a1) {
876; CHECK-LABEL: stack_fold_vprotb_mr:
877; CHECK:       # %bb.0:
878; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
879; CHECK-NEXT:    #APP
880; CHECK-NEXT:    nop
881; CHECK-NEXT:    #NO_APP
882; CHECK-NEXT:    vprotb %xmm0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
883; CHECK-NEXT:    retq
884  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
885  %2 = call <16 x i8> @llvm.x86.xop.vprotb(<16 x i8> %a1, <16 x i8> %a0)
886  ret <16 x i8> %2
887}
888declare <16 x i8> @llvm.x86.xop.vprotb(<16 x i8>, <16 x i8>) nounwind readnone
889
890define <4 x i32> @stack_fold_vprotd(<4 x i32> %a0) {
891; CHECK-LABEL: stack_fold_vprotd:
892; CHECK:       # %bb.0:
893; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
894; CHECK-NEXT:    #APP
895; CHECK-NEXT:    nop
896; CHECK-NEXT:    #NO_APP
897; CHECK-NEXT:    vprotd $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
898; CHECK-NEXT:    retq
899  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
900  %2 = call <4 x i32> @llvm.x86.xop.vprotdi(<4 x i32> %a0, i8 7)
901  ret <4 x i32> %2
902}
903declare <4 x i32> @llvm.x86.xop.vprotdi(<4 x i32>, i8) nounwind readnone
904
905define <4 x i32> @stack_fold_vprotd_rm(<4 x i32> %a0, <4 x i32> %a1) {
906; CHECK-LABEL: stack_fold_vprotd_rm:
907; CHECK:       # %bb.0:
908; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
909; CHECK-NEXT:    #APP
910; CHECK-NEXT:    nop
911; CHECK-NEXT:    #NO_APP
912; CHECK-NEXT:    vprotd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
913; CHECK-NEXT:    retq
914  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
915  %2 = call <4 x i32> @llvm.x86.xop.vprotd(<4 x i32> %a0, <4 x i32> %a1)
916  ret <4 x i32> %2
917}
918define <4 x i32> @stack_fold_vprotd_mr(<4 x i32> %a0, <4 x i32> %a1) {
919; CHECK-LABEL: stack_fold_vprotd_mr:
920; CHECK:       # %bb.0:
921; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
922; CHECK-NEXT:    #APP
923; CHECK-NEXT:    nop
924; CHECK-NEXT:    #NO_APP
925; CHECK-NEXT:    vprotd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
926; CHECK-NEXT:    retq
927  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
928  %2 = call <4 x i32> @llvm.x86.xop.vprotd(<4 x i32> %a1, <4 x i32> %a0)
929  ret <4 x i32> %2
930}
931declare <4 x i32> @llvm.x86.xop.vprotd(<4 x i32>, <4 x i32>) nounwind readnone
932
933define <2 x i64> @stack_fold_vprotq(<2 x i64> %a0) {
934; CHECK-LABEL: stack_fold_vprotq:
935; CHECK:       # %bb.0:
936; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
937; CHECK-NEXT:    #APP
938; CHECK-NEXT:    nop
939; CHECK-NEXT:    #NO_APP
940; CHECK-NEXT:    vprotq $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
941; CHECK-NEXT:    retq
942  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
943  %2 = call <2 x i64> @llvm.x86.xop.vprotqi(<2 x i64> %a0, i8 7)
944  ret <2 x i64> %2
945}
946declare <2 x i64> @llvm.x86.xop.vprotqi(<2 x i64>, i8) nounwind readnone
947
948define <2 x i64> @stack_fold_vprotq_rm(<2 x i64> %a0, <2 x i64> %a1) {
949; CHECK-LABEL: stack_fold_vprotq_rm:
950; CHECK:       # %bb.0:
951; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
952; CHECK-NEXT:    #APP
953; CHECK-NEXT:    nop
954; CHECK-NEXT:    #NO_APP
955; CHECK-NEXT:    vprotq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
956; CHECK-NEXT:    retq
957  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
958  %2 = call <2 x i64> @llvm.x86.xop.vprotq(<2 x i64> %a0, <2 x i64> %a1)
959  ret <2 x i64> %2
960}
961define <2 x i64> @stack_fold_vprotq_mr(<2 x i64> %a0, <2 x i64> %a1) {
962; CHECK-LABEL: stack_fold_vprotq_mr:
963; CHECK:       # %bb.0:
964; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
965; CHECK-NEXT:    #APP
966; CHECK-NEXT:    nop
967; CHECK-NEXT:    #NO_APP
968; CHECK-NEXT:    vprotq %xmm0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
969; CHECK-NEXT:    retq
970  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
971  %2 = call <2 x i64> @llvm.x86.xop.vprotq(<2 x i64> %a1, <2 x i64> %a0)
972  ret <2 x i64> %2
973}
974declare <2 x i64> @llvm.x86.xop.vprotq(<2 x i64>, <2 x i64>) nounwind readnone
975
976define <8 x i16> @stack_fold_vprotw(<8 x i16> %a0) {
977; CHECK-LABEL: stack_fold_vprotw:
978; CHECK:       # %bb.0:
979; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
980; CHECK-NEXT:    #APP
981; CHECK-NEXT:    nop
982; CHECK-NEXT:    #NO_APP
983; CHECK-NEXT:    vprotw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
984; CHECK-NEXT:    retq
985  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
986  %2 = call <8 x i16> @llvm.x86.xop.vprotwi(<8 x i16> %a0, i8 7)
987  ret <8 x i16> %2
988}
989declare <8 x i16> @llvm.x86.xop.vprotwi(<8 x i16>, i8) nounwind readnone
990
991define <8 x i16> @stack_fold_vprotw_rm(<8 x i16> %a0, <8 x i16> %a1) {
992; CHECK-LABEL: stack_fold_vprotw_rm:
993; CHECK:       # %bb.0:
994; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
995; CHECK-NEXT:    #APP
996; CHECK-NEXT:    nop
997; CHECK-NEXT:    #NO_APP
998; CHECK-NEXT:    vprotw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
999; CHECK-NEXT:    retq
1000  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1001  %2 = call <8 x i16> @llvm.x86.xop.vprotw(<8 x i16> %a0, <8 x i16> %a1)
1002  ret <8 x i16> %2
1003}
1004define <8 x i16> @stack_fold_vprotw_mr(<8 x i16> %a0, <8 x i16> %a1) {
1005; CHECK-LABEL: stack_fold_vprotw_mr:
1006; CHECK:       # %bb.0:
1007; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1008; CHECK-NEXT:    #APP
1009; CHECK-NEXT:    nop
1010; CHECK-NEXT:    #NO_APP
1011; CHECK-NEXT:    vprotw %xmm0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1012; CHECK-NEXT:    retq
1013  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1014  %2 = call <8 x i16> @llvm.x86.xop.vprotw(<8 x i16> %a1, <8 x i16> %a0)
1015  ret <8 x i16> %2
1016}
1017declare <8 x i16> @llvm.x86.xop.vprotw(<8 x i16>, <8 x i16>) nounwind readnone
1018
1019define <16 x i8> @stack_fold_vpshab_rm(<16 x i8> %a0, <16 x i8> %a1) {
1020; CHECK-LABEL: stack_fold_vpshab_rm:
1021; CHECK:       # %bb.0:
1022; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1023; CHECK-NEXT:    #APP
1024; CHECK-NEXT:    nop
1025; CHECK-NEXT:    #NO_APP
1026; CHECK-NEXT:    vpshab {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1027; CHECK-NEXT:    retq
1028  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1029  %2 = call <16 x i8> @llvm.x86.xop.vpshab(<16 x i8> %a0, <16 x i8> %a1)
1030  ret <16 x i8> %2
1031}
1032define <16 x i8> @stack_fold_vpshab_mr(<16 x i8> %a0, <16 x i8> %a1) {
1033; CHECK-LABEL: stack_fold_vpshab_mr:
1034; CHECK:       # %bb.0:
1035; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1036; CHECK-NEXT:    #APP
1037; CHECK-NEXT:    nop
1038; CHECK-NEXT:    #NO_APP
1039; CHECK-NEXT:    vpshab %xmm0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1040; CHECK-NEXT:    retq
1041  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1042  %2 = call <16 x i8> @llvm.x86.xop.vpshab(<16 x i8> %a1, <16 x i8> %a0)
1043  ret <16 x i8> %2
1044}
1045declare <16 x i8> @llvm.x86.xop.vpshab(<16 x i8>, <16 x i8>) nounwind readnone
1046
1047define <4 x i32> @stack_fold_vpshad_rm(<4 x i32> %a0, <4 x i32> %a1) {
1048; CHECK-LABEL: stack_fold_vpshad_rm:
1049; CHECK:       # %bb.0:
1050; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1051; CHECK-NEXT:    #APP
1052; CHECK-NEXT:    nop
1053; CHECK-NEXT:    #NO_APP
1054; CHECK-NEXT:    vpshad {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1055; CHECK-NEXT:    retq
1056  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1057  %2 = call <4 x i32> @llvm.x86.xop.vpshad(<4 x i32> %a0, <4 x i32> %a1)
1058  ret <4 x i32> %2
1059}
1060define <4 x i32> @stack_fold_vpshad_mr(<4 x i32> %a0, <4 x i32> %a1) {
1061; CHECK-LABEL: stack_fold_vpshad_mr:
1062; CHECK:       # %bb.0:
1063; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1064; CHECK-NEXT:    #APP
1065; CHECK-NEXT:    nop
1066; CHECK-NEXT:    #NO_APP
1067; CHECK-NEXT:    vpshad %xmm0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1068; CHECK-NEXT:    retq
1069  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1070  %2 = call <4 x i32> @llvm.x86.xop.vpshad(<4 x i32> %a1, <4 x i32> %a0)
1071  ret <4 x i32> %2
1072}
1073declare <4 x i32> @llvm.x86.xop.vpshad(<4 x i32>, <4 x i32>) nounwind readnone
1074
1075define <2 x i64> @stack_fold_vpshaq_rm(<2 x i64> %a0, <2 x i64> %a1) {
1076; CHECK-LABEL: stack_fold_vpshaq_rm:
1077; CHECK:       # %bb.0:
1078; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1079; CHECK-NEXT:    #APP
1080; CHECK-NEXT:    nop
1081; CHECK-NEXT:    #NO_APP
1082; CHECK-NEXT:    vpshaq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1083; CHECK-NEXT:    retq
1084  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1085  %2 = call <2 x i64> @llvm.x86.xop.vpshaq(<2 x i64> %a0, <2 x i64> %a1)
1086  ret <2 x i64> %2
1087}
1088define <2 x i64> @stack_fold_vpshaq_mr(<2 x i64> %a0, <2 x i64> %a1) {
1089; CHECK-LABEL: stack_fold_vpshaq_mr:
1090; CHECK:       # %bb.0:
1091; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1092; CHECK-NEXT:    #APP
1093; CHECK-NEXT:    nop
1094; CHECK-NEXT:    #NO_APP
1095; CHECK-NEXT:    vpshaq %xmm0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1096; CHECK-NEXT:    retq
1097  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1098  %2 = call <2 x i64> @llvm.x86.xop.vpshaq(<2 x i64> %a1, <2 x i64> %a0)
1099  ret <2 x i64> %2
1100}
1101declare <2 x i64> @llvm.x86.xop.vpshaq(<2 x i64>, <2 x i64>) nounwind readnone
1102
1103define <8 x i16> @stack_fold_vpshaw_rm(<8 x i16> %a0, <8 x i16> %a1) {
1104; CHECK-LABEL: stack_fold_vpshaw_rm:
1105; CHECK:       # %bb.0:
1106; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1107; CHECK-NEXT:    #APP
1108; CHECK-NEXT:    nop
1109; CHECK-NEXT:    #NO_APP
1110; CHECK-NEXT:    vpshaw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1111; CHECK-NEXT:    retq
1112  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1113  %2 = call <8 x i16> @llvm.x86.xop.vpshaw(<8 x i16> %a0, <8 x i16> %a1)
1114  ret <8 x i16> %2
1115}
1116define <8 x i16> @stack_fold_vpshaw_mr(<8 x i16> %a0, <8 x i16> %a1) {
1117; CHECK-LABEL: stack_fold_vpshaw_mr:
1118; CHECK:       # %bb.0:
1119; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1120; CHECK-NEXT:    #APP
1121; CHECK-NEXT:    nop
1122; CHECK-NEXT:    #NO_APP
1123; CHECK-NEXT:    vpshaw %xmm0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1124; CHECK-NEXT:    retq
1125  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1126  %2 = call <8 x i16> @llvm.x86.xop.vpshaw(<8 x i16> %a1, <8 x i16> %a0)
1127  ret <8 x i16> %2
1128}
1129declare <8 x i16> @llvm.x86.xop.vpshaw(<8 x i16>, <8 x i16>) nounwind readnone
1130
1131define <16 x i8> @stack_fold_vpshlb_rm(<16 x i8> %a0, <16 x i8> %a1) {
1132; CHECK-LABEL: stack_fold_vpshlb_rm:
1133; CHECK:       # %bb.0:
1134; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1135; CHECK-NEXT:    #APP
1136; CHECK-NEXT:    nop
1137; CHECK-NEXT:    #NO_APP
1138; CHECK-NEXT:    vpshlb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1139; CHECK-NEXT:    retq
1140  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1141  %2 = call <16 x i8> @llvm.x86.xop.vpshlb(<16 x i8> %a0, <16 x i8> %a1)
1142  ret <16 x i8> %2
1143}
1144define <16 x i8> @stack_fold_vpshlb_mr(<16 x i8> %a0, <16 x i8> %a1) {
1145; CHECK-LABEL: stack_fold_vpshlb_mr:
1146; CHECK:       # %bb.0:
1147; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1148; CHECK-NEXT:    #APP
1149; CHECK-NEXT:    nop
1150; CHECK-NEXT:    #NO_APP
1151; CHECK-NEXT:    vpshlb %xmm0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1152; CHECK-NEXT:    retq
1153  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1154  %2 = call <16 x i8> @llvm.x86.xop.vpshlb(<16 x i8> %a1, <16 x i8> %a0)
1155  ret <16 x i8> %2
1156}
1157declare <16 x i8> @llvm.x86.xop.vpshlb(<16 x i8>, <16 x i8>) nounwind readnone
1158
1159define <4 x i32> @stack_fold_vpshld_rm(<4 x i32> %a0, <4 x i32> %a1) {
1160; CHECK-LABEL: stack_fold_vpshld_rm:
1161; CHECK:       # %bb.0:
1162; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1163; CHECK-NEXT:    #APP
1164; CHECK-NEXT:    nop
1165; CHECK-NEXT:    #NO_APP
1166; CHECK-NEXT:    vpshld {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1167; CHECK-NEXT:    retq
1168  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1169  %2 = call <4 x i32> @llvm.x86.xop.vpshld(<4 x i32> %a0, <4 x i32> %a1)
1170  ret <4 x i32> %2
1171}
1172define <4 x i32> @stack_fold_vpshld_mr(<4 x i32> %a0, <4 x i32> %a1) {
1173; CHECK-LABEL: stack_fold_vpshld_mr:
1174; CHECK:       # %bb.0:
1175; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1176; CHECK-NEXT:    #APP
1177; CHECK-NEXT:    nop
1178; CHECK-NEXT:    #NO_APP
1179; CHECK-NEXT:    vpshld %xmm0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1180; CHECK-NEXT:    retq
1181  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1182  %2 = call <4 x i32> @llvm.x86.xop.vpshld(<4 x i32> %a1, <4 x i32> %a0)
1183  ret <4 x i32> %2
1184}
1185declare <4 x i32> @llvm.x86.xop.vpshld(<4 x i32>, <4 x i32>) nounwind readnone
1186
1187define <2 x i64> @stack_fold_vpshlq_rm(<2 x i64> %a0, <2 x i64> %a1) {
1188; CHECK-LABEL: stack_fold_vpshlq_rm:
1189; CHECK:       # %bb.0:
1190; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1191; CHECK-NEXT:    #APP
1192; CHECK-NEXT:    nop
1193; CHECK-NEXT:    #NO_APP
1194; CHECK-NEXT:    vpshlq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1195; CHECK-NEXT:    retq
1196  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1197  %2 = call <2 x i64> @llvm.x86.xop.vpshlq(<2 x i64> %a0, <2 x i64> %a1)
1198  ret <2 x i64> %2
1199}
1200define <2 x i64> @stack_fold_vpshlq_mr(<2 x i64> %a0, <2 x i64> %a1) {
1201; CHECK-LABEL: stack_fold_vpshlq_mr:
1202; CHECK:       # %bb.0:
1203; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1204; CHECK-NEXT:    #APP
1205; CHECK-NEXT:    nop
1206; CHECK-NEXT:    #NO_APP
1207; CHECK-NEXT:    vpshlq %xmm0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1208; CHECK-NEXT:    retq
1209  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1210  %2 = call <2 x i64> @llvm.x86.xop.vpshlq(<2 x i64> %a1, <2 x i64> %a0)
1211  ret <2 x i64> %2
1212}
1213declare <2 x i64> @llvm.x86.xop.vpshlq(<2 x i64>, <2 x i64>) nounwind readnone
1214
1215define <8 x i16> @stack_fold_vpshlw_rm(<8 x i16> %a0, <8 x i16> %a1) {
1216; CHECK-LABEL: stack_fold_vpshlw_rm:
1217; CHECK:       # %bb.0:
1218; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1219; CHECK-NEXT:    #APP
1220; CHECK-NEXT:    nop
1221; CHECK-NEXT:    #NO_APP
1222; CHECK-NEXT:    vpshlw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1223; CHECK-NEXT:    retq
1224  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1225  %2 = call <8 x i16> @llvm.x86.xop.vpshlw(<8 x i16> %a0, <8 x i16> %a1)
1226  ret <8 x i16> %2
1227}
1228define <8 x i16> @stack_fold_vpshlw_mr(<8 x i16> %a0, <8 x i16> %a1) {
1229; CHECK-LABEL: stack_fold_vpshlw_mr:
1230; CHECK:       # %bb.0:
1231; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1232; CHECK-NEXT:    #APP
1233; CHECK-NEXT:    nop
1234; CHECK-NEXT:    #NO_APP
1235; CHECK-NEXT:    vpshlw %xmm0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1236; CHECK-NEXT:    retq
1237  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1238  %2 = call <8 x i16> @llvm.x86.xop.vpshlw(<8 x i16> %a1, <8 x i16> %a0)
1239  ret <8 x i16> %2
1240}
1241declare <8 x i16> @llvm.x86.xop.vpshlw(<8 x i16>, <8 x i16>) nounwind readnone
1242