1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+avx512dq,+avx512vbmi,+avx512cd,+avx512vpopcntdq < %s | FileCheck %s
3
4target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
5target triple = "x86_64-unknown-unknown"
6
7; Stack reload folding tests.
8;
9; By including a nop call with sideeffects we can force a partial register spill of the
10; relevant registers and check that the reload is correctly folded into the instruction.
11
12define <8 x i32> @stack_fold_valignd_ymm(<8 x i32> %a, <8 x i32> %b) {
13; CHECK-LABEL: stack_fold_valignd_ymm:
14; CHECK:       # %bb.0:
15; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17; CHECK-NEXT:    #APP
18; CHECK-NEXT:    nop
19; CHECK-NEXT:    #NO_APP
20; CHECK-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
21; CHECK-NEXT:    valignd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
22; CHECK-NEXT:    # ymm0 = mem[1,2,3,4,5,6,7],ymm0[0]
23; CHECK-NEXT:    retq
24  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
25  %2 = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
26  ret <8 x i32> %2
27}
28
29define <8 x i32> @stack_fold_valignd_ymm_mask(<8 x i32> %a, <8 x i32> %b, <8 x i32>* %passthru, i8 %mask) {
30; CHECK-LABEL: stack_fold_valignd_ymm_mask:
31; CHECK:       # %bb.0:
32; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
33; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
34; CHECK-NEXT:    #APP
35; CHECK-NEXT:    nop
36; CHECK-NEXT:    #NO_APP
37; CHECK-NEXT:    kmovd %esi, %k1
38; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
39; CHECK-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
40; CHECK-NEXT:    valignd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 {%k1} # 32-byte Folded Reload
41; CHECK-NEXT:    # ymm1 {%k1} = mem[1,2,3,4,5,6,7],ymm0[0]
42; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
43; CHECK-NEXT:    retq
44  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
45  %2 = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
46  %3 = bitcast i8 %mask to <8 x i1>
47  %4 = load <8 x i32>, <8 x i32>* %passthru
48  %5 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> %4
49  ret <8 x i32> %5
50}
51
52define <8 x i32> @stack_fold_valignd_ymm_maskz(<8 x i32> %a, <8 x i32> %b, i8 %mask) {
53; CHECK-LABEL: stack_fold_valignd_ymm_maskz:
54; CHECK:       # %bb.0:
55; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
56; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
57; CHECK-NEXT:    #APP
58; CHECK-NEXT:    nop
59; CHECK-NEXT:    #NO_APP
60; CHECK-NEXT:    kmovd %edi, %k1
61; CHECK-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
62; CHECK-NEXT:    valignd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 {%k1} {z} # 32-byte Folded Reload
63; CHECK-NEXT:    # ymm0 {%k1} {z} = mem[1,2,3,4,5,6,7],ymm0[0]
64; CHECK-NEXT:    retq
65  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
66  %2 = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
67  %3 = bitcast i8 %mask to <8 x i1>
68  %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer
69  ret <8 x i32> %4
70}
71
72define <4 x i64> @stack_fold_valignq_ymm(<4 x i64> %a, <4 x i64> %b) {
73; CHECK-LABEL: stack_fold_valignq_ymm:
74; CHECK:       # %bb.0:
75; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
76; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
77; CHECK-NEXT:    #APP
78; CHECK-NEXT:    nop
79; CHECK-NEXT:    #NO_APP
80; CHECK-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
81; CHECK-NEXT:    valignq $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
82; CHECK-NEXT:    # ymm0 = mem[1,2,3],ymm0[0]
83; CHECK-NEXT:    retq
84  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
85  %2 = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
86  ret <4 x i64> %2
87}
88
89define <16 x i8> @stack_fold_pavgb(<16 x i8> %a0, <16 x i8> %a1) {
90; CHECK-LABEL: stack_fold_pavgb:
91; CHECK:       # %bb.0:
92; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
93; CHECK-NEXT:    #APP
94; CHECK-NEXT:    nop
95; CHECK-NEXT:    #NO_APP
96; CHECK-NEXT:    vpavgb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
97; CHECK-NEXT:    retq
98  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
99  %2 = zext <16 x i8> %a0 to <16 x i16>
100  %3 = zext <16 x i8> %a1 to <16 x i16>
101  %4 = add <16 x i16> %2, %3
102  %5 = add <16 x i16> %4, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
103  %6 = lshr <16 x i16> %5, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
104  %7 = trunc <16 x i16> %6 to <16 x i8>
105  ret <16 x i8> %7
106}
107
108define <32 x i8> @stack_fold_pavgb_ymm(<32 x i8> %a0, <32 x i8> %a1) {
109; CHECK-LABEL: stack_fold_pavgb_ymm:
110; CHECK:       # %bb.0:
111; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
112; CHECK-NEXT:    #APP
113; CHECK-NEXT:    nop
114; CHECK-NEXT:    #NO_APP
115; CHECK-NEXT:    vpavgb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
116; CHECK-NEXT:    retq
117  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
118  %2 = zext <32 x i8> %a0 to <32 x i16>
119  %3 = zext <32 x i8> %a1 to <32 x i16>
120  %4 = add <32 x i16> %2, %3
121  %5 = add <32 x i16> %4, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
122  %6 = lshr <32 x i16> %5, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
123  %7 = trunc <32 x i16> %6 to <32 x i8>
124  ret <32 x i8> %7
125}
126
127define <8 x i16> @stack_fold_pavgw(<8 x i16> %a0, <8 x i16> %a1) {
128; CHECK-LABEL: stack_fold_pavgw:
129; CHECK:       # %bb.0:
130; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
131; CHECK-NEXT:    #APP
132; CHECK-NEXT:    nop
133; CHECK-NEXT:    #NO_APP
134; CHECK-NEXT:    vpavgw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
135; CHECK-NEXT:    retq
136  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
137  %2 = zext <8 x i16> %a0 to <8 x i32>
138  %3 = zext <8 x i16> %a1 to <8 x i32>
139  %4 = add <8 x i32> %2, %3
140  %5 = add <8 x i32> %4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
141  %6 = lshr <8 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
142  %7 = trunc <8 x i32> %6 to <8 x i16>
143  ret <8 x i16> %7
144}
145
146define <16 x i16> @stack_fold_pavgw_ymm(<16 x i16> %a0, <16 x i16> %a1) {
147; CHECK-LABEL: stack_fold_pavgw_ymm:
148; CHECK:       # %bb.0:
149; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
150; CHECK-NEXT:    #APP
151; CHECK-NEXT:    nop
152; CHECK-NEXT:    #NO_APP
153; CHECK-NEXT:    vpavgw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
154; CHECK-NEXT:    retq
155  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
156  %2 = zext <16 x i16> %a0 to <16 x i32>
157  %3 = zext <16 x i16> %a1 to <16 x i32>
158  %4 = add <16 x i32> %2, %3
159  %5 = add <16 x i32> %4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
160  %6 = lshr <16 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
161  %7 = trunc <16 x i32> %6 to <16 x i16>
162  ret <16 x i16> %7
163}
164
165define <4 x i32> @stack_fold_vpconflictd(<4 x i32> %a0) {
166; CHECK-LABEL: stack_fold_vpconflictd:
167; CHECK:       # %bb.0:
168; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
169; CHECK-NEXT:    #APP
170; CHECK-NEXT:    nop
171; CHECK-NEXT:    #NO_APP
172; CHECK-NEXT:    vpconflictd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
173; CHECK-NEXT:    retq
174  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
175  %2 = call <4 x i32> @llvm.x86.avx512.conflict.d.128(<4 x i32> %a0)
176  ret <4 x i32> %2
177}
178
179define <8 x i32> @stack_fold_vpconflictd_ymm(<8 x i32> %a0) {
180; CHECK-LABEL: stack_fold_vpconflictd_ymm:
181; CHECK:       # %bb.0:
182; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
183; CHECK-NEXT:    #APP
184; CHECK-NEXT:    nop
185; CHECK-NEXT:    #NO_APP
186; CHECK-NEXT:    vpconflictd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
187; CHECK-NEXT:    retq
188  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
189  %2 = call <8 x i32> @llvm.x86.avx512.conflict.d.256(<8 x i32> %a0)
190  ret <8 x i32> %2
191}
192
193define <2 x i64> @stack_fold_vpconflictq(<2 x i64> %a0) {
194; CHECK-LABEL: stack_fold_vpconflictq:
195; CHECK:       # %bb.0:
196; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
197; CHECK-NEXT:    #APP
198; CHECK-NEXT:    nop
199; CHECK-NEXT:    #NO_APP
200; CHECK-NEXT:    vpconflictq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
201; CHECK-NEXT:    retq
202  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
203  %2 = call <2 x i64> @llvm.x86.avx512.conflict.q.128(<2 x i64> %a0)
204  ret <2 x i64> %2
205}
206
207define <4 x i64> @stack_fold_vpconflictq_ymm(<4 x i64> %a0) {
208; CHECK-LABEL: stack_fold_vpconflictq_ymm:
209; CHECK:       # %bb.0:
210; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
211; CHECK-NEXT:    #APP
212; CHECK-NEXT:    nop
213; CHECK-NEXT:    #NO_APP
214; CHECK-NEXT:    vpconflictq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
215; CHECK-NEXT:    retq
216  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
217  %2 = call <4 x i64> @llvm.x86.avx512.conflict.q.256(<4 x i64> %a0)
218  ret <4 x i64> %2
219}
220
221define <4 x i32> @stack_fold_extracti32x4(<8 x i16> %a0, <8 x i32> %a1) {
222; CHECK-LABEL: stack_fold_extracti32x4:
223; CHECK:       # %bb.0:
224; CHECK-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
225; CHECK-NEXT:    vextracti128 $1, %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
226; CHECK-NEXT:    #APP
227; CHECK-NEXT:    nop
228; CHECK-NEXT:    #NO_APP
229; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
230; CHECK-NEXT:    vzeroupper
231; CHECK-NEXT:    retq
232  ; zext forces execution domain
233  %1 = zext <8 x i16> %a0 to <8 x i32>
234  %2 = shufflevector <8 x i32> %1, <8 x i32> %a1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
235  %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
236  ret <4 x i32> %2
237}
238
239define <2 x i64> @stack_fold_extracti64x2(<4 x i32> %a0, <4 x i64> %a1) {
240; CHECK-LABEL: stack_fold_extracti64x2:
241; CHECK:       # %bb.0:
242; CHECK-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
243; CHECK-NEXT:    vextracti128 $1, %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
244; CHECK-NEXT:    #APP
245; CHECK-NEXT:    nop
246; CHECK-NEXT:    #NO_APP
247; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
248; CHECK-NEXT:    vzeroupper
249; CHECK-NEXT:    retq
250  ; zext forces execution domain
251  %1 = zext <4 x i32> %a0 to <4 x i64>
252  %2 = shufflevector <4 x i64> %1, <4 x i64> %a1, <2 x i32> <i32 2, i32 3>
253  %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
254  ret <2 x i64> %2
255}
256
257define <8 x i32> @stack_fold_inserti32x4(<4 x i32> %a0, <4 x i32> %a1) {
258; CHECK-LABEL: stack_fold_inserti32x4:
259; CHECK:       # %bb.0:
260; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
261; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
262; CHECK-NEXT:    #APP
263; CHECK-NEXT:    nop
264; CHECK-NEXT:    #NO_APP
265; CHECK-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
266; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
267; CHECK-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
268; CHECK-NEXT:    retq
269  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
270  %2 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
271  ; add forces execution domain
272  %3 = add <8 x i32> %2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
273  ret <8 x i32> %3
274}
275
276define <4 x i64> @stack_fold_inserti64x2(<2 x i64> %a0, <2 x i64> %a1) {
277; CHECK-LABEL: stack_fold_inserti64x2:
278; CHECK:       # %bb.0:
279; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
280; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
281; CHECK-NEXT:    #APP
282; CHECK-NEXT:    nop
283; CHECK-NEXT:    #NO_APP
284; CHECK-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
285; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
286; CHECK-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
287; CHECK-NEXT:    retq
288  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
289  %2 = shufflevector <2 x i64> %a0, <2 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
290  ; add forces execution domain
291  %3 = add <4 x i64> %2, <i64 1, i64 1, i64 1, i64 1>
292  ret <4 x i64> %3
293}
294
295define <16 x i8> @stack_fold_pabsb(<16 x i8> %a0) {
296; CHECK-LABEL: stack_fold_pabsb:
297; CHECK:       # %bb.0:
298; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
299; CHECK-NEXT:    #APP
300; CHECK-NEXT:    nop
301; CHECK-NEXT:    #NO_APP
302; CHECK-NEXT:    vpabsb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
303; CHECK-NEXT:    retq
304  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
305  %2 = icmp sgt <16 x i8> %a0, zeroinitializer
306  %3 = sub <16 x i8> zeroinitializer, %a0
307  %4 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %3
308  ret <16 x i8> %4
309}
310
311define <32 x i8> @stack_fold_pabsb_ymm(<32 x i8> %a0) {
312; CHECK-LABEL: stack_fold_pabsb_ymm:
313; CHECK:       # %bb.0:
314; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
315; CHECK-NEXT:    #APP
316; CHECK-NEXT:    nop
317; CHECK-NEXT:    #NO_APP
318; CHECK-NEXT:    vpabsb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
319; CHECK-NEXT:    retq
320  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
321  %2 = icmp sgt <32 x i8> %a0, zeroinitializer
322  %3 = sub <32 x i8> zeroinitializer, %a0
323  %4 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %3
324  ret <32 x i8> %4
325}
326
327define <4 x i32> @stack_fold_pabsd(<4 x i32> %a0) {
328; CHECK-LABEL: stack_fold_pabsd:
329; CHECK:       # %bb.0:
330; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
331; CHECK-NEXT:    #APP
332; CHECK-NEXT:    nop
333; CHECK-NEXT:    #NO_APP
334; CHECK-NEXT:    vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
335; CHECK-NEXT:    retq
336  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
337  %2 = icmp sgt <4 x i32> %a0, zeroinitializer
338  %3 = sub <4 x i32> zeroinitializer, %a0
339  %4 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %3
340  ret <4 x i32> %4
341}
342
343define <8 x i32> @stack_fold_pabsd_ymm(<8 x i32> %a0) {
344; CHECK-LABEL: stack_fold_pabsd_ymm:
345; CHECK:       # %bb.0:
346; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
347; CHECK-NEXT:    #APP
348; CHECK-NEXT:    nop
349; CHECK-NEXT:    #NO_APP
350; CHECK-NEXT:    vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
351; CHECK-NEXT:    retq
352  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
353  %2 = icmp sgt <8 x i32> %a0, zeroinitializer
354  %3 = sub <8 x i32> zeroinitializer, %a0
355  %4 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %3
356  ret <8 x i32> %4
357}
358
359define <2 x i64> @stack_fold_pabsq(<2 x i64> %a0) {
360; CHECK-LABEL: stack_fold_pabsq:
361; CHECK:       # %bb.0:
362; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
363; CHECK-NEXT:    #APP
364; CHECK-NEXT:    nop
365; CHECK-NEXT:    #NO_APP
366; CHECK-NEXT:    vpabsq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
367; CHECK-NEXT:    retq
368  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
369  %2 = icmp sgt <2 x i64> %a0, zeroinitializer
370  %3 = sub <2 x i64> zeroinitializer, %a0
371  %4 = select <2 x i1> %2, <2 x i64> %a0, <2 x i64> %3
372  ret <2 x i64> %4
373}
374
375define <4 x i64> @stack_fold_pabsq_ymm(<4 x i64> %a0) {
376; CHECK-LABEL: stack_fold_pabsq_ymm:
377; CHECK:       # %bb.0:
378; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
379; CHECK-NEXT:    #APP
380; CHECK-NEXT:    nop
381; CHECK-NEXT:    #NO_APP
382; CHECK-NEXT:    vpabsq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
383; CHECK-NEXT:    retq
384  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
385  %2 = icmp sgt <4 x i64> %a0, zeroinitializer
386  %3 = sub <4 x i64> zeroinitializer, %a0
387  %4 = select <4 x i1> %2, <4 x i64> %a0, <4 x i64> %3
388  ret <4 x i64> %4
389}
390
391define <8 x i16> @stack_fold_pabsw(<8 x i16> %a0) {
392; CHECK-LABEL: stack_fold_pabsw:
393; CHECK:       # %bb.0:
394; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
395; CHECK-NEXT:    #APP
396; CHECK-NEXT:    nop
397; CHECK-NEXT:    #NO_APP
398; CHECK-NEXT:    vpabsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
399; CHECK-NEXT:    retq
400  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
401  %2 = icmp sgt <8 x i16> %a0, zeroinitializer
402  %3 = sub <8 x i16> zeroinitializer, %a0
403  %4 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %3
404  ret <8 x i16> %4
405}
406
407define <16 x i16> @stack_fold_pabsw_ymm(<16 x i16> %a0) {
408; CHECK-LABEL: stack_fold_pabsw_ymm:
409; CHECK:       # %bb.0:
410; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
411; CHECK-NEXT:    #APP
412; CHECK-NEXT:    nop
413; CHECK-NEXT:    #NO_APP
414; CHECK-NEXT:    vpabsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
415; CHECK-NEXT:    retq
416  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
417  %2 = icmp sgt <16 x i16> %a0, zeroinitializer
418  %3 = sub <16 x i16> zeroinitializer, %a0
419  %4 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %3
420  ret <16 x i16> %4
421}
422
423define <8 x i16> @stack_fold_packssdw(<4 x i32> %a0, <4 x i32> %a1) {
424; CHECK-LABEL: stack_fold_packssdw:
425; CHECK:       # %bb.0:
426; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
427; CHECK-NEXT:    #APP
428; CHECK-NEXT:    nop
429; CHECK-NEXT:    #NO_APP
430; CHECK-NEXT:    vpackssdw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
431; CHECK-NEXT:    retq
432  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
433  %2 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a0, <4 x i32> %a1)
434  ret <8 x i16> %2
435}
436declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>) nounwind readnone
437
438define <16 x i16> @stack_fold_packssdw_ymm(<8 x i32> %a0, <8 x i32> %a1) {
439; CHECK-LABEL: stack_fold_packssdw_ymm:
440; CHECK:       # %bb.0:
441; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
442; CHECK-NEXT:    #APP
443; CHECK-NEXT:    nop
444; CHECK-NEXT:    #NO_APP
445; CHECK-NEXT:    vpackssdw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
446; CHECK-NEXT:    retq
447  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
448  %2 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a0, <8 x i32> %a1)
449  ret <16 x i16> %2
450}
451declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) nounwind readnone
452
453define <16 x i8> @stack_fold_packsswb(<8 x i16> %a0, <8 x i16> %a1) {
454; CHECK-LABEL: stack_fold_packsswb:
455; CHECK:       # %bb.0:
456; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
457; CHECK-NEXT:    #APP
458; CHECK-NEXT:    nop
459; CHECK-NEXT:    #NO_APP
460; CHECK-NEXT:    vpacksswb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
461; CHECK-NEXT:    retq
462  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
463  %2 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a0, <8 x i16> %a1)
464  ret <16 x i8> %2
465}
466declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>) nounwind readnone
467
468define <32 x i8> @stack_fold_packsswb_ymm(<16 x i16> %a0, <16 x i16> %a1) {
469; CHECK-LABEL: stack_fold_packsswb_ymm:
470; CHECK:       # %bb.0:
471; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
472; CHECK-NEXT:    #APP
473; CHECK-NEXT:    nop
474; CHECK-NEXT:    #NO_APP
475; CHECK-NEXT:    vpacksswb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
476; CHECK-NEXT:    retq
477  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
478  %2 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a0, <16 x i16> %a1)
479  ret <32 x i8> %2
480}
481declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readnone
482
483define <8 x i16> @stack_fold_packusdw(<4 x i32> %a0, <4 x i32> %a1) {
484; CHECK-LABEL: stack_fold_packusdw:
485; CHECK:       # %bb.0:
486; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
487; CHECK-NEXT:    #APP
488; CHECK-NEXT:    nop
489; CHECK-NEXT:    #NO_APP
490; CHECK-NEXT:    vpackusdw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
491; CHECK-NEXT:    retq
492  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
493  %2 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a0, <4 x i32> %a1)
494  ret <8 x i16> %2
495}
496declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>) nounwind readnone
497
498define <16 x i16> @stack_fold_packusdw_ymm(<8 x i32> %a0, <8 x i32> %a1) {
499; CHECK-LABEL: stack_fold_packusdw_ymm:
500; CHECK:       # %bb.0:
501; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
502; CHECK-NEXT:    #APP
503; CHECK-NEXT:    nop
504; CHECK-NEXT:    #NO_APP
505; CHECK-NEXT:    vpackusdw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
506; CHECK-NEXT:    retq
507  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
508  %2 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a0, <8 x i32> %a1)
509  ret <16 x i16> %2
510}
511declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone
512
513define <16 x i8> @stack_fold_packuswb(<8 x i16> %a0, <8 x i16> %a1) {
514; CHECK-LABEL: stack_fold_packuswb:
515; CHECK:       # %bb.0:
516; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
517; CHECK-NEXT:    #APP
518; CHECK-NEXT:    nop
519; CHECK-NEXT:    #NO_APP
520; CHECK-NEXT:    vpackuswb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
521; CHECK-NEXT:    retq
522  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
523  %2 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a0, <8 x i16> %a1)
524  ret <16 x i8> %2
525}
526declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>) nounwind readnone
527
528define <32 x i8> @stack_fold_packuswb_ymm(<16 x i16> %a0, <16 x i16> %a1) {
529; CHECK-LABEL: stack_fold_packuswb_ymm:
530; CHECK:       # %bb.0:
531; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
532; CHECK-NEXT:    #APP
533; CHECK-NEXT:    nop
534; CHECK-NEXT:    #NO_APP
535; CHECK-NEXT:    vpackuswb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
536; CHECK-NEXT:    retq
537  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
538  %2 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a0, <16 x i16> %a1)
539  ret <32 x i8> %2
540}
541declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readnone
542
543define <16 x i8> @stack_fold_paddb(<16 x i8> %a0, <16 x i8> %a1) {
544; CHECK-LABEL: stack_fold_paddb:
545; CHECK:       # %bb.0:
546; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
547; CHECK-NEXT:    #APP
548; CHECK-NEXT:    nop
549; CHECK-NEXT:    #NO_APP
550; CHECK-NEXT:    vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
551; CHECK-NEXT:    retq
552  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
553  %2 = add <16 x i8> %a0, %a1
554  ret <16 x i8> %2
555}
556
557define <16 x i8> @stack_fold_paddb_mask(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %a2, i16 %mask) {
558; CHECK-LABEL: stack_fold_paddb_mask:
559; CHECK:       # %bb.0:
560; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
561; CHECK-NEXT:    #APP
562; CHECK-NEXT:    nop
563; CHECK-NEXT:    #NO_APP
564; CHECK-NEXT:    kmovd %esi, %k1
565; CHECK-NEXT:    vmovdqa (%rdi), %xmm2
566; CHECK-NEXT:    vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
567; CHECK-NEXT:    vmovdqa %xmm2, %xmm0
568; CHECK-NEXT:    retq
569  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
570  %2 = add <16 x i8> %a0, %a1
571  %3 = bitcast i16 %mask to <16 x i1>
572  ; load needed to keep the operation from being scheduled about the asm block
573  %4 = load <16 x i8>, <16 x i8>* %a2
574  %5 = select <16 x i1> %3, <16 x i8> %2, <16 x i8> %4
575  ret <16 x i8> %5
576}
577
578define <16 x i8> @stack_fold_paddb_maskz(<16 x i8> %a0, <16 x i8> %a1, i16 %mask) {
579; CHECK-LABEL: stack_fold_paddb_maskz:
580; CHECK:       # %bb.0:
581; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
582; CHECK-NEXT:    #APP
583; CHECK-NEXT:    nop
584; CHECK-NEXT:    #NO_APP
585; CHECK-NEXT:    kmovd %edi, %k1
586; CHECK-NEXT:    vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} {z} # 16-byte Folded Reload
587; CHECK-NEXT:    retq
588  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
589  %2 = add <16 x i8> %a0, %a1
590  %3 = bitcast i16 %mask to <16 x i1>
591  %4 = select <16 x i1> %3, <16 x i8> %2, <16 x i8> zeroinitializer
592  ret <16 x i8> %4
593}
594
595define <32 x i8> @stack_fold_paddb_ymm(<32 x i8> %a0, <32 x i8> %a1) {
596; CHECK-LABEL: stack_fold_paddb_ymm:
597; CHECK:       # %bb.0:
598; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
599; CHECK-NEXT:    #APP
600; CHECK-NEXT:    nop
601; CHECK-NEXT:    #NO_APP
602; CHECK-NEXT:    vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
603; CHECK-NEXT:    retq
604  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
605  %2 = add <32 x i8> %a0, %a1
606  ret <32 x i8> %2
607}
608
609define <32 x i8> @stack_fold_paddb_mask_ymm(<32 x i8> %a0, <32 x i8> %a1, <32 x i8>* %a2, i32 %mask) {
610; CHECK-LABEL: stack_fold_paddb_mask_ymm:
611; CHECK:       # %bb.0:
612; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
613; CHECK-NEXT:    #APP
614; CHECK-NEXT:    nop
615; CHECK-NEXT:    #NO_APP
616; CHECK-NEXT:    kmovd %esi, %k1
617; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
618; CHECK-NEXT:    vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
619; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
620; CHECK-NEXT:    retq
621  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
622  %2 = add <32 x i8> %a0, %a1
623  %3 = bitcast i32 %mask to <32 x i1>
624  ; load needed to keep the operation from being scheduled about the asm block
625  %4 = load <32 x i8>, <32 x i8>* %a2
626  %5 = select <32 x i1> %3, <32 x i8> %2, <32 x i8> %4
627  ret <32 x i8> %5
628}
629
630define <32 x i8> @stack_fold_paddb_maskz_ymm(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) {
631; CHECK-LABEL: stack_fold_paddb_maskz_ymm:
632; CHECK:       # %bb.0:
633; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
634; CHECK-NEXT:    #APP
635; CHECK-NEXT:    nop
636; CHECK-NEXT:    #NO_APP
637; CHECK-NEXT:    kmovd %edi, %k1
638; CHECK-NEXT:    vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 {%k1} {z} # 32-byte Folded Reload
639; CHECK-NEXT:    retq
640  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
641  %2 = add <32 x i8> %a0, %a1
642  %3 = bitcast i32 %mask to <32 x i1>
643  %4 = select <32 x i1> %3, <32 x i8> %2, <32 x i8> zeroinitializer
644  ret <32 x i8> %4
645}
646
647define <4 x i32> @stack_fold_paddd(<4 x i32> %a0, <4 x i32> %a1) {
648; CHECK-LABEL: stack_fold_paddd:
649; CHECK:       # %bb.0:
650; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
651; CHECK-NEXT:    #APP
652; CHECK-NEXT:    nop
653; CHECK-NEXT:    #NO_APP
654; CHECK-NEXT:    vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
655; CHECK-NEXT:    retq
656  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
657  %2 = add <4 x i32> %a0, %a1
658  ret <4 x i32> %2
659}
660
661define <8 x i32> @stack_fold_paddd_ymm(<8 x i32> %a0, <8 x i32> %a1) {
662; CHECK-LABEL: stack_fold_paddd_ymm:
663; CHECK:       # %bb.0:
664; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
665; CHECK-NEXT:    #APP
666; CHECK-NEXT:    nop
667; CHECK-NEXT:    #NO_APP
668; CHECK-NEXT:    vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
669; CHECK-NEXT:    retq
670  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
671  %2 = add <8 x i32> %a0, %a1
672  ret <8 x i32> %2
673}
674
675define <2 x i64> @stack_fold_paddq(<2 x i64> %a0, <2 x i64> %a1) {
676; CHECK-LABEL: stack_fold_paddq:
677; CHECK:       # %bb.0:
678; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
679; CHECK-NEXT:    #APP
680; CHECK-NEXT:    nop
681; CHECK-NEXT:    #NO_APP
682; CHECK-NEXT:    vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
683; CHECK-NEXT:    retq
684  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
685  %2 = add <2 x i64> %a0, %a1
686  ret <2 x i64> %2
687}
688
689define <4 x i64> @stack_fold_paddq_ymm(<4 x i64> %a0, <4 x i64> %a1) {
690; CHECK-LABEL: stack_fold_paddq_ymm:
691; CHECK:       # %bb.0:
692; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
693; CHECK-NEXT:    #APP
694; CHECK-NEXT:    nop
695; CHECK-NEXT:    #NO_APP
696; CHECK-NEXT:    vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
697; CHECK-NEXT:    retq
698  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
699  %2 = add <4 x i64> %a0, %a1
700  ret <4 x i64> %2
701}
702
703define <16 x i8> @stack_fold_paddsb(<16 x i8> %a0, <16 x i8> %a1) {
704; CHECK-LABEL: stack_fold_paddsb:
705; CHECK:       # %bb.0:
706; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
707; CHECK-NEXT:    #APP
708; CHECK-NEXT:    nop
709; CHECK-NEXT:    #NO_APP
710; CHECK-NEXT:    vpaddsb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
711; CHECK-NEXT:    retq
712  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
713  %2 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> %a0, <16 x i8> %a1)
714  ret <16 x i8> %2
715}
716declare <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
717
718define <32 x i8> @stack_fold_paddsb_ymm(<32 x i8> %a0, <32 x i8> %a1) {
719; CHECK-LABEL: stack_fold_paddsb_ymm:
720; CHECK:       # %bb.0:
721; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
722; CHECK-NEXT:    #APP
723; CHECK-NEXT:    nop
724; CHECK-NEXT:    #NO_APP
725; CHECK-NEXT:    vpaddsb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
726; CHECK-NEXT:    retq
727  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
728  %2 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> %a0, <32 x i8> %a1)
729  ret <32 x i8> %2
730}
731declare <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone
732
733define <8 x i16> @stack_fold_paddsw(<8 x i16> %a0, <8 x i16> %a1) {
734; CHECK-LABEL: stack_fold_paddsw:
735; CHECK:       # %bb.0:
736; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
737; CHECK-NEXT:    #APP
738; CHECK-NEXT:    nop
739; CHECK-NEXT:    #NO_APP
740; CHECK-NEXT:    vpaddsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
741; CHECK-NEXT:    retq
742  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
743  %2 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %a0, <8 x i16> %a1)
744  ret <8 x i16> %2
745}
746declare <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
747
748define <16 x i16> @stack_fold_paddsw_ymm(<16 x i16> %a0, <16 x i16> %a1) {
749; CHECK-LABEL: stack_fold_paddsw_ymm:
750; CHECK:       # %bb.0:
751; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
752; CHECK-NEXT:    #APP
753; CHECK-NEXT:    nop
754; CHECK-NEXT:    #NO_APP
755; CHECK-NEXT:    vpaddsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
756; CHECK-NEXT:    retq
757  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
758  %2 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> %a0, <16 x i16> %a1)
759  ret <16 x i16> %2
760}
761declare <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone
762
763define <16 x i8> @stack_fold_paddusb(<16 x i8> %a0, <16 x i8> %a1) {
764; CHECK-LABEL: stack_fold_paddusb:
765; CHECK:       # %bb.0:
766; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
767; CHECK-NEXT:    #APP
768; CHECK-NEXT:    nop
769; CHECK-NEXT:    #NO_APP
770; CHECK-NEXT:    vpaddusb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
771; CHECK-NEXT:    retq
772  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
773  %2 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> %a0, <16 x i8> %a1)
774  ret <16 x i8> %2
775}
776declare <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
777
778define <32 x i8> @stack_fold_paddusb_ymm(<32 x i8> %a0, <32 x i8> %a1) {
779; CHECK-LABEL: stack_fold_paddusb_ymm:
780; CHECK:       # %bb.0:
781; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
782; CHECK-NEXT:    #APP
783; CHECK-NEXT:    nop
784; CHECK-NEXT:    #NO_APP
785; CHECK-NEXT:    vpaddusb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
786; CHECK-NEXT:    retq
787  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
788  %2 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> %a0, <32 x i8> %a1)
789  ret <32 x i8> %2
790}
791declare <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone
792
793define <8 x i16> @stack_fold_paddusw(<8 x i16> %a0, <8 x i16> %a1) {
794; CHECK-LABEL: stack_fold_paddusw:
795; CHECK:       # %bb.0:
796; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
797; CHECK-NEXT:    #APP
798; CHECK-NEXT:    nop
799; CHECK-NEXT:    #NO_APP
800; CHECK-NEXT:    vpaddusw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
801; CHECK-NEXT:    retq
802  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
803  %2 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %a0, <8 x i16> %a1)
804  ret <8 x i16> %2
805}
806declare <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
807
808define <16 x i16> @stack_fold_paddusw_ymm(<16 x i16> %a0, <16 x i16> %a1) {
809; CHECK-LABEL: stack_fold_paddusw_ymm:
810; CHECK:       # %bb.0:
811; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
812; CHECK-NEXT:    #APP
813; CHECK-NEXT:    nop
814; CHECK-NEXT:    #NO_APP
815; CHECK-NEXT:    vpaddusw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
816; CHECK-NEXT:    retq
817  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
818  %2 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> %a0, <16 x i16> %a1)
819  ret <16 x i16> %2
820}
821declare <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone
822
823define <8 x i16> @stack_fold_paddw(<8 x i16> %a0, <8 x i16> %a1) {
824; CHECK-LABEL: stack_fold_paddw:
825; CHECK:       # %bb.0:
826; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
827; CHECK-NEXT:    #APP
828; CHECK-NEXT:    nop
829; CHECK-NEXT:    #NO_APP
830; CHECK-NEXT:    vpaddw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
831; CHECK-NEXT:    retq
832  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
833  %2 = add <8 x i16> %a0, %a1
834  ret <8 x i16> %2
835}
836
837define <16 x i16> @stack_fold_paddw_ymm(<16 x i16> %a0, <16 x i16> %a1) {
838; CHECK-LABEL: stack_fold_paddw_ymm:
839; CHECK:       # %bb.0:
840; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
841; CHECK-NEXT:    #APP
842; CHECK-NEXT:    nop
843; CHECK-NEXT:    #NO_APP
844; CHECK-NEXT:    vpaddw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
845; CHECK-NEXT:    retq
846  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
847  %2 = add <16 x i16> %a0, %a1
848  ret <16 x i16> %2
849}
850
851define <32 x i8> @stack_fold_palignr(<32 x i8> %a0, <32 x i8> %a1) {
852; CHECK-LABEL: stack_fold_palignr:
853; CHECK:       # %bb.0:
854; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
855; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
856; CHECK-NEXT:    #APP
857; CHECK-NEXT:    nop
858; CHECK-NEXT:    #NO_APP
859; CHECK-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
860; CHECK-NEXT:    vpalignr $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
861; CHECK-NEXT:    # ymm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0],mem[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16]
862; CHECK-NEXT:    retq
863  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
864  %2 = shufflevector <32 x i8> %a1, <32 x i8> %a0, <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48>
865  ret <32 x i8> %2
866}
867
868define <32 x i8> @stack_fold_palignr_mask(<32 x i8> %a0, <32 x i8> %a1, <32 x i8>* %passthru, i32 %mask) {
869; CHECK-LABEL: stack_fold_palignr_mask:
870; CHECK:       # %bb.0:
871; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
872; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
873; CHECK-NEXT:    #APP
874; CHECK-NEXT:    nop
875; CHECK-NEXT:    #NO_APP
876; CHECK-NEXT:    kmovd %esi, %k1
877; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
878; CHECK-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
879; CHECK-NEXT:    vpalignr $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 {%k1} # 32-byte Folded Reload
880; CHECK-NEXT:    # ymm1 {%k1} = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0],mem[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16]
881; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
882; CHECK-NEXT:    retq
883  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
884  %2 = shufflevector <32 x i8> %a1, <32 x i8> %a0, <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48>
885  %3 = bitcast i32 %mask to <32 x i1>
886  %4 = load <32 x i8>, <32 x i8>* %passthru
887  %5 = select <32 x i1> %3, <32 x i8> %2, <32 x i8> %4
888  ret <32 x i8> %5
889}
890
891define <32 x i8> @stack_fold_palignr_maskz(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) {
892; CHECK-LABEL: stack_fold_palignr_maskz:
893; CHECK:       # %bb.0:
894; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
895; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
896; CHECK-NEXT:    #APP
897; CHECK-NEXT:    nop
898; CHECK-NEXT:    #NO_APP
899; CHECK-NEXT:    kmovd %edi, %k1
900; CHECK-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
901; CHECK-NEXT:    vpalignr $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 {%k1} {z} # 32-byte Folded Reload
902; CHECK-NEXT:    # ymm0 {%k1} {z} = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0],mem[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16]
903; CHECK-NEXT:    retq
904  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
905  %2 = shufflevector <32 x i8> %a1, <32 x i8> %a0, <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48>
906  %3 = bitcast i32 %mask to <32 x i1>
907  %4 = select <32 x i1> %3, <32 x i8> %2, <32 x i8> zeroinitializer
908  ret <32 x i8> %4
909}
910
911define i16 @stack_fold_pcmpeqb(<16 x i8> %a0, <16 x i8> %a1) {
912; CHECK-LABEL: stack_fold_pcmpeqb:
913; CHECK:       # %bb.0:
914; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
915; CHECK-NEXT:    #APP
916; CHECK-NEXT:    nop
917; CHECK-NEXT:    #NO_APP
918; CHECK-NEXT:    vpcmpeqb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %k0 # 16-byte Folded Reload
919; CHECK-NEXT:    kmovd %k0, %eax
920; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
921; CHECK-NEXT:    retq
922  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
923  %2 = icmp eq <16 x i8> %a0, %a1
924  %3 = bitcast <16 x i1> %2 to i16
925  ret i16 %3
926}
927
928define i8 @stack_fold_pcmpeqd(<4 x i32> %a0, <4 x i32> %a1) {
929; CHECK-LABEL: stack_fold_pcmpeqd:
930; CHECK:       # %bb.0:
931; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
932; CHECK-NEXT:    #APP
933; CHECK-NEXT:    nop
934; CHECK-NEXT:    #NO_APP
935; CHECK-NEXT:    vpcmpeqd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %k0 # 16-byte Folded Reload
936; CHECK-NEXT:    kmovd %k0, %eax
937; CHECK-NEXT:    # kill: def $al killed $al killed $eax
938; CHECK-NEXT:    retq
939  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
940  %2 = icmp eq <4 x i32> %a0, %a1
941  %3 = shufflevector <4 x i1> %2, <4 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
942  %4 = bitcast <8 x i1> %3 to i8
943  ret i8 %4
944}
945
946define i8 @stack_fold_pcmpeqq(<2 x i64> %a0, <2 x i64> %a1) {
947; CHECK-LABEL: stack_fold_pcmpeqq:
948; CHECK:       # %bb.0:
949; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
950; CHECK-NEXT:    #APP
951; CHECK-NEXT:    nop
952; CHECK-NEXT:    #NO_APP
953; CHECK-NEXT:    vpcmpeqq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %k0 # 16-byte Folded Reload
954; CHECK-NEXT:    kmovd %k0, %eax
955; CHECK-NEXT:    # kill: def $al killed $al killed $eax
956; CHECK-NEXT:    retq
957  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
958  %2 = icmp eq <2 x i64> %a0, %a1
959  %3 = shufflevector <2 x i1> %2, <2 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
960  %4 = bitcast <8 x i1> %3 to i8
961  ret i8 %4
962}
963
964define i8 @stack_fold_pcmpeqw(<8 x i16> %a0, <8 x i16> %a1) {
965; CHECK-LABEL: stack_fold_pcmpeqw:
966; CHECK:       # %bb.0:
967; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
968; CHECK-NEXT:    #APP
969; CHECK-NEXT:    nop
970; CHECK-NEXT:    #NO_APP
971; CHECK-NEXT:    vpcmpeqw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %k0 # 16-byte Folded Reload
972; CHECK-NEXT:    kmovd %k0, %eax
973; CHECK-NEXT:    # kill: def $al killed $al killed $eax
974; CHECK-NEXT:    retq
975  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
976  %2 = icmp eq <8 x i16> %a0, %a1
977  %3 = bitcast <8 x i1> %2 to i8
978  ret i8 %3
979}
980
981define <32 x i8> @stack_fold_permbvar(<32 x i8> %a0, <32 x i8> %a1) {
982; CHECK-LABEL: stack_fold_permbvar:
983; CHECK:       # %bb.0:
984; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
985; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
986; CHECK-NEXT:    #APP
987; CHECK-NEXT:    nop
988; CHECK-NEXT:    #NO_APP
989; CHECK-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
990; CHECK-NEXT:    vpermb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
991; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
992; CHECK-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
993; CHECK-NEXT:    retq
994  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
995  %2 = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> %a1, <32 x i8> %a0)
996  ; add forces execution domain
997  %3 = add <32 x i8> %2, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
998  ret <32 x i8> %3
999}
1000declare <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8>, <32 x i8>) nounwind readonly
1001
1002define <8 x i32> @stack_fold_permd(<8 x i32> %a0, <8 x i32> %a1) {
1003; CHECK-LABEL: stack_fold_permd:
1004; CHECK:       # %bb.0:
1005; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1006; CHECK-NEXT:    #APP
1007; CHECK-NEXT:    nop
1008; CHECK-NEXT:    #NO_APP
1009; CHECK-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1010; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
1011; CHECK-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
1012; CHECK-NEXT:    retq
1013  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1014  %2 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a1, <8 x i32> %a0)
1015  ; add forces execution domain
1016  %3 = add <8 x i32> %2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1017  ret <8 x i32> %3
1018}
1019declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) nounwind readonly
1020
1021define <16 x i8> @stack_fold_vpermi2b(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2) {
1022; CHECK-LABEL: stack_fold_vpermi2b:
1023; CHECK:       # %bb.0:
1024; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1025; CHECK-NEXT:    #APP
1026; CHECK-NEXT:    nop
1027; CHECK-NEXT:    #NO_APP
1028; CHECK-NEXT:    vpermi2b {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
1029; CHECK-NEXT:    retq
1030  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1031  %2 = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> %x1, <16 x i8> %x0, <16 x i8> %x2)
1032  ret <16 x i8> %2
1033}
1034
1035define <32 x i8> @stack_fold_vpermi2b_ymm(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2) {
1036; CHECK-LABEL: stack_fold_vpermi2b_ymm:
1037; CHECK:       # %bb.0:
1038; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1039; CHECK-NEXT:    #APP
1040; CHECK-NEXT:    nop
1041; CHECK-NEXT:    #NO_APP
1042; CHECK-NEXT:    vpermi2b {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
1043; CHECK-NEXT:    retq
1044  %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1045  %2 = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> %x1, <32 x i8> %x0, <32 x i8> %x2)
1046  ret <32 x i8> %2
1047}
1048
1049define <4 x i32> @stack_fold_vpermi2d(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
1050; CHECK-LABEL: stack_fold_vpermi2d:
1051; CHECK:       # %bb.0:
1052; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1053; CHECK-NEXT:    #APP
1054; CHECK-NEXT:    nop
1055; CHECK-NEXT:    #NO_APP
1056; CHECK-NEXT:    vpermi2d {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
1057; CHECK-NEXT:    retq
1058  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1059  %2 = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %x1, <4 x i32> %x0, <4 x i32> %x2)
1060  ret <4 x i32> %2
1061}
1062
1063define <8 x i32> @stack_fold_vpermi2d_ymm(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
1064; CHECK-LABEL: stack_fold_vpermi2d_ymm:
1065; CHECK:       # %bb.0:
1066; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1067; CHECK-NEXT:    #APP
1068; CHECK-NEXT:    nop
1069; CHECK-NEXT:    #NO_APP
1070; CHECK-NEXT:    vpermi2d {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
1071; CHECK-NEXT:    retq
1072  %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1073  %2 = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %x1, <8 x i32> %x0, <8 x i32> %x2)
1074  ret <8 x i32> %2
1075}
1076
1077define <2 x i64> @stack_fold_vpermi2q(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) {
1078; CHECK-LABEL: stack_fold_vpermi2q:
1079; CHECK:       # %bb.0:
1080; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1081; CHECK-NEXT:    #APP
1082; CHECK-NEXT:    nop
1083; CHECK-NEXT:    #NO_APP
1084; CHECK-NEXT:    vpermi2q {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
1085; CHECK-NEXT:    retq
1086  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1087  %2 = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %x1, <2 x i64> %x0, <2 x i64> %x2)
1088  ret <2 x i64> %2
1089}
1090
1091define <4 x i64> @stack_fold_vpermi2q_ymm(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) {
1092; CHECK-LABEL: stack_fold_vpermi2q_ymm:
1093; CHECK:       # %bb.0:
1094; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1095; CHECK-NEXT:    #APP
1096; CHECK-NEXT:    nop
1097; CHECK-NEXT:    #NO_APP
1098; CHECK-NEXT:    vpermi2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
1099; CHECK-NEXT:    retq
1100  %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1101  %2 = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %x1, <4 x i64> %x0, <4 x i64> %x2)
1102  ret <4 x i64> %2
1103}
1104
1105define <8 x i16> @stack_fold_vpermi2w(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2) {
1106; CHECK-LABEL: stack_fold_vpermi2w:
1107; CHECK:       # %bb.0:
1108; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1109; CHECK-NEXT:    #APP
1110; CHECK-NEXT:    nop
1111; CHECK-NEXT:    #NO_APP
1112; CHECK-NEXT:    vpermi2w {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
1113; CHECK-NEXT:    retq
1114  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1115  %2 = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> %x1, <8 x i16> %x0, <8 x i16> %x2)
1116  ret <8 x i16> %2
1117}
1118
1119define <16 x i16> @stack_fold_vpermi2w_ymm(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2) {
1120; CHECK-LABEL: stack_fold_vpermi2w_ymm:
1121; CHECK:       # %bb.0:
1122; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1123; CHECK-NEXT:    #APP
1124; CHECK-NEXT:    nop
1125; CHECK-NEXT:    #NO_APP
1126; CHECK-NEXT:    vpermi2w {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
1127; CHECK-NEXT:    retq
1128  %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1129  %2 = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> %x1, <16 x i16> %x0, <16 x i16> %x2)
1130  ret <16 x i16> %2
1131}
1132
1133define <4 x i64> @stack_fold_permq(<4 x i64> %a0) {
1134; CHECK-LABEL: stack_fold_permq:
1135; CHECK:       # %bb.0:
1136; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1137; CHECK-NEXT:    #APP
1138; CHECK-NEXT:    nop
1139; CHECK-NEXT:    #NO_APP
1140; CHECK-NEXT:    vpermq $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
1141; CHECK-NEXT:    # ymm0 = mem[3,2,2,3]
1142; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
1143; CHECK-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
1144; CHECK-NEXT:    retq
1145  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1146  %2 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 2, i32 3>
1147  ; add forces execution domain
1148  %3 = add <4 x i64> %2, <i64 1, i64 1, i64 1, i64 1>
1149  ret <4 x i64> %3
1150}
1151
1152define <4 x i64> @stack_fold_permqvar(<4 x i64> %a0, <4 x i64> %a1) {
1153; CHECK-LABEL: stack_fold_permqvar:
1154; CHECK:       # %bb.0:
1155; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1156; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1157; CHECK-NEXT:    #APP
1158; CHECK-NEXT:    nop
1159; CHECK-NEXT:    #NO_APP
1160; CHECK-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1161; CHECK-NEXT:    vpermq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1162; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
1163; CHECK-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
1164; CHECK-NEXT:    retq
1165  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1166  %2 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a1, <4 x i64> %a0)
1167  ; add forces execution domain
1168  %3 = add <4 x i64> %2, <i64 1, i64 1, i64 1, i64 1>
1169  ret <4 x i64> %3
1170}
1171declare <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64>, <4 x i64>) nounwind readonly
1172
1173define <16 x i8> @stack_fold_vpermt2b(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2) {
1174; CHECK-LABEL: stack_fold_vpermt2b:
1175; CHECK:       # %bb.0:
1176; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1177; CHECK-NEXT:    #APP
1178; CHECK-NEXT:    nop
1179; CHECK-NEXT:    #NO_APP
1180; CHECK-NEXT:    vpermt2b {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
1181; CHECK-NEXT:    retq
1182  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1183  %2 = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2)
1184  ret <16 x i8> %2
1185}
1186declare <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8>, <16 x i8>, <16 x i8>)
1187
1188define <32 x i8> @stack_fold_vpermt2b_ymm(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2) {
1189; CHECK-LABEL: stack_fold_vpermt2b_ymm:
1190; CHECK:       # %bb.0:
1191; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1192; CHECK-NEXT:    #APP
1193; CHECK-NEXT:    nop
1194; CHECK-NEXT:    #NO_APP
1195; CHECK-NEXT:    vpermt2b {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
1196; CHECK-NEXT:    retq
1197  %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1198  %2 = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2)
1199  ret <32 x i8> %2
1200}
1201declare <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8>, <32 x i8>, <32 x i8>)
1202
1203define <4 x i32> @stack_fold_vpermt2d(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
1204; CHECK-LABEL: stack_fold_vpermt2d:
1205; CHECK:       # %bb.0:
1206; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1207; CHECK-NEXT:    #APP
1208; CHECK-NEXT:    nop
1209; CHECK-NEXT:    #NO_APP
1210; CHECK-NEXT:    vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
1211; CHECK-NEXT:    retq
1212  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1213  %2 = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
1214  ret <4 x i32> %2
1215}
1216declare <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32>, <4 x i32>, <4 x i32>)
1217
1218define <8 x i32> @stack_fold_vpermt2d_ymm(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
1219; CHECK-LABEL: stack_fold_vpermt2d_ymm:
1220; CHECK:       # %bb.0:
1221; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1222; CHECK-NEXT:    #APP
1223; CHECK-NEXT:    nop
1224; CHECK-NEXT:    #NO_APP
1225; CHECK-NEXT:    vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
1226; CHECK-NEXT:    retq
1227  %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1228  %2 = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
1229  ret <8 x i32> %2
1230}
1231declare <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32>, <8 x i32>, <8 x i32>)
1232
1233define <2 x i64> @stack_fold_vpermt2q(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) {
1234; CHECK-LABEL: stack_fold_vpermt2q:
1235; CHECK:       # %bb.0:
1236; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1237; CHECK-NEXT:    #APP
1238; CHECK-NEXT:    nop
1239; CHECK-NEXT:    #NO_APP
1240; CHECK-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
1241; CHECK-NEXT:    retq
1242  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1243  %2 = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2)
1244  ret <2 x i64> %2
1245}
1246declare <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64>, <2 x i64>, <2 x i64>)
1247
1248define <4 x i64> @stack_fold_vpermt2q_ymm(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) {
1249; CHECK-LABEL: stack_fold_vpermt2q_ymm:
1250; CHECK:       # %bb.0:
1251; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1252; CHECK-NEXT:    #APP
1253; CHECK-NEXT:    nop
1254; CHECK-NEXT:    #NO_APP
1255; CHECK-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
1256; CHECK-NEXT:    retq
1257  %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1258  %2 = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2)
1259  ret <4 x i64> %2
1260}
1261declare <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64>, <4 x i64>, <4 x i64>)
1262
1263define <8 x i16> @stack_fold_vpermt2w(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2) {
1264; CHECK-LABEL: stack_fold_vpermt2w:
1265; CHECK:       # %bb.0:
1266; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1267; CHECK-NEXT:    #APP
1268; CHECK-NEXT:    nop
1269; CHECK-NEXT:    #NO_APP
1270; CHECK-NEXT:    vpermt2w {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
1271; CHECK-NEXT:    retq
1272  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1273  %2 = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2)
1274  ret <8 x i16> %2
1275}
1276declare <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16>, <8 x i16>, <8 x i16>)
1277
1278define <16 x i16> @stack_fold_vpermt2w_ymm(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2) {
1279; CHECK-LABEL: stack_fold_vpermt2w_ymm:
1280; CHECK:       # %bb.0:
1281; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1282; CHECK-NEXT:    #APP
1283; CHECK-NEXT:    nop
1284; CHECK-NEXT:    #NO_APP
1285; CHECK-NEXT:    vpermt2w {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
1286; CHECK-NEXT:    retq
1287  %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1288  %2 = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2)
1289  ret <16 x i16> %2
1290}
1291declare <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16>, <16 x i16>, <16 x i16>)
1292
1293define <16 x i16> @stack_fold_permwvar(<16 x i16> %a0, <16 x i16> %a1) {
1294; CHECK-LABEL: stack_fold_permwvar:
1295; CHECK:       # %bb.0:
1296; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1297; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1298; CHECK-NEXT:    #APP
1299; CHECK-NEXT:    nop
1300; CHECK-NEXT:    #NO_APP
1301; CHECK-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1302; CHECK-NEXT:    vpermw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1303; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
1304; CHECK-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
1305; CHECK-NEXT:    retq
1306  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1307  %2 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %a1, <16 x i16> %a0)
1308  ; add forces execution domain
1309  %3 = add <16 x i16> %2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
1310  ret <16 x i16> %3
1311}
1312declare <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16>, <16 x i16>) nounwind readonly
1313
1314define <4 x i32> @stack_fold_vplzcntd(<4 x i32> %a0) {
1315; CHECK-LABEL: stack_fold_vplzcntd:
1316; CHECK:       # %bb.0:
1317; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1318; CHECK-NEXT:    #APP
1319; CHECK-NEXT:    nop
1320; CHECK-NEXT:    #NO_APP
1321; CHECK-NEXT:    vplzcntd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1322; CHECK-NEXT:    retq
1323  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1324  %2 = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a0, i1 false)
1325  ret <4 x i32> %2
1326}
1327
1328define <8 x i32> @stack_fold_vplzcntd_ymm(<8 x i32> %a0) {
1329; CHECK-LABEL: stack_fold_vplzcntd_ymm:
1330; CHECK:       # %bb.0:
1331; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1332; CHECK-NEXT:    #APP
1333; CHECK-NEXT:    nop
1334; CHECK-NEXT:    #NO_APP
1335; CHECK-NEXT:    vplzcntd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
1336; CHECK-NEXT:    retq
1337  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1338  %2 = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %a0, i1 false)
1339  ret <8 x i32> %2
1340}
1341
1342define <2 x i64> @stack_fold_vplzcntq(<2 x i64> %a0) {
1343; CHECK-LABEL: stack_fold_vplzcntq:
1344; CHECK:       # %bb.0:
1345; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1346; CHECK-NEXT:    #APP
1347; CHECK-NEXT:    nop
1348; CHECK-NEXT:    #NO_APP
1349; CHECK-NEXT:    vplzcntq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1350; CHECK-NEXT:    retq
1351  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1352  %2 = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a0, i1 false)
1353  ret <2 x i64> %2
1354}
1355
1356define <4 x i64> @stack_fold_vplzcntq_ymm(<4 x i64> %a0) {
1357; CHECK-LABEL: stack_fold_vplzcntq_ymm:
1358; CHECK:       # %bb.0:
1359; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1360; CHECK-NEXT:    #APP
1361; CHECK-NEXT:    nop
1362; CHECK-NEXT:    #NO_APP
1363; CHECK-NEXT:    vplzcntq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
1364; CHECK-NEXT:    retq
1365  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1366  %2 = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %a0, i1 false)
1367  ret <4 x i64> %2
1368}
1369
1370define <8 x i16> @stack_fold_pmaddubsw(<16 x i8> %a0, <16 x i8> %a1) {
1371; CHECK-LABEL: stack_fold_pmaddubsw:
1372; CHECK:       # %bb.0:
1373; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1374; CHECK-NEXT:    #APP
1375; CHECK-NEXT:    nop
1376; CHECK-NEXT:    #NO_APP
1377; CHECK-NEXT:    vpmaddubsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1378; CHECK-NEXT:    retq
1379  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1380  %2 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a0, <16 x i8> %a1)
1381  ret <8 x i16> %2
1382}
1383declare <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8>, <16 x i8>) nounwind readnone
1384
1385define <8 x i16> @stack_fold_pmaddubsw_mask(<8 x i16>* %passthru, <16 x i8> %a0, <16 x i8> %a1, i8 %mask) {
1386; CHECK-LABEL: stack_fold_pmaddubsw_mask:
1387; CHECK:       # %bb.0:
1388; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1389; CHECK-NEXT:    #APP
1390; CHECK-NEXT:    nop
1391; CHECK-NEXT:    #NO_APP
1392; CHECK-NEXT:    kmovd %esi, %k1
1393; CHECK-NEXT:    vmovdqa (%rdi), %xmm2
1394; CHECK-NEXT:    vpmaddubsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
1395; CHECK-NEXT:    vmovdqa %xmm2, %xmm0
1396; CHECK-NEXT:    retq
1397  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1398  %2 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a0, <16 x i8> %a1)
1399  %3 = bitcast i8 %mask to <8 x i1>
1400  ; load needed to keep the operation from being scheduled about the asm block
1401  %4 = load <8 x i16>, <8 x i16>* %passthru
1402  %5 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> %4
1403  ret <8 x i16> %5
1404}
1405
1406define <8 x i16> @stack_fold_pmaddubsw_maskz(<16 x i8> %a0, <16 x i8> %a1, i8 %mask) {
1407; CHECK-LABEL: stack_fold_pmaddubsw_maskz:
1408; CHECK:       # %bb.0:
1409; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1410; CHECK-NEXT:    #APP
1411; CHECK-NEXT:    nop
1412; CHECK-NEXT:    #NO_APP
1413; CHECK-NEXT:    kmovd %edi, %k1
1414; CHECK-NEXT:    vpmaddubsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} {z} # 16-byte Folded Reload
1415; CHECK-NEXT:    retq
1416  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1417  %2 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a0, <16 x i8> %a1)
1418  %3 = bitcast i8 %mask to <8 x i1>
1419  %4 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> zeroinitializer
1420  ret <8 x i16> %4
1421}
1422
1423define <16 x i16> @stack_fold_pmaddubsw_ymm(<32 x i8> %a0, <32 x i8> %a1) {
1424; CHECK-LABEL: stack_fold_pmaddubsw_ymm:
1425; CHECK:       # %bb.0:
1426; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1427; CHECK-NEXT:    #APP
1428; CHECK-NEXT:    nop
1429; CHECK-NEXT:    #NO_APP
1430; CHECK-NEXT:    vpmaddubsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1431; CHECK-NEXT:    retq
1432  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1433  %2 = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %a0, <32 x i8> %a1)
1434  ret <16 x i16> %2
1435}
1436declare <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>) nounwind readnone
1437
1438define <16 x i16> @stack_fold_pmaddubsw_ymm_mask(<16 x i16>* %passthru, <32 x i8> %a0, <32 x i8> %a1, i16 %mask) {
1439; CHECK-LABEL: stack_fold_pmaddubsw_ymm_mask:
1440; CHECK:       # %bb.0:
1441; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1442; CHECK-NEXT:    #APP
1443; CHECK-NEXT:    nop
1444; CHECK-NEXT:    #NO_APP
1445; CHECK-NEXT:    kmovd %esi, %k1
1446; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
1447; CHECK-NEXT:    vpmaddubsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
1448; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
1449; CHECK-NEXT:    retq
1450  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1451  %2 = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %a0, <32 x i8> %a1)
1452  %3 = bitcast i16 %mask to <16 x i1>
1453  ; load needed to keep the operation from being scheduled about the asm block
1454  %4 = load <16 x i16>, <16 x i16>* %passthru
1455  %5 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> %4
1456  ret <16 x i16> %5
1457}
1458
1459define <16 x i16> @stack_fold_pmaddubsw_ymm_maskz(<32 x i8> %a0, <32 x i8> %a1, i16 %mask) {
1460; CHECK-LABEL: stack_fold_pmaddubsw_ymm_maskz:
1461; CHECK:       # %bb.0:
1462; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1463; CHECK-NEXT:    #APP
1464; CHECK-NEXT:    nop
1465; CHECK-NEXT:    #NO_APP
1466; CHECK-NEXT:    kmovd %edi, %k1
1467; CHECK-NEXT:    vpmaddubsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 {%k1} {z} # 32-byte Folded Reload
1468; CHECK-NEXT:    retq
1469  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1470  %2 = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %a0, <32 x i8> %a1)
1471  %3 = bitcast i16 %mask to <16 x i1>
1472  %4 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> zeroinitializer
1473  ret <16 x i16> %4
1474}
1475
1476define <4 x i32> @stack_fold_pmaddwd(<8 x i16> %a0, <8 x i16> %a1) {
1477; CHECK-LABEL: stack_fold_pmaddwd:
1478; CHECK:       # %bb.0:
1479; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1480; CHECK-NEXT:    #APP
1481; CHECK-NEXT:    nop
1482; CHECK-NEXT:    #NO_APP
1483; CHECK-NEXT:    vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1484; CHECK-NEXT:    retq
1485  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1486  %2 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a0, <8 x i16> %a1)
1487  ret <4 x i32> %2
1488}
1489declare <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>) nounwind readnone
1490
1491define <4 x i32> @stack_fold_pmaddwd_mask(<4 x i32>* %passthru, <8 x i16> %a0, <8 x i16> %a1, i8 %mask) {
1492; CHECK-LABEL: stack_fold_pmaddwd_mask:
1493; CHECK:       # %bb.0:
1494; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1495; CHECK-NEXT:    #APP
1496; CHECK-NEXT:    nop
1497; CHECK-NEXT:    #NO_APP
1498; CHECK-NEXT:    kmovd %esi, %k1
1499; CHECK-NEXT:    vmovdqa (%rdi), %xmm2
1500; CHECK-NEXT:    vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
1501; CHECK-NEXT:    vmovdqa %xmm2, %xmm0
1502; CHECK-NEXT:    retq
1503  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1504  %2 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a0, <8 x i16> %a1)
1505  %3 = bitcast i8 %mask to <8 x i1>
1506  %4 = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1507  ; load needed to keep the operation from being scheduled about the asm block
1508  %5 = load <4 x i32>, <4 x i32>* %passthru
1509  %6 = select <4 x i1> %4, <4 x i32> %2, <4 x i32> %5
1510  ret <4 x i32> %6
1511}
1512
1513define <4 x i32> @stack_fold_pmaddwd_maskz(<8 x i16> %a0, <8 x i16> %a1, i8 %mask) {
1514; CHECK-LABEL: stack_fold_pmaddwd_maskz:
1515; CHECK:       # %bb.0:
1516; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1517; CHECK-NEXT:    #APP
1518; CHECK-NEXT:    nop
1519; CHECK-NEXT:    #NO_APP
1520; CHECK-NEXT:    kmovd %edi, %k1
1521; CHECK-NEXT:    vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} {z} # 16-byte Folded Reload
1522; CHECK-NEXT:    retq
1523  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1524  %2 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a0, <8 x i16> %a1)
1525  %3 = bitcast i8 %mask to <8 x i1>
1526  %4 = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1527  %5 = select <4 x i1> %4, <4 x i32> %2, <4 x i32> zeroinitializer
1528  ret <4 x i32> %5
1529}
1530
1531define <8 x i32> @stack_fold_pmaddwd_ymm(<16 x i16> %a0, <16 x i16> %a1) {
1532; CHECK-LABEL: stack_fold_pmaddwd_ymm:
1533; CHECK:       # %bb.0:
1534; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1535; CHECK-NEXT:    #APP
1536; CHECK-NEXT:    nop
1537; CHECK-NEXT:    #NO_APP
1538; CHECK-NEXT:    vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1539; CHECK-NEXT:    retq
1540  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1541  %2 = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %a0, <16 x i16> %a1)
1542  ret <8 x i32> %2
1543}
1544declare <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>) nounwind readnone
1545
1546define <8 x i32> @stack_fold_pmaddwd_ymm_mask(<8 x i32>* %passthru, <16 x i16> %a0, <16 x i16> %a1, i8 %mask) {
1547; CHECK-LABEL: stack_fold_pmaddwd_ymm_mask:
1548; CHECK:       # %bb.0:
1549; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1550; CHECK-NEXT:    #APP
1551; CHECK-NEXT:    nop
1552; CHECK-NEXT:    #NO_APP
1553; CHECK-NEXT:    kmovd %esi, %k1
1554; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
1555; CHECK-NEXT:    vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
1556; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
1557; CHECK-NEXT:    retq
1558  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1559  %2 = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %a0, <16 x i16> %a1)
1560  %3 = bitcast i8 %mask to <8 x i1>
1561  ; load needed to keep the operation from being scheduled about the asm block
1562  %4 = load <8 x i32>, <8 x i32>* %passthru
1563  %5 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> %4
1564  ret <8 x i32> %5
1565}
1566
1567define <8 x i32> @stack_fold_pmaddwd_ymm_maskz(<16 x i16> %a0, <16 x i16> %a1, i8 %mask) {
1568; CHECK-LABEL: stack_fold_pmaddwd_ymm_maskz:
1569; CHECK:       # %bb.0:
1570; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1571; CHECK-NEXT:    #APP
1572; CHECK-NEXT:    nop
1573; CHECK-NEXT:    #NO_APP
1574; CHECK-NEXT:    kmovd %edi, %k1
1575; CHECK-NEXT:    vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 {%k1} {z} # 32-byte Folded Reload
1576; CHECK-NEXT:    retq
1577  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1578  %2 = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %a0, <16 x i16> %a1)
1579  %3 = bitcast i8 %mask to <8 x i1>
1580  %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer
1581  ret <8 x i32> %4
1582}
1583
1584define <16 x i8> @stack_fold_pmaxsb(<16 x i8> %a0, <16 x i8> %a1) {
1585; CHECK-LABEL: stack_fold_pmaxsb:
1586; CHECK:       # %bb.0:
1587; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1588; CHECK-NEXT:    #APP
1589; CHECK-NEXT:    nop
1590; CHECK-NEXT:    #NO_APP
1591; CHECK-NEXT:    vpmaxsb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1592; CHECK-NEXT:    retq
1593  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1594  %2 = icmp sgt <16 x i8> %a0, %a1
1595  %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %a1
1596  ret <16 x i8> %3
1597}
1598
1599define <32 x i8> @stack_fold_pmaxsb_ymm(<32 x i8> %a0, <32 x i8> %a1) {
1600; CHECK-LABEL: stack_fold_pmaxsb_ymm:
1601; CHECK:       # %bb.0:
1602; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1603; CHECK-NEXT:    #APP
1604; CHECK-NEXT:    nop
1605; CHECK-NEXT:    #NO_APP
1606; CHECK-NEXT:    vpmaxsb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1607; CHECK-NEXT:    retq
1608  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1609  %2 = icmp sgt <32 x i8> %a0, %a1
1610  %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %a1
1611  ret <32 x i8> %3
1612}
1613
1614define <4 x i32> @stack_fold_pmaxsd(<4 x i32> %a0, <4 x i32> %a1) {
1615; CHECK-LABEL: stack_fold_pmaxsd:
1616; CHECK:       # %bb.0:
1617; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1618; CHECK-NEXT:    #APP
1619; CHECK-NEXT:    nop
1620; CHECK-NEXT:    #NO_APP
1621; CHECK-NEXT:    vpmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1622; CHECK-NEXT:    retq
1623  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1624  %2 = icmp sgt <4 x i32> %a0, %a1
1625  %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %a1
1626  ret <4 x i32> %3
1627}
1628
1629define <8 x i32> @stack_fold_pmaxsd_ymm(<8 x i32> %a0, <8 x i32> %a1) {
1630; CHECK-LABEL: stack_fold_pmaxsd_ymm:
1631; CHECK:       # %bb.0:
1632; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1633; CHECK-NEXT:    #APP
1634; CHECK-NEXT:    nop
1635; CHECK-NEXT:    #NO_APP
1636; CHECK-NEXT:    vpmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1637; CHECK-NEXT:    retq
1638  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1639  %2 = icmp sgt <8 x i32> %a0, %a1
1640  %3 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %a1
1641  ret <8 x i32> %3
1642}
1643
1644define <2 x i64> @stack_fold_pmaxsq(<2 x i64> %a0, <2 x i64> %a1) {
1645; CHECK-LABEL: stack_fold_pmaxsq:
1646; CHECK:       # %bb.0:
1647; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1648; CHECK-NEXT:    #APP
1649; CHECK-NEXT:    nop
1650; CHECK-NEXT:    #NO_APP
1651; CHECK-NEXT:    vpmaxsq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1652; CHECK-NEXT:    retq
1653  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1654  %2 = icmp sgt <2 x i64> %a0, %a1
1655  %3 = select <2 x i1> %2, <2 x i64> %a0, <2 x i64> %a1
1656  ret <2 x i64> %3
1657}
1658
1659define <4 x i64> @stack_fold_pmaxsq_ymm(<4 x i64> %a0, <4 x i64> %a1) {
1660; CHECK-LABEL: stack_fold_pmaxsq_ymm:
1661; CHECK:       # %bb.0:
1662; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1663; CHECK-NEXT:    #APP
1664; CHECK-NEXT:    nop
1665; CHECK-NEXT:    #NO_APP
1666; CHECK-NEXT:    vpmaxsq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1667; CHECK-NEXT:    retq
1668  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1669  %2 = icmp sgt <4 x i64> %a0, %a1
1670  %3 = select <4 x i1> %2, <4 x i64> %a0, <4 x i64> %a1
1671  ret <4 x i64> %3
1672}
1673
1674define <8 x i16> @stack_fold_pmaxsw(<8 x i16> %a0, <8 x i16> %a1) {
1675; CHECK-LABEL: stack_fold_pmaxsw:
1676; CHECK:       # %bb.0:
1677; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1678; CHECK-NEXT:    #APP
1679; CHECK-NEXT:    nop
1680; CHECK-NEXT:    #NO_APP
1681; CHECK-NEXT:    vpmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1682; CHECK-NEXT:    retq
1683  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1684  %2 = icmp sgt <8 x i16> %a0, %a1
1685  %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %a1
1686  ret <8 x i16> %3
1687}
1688
1689define <16 x i16> @stack_fold_pmaxsw_ymm(<16 x i16> %a0, <16 x i16> %a1) {
1690; CHECK-LABEL: stack_fold_pmaxsw_ymm:
1691; CHECK:       # %bb.0:
1692; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1693; CHECK-NEXT:    #APP
1694; CHECK-NEXT:    nop
1695; CHECK-NEXT:    #NO_APP
1696; CHECK-NEXT:    vpmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1697; CHECK-NEXT:    retq
1698  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1699  %2 = icmp sgt <16 x i16> %a0, %a1
1700  %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %a1
1701  ret <16 x i16> %3
1702}
1703
1704define <16 x i8> @stack_fold_pmaxub(<16 x i8> %a0, <16 x i8> %a1) {
1705; CHECK-LABEL: stack_fold_pmaxub:
1706; CHECK:       # %bb.0:
1707; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1708; CHECK-NEXT:    #APP
1709; CHECK-NEXT:    nop
1710; CHECK-NEXT:    #NO_APP
1711; CHECK-NEXT:    vpmaxub {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1712; CHECK-NEXT:    retq
1713  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1714  %2 = icmp ugt <16 x i8> %a0, %a1
1715  %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %a1
1716  ret <16 x i8> %3
1717}
1718
1719define <32 x i8> @stack_fold_pmaxub_ymm(<32 x i8> %a0, <32 x i8> %a1) {
1720; CHECK-LABEL: stack_fold_pmaxub_ymm:
1721; CHECK:       # %bb.0:
1722; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1723; CHECK-NEXT:    #APP
1724; CHECK-NEXT:    nop
1725; CHECK-NEXT:    #NO_APP
1726; CHECK-NEXT:    vpmaxub {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1727; CHECK-NEXT:    retq
1728  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1729  %2 = icmp ugt <32 x i8> %a0, %a1
1730  %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %a1
1731  ret <32 x i8> %3
1732}
1733
1734define <4 x i32> @stack_fold_pmaxud(<4 x i32> %a0, <4 x i32> %a1) {
1735; CHECK-LABEL: stack_fold_pmaxud:
1736; CHECK:       # %bb.0:
1737; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1738; CHECK-NEXT:    #APP
1739; CHECK-NEXT:    nop
1740; CHECK-NEXT:    #NO_APP
1741; CHECK-NEXT:    vpmaxud {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1742; CHECK-NEXT:    retq
1743  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1744  %2 = icmp ugt <4 x i32> %a0, %a1
1745  %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %a1
1746  ret <4 x i32> %3
1747}
1748
1749define <8 x i32> @stack_fold_pmaxud_ymm(<8 x i32> %a0, <8 x i32> %a1) {
1750; CHECK-LABEL: stack_fold_pmaxud_ymm:
1751; CHECK:       # %bb.0:
1752; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1753; CHECK-NEXT:    #APP
1754; CHECK-NEXT:    nop
1755; CHECK-NEXT:    #NO_APP
1756; CHECK-NEXT:    vpmaxud {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1757; CHECK-NEXT:    retq
1758  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1759  %2 = icmp ugt <8 x i32> %a0, %a1
1760  %3 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %a1
1761  ret <8 x i32> %3
1762}
1763
1764define <2 x i64> @stack_fold_pmaxuq(<2 x i64> %a0, <2 x i64> %a1) {
1765; CHECK-LABEL: stack_fold_pmaxuq:
1766; CHECK:       # %bb.0:
1767; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1768; CHECK-NEXT:    #APP
1769; CHECK-NEXT:    nop
1770; CHECK-NEXT:    #NO_APP
1771; CHECK-NEXT:    vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1772; CHECK-NEXT:    retq
1773  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1774  %2 = icmp ugt <2 x i64> %a0, %a1
1775  %3 = select <2 x i1> %2, <2 x i64> %a0, <2 x i64> %a1
1776  ret <2 x i64> %3
1777}
1778
1779define <2 x i64> @stack_fold_pmaxuq_mask(<2 x i64>* %passthru, <2 x i64> %a0, <2 x i64> %a1, i8 %mask) {
1780; CHECK-LABEL: stack_fold_pmaxuq_mask:
1781; CHECK:       # %bb.0:
1782; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1783; CHECK-NEXT:    #APP
1784; CHECK-NEXT:    nop
1785; CHECK-NEXT:    #NO_APP
1786; CHECK-NEXT:    vmovdqa (%rdi), %xmm2
1787; CHECK-NEXT:    kmovd %esi, %k1
1788; CHECK-NEXT:    vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
1789; CHECK-NEXT:    vmovdqa %xmm2, %xmm0
1790; CHECK-NEXT:    retq
1791  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1792  %2 = load <2 x i64>, <2 x i64>* %passthru
1793  %3 = icmp ugt <2 x i64> %a0, %a1
1794  %4 = select <2 x i1> %3, <2 x i64> %a0, <2 x i64> %a1
1795  %5 = bitcast i8 %mask to <8 x i1>
1796  %extract = shufflevector <8 x i1> %5, <8 x i1> %5, <2 x i32> <i32 0, i32 1>
1797  %6 = select <2 x i1> %extract, <2 x i64> %4, <2 x i64> %2
1798  ret <2 x i64> %6
1799}
1800
1801define <2 x i64> @stack_fold_pmaxuq_maskz(<2 x i64> %a0, <2 x i64> %a1, i8 %mask) {
1802; CHECK-LABEL: stack_fold_pmaxuq_maskz:
1803; CHECK:       # %bb.0:
1804; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1805; CHECK-NEXT:    #APP
1806; CHECK-NEXT:    nop
1807; CHECK-NEXT:    #NO_APP
1808; CHECK-NEXT:    kmovd %edi, %k1
1809; CHECK-NEXT:    vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} {z} # 16-byte Folded Reload
1810; CHECK-NEXT:    retq
1811  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1812  %2 = icmp ugt <2 x i64> %a0, %a1
1813  %3 = select <2 x i1> %2, <2 x i64> %a0, <2 x i64> %a1
1814  %4 = bitcast i8 %mask to <8 x i1>
1815  %extract = shufflevector <8 x i1> %4, <8 x i1> %4, <2 x i32> <i32 0, i32 1>
1816  %5 = select <2 x i1> %extract, <2 x i64> %3, <2 x i64> zeroinitializer
1817  ret <2 x i64> %5
1818}
1819
1820define <4 x i64> @stack_fold_pmaxuq_ymm(<4 x i64> %a0, <4 x i64> %a1) {
1821; CHECK-LABEL: stack_fold_pmaxuq_ymm:
1822; CHECK:       # %bb.0:
1823; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1824; CHECK-NEXT:    #APP
1825; CHECK-NEXT:    nop
1826; CHECK-NEXT:    #NO_APP
1827; CHECK-NEXT:    vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1828; CHECK-NEXT:    retq
1829  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1830  %2 = icmp ugt <4 x i64> %a0, %a1
1831  %3 = select <4 x i1> %2, <4 x i64> %a0, <4 x i64> %a1
1832  ret <4 x i64> %3
1833}
1834
1835define <4 x i64> @stack_fold_pmaxuq_ymm_mask(<4 x i64>* %passthru, <4 x i64> %a0, <4 x i64> %a1, i8 %mask) {
1836; CHECK-LABEL: stack_fold_pmaxuq_ymm_mask:
1837; CHECK:       # %bb.0:
1838; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1839; CHECK-NEXT:    #APP
1840; CHECK-NEXT:    nop
1841; CHECK-NEXT:    #NO_APP
1842; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
1843; CHECK-NEXT:    kmovd %esi, %k1
1844; CHECK-NEXT:    vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
1845; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
1846; CHECK-NEXT:    retq
1847  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1848  %2 = load <4 x i64>, <4 x i64>* %passthru
1849  %3 = icmp ugt <4 x i64> %a0, %a1
1850  %4 = select <4 x i1> %3, <4 x i64> %a0, <4 x i64> %a1
1851  %5 = bitcast i8 %mask to <8 x i1>
1852  %extract = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1853  %6 = select <4 x i1> %extract, <4 x i64> %4, <4 x i64> %2
1854  ret <4 x i64> %6
1855}
1856
1857define <4 x i64> @stack_fold_pmaxuq_ymm_maskz(<4 x i64> %a0, <4 x i64> %a1, i8 %mask) {
1858; CHECK-LABEL: stack_fold_pmaxuq_ymm_maskz:
1859; CHECK:       # %bb.0:
1860; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1861; CHECK-NEXT:    #APP
1862; CHECK-NEXT:    nop
1863; CHECK-NEXT:    #NO_APP
1864; CHECK-NEXT:    kmovd %edi, %k1
1865; CHECK-NEXT:    vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 {%k1} {z} # 32-byte Folded Reload
1866; CHECK-NEXT:    retq
1867  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1868  %2 = icmp ugt <4 x i64> %a0, %a1
1869  %3 = select <4 x i1> %2, <4 x i64> %a0, <4 x i64> %a1
1870  %4 = bitcast i8 %mask to <8 x i1>
1871  %extract = shufflevector <8 x i1> %4, <8 x i1> %4, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1872  %5 = select <4 x i1> %extract, <4 x i64> %3, <4 x i64> zeroinitializer
1873  ret <4 x i64> %5
1874}
1875
1876define <8 x i16> @stack_fold_pmaxuw(<8 x i16> %a0, <8 x i16> %a1) {
1877; CHECK-LABEL: stack_fold_pmaxuw:
1878; CHECK:       # %bb.0:
1879; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1880; CHECK-NEXT:    #APP
1881; CHECK-NEXT:    nop
1882; CHECK-NEXT:    #NO_APP
1883; CHECK-NEXT:    vpmaxuw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1884; CHECK-NEXT:    retq
1885  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1886  %2 = icmp ugt <8 x i16> %a0, %a1
1887  %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %a1
1888  ret <8 x i16> %3
1889}
1890
1891define <16 x i16> @stack_fold_pmaxuw_ymm(<16 x i16> %a0, <16 x i16> %a1) {
1892; CHECK-LABEL: stack_fold_pmaxuw_ymm:
1893; CHECK:       # %bb.0:
1894; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1895; CHECK-NEXT:    #APP
1896; CHECK-NEXT:    nop
1897; CHECK-NEXT:    #NO_APP
1898; CHECK-NEXT:    vpmaxuw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1899; CHECK-NEXT:    retq
1900  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1901  %2 = icmp ugt <16 x i16> %a0, %a1
1902  %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %a1
1903  ret <16 x i16> %3
1904}
1905declare <16 x i16> @llvm.x86.avx2.pmaxu.w(<16 x i16>, <16 x i16>) nounwind readnone
1906
1907define <16 x i8> @stack_fold_pminsb(<16 x i8> %a0, <16 x i8> %a1) {
1908; CHECK-LABEL: stack_fold_pminsb:
1909; CHECK:       # %bb.0:
1910; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1911; CHECK-NEXT:    #APP
1912; CHECK-NEXT:    nop
1913; CHECK-NEXT:    #NO_APP
1914; CHECK-NEXT:    vpminsb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1915; CHECK-NEXT:    retq
1916  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1917  %2 = icmp slt <16 x i8> %a0, %a1
1918  %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %a1
1919  ret <16 x i8> %3
1920}
1921
1922define <32 x i8> @stack_fold_pminsb_ymm(<32 x i8> %a0, <32 x i8> %a1) {
1923; CHECK-LABEL: stack_fold_pminsb_ymm:
1924; CHECK:       # %bb.0:
1925; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1926; CHECK-NEXT:    #APP
1927; CHECK-NEXT:    nop
1928; CHECK-NEXT:    #NO_APP
1929; CHECK-NEXT:    vpminsb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1930; CHECK-NEXT:    retq
1931  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1932  %2 = icmp slt <32 x i8> %a0, %a1
1933  %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %a1
1934  ret <32 x i8> %3
1935}
1936
1937define <4 x i32> @stack_fold_pminsd(<4 x i32> %a0, <4 x i32> %a1) {
1938; CHECK-LABEL: stack_fold_pminsd:
1939; CHECK:       # %bb.0:
1940; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1941; CHECK-NEXT:    #APP
1942; CHECK-NEXT:    nop
1943; CHECK-NEXT:    #NO_APP
1944; CHECK-NEXT:    vpminsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1945; CHECK-NEXT:    retq
1946  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1947  %2 = icmp slt <4 x i32> %a0, %a1
1948  %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %a1
1949  ret <4 x i32> %3
1950}
1951
1952define <8 x i32> @stack_fold_pminsd_ymm(<8 x i32> %a0, <8 x i32> %a1) {
1953; CHECK-LABEL: stack_fold_pminsd_ymm:
1954; CHECK:       # %bb.0:
1955; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1956; CHECK-NEXT:    #APP
1957; CHECK-NEXT:    nop
1958; CHECK-NEXT:    #NO_APP
1959; CHECK-NEXT:    vpminsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1960; CHECK-NEXT:    retq
1961  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1962  %2 = icmp slt <8 x i32> %a0, %a1
1963  %3 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %a1
1964  ret <8 x i32> %3
1965}
1966
1967define <2 x i64> @stack_fold_pminsq(<2 x i64> %a0, <2 x i64> %a1) {
1968; CHECK-LABEL: stack_fold_pminsq:
1969; CHECK:       # %bb.0:
1970; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1971; CHECK-NEXT:    #APP
1972; CHECK-NEXT:    nop
1973; CHECK-NEXT:    #NO_APP
1974; CHECK-NEXT:    vpminsq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1975; CHECK-NEXT:    retq
1976  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1977  %2 = icmp slt <2 x i64> %a0, %a1
1978  %3 = select <2 x i1> %2, <2 x i64> %a0, <2 x i64> %a1
1979  ret <2 x i64> %3
1980}
1981
1982define <4 x i64> @stack_fold_pminsq_ymm(<4 x i64> %a0, <4 x i64> %a1) {
1983; CHECK-LABEL: stack_fold_pminsq_ymm:
1984; CHECK:       # %bb.0:
1985; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1986; CHECK-NEXT:    #APP
1987; CHECK-NEXT:    nop
1988; CHECK-NEXT:    #NO_APP
1989; CHECK-NEXT:    vpminsq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1990; CHECK-NEXT:    retq
1991  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1992  %2 = icmp slt <4 x i64> %a0, %a1
1993  %3 = select <4 x i1> %2, <4 x i64> %a0, <4 x i64> %a1
1994  ret <4 x i64> %3
1995}
1996
1997define <8 x i16> @stack_fold_pminsw(<8 x i16> %a0, <8 x i16> %a1) {
1998; CHECK-LABEL: stack_fold_pminsw:
1999; CHECK:       # %bb.0:
2000; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2001; CHECK-NEXT:    #APP
2002; CHECK-NEXT:    nop
2003; CHECK-NEXT:    #NO_APP
2004; CHECK-NEXT:    vpminsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2005; CHECK-NEXT:    retq
2006  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2007  %2 = icmp slt <8 x i16> %a0, %a1
2008  %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %a1
2009  ret <8 x i16> %3
2010}
2011
2012define <16 x i16> @stack_fold_pminsw_ymm(<16 x i16> %a0, <16 x i16> %a1) {
2013; CHECK-LABEL: stack_fold_pminsw_ymm:
2014; CHECK:       # %bb.0:
2015; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2016; CHECK-NEXT:    #APP
2017; CHECK-NEXT:    nop
2018; CHECK-NEXT:    #NO_APP
2019; CHECK-NEXT:    vpminsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
2020; CHECK-NEXT:    retq
2021  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2022  %2 = icmp slt <16 x i16> %a0, %a1
2023  %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %a1
2024  ret <16 x i16> %3
2025}
2026
2027define <16 x i8> @stack_fold_pminub(<16 x i8> %a0, <16 x i8> %a1) {
2028; CHECK-LABEL: stack_fold_pminub:
2029; CHECK:       # %bb.0:
2030; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2031; CHECK-NEXT:    #APP
2032; CHECK-NEXT:    nop
2033; CHECK-NEXT:    #NO_APP
2034; CHECK-NEXT:    vpminub {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2035; CHECK-NEXT:    retq
2036  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2037  %2 = icmp ult <16 x i8> %a0, %a1
2038  %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %a1
2039  ret <16 x i8> %3
2040}
2041
2042define <32 x i8> @stack_fold_pminub_ymm(<32 x i8> %a0, <32 x i8> %a1) {
2043; CHECK-LABEL: stack_fold_pminub_ymm:
2044; CHECK:       # %bb.0:
2045; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2046; CHECK-NEXT:    #APP
2047; CHECK-NEXT:    nop
2048; CHECK-NEXT:    #NO_APP
2049; CHECK-NEXT:    vpminub {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
2050; CHECK-NEXT:    retq
2051  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2052  %2 = icmp ult <32 x i8> %a0, %a1
2053  %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %a1
2054  ret <32 x i8> %3
2055}
2056
2057define <4 x i32> @stack_fold_pminud(<4 x i32> %a0, <4 x i32> %a1) {
2058; CHECK-LABEL: stack_fold_pminud:
2059; CHECK:       # %bb.0:
2060; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2061; CHECK-NEXT:    #APP
2062; CHECK-NEXT:    nop
2063; CHECK-NEXT:    #NO_APP
2064; CHECK-NEXT:    vpminud {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2065; CHECK-NEXT:    retq
2066  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2067  %2 = icmp ult <4 x i32> %a0, %a1
2068  %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %a1
2069  ret <4 x i32> %3
2070}
2071
2072define <8 x i32> @stack_fold_pminud_ymm(<8 x i32> %a0, <8 x i32> %a1) {
2073; CHECK-LABEL: stack_fold_pminud_ymm:
2074; CHECK:       # %bb.0:
2075; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2076; CHECK-NEXT:    #APP
2077; CHECK-NEXT:    nop
2078; CHECK-NEXT:    #NO_APP
2079; CHECK-NEXT:    vpminud {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
2080; CHECK-NEXT:    retq
2081  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2082  %2 = icmp ult <8 x i32> %a0, %a1
2083  %3 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %a1
2084  ret <8 x i32> %3
2085}
2086
2087define <2 x i64> @stack_fold_pminuq(<2 x i64> %a0, <2 x i64> %a1) {
2088; CHECK-LABEL: stack_fold_pminuq:
2089; CHECK:       # %bb.0:
2090; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2091; CHECK-NEXT:    #APP
2092; CHECK-NEXT:    nop
2093; CHECK-NEXT:    #NO_APP
2094; CHECK-NEXT:    vpminuq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2095; CHECK-NEXT:    retq
2096  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2097  %2 = icmp ult <2 x i64> %a0, %a1
2098  %3 = select <2 x i1> %2, <2 x i64> %a0, <2 x i64> %a1
2099  ret <2 x i64> %3
2100}
2101
2102define <4 x i64> @stack_fold_pminuq_ymm(<4 x i64> %a0, <4 x i64> %a1) {
2103; CHECK-LABEL: stack_fold_pminuq_ymm:
2104; CHECK:       # %bb.0:
2105; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2106; CHECK-NEXT:    #APP
2107; CHECK-NEXT:    nop
2108; CHECK-NEXT:    #NO_APP
2109; CHECK-NEXT:    vpminuq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
2110; CHECK-NEXT:    retq
2111  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2112  %2 = icmp ult <4 x i64> %a0, %a1
2113  %3 = select <4 x i1> %2, <4 x i64> %a0, <4 x i64> %a1
2114  ret <4 x i64> %3
2115}
2116
2117define <8 x i16> @stack_fold_pminuw(<8 x i16> %a0, <8 x i16> %a1) {
2118; CHECK-LABEL: stack_fold_pminuw:
2119; CHECK:       # %bb.0:
2120; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2121; CHECK-NEXT:    #APP
2122; CHECK-NEXT:    nop
2123; CHECK-NEXT:    #NO_APP
2124; CHECK-NEXT:    vpminuw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2125; CHECK-NEXT:    retq
2126  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2127  %2 = icmp ult <8 x i16> %a0, %a1
2128  %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %a1
2129  ret <8 x i16> %3
2130}
2131
2132define <16 x i16> @stack_fold_pminuw_ymm(<16 x i16> %a0, <16 x i16> %a1) {
2133; CHECK-LABEL: stack_fold_pminuw_ymm:
2134; CHECK:       # %bb.0:
2135; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2136; CHECK-NEXT:    #APP
2137; CHECK-NEXT:    nop
2138; CHECK-NEXT:    #NO_APP
2139; CHECK-NEXT:    vpminuw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
2140; CHECK-NEXT:    retq
2141  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2142  %2 = icmp ult <16 x i16> %a0, %a1
2143  %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %a1
2144  ret <16 x i16> %3
2145}
2146
2147define <8 x i16> @stack_fold_vpmovdw(<8 x i32> %a0) {
2148; CHECK-LABEL: stack_fold_vpmovdw:
2149; CHECK:       # %bb.0:
2150; CHECK-NEXT:    vpmovdw %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
2151; CHECK-NEXT:    #APP
2152; CHECK-NEXT:    nop
2153; CHECK-NEXT:    #NO_APP
2154; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2155; CHECK-NEXT:    vzeroupper
2156; CHECK-NEXT:    retq
2157  %1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32> %a0, <8 x i16> undef, i8 -1)
2158  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2159  ret <8 x i16> %1
2160}
2161declare <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32>, <8 x i16>, i8)
2162
2163define <4 x i32> @stack_fold_vpmovqd(<4 x i64> %a0) {
2164; CHECK-LABEL: stack_fold_vpmovqd:
2165; CHECK:       # %bb.0:
2166; CHECK-NEXT:    vpmovqd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
2167; CHECK-NEXT:    #APP
2168; CHECK-NEXT:    nop
2169; CHECK-NEXT:    #NO_APP
2170; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2171; CHECK-NEXT:    vzeroupper
2172; CHECK-NEXT:    retq
2173  %1 = trunc <4 x i64> %a0 to <4 x i32>
2174  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2175  ret <4 x i32> %1
2176}
2177declare <4 x i32> @llvm.x86.avx512.mask.pmov.qd.256(<4 x i64>, <4 x i32>, i8)
2178
2179define <16 x i8> @stack_fold_vpmovwb(<16 x i16> %a0) {
2180; CHECK-LABEL: stack_fold_vpmovwb:
2181; CHECK:       # %bb.0:
2182; CHECK-NEXT:    vpmovwb %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
2183; CHECK-NEXT:    #APP
2184; CHECK-NEXT:    nop
2185; CHECK-NEXT:    #NO_APP
2186; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2187; CHECK-NEXT:    vzeroupper
2188; CHECK-NEXT:    retq
2189  %1 = trunc <16 x i16> %a0 to <16 x i8>
2190  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2191  ret <16 x i8> %1
2192}
2193declare <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16>, <16 x i8>, i16)
2194
2195define <8 x i16> @stack_fold_vpmovsdw(<8 x i32> %a0) {
2196; CHECK-LABEL: stack_fold_vpmovsdw:
2197; CHECK:       # %bb.0:
2198; CHECK-NEXT:    vpmovsdw %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
2199; CHECK-NEXT:    #APP
2200; CHECK-NEXT:    nop
2201; CHECK-NEXT:    #NO_APP
2202; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2203; CHECK-NEXT:    vzeroupper
2204; CHECK-NEXT:    retq
2205  %1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.256(<8 x i32> %a0, <8 x i16> undef, i8 -1)
2206  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2207  ret <8 x i16> %1
2208}
2209declare <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.256(<8 x i32>, <8 x i16>, i8)
2210
2211define <4 x i32> @stack_fold_vpmovsqd(<4 x i64> %a0) {
2212; CHECK-LABEL: stack_fold_vpmovsqd:
2213; CHECK:       # %bb.0:
2214; CHECK-NEXT:    vpmovsqd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
2215; CHECK-NEXT:    #APP
2216; CHECK-NEXT:    nop
2217; CHECK-NEXT:    #NO_APP
2218; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2219; CHECK-NEXT:    vzeroupper
2220; CHECK-NEXT:    retq
2221  %1 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64> %a0, <4 x i32> undef, i8 -1)
2222  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2223  ret <4 x i32> %1
2224}
2225declare <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64>, <4 x i32>, i8)
2226
2227define <16 x i8> @stack_fold_vpmovswb(<16 x i16> %a0) {
2228; CHECK-LABEL: stack_fold_vpmovswb:
2229; CHECK:       # %bb.0:
2230; CHECK-NEXT:    vpmovswb %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
2231; CHECK-NEXT:    #APP
2232; CHECK-NEXT:    nop
2233; CHECK-NEXT:    #NO_APP
2234; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2235; CHECK-NEXT:    vzeroupper
2236; CHECK-NEXT:    retq
2237  %1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16> %a0, <16 x i8> undef, i16 -1)
2238  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2239  ret <16 x i8> %1
2240}
2241declare <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16>, <16 x i8>, i16)
2242
2243define <4 x i32> @stack_fold_pmovsxbd(<16 x i8> %a0) {
2244; CHECK-LABEL: stack_fold_pmovsxbd:
2245; CHECK:       # %bb.0:
2246; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2247; CHECK-NEXT:    #APP
2248; CHECK-NEXT:    nop
2249; CHECK-NEXT:    #NO_APP
2250; CHECK-NEXT:    vpmovsxbd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2251; CHECK-NEXT:    retq
2252  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2253  %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2254  %3 = sext <4 x i8> %2 to <4 x i32>
2255  ret <4 x i32> %3
2256}
2257
2258define <8 x i32> @stack_fold_pmovsxbd_ymm(<16 x i8> %a0) {
2259; CHECK-LABEL: stack_fold_pmovsxbd_ymm:
2260; CHECK:       # %bb.0:
2261; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2262; CHECK-NEXT:    #APP
2263; CHECK-NEXT:    nop
2264; CHECK-NEXT:    #NO_APP
2265; CHECK-NEXT:    vpmovsxbd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
2266; CHECK-NEXT:    retq
2267  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2268  %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2269  %3 = sext <8 x i8> %2 to <8 x i32>
2270  ret <8 x i32> %3
2271}
2272
2273define <2 x i64> @stack_fold_pmovsxbq(<16 x i8> %a0) {
2274; CHECK-LABEL: stack_fold_pmovsxbq:
2275; CHECK:       # %bb.0:
2276; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2277; CHECK-NEXT:    #APP
2278; CHECK-NEXT:    nop
2279; CHECK-NEXT:    #NO_APP
2280; CHECK-NEXT:    vpmovsxbq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2281; CHECK-NEXT:    retq
2282  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2283  %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
2284  %3 = sext <2 x i8> %2 to <2 x i64>
2285  ret <2 x i64> %3
2286}
2287
2288define <4 x i64> @stack_fold_pmovsxbq_ymm(<16 x i8> %a0) {
2289; CHECK-LABEL: stack_fold_pmovsxbq_ymm:
2290; CHECK:       # %bb.0:
2291; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2292; CHECK-NEXT:    #APP
2293; CHECK-NEXT:    nop
2294; CHECK-NEXT:    #NO_APP
2295; CHECK-NEXT:    vpmovsxbq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
2296; CHECK-NEXT:    retq
2297  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2298  %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2299  %3 = sext <4 x i8> %2 to <4 x i64>
2300  ret <4 x i64> %3
2301}
2302
2303define <8 x i16> @stack_fold_pmovsxbw(<16 x i8> %a0) {
2304; CHECK-LABEL: stack_fold_pmovsxbw:
2305; CHECK:       # %bb.0:
2306; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2307; CHECK-NEXT:    #APP
2308; CHECK-NEXT:    nop
2309; CHECK-NEXT:    #NO_APP
2310; CHECK-NEXT:    vpmovsxbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2311; CHECK-NEXT:    retq
2312  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2313  %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2314  %3 = sext <8 x i8> %2 to <8 x i16>
2315  ret <8 x i16> %3
2316}
2317
2318define <16 x i16> @stack_fold_pmovsxbw_ymm(<16 x i8> %a0) {
2319; CHECK-LABEL: stack_fold_pmovsxbw_ymm:
2320; CHECK:       # %bb.0:
2321; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2322; CHECK-NEXT:    #APP
2323; CHECK-NEXT:    nop
2324; CHECK-NEXT:    #NO_APP
2325; CHECK-NEXT:    vpmovsxbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
2326; CHECK-NEXT:    retq
2327  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2328  %2 = sext <16 x i8> %a0 to <16 x i16>
2329  ret <16 x i16> %2
2330}
2331
2332define <2 x i64> @stack_fold_pmovsxdq(<4 x i32> %a0) {
2333; CHECK-LABEL: stack_fold_pmovsxdq:
2334; CHECK:       # %bb.0:
2335; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2336; CHECK-NEXT:    #APP
2337; CHECK-NEXT:    nop
2338; CHECK-NEXT:    #NO_APP
2339; CHECK-NEXT:    vpmovsxdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2340; CHECK-NEXT:    retq
2341  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2342  %2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
2343  %3 = sext <2 x i32> %2 to <2 x i64>
2344  ret <2 x i64> %3
2345}
2346
2347define <4 x i64> @stack_fold_pmovsxdq_ymm(<4 x i32> %a0) {
2348; CHECK-LABEL: stack_fold_pmovsxdq_ymm:
2349; CHECK:       # %bb.0:
2350; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2351; CHECK-NEXT:    #APP
2352; CHECK-NEXT:    nop
2353; CHECK-NEXT:    #NO_APP
2354; CHECK-NEXT:    vpmovsxdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
2355; CHECK-NEXT:    retq
2356  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2357  %2 = sext <4 x i32> %a0 to <4 x i64>
2358  ret <4 x i64> %2
2359}
2360
2361define <4 x i32> @stack_fold_pmovsxwd(<8 x i16> %a0) {
2362; CHECK-LABEL: stack_fold_pmovsxwd:
2363; CHECK:       # %bb.0:
2364; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2365; CHECK-NEXT:    #APP
2366; CHECK-NEXT:    nop
2367; CHECK-NEXT:    #NO_APP
2368; CHECK-NEXT:    vpmovsxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2369; CHECK-NEXT:    retq
2370  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2371  %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2372  %3 = sext <4 x i16> %2 to <4 x i32>
2373  ret <4 x i32> %3
2374}
2375
2376define <8 x i32> @stack_fold_pmovsxwd_ymm(<8 x i16> %a0) {
2377; CHECK-LABEL: stack_fold_pmovsxwd_ymm:
2378; CHECK:       # %bb.0:
2379; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2380; CHECK-NEXT:    #APP
2381; CHECK-NEXT:    nop
2382; CHECK-NEXT:    #NO_APP
2383; CHECK-NEXT:    vpmovsxwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
2384; CHECK-NEXT:    retq
2385  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2386  %2 = sext <8 x i16> %a0 to <8 x i32>
2387  ret <8 x i32> %2
2388}
2389
2390define <2 x i64> @stack_fold_pmovsxwq(<8 x i16> %a0) {
2391; CHECK-LABEL: stack_fold_pmovsxwq:
2392; CHECK:       # %bb.0:
2393; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2394; CHECK-NEXT:    #APP
2395; CHECK-NEXT:    nop
2396; CHECK-NEXT:    #NO_APP
2397; CHECK-NEXT:    vpmovsxwq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2398; CHECK-NEXT:    retq
2399  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2400  %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
2401  %3 = sext <2 x i16> %2 to <2 x i64>
2402  ret <2 x i64> %3
2403}
2404
2405define <4 x i64> @stack_fold_pmovsxwq_ymm(<8 x i16> %a0) {
2406; CHECK-LABEL: stack_fold_pmovsxwq_ymm:
2407; CHECK:       # %bb.0:
2408; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2409; CHECK-NEXT:    #APP
2410; CHECK-NEXT:    nop
2411; CHECK-NEXT:    #NO_APP
2412; CHECK-NEXT:    vpmovsxwq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
2413; CHECK-NEXT:    retq
2414  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2415  %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2416  %3 = sext <4 x i16> %2 to <4 x i64>
2417  ret <4 x i64> %3
2418}
2419
2420define <8 x i16> @stack_fold_vpmovusdw(<8 x i32> %a0) {
2421; CHECK-LABEL: stack_fold_vpmovusdw:
2422; CHECK:       # %bb.0:
2423; CHECK-NEXT:    vpmovusdw %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
2424; CHECK-NEXT:    #APP
2425; CHECK-NEXT:    nop
2426; CHECK-NEXT:    #NO_APP
2427; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2428; CHECK-NEXT:    vzeroupper
2429; CHECK-NEXT:    retq
2430  %1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.256(<8 x i32> %a0, <8 x i16> undef, i8 -1)
2431  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2432  ret <8 x i16> %1
2433}
2434declare <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.256(<8 x i32>, <8 x i16>, i8)
2435
2436define <4 x i32> @stack_fold_vpmovusqd(<4 x i64> %a0) {
2437; CHECK-LABEL: stack_fold_vpmovusqd:
2438; CHECK:       # %bb.0:
2439; CHECK-NEXT:    vpmovusqd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
2440; CHECK-NEXT:    #APP
2441; CHECK-NEXT:    nop
2442; CHECK-NEXT:    #NO_APP
2443; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2444; CHECK-NEXT:    vzeroupper
2445; CHECK-NEXT:    retq
2446  %1 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64> %a0, <4 x i32> undef, i8 -1)
2447  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2448  ret <4 x i32> %1
2449}
2450declare <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64>, <4 x i32>, i8)
2451
2452define <16 x i8> @stack_fold_vpmovuswb(<16 x i16> %a0) {
2453; CHECK-LABEL: stack_fold_vpmovuswb:
2454; CHECK:       # %bb.0:
2455; CHECK-NEXT:    vpmovuswb %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
2456; CHECK-NEXT:    #APP
2457; CHECK-NEXT:    nop
2458; CHECK-NEXT:    #NO_APP
2459; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2460; CHECK-NEXT:    vzeroupper
2461; CHECK-NEXT:    retq
2462  %1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16> %a0, <16 x i8> undef, i16 -1)
2463  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2464  ret <16 x i8> %1
2465}
2466declare <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16>, <16 x i8>, i16)
2467
2468define <4 x i32> @stack_fold_pmovzxbd(<16 x i8> %a0) {
2469; CHECK-LABEL: stack_fold_pmovzxbd:
2470; CHECK:       # %bb.0:
2471; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2472; CHECK-NEXT:    #APP
2473; CHECK-NEXT:    nop
2474; CHECK-NEXT:    #NO_APP
2475; CHECK-NEXT:    vpmovzxbd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2476; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
2477; CHECK-NEXT:    retq
2478  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2479  %2 = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 17, i32 18, i32 1, i32 19, i32 20, i32 21, i32 2, i32 22, i32 23, i32 24, i32 3, i32 25, i32 26, i32 27>
2480  %3 = bitcast <16 x i8> %2 to <4 x i32>
2481  ret <4 x i32> %3
2482}
2483
2484define <8 x i32> @stack_fold_pmovzxbd_ymm(<16 x i8> %a0) {
2485; CHECK-LABEL: stack_fold_pmovzxbd_ymm:
2486; CHECK:       # %bb.0:
2487; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2488; CHECK-NEXT:    #APP
2489; CHECK-NEXT:    nop
2490; CHECK-NEXT:    #NO_APP
2491; CHECK-NEXT:    vpmovzxbd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
2492; CHECK-NEXT:    # ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
2493; CHECK-NEXT:    retq
2494  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2495  %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2496  %3 = zext <8 x i8> %2 to <8 x i32>
2497  ret <8 x i32> %3
2498}
2499
2500define <2 x i64> @stack_fold_pmovzxbq(<16 x i8> %a0) {
2501; CHECK-LABEL: stack_fold_pmovzxbq:
2502; CHECK:       # %bb.0:
2503; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2504; CHECK-NEXT:    #APP
2505; CHECK-NEXT:    nop
2506; CHECK-NEXT:    #NO_APP
2507; CHECK-NEXT:    vpmovzxbq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2508; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
2509; CHECK-NEXT:    retq
2510  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2511  %2 = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 1, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28>
2512  %3 = bitcast <16 x i8> %2 to <2 x i64>
2513  ret <2 x i64> %3
2514}
2515
2516define <4 x i64> @stack_fold_pmovzxbq_ymm(<16 x i8> %a0) {
2517; CHECK-LABEL: stack_fold_pmovzxbq_ymm:
2518; CHECK:       # %bb.0:
2519; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2520; CHECK-NEXT:    #APP
2521; CHECK-NEXT:    nop
2522; CHECK-NEXT:    #NO_APP
2523; CHECK-NEXT:    vpmovzxbq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
2524; CHECK-NEXT:    # ymm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero
2525; CHECK-NEXT:    retq
2526  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2527  %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2528  %3 = zext <4 x i8> %2 to <4 x i64>
2529  ret <4 x i64> %3
2530}
2531
2532define <8 x i16> @stack_fold_pmovzxbw(<16 x i8> %a0) {
2533; CHECK-LABEL: stack_fold_pmovzxbw:
2534; CHECK:       # %bb.0:
2535; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2536; CHECK-NEXT:    #APP
2537; CHECK-NEXT:    nop
2538; CHECK-NEXT:    #NO_APP
2539; CHECK-NEXT:    vpmovzxbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2540; CHECK-NEXT:    # xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
2541; CHECK-NEXT:    retq
2542  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2543  %2 = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
2544  %3 = bitcast <16 x i8> %2 to <8 x i16>
2545  ret <8 x i16> %3
2546}
2547
2548define <16 x i16> @stack_fold_pmovzxbw_ymm(<16 x i8> %a0) {
2549; CHECK-LABEL: stack_fold_pmovzxbw_ymm:
2550; CHECK:       # %bb.0:
2551; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2552; CHECK-NEXT:    #APP
2553; CHECK-NEXT:    nop
2554; CHECK-NEXT:    #NO_APP
2555; CHECK-NEXT:    vpmovzxbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
2556; CHECK-NEXT:    # ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
2557; CHECK-NEXT:    retq
2558  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2559  %2 = zext <16 x i8> %a0 to <16 x i16>
2560  ret <16 x i16> %2
2561}
2562
2563define <2 x i64> @stack_fold_pmovzxdq(<4 x i32> %a0) {
2564; CHECK-LABEL: stack_fold_pmovzxdq:
2565; CHECK:       # %bb.0:
2566; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2567; CHECK-NEXT:    #APP
2568; CHECK-NEXT:    nop
2569; CHECK-NEXT:    #NO_APP
2570; CHECK-NEXT:    vpmovzxdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2571; CHECK-NEXT:    # xmm0 = mem[0],zero,mem[1],zero
2572; CHECK-NEXT:    retq
2573  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2574  %2 = shufflevector <4 x i32> %a0, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
2575  %3 = bitcast <4 x i32> %2 to <2 x i64>
2576  ret <2 x i64> %3
2577}
2578
2579define <4 x i64> @stack_fold_pmovzxdq_ymm(<4 x i32> %a0) {
2580; CHECK-LABEL: stack_fold_pmovzxdq_ymm:
2581; CHECK:       # %bb.0:
2582; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2583; CHECK-NEXT:    #APP
2584; CHECK-NEXT:    nop
2585; CHECK-NEXT:    #NO_APP
2586; CHECK-NEXT:    vpmovzxdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
2587; CHECK-NEXT:    # ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
2588; CHECK-NEXT:    retq
2589  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2590  %2 = zext <4 x i32> %a0 to <4 x i64>
2591  ret <4 x i64> %2
2592}
2593
2594define <4 x i32> @stack_fold_pmovzxwd(<8 x i16> %a0) {
2595; CHECK-LABEL: stack_fold_pmovzxwd:
2596; CHECK:       # %bb.0:
2597; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2598; CHECK-NEXT:    #APP
2599; CHECK-NEXT:    nop
2600; CHECK-NEXT:    #NO_APP
2601; CHECK-NEXT:    vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2602; CHECK-NEXT:    # xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
2603; CHECK-NEXT:    retq
2604  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2605  %2 = shufflevector <8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
2606  %3 = bitcast <8 x i16> %2 to <4 x i32>
2607  ret <4 x i32> %3
2608}
2609
2610define <8 x i32> @stack_fold_pmovzxwd_ymm(<8 x i16> %a0) {
2611; CHECK-LABEL: stack_fold_pmovzxwd_ymm:
2612; CHECK:       # %bb.0:
2613; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2614; CHECK-NEXT:    #APP
2615; CHECK-NEXT:    nop
2616; CHECK-NEXT:    #NO_APP
2617; CHECK-NEXT:    vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
2618; CHECK-NEXT:    # ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
2619; CHECK-NEXT:    retq
2620  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2621  %2 = zext <8 x i16> %a0 to <8 x i32>
2622  ret <8 x i32> %2
2623}
2624
2625define <2 x i64> @stack_fold_pmovzxwq(<8 x i16> %a0) {
2626; CHECK-LABEL: stack_fold_pmovzxwq:
2627; CHECK:       # %bb.0:
2628; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2629; CHECK-NEXT:    #APP
2630; CHECK-NEXT:    nop
2631; CHECK-NEXT:    #NO_APP
2632; CHECK-NEXT:    vpmovzxwq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2633; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
2634; CHECK-NEXT:    retq
2635  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2636  %2 = shufflevector <8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 8, i32 9, i32 10, i32 1, i32 11, i32 12, i32 13>
2637  %3 = bitcast <8 x i16> %2 to <2 x i64>
2638  ret <2 x i64> %3
2639}
2640
2641define <4 x i64> @stack_fold_pmovzxwq_ymm(<8 x i16> %a0) {
2642; CHECK-LABEL: stack_fold_pmovzxwq_ymm:
2643; CHECK:       # %bb.0:
2644; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2645; CHECK-NEXT:    #APP
2646; CHECK-NEXT:    nop
2647; CHECK-NEXT:    #NO_APP
2648; CHECK-NEXT:    vpmovzxwq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
2649; CHECK-NEXT:    # ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
2650; CHECK-NEXT:    retq
2651  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2652  %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2653  %3 = zext <4 x i16> %2 to <4 x i64>
2654  ret <4 x i64> %3
2655}
2656
2657define <4 x i64> @stack_fold_pmovzxwq_maskz_ymm(<8 x i16> %a0, i8 %mask) {
2658; CHECK-LABEL: stack_fold_pmovzxwq_maskz_ymm:
2659; CHECK:       # %bb.0:
2660; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2661; CHECK-NEXT:    #APP
2662; CHECK-NEXT:    nop
2663; CHECK-NEXT:    #NO_APP
2664; CHECK-NEXT:    kmovd %edi, %k1
2665; CHECK-NEXT:    vpmovzxwq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 {%k1} {z} # 16-byte Folded Reload
2666; CHECK-NEXT:    # ymm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
2667; CHECK-NEXT:    retq
2668  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2669  %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2670  %3 = zext <4 x i16> %2 to <4 x i64>
2671  %4 = bitcast i8 %mask to <8 x i1>
2672  %5 = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2673  %6 = select <4 x i1> %5, <4 x i64> %3, <4 x i64> zeroinitializer
2674  ret <4 x i64> %6
2675}
2676
2677define <4 x i64> @stack_fold_pmovzxwq_mask_ymm(<4 x i64> %passthru, <8 x i16> %a0, i8 %mask) {
2678; CHECK-LABEL: stack_fold_pmovzxwq_mask_ymm:
2679; CHECK:       # %bb.0:
2680; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2681; CHECK-NEXT:    #APP
2682; CHECK-NEXT:    nop
2683; CHECK-NEXT:    #NO_APP
2684; CHECK-NEXT:    kmovd %edi, %k1
2685; CHECK-NEXT:    vpmovzxwq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 {%k1} # 16-byte Folded Reload
2686; CHECK-NEXT:    # ymm0 {%k1} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
2687; CHECK-NEXT:    retq
2688  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2689  %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2690  %3 = zext <4 x i16> %2 to <4 x i64>
2691  %4 = bitcast i8 %mask to <8 x i1>
2692  %5 = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2693  %6 = select <4 x i1> %5, <4 x i64> %3, <4 x i64> %passthru
2694  ret <4 x i64> %6
2695}
2696
2697define <2 x i64> @stack_fold_pmuldq(<4 x i32> %a0, <4 x i32> %a1) {
2698; CHECK-LABEL: stack_fold_pmuldq:
2699; CHECK:       # %bb.0:
2700; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2701; CHECK-NEXT:    #APP
2702; CHECK-NEXT:    nop
2703; CHECK-NEXT:    #NO_APP
2704; CHECK-NEXT:    vpmuldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2705; CHECK-NEXT:    retq
2706  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2707  %2 = bitcast <4 x i32> %a0 to <2 x i64>
2708  %3 = bitcast <4 x i32> %a1 to <2 x i64>
2709  %4 = shl <2 x i64> %2, <i64 32, i64 32>
2710  %5 = ashr <2 x i64> %4, <i64 32, i64 32>
2711  %6 = shl <2 x i64> %3, <i64 32, i64 32>
2712  %7 = ashr <2 x i64> %6, <i64 32, i64 32>
2713  %8 = mul <2 x i64> %5, %7
2714  ret <2 x i64> %8
2715}
2716
2717define <4 x i64> @stack_fold_pmuldq_ymm(<8 x i32> %a0, <8 x i32> %a1) {
2718; CHECK-LABEL: stack_fold_pmuldq_ymm:
2719; CHECK:       # %bb.0:
2720; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2721; CHECK-NEXT:    #APP
2722; CHECK-NEXT:    nop
2723; CHECK-NEXT:    #NO_APP
2724; CHECK-NEXT:    vpmuldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
2725; CHECK-NEXT:    retq
2726  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2727  %2 = bitcast <8 x i32> %a0 to <4 x i64>
2728  %3 = bitcast <8 x i32> %a1 to <4 x i64>
2729  %4 = shl <4 x i64> %2, <i64 32, i64 32, i64 32, i64 32>
2730  %5 = ashr <4 x i64> %4, <i64 32, i64 32, i64 32, i64 32>
2731  %6 = shl <4 x i64> %3, <i64 32, i64 32, i64 32, i64 32>
2732  %7 = ashr <4 x i64> %6, <i64 32, i64 32, i64 32, i64 32>
2733  %8 = mul <4 x i64> %5, %7
2734  ret <4 x i64> %8
2735}
2736
2737define <2 x i64> @stack_fold_pmuludq(<4 x i32> %a0, <4 x i32> %a1) {
2738; CHECK-LABEL: stack_fold_pmuludq:
2739; CHECK:       # %bb.0:
2740; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2741; CHECK-NEXT:    #APP
2742; CHECK-NEXT:    nop
2743; CHECK-NEXT:    #NO_APP
2744; CHECK-NEXT:    vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2745; CHECK-NEXT:    retq
2746  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2747  %2 = bitcast <4 x i32> %a0 to <2 x i64>
2748  %3 = bitcast <4 x i32> %a1 to <2 x i64>
2749  %4 = and <2 x i64> %2, <i64 4294967295, i64 4294967295>
2750  %5 = and <2 x i64> %3, <i64 4294967295, i64 4294967295>
2751  %6 = mul <2 x i64> %4, %5
2752  ret <2 x i64> %6
2753}
2754
2755define <4 x i64> @stack_fold_pmuludq_ymm(<8 x i32> %a0, <8 x i32> %a1) {
2756; CHECK-LABEL: stack_fold_pmuludq_ymm:
2757; CHECK:       # %bb.0:
2758; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2759; CHECK-NEXT:    #APP
2760; CHECK-NEXT:    nop
2761; CHECK-NEXT:    #NO_APP
2762; CHECK-NEXT:    vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
2763; CHECK-NEXT:    retq
2764  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2765  %2 = bitcast <8 x i32> %a0 to <4 x i64>
2766  %3 = bitcast <8 x i32> %a1 to <4 x i64>
2767  %4 = and <4 x i64> %2, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
2768  %5 = and <4 x i64> %3, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
2769  %6 = mul <4 x i64> %4, %5
2770  ret <4 x i64> %6
2771}
2772
2773define <4 x i64> @stack_fold_pmuludq_ymm_mask(<4 x i64>* %passthru, <8 x i32> %a0, <8 x i32> %a1, i8 %mask) {
2774; CHECK-LABEL: stack_fold_pmuludq_ymm_mask:
2775; CHECK:       # %bb.0:
2776; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2777; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2778; CHECK-NEXT:    #APP
2779; CHECK-NEXT:    nop
2780; CHECK-NEXT:    #NO_APP
2781; CHECK-NEXT:    kmovd %esi, %k1
2782; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
2783; CHECK-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2784; CHECK-NEXT:    vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 {%k1} # 32-byte Folded Reload
2785; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
2786; CHECK-NEXT:    retq
2787  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2788  %2 = bitcast <8 x i32> %a0 to <4 x i64>
2789  %3 = bitcast <8 x i32> %a1 to <4 x i64>
2790  %4 = and <4 x i64> %2, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
2791  %5 = and <4 x i64> %3, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
2792  %6 = mul <4 x i64> %4, %5
2793  %7 = bitcast i8 %mask to <8 x i1>
2794  %8 = shufflevector <8 x i1> %7, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2795  %9 = load <4 x i64>, <4 x i64>* %passthru
2796  %10 = select <4 x i1> %8, <4 x i64> %6, <4 x i64> %9
2797  ret <4 x i64> %10
2798}
2799
2800define <4 x i64> @stack_fold_pmuludq_ymm_maskz(<8 x i32> %a0, <8 x i32> %a1, i8 %mask) {
2801; CHECK-LABEL: stack_fold_pmuludq_ymm_maskz:
2802; CHECK:       # %bb.0:
2803; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2804; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2805; CHECK-NEXT:    #APP
2806; CHECK-NEXT:    nop
2807; CHECK-NEXT:    #NO_APP
2808; CHECK-NEXT:    kmovd %edi, %k1
2809; CHECK-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2810; CHECK-NEXT:    vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 {%k1} {z} # 32-byte Folded Reload
2811; CHECK-NEXT:    retq
2812  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2813  %2 = bitcast <8 x i32> %a0 to <4 x i64>
2814  %3 = bitcast <8 x i32> %a1 to <4 x i64>
2815  %4 = and <4 x i64> %2, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
2816  %5 = and <4 x i64> %3, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
2817  %6 = mul <4 x i64> %4, %5
2818  %7 = bitcast i8 %mask to <8 x i1>
2819  %8 = shufflevector <8 x i1> %7, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2820  %9 = select <4 x i1> %8, <4 x i64> %6, <4 x i64> zeroinitializer
2821  ret <4 x i64> %9
2822}
2823
2824define <4 x i32> @stack_fold_vpopcntd(<4 x i32> %a0) {
2825; CHECK-LABEL: stack_fold_vpopcntd:
2826; CHECK:       # %bb.0:
2827; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2828; CHECK-NEXT:    #APP
2829; CHECK-NEXT:    nop
2830; CHECK-NEXT:    #NO_APP
2831; CHECK-NEXT:    vpopcntd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2832; CHECK-NEXT:    retq
2833  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2834  %2 = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %a0)
2835  ret <4 x i32> %2
2836}
2837declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>) nounwind readonly
2838
2839define <8 x i32> @stack_fold_vpopcntd_ymm(<8 x i32> %a0) {
2840; CHECK-LABEL: stack_fold_vpopcntd_ymm:
2841; CHECK:       # %bb.0:
2842; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2843; CHECK-NEXT:    #APP
2844; CHECK-NEXT:    nop
2845; CHECK-NEXT:    #NO_APP
2846; CHECK-NEXT:    vpopcntd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
2847; CHECK-NEXT:    retq
2848  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2849  %2 = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %a0)
2850  ret <8 x i32> %2
2851}
2852declare <8 x i32> @llvm.ctpop.v8i32(<8 x i32>) nounwind readonly
2853
2854define <2 x i64> @stack_fold_vpopcntq(<2 x i64> %a0) {
2855; CHECK-LABEL: stack_fold_vpopcntq:
2856; CHECK:       # %bb.0:
2857; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2858; CHECK-NEXT:    #APP
2859; CHECK-NEXT:    nop
2860; CHECK-NEXT:    #NO_APP
2861; CHECK-NEXT:    vpopcntq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2862; CHECK-NEXT:    retq
2863  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2864  %2 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a0)
2865  ret <2 x i64> %2
2866}
2867declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>) nounwind readnone
2868
2869define <4 x i64> @stack_fold_vpopcntq_ymm(<4 x i64> %a0) {
2870; CHECK-LABEL: stack_fold_vpopcntq_ymm:
2871; CHECK:       # %bb.0:
2872; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2873; CHECK-NEXT:    #APP
2874; CHECK-NEXT:    nop
2875; CHECK-NEXT:    #NO_APP
2876; CHECK-NEXT:    vpopcntq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
2877; CHECK-NEXT:    retq
2878  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2879  %2 = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %a0)
2880  ret <4 x i64> %2
2881}
2882declare <4 x i64> @llvm.ctpop.v4i64(<4 x i64>) nounwind readnone
2883
2884define <2 x i64> @stack_fold_psadbw(<16 x i8> %a0, <16 x i8> %a1) {
2885; CHECK-LABEL: stack_fold_psadbw:
2886; CHECK:       # %bb.0:
2887; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2888; CHECK-NEXT:    #APP
2889; CHECK-NEXT:    nop
2890; CHECK-NEXT:    #NO_APP
2891; CHECK-NEXT:    vpsadbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2892; CHECK-NEXT:    retq
2893  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2894  %2 = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %a0, <16 x i8> %a1)
2895  ret <2 x i64> %2
2896}
2897declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
2898
2899define <2 x i64> @stack_fold_psadbw_commute(<16 x i8> %a0, <16 x i8> %a1) {
2900; CHECK-LABEL: stack_fold_psadbw_commute:
2901; CHECK:       # %bb.0:
2902; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2903; CHECK-NEXT:    #APP
2904; CHECK-NEXT:    nop
2905; CHECK-NEXT:    #NO_APP
2906; CHECK-NEXT:    vpsadbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2907; CHECK-NEXT:    retq
2908  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2909  %2 = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %a1, <16 x i8> %a0)
2910  ret <2 x i64> %2
2911}
2912
2913define <4 x i64> @stack_fold_psadbw_ymm(<32 x i8> %a0, <32 x i8> %a1) {
2914; CHECK-LABEL: stack_fold_psadbw_ymm:
2915; CHECK:       # %bb.0:
2916; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2917; CHECK-NEXT:    #APP
2918; CHECK-NEXT:    nop
2919; CHECK-NEXT:    #NO_APP
2920; CHECK-NEXT:    vpsadbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
2921; CHECK-NEXT:    retq
2922  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2923  %2 = call <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8> %a0, <32 x i8> %a1)
2924  ret <4 x i64> %2
2925}
2926declare <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8>, <32 x i8>) nounwind readnone
2927
2928define <4 x i64> @stack_fold_psadbw_ymm_commute(<32 x i8> %a0, <32 x i8> %a1) {
2929; CHECK-LABEL: stack_fold_psadbw_ymm_commute:
2930; CHECK:       # %bb.0:
2931; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2932; CHECK-NEXT:    #APP
2933; CHECK-NEXT:    nop
2934; CHECK-NEXT:    #NO_APP
2935; CHECK-NEXT:    vpsadbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
2936; CHECK-NEXT:    retq
2937  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2938  %2 = call <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8> %a1, <32 x i8> %a0)
2939  ret <4 x i64> %2
2940}
2941
2942define <16 x i8> @stack_fold_pshufb(<16 x i8> %a0, <16 x i8> %a1) {
2943; CHECK-LABEL: stack_fold_pshufb:
2944; CHECK:       # %bb.0:
2945; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2946; CHECK-NEXT:    #APP
2947; CHECK-NEXT:    nop
2948; CHECK-NEXT:    #NO_APP
2949; CHECK-NEXT:    vpshufb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2950; CHECK-NEXT:    retq
2951  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2952  %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> %a1)
2953  ret <16 x i8> %2
2954}
2955declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>) nounwind readnone
2956
2957define <16 x i8> @stack_fold_pshufb_mask(<16 x i8>* %passthru, <16 x i8> %a0, <16 x i8> %a1, i16 %mask) {
2958; CHECK-LABEL: stack_fold_pshufb_mask:
2959; CHECK:       # %bb.0:
2960; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2961; CHECK-NEXT:    #APP
2962; CHECK-NEXT:    nop
2963; CHECK-NEXT:    #NO_APP
2964; CHECK-NEXT:    vmovdqa (%rdi), %xmm2
2965; CHECK-NEXT:    kmovd %esi, %k1
2966; CHECK-NEXT:    vpshufb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
2967; CHECK-NEXT:    vmovdqa %xmm2, %xmm0
2968; CHECK-NEXT:    retq
2969  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2970  %2 = load <16 x i8>, <16 x i8>* %passthru
2971  %3 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> %a1)
2972  %4 = bitcast i16 %mask to <16 x i1>
2973  %5 = select <16 x i1> %4, <16 x i8> %3, <16 x i8> %2
2974  ret <16 x i8> %5
2975}
2976
2977define <16 x i8> @stack_fold_pshufb_maskz(<16 x i8> %a0, <16 x i8> %a1, i16 %mask) {
2978; CHECK-LABEL: stack_fold_pshufb_maskz:
2979; CHECK:       # %bb.0:
2980; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2981; CHECK-NEXT:    #APP
2982; CHECK-NEXT:    nop
2983; CHECK-NEXT:    #NO_APP
2984; CHECK-NEXT:    kmovd %edi, %k1
2985; CHECK-NEXT:    vpshufb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} {z} # 16-byte Folded Reload
2986; CHECK-NEXT:    retq
2987  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2988  %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> %a1)
2989  %3 = bitcast i16 %mask to <16 x i1>
2990  %4 = select <16 x i1> %3, <16 x i8> %2, <16 x i8> zeroinitializer
2991  ret <16 x i8> %4
2992}
2993
2994define <32 x i8> @stack_fold_pshufb_ymm(<32 x i8> %a0, <32 x i8> %a1) {
2995; CHECK-LABEL: stack_fold_pshufb_ymm:
2996; CHECK:       # %bb.0:
2997; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2998; CHECK-NEXT:    #APP
2999; CHECK-NEXT:    nop
3000; CHECK-NEXT:    #NO_APP
3001; CHECK-NEXT:    vpshufb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3002; CHECK-NEXT:    retq
3003  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3004  %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> %a1)
3005  ret <32 x i8> %2
3006}
3007declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>)
3008
3009define <32 x i8> @stack_fold_pshufb_ymm_mask(<32 x i8>* %passthru, <32 x i8> %a0, <32 x i8> %a1, i32 %mask) {
3010; CHECK-LABEL: stack_fold_pshufb_ymm_mask:
3011; CHECK:       # %bb.0:
3012; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3013; CHECK-NEXT:    #APP
3014; CHECK-NEXT:    nop
3015; CHECK-NEXT:    #NO_APP
3016; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
3017; CHECK-NEXT:    kmovd %esi, %k1
3018; CHECK-NEXT:    vpshufb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
3019; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
3020; CHECK-NEXT:    retq
3021  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3022  %2 = load <32 x i8>, <32 x i8>* %passthru
3023  %3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> %a1)
3024  %4 = bitcast i32 %mask to <32 x i1>
3025  %5 = select <32 x i1> %4, <32 x i8> %3, <32 x i8> %2
3026  ret <32 x i8> %5
3027}
3028
3029define <32 x i8> @stack_fold_pshufb_ymm_maskz(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) {
3030; CHECK-LABEL: stack_fold_pshufb_ymm_maskz:
3031; CHECK:       # %bb.0:
3032; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3033; CHECK-NEXT:    #APP
3034; CHECK-NEXT:    nop
3035; CHECK-NEXT:    #NO_APP
3036; CHECK-NEXT:    kmovd %edi, %k1
3037; CHECK-NEXT:    vpshufb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 {%k1} {z} # 32-byte Folded Reload
3038; CHECK-NEXT:    retq
3039  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3040  %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> %a1)
3041  %3 = bitcast i32 %mask to <32 x i1>
3042  %4 = select <32 x i1> %3, <32 x i8> %2, <32 x i8> zeroinitializer
3043  ret <32 x i8> %4
3044}
3045
3046define <4 x i32> @stack_fold_pshufd(<4 x i32> %a0) {
3047; CHECK-LABEL: stack_fold_pshufd:
3048; CHECK:       # %bb.0:
3049; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3050; CHECK-NEXT:    #APP
3051; CHECK-NEXT:    nop
3052; CHECK-NEXT:    #NO_APP
3053; CHECK-NEXT:    vpshufd $27, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3054; CHECK-NEXT:    # xmm0 = mem[3,2,1,0]
3055; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
3056; CHECK-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
3057; CHECK-NEXT:    retq
3058  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3059  %2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
3060  %3 = add <4 x i32> %2, <i32 1, i32 1, i32 1, i32 1>
3061  ret <4 x i32> %3
3062}
3063
3064define <4 x i32> @stack_fold_pshufd_mask(<4 x i32> %passthru, <4 x i32> %a0, i8 %mask) {
3065; CHECK-LABEL: stack_fold_pshufd_mask:
3066; CHECK:       # %bb.0:
3067; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3068; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3069; CHECK-NEXT:    #APP
3070; CHECK-NEXT:    nop
3071; CHECK-NEXT:    #NO_APP
3072; CHECK-NEXT:    kmovd %edi, %k1
3073; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3074; CHECK-NEXT:    vpshufd $27, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 {%k1} # 16-byte Folded Reload
3075; CHECK-NEXT:    # xmm0 {%k1} = mem[3,2,1,0]
3076; CHECK-NEXT:    retq
3077  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3078  %2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
3079  %3 = bitcast i8 %mask to <8 x i1>
3080  %4 = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3081  %5 = select <4 x i1> %4, <4 x i32> %2, <4 x i32> %passthru
3082  ret <4 x i32> %5
3083}
3084
3085define <4 x i32> @stack_fold_pshufd_maskz(<4 x i32> %a0, i8 %mask) {
3086; CHECK-LABEL: stack_fold_pshufd_maskz:
3087; CHECK:       # %bb.0:
3088; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3089; CHECK-NEXT:    #APP
3090; CHECK-NEXT:    nop
3091; CHECK-NEXT:    #NO_APP
3092; CHECK-NEXT:    kmovd %edi, %k1
3093; CHECK-NEXT:    vpshufd $27, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 {%k1} {z} # 16-byte Folded Reload
3094; CHECK-NEXT:    # xmm0 {%k1} {z} = mem[3,2,1,0]
3095; CHECK-NEXT:    retq
3096  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3097  %2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
3098  %3 = bitcast i8 %mask to <8 x i1>
3099  %4 = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3100  %5 = select <4 x i1> %4, <4 x i32> %2, <4 x i32> zeroinitializer
3101  ret <4 x i32> %5
3102}
3103
3104define <8 x i32> @stack_fold_pshufd_ymm(<8 x i32> %a0) {
3105; CHECK-LABEL: stack_fold_pshufd_ymm:
3106; CHECK:       # %bb.0:
3107; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3108; CHECK-NEXT:    #APP
3109; CHECK-NEXT:    nop
3110; CHECK-NEXT:    #NO_APP
3111; CHECK-NEXT:    vpshufd $27, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
3112; CHECK-NEXT:    # ymm0 = mem[3,2,1,0,7,6,5,4]
3113; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
3114; CHECK-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
3115; CHECK-NEXT:    retq
3116  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3117  %2 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
3118  %3 = add <8 x i32> %2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
3119  ret <8 x i32> %3
3120}
3121
3122define <8 x i32> @stack_fold_pshufd_ymm_mask(<8 x i32> %passthru, <8 x i32> %a0, i8 %mask) {
3123; CHECK-LABEL: stack_fold_pshufd_ymm_mask:
3124; CHECK:       # %bb.0:
3125; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3126; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3127; CHECK-NEXT:    #APP
3128; CHECK-NEXT:    nop
3129; CHECK-NEXT:    #NO_APP
3130; CHECK-NEXT:    kmovd %edi, %k1
3131; CHECK-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3132; CHECK-NEXT:    vpshufd $27, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 {%k1} # 32-byte Folded Reload
3133; CHECK-NEXT:    # ymm0 {%k1} = mem[3,2,1,0,7,6,5,4]
3134; CHECK-NEXT:    retq
3135  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3136  %2 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
3137  %3 = bitcast i8 %mask to <8 x i1>
3138  %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> %passthru
3139  ret <8 x i32> %4
3140}
3141
3142define <8 x i32> @stack_fold_pshufd_ymm_maskz(<8 x i32> %a0, i8 %mask) {
3143; CHECK-LABEL: stack_fold_pshufd_ymm_maskz:
3144; CHECK:       # %bb.0:
3145; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3146; CHECK-NEXT:    #APP
3147; CHECK-NEXT:    nop
3148; CHECK-NEXT:    #NO_APP
3149; CHECK-NEXT:    kmovd %edi, %k1
3150; CHECK-NEXT:    vpshufd $27, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 {%k1} {z} # 32-byte Folded Reload
3151; CHECK-NEXT:    # ymm0 {%k1} {z} = mem[3,2,1,0,7,6,5,4]
3152; CHECK-NEXT:    retq
3153  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3154  %2 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
3155  %3 = bitcast i8 %mask to <8 x i1>
3156  %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer
3157  ret <8 x i32> %4
3158}
3159
3160define <8 x i16> @stack_fold_pshufhw(<8 x i16> %a0) {
3161; CHECK-LABEL: stack_fold_pshufhw:
3162; CHECK:       # %bb.0:
3163; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3164; CHECK-NEXT:    #APP
3165; CHECK-NEXT:    nop
3166; CHECK-NEXT:    #NO_APP
3167; CHECK-NEXT:    vpshufhw $11, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3168; CHECK-NEXT:    # xmm0 = mem[0,1,2,3,7,6,4,4]
3169; CHECK-NEXT:    retq
3170  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3171  %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 4, i32 4>
3172  ret <8 x i16> %2
3173}
3174
3175define <8 x i16> @stack_fold_pshufhw_mask(<8 x i16> %passthru, <8 x i16> %a0, i8 %mask) {
3176; CHECK-LABEL: stack_fold_pshufhw_mask:
3177; CHECK:       # %bb.0:
3178; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3179; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3180; CHECK-NEXT:    #APP
3181; CHECK-NEXT:    nop
3182; CHECK-NEXT:    #NO_APP
3183; CHECK-NEXT:    kmovd %edi, %k1
3184; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3185; CHECK-NEXT:    vpshufhw $11, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 {%k1} # 16-byte Folded Reload
3186; CHECK-NEXT:    # xmm0 {%k1} = mem[0,1,2,3,7,6,4,4]
3187; CHECK-NEXT:    retq
3188  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3189  %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 4, i32 4>
3190  %3 = bitcast i8 %mask to <8 x i1>
3191  %4 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> %passthru
3192  ret <8 x i16> %4
3193}
3194
3195define <8 x i16> @stack_fold_pshufhw_maskz(<8 x i16> %a0, i8 %mask) {
3196; CHECK-LABEL: stack_fold_pshufhw_maskz:
3197; CHECK:       # %bb.0:
3198; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3199; CHECK-NEXT:    #APP
3200; CHECK-NEXT:    nop
3201; CHECK-NEXT:    #NO_APP
3202; CHECK-NEXT:    kmovd %edi, %k1
3203; CHECK-NEXT:    vpshufhw $11, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 {%k1} {z} # 16-byte Folded Reload
3204; CHECK-NEXT:    # xmm0 {%k1} {z} = mem[0,1,2,3,7,6,4,4]
3205; CHECK-NEXT:    retq
3206  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3207  %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 4, i32 4>
3208  %3 = bitcast i8 %mask to <8 x i1>
3209  %4 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> zeroinitializer
3210  ret <8 x i16> %4
3211}
3212
3213define <16 x i16> @stack_fold_pshufhw_ymm(<16 x i16> %a0) {
3214; CHECK-LABEL: stack_fold_pshufhw_ymm:
3215; CHECK:       # %bb.0:
3216; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3217; CHECK-NEXT:    #APP
3218; CHECK-NEXT:    nop
3219; CHECK-NEXT:    #NO_APP
3220; CHECK-NEXT:    vpshufhw $27, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
3221; CHECK-NEXT:    # ymm0 = mem[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12]
3222; CHECK-NEXT:    retq
3223  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3224  %2 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 5, i32 4, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 13, i32 12>
3225  ret <16 x i16> %2
3226}
3227
3228define <16 x i16> @stack_fold_pshufhw_ymm_mask(<16 x i16> %passthru, <16 x i16> %a0, i16 %mask) {
3229; CHECK-LABEL: stack_fold_pshufhw_ymm_mask:
3230; CHECK:       # %bb.0:
3231; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3232; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3233; CHECK-NEXT:    #APP
3234; CHECK-NEXT:    nop
3235; CHECK-NEXT:    #NO_APP
3236; CHECK-NEXT:    kmovd %edi, %k1
3237; CHECK-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3238; CHECK-NEXT:    vpshufhw $27, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 {%k1} # 32-byte Folded Reload
3239; CHECK-NEXT:    # ymm0 {%k1} = mem[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12]
3240; CHECK-NEXT:    retq
3241  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3242  %2 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 5, i32 4, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 13, i32 12>
3243  %3 = bitcast i16 %mask to <16 x i1>
3244  %4 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> %passthru
3245  ret <16 x i16> %4
3246}
3247
3248define <16 x i16> @stack_fold_pshufhw_ymm_maskz(<16 x i16> %a0, i16 %mask) {
3249; CHECK-LABEL: stack_fold_pshufhw_ymm_maskz:
3250; CHECK:       # %bb.0:
3251; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3252; CHECK-NEXT:    #APP
3253; CHECK-NEXT:    nop
3254; CHECK-NEXT:    #NO_APP
3255; CHECK-NEXT:    kmovd %edi, %k1
3256; CHECK-NEXT:    vpshufhw $27, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 {%k1} {z} # 32-byte Folded Reload
3257; CHECK-NEXT:    # ymm0 {%k1} {z} = mem[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12]
3258; CHECK-NEXT:    retq
3259  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3260  %2 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 5, i32 4, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 13, i32 12>
3261  %3 = bitcast i16 %mask to <16 x i1>
3262  %4 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> zeroinitializer
3263  ret <16 x i16> %4
3264}
3265
3266define <8 x i16> @stack_fold_pshuflw(<8 x i16> %a0) {
3267; CHECK-LABEL: stack_fold_pshuflw:
3268; CHECK:       # %bb.0:
3269; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3270; CHECK-NEXT:    #APP
3271; CHECK-NEXT:    nop
3272; CHECK-NEXT:    #NO_APP
3273; CHECK-NEXT:    vpshuflw $27, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3274; CHECK-NEXT:    # xmm0 = mem[3,2,1,0,4,5,6,7]
3275; CHECK-NEXT:    retq
3276  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3277  %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7>
3278  ret <8 x i16> %2
3279}
3280
3281define <8 x i16> @stack_fold_pshuflw_mask(<8 x i16> %passthru, <8 x i16> %a0, i8 %mask) {
3282; CHECK-LABEL: stack_fold_pshuflw_mask:
3283; CHECK:       # %bb.0:
3284; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3285; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3286; CHECK-NEXT:    #APP
3287; CHECK-NEXT:    nop
3288; CHECK-NEXT:    #NO_APP
3289; CHECK-NEXT:    kmovd %edi, %k1
3290; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3291; CHECK-NEXT:    vpshuflw $27, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 {%k1} # 16-byte Folded Reload
3292; CHECK-NEXT:    # xmm0 {%k1} = mem[3,2,1,0,4,5,6,7]
3293; CHECK-NEXT:    retq
3294  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3295  %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7>
3296  %3 = bitcast i8 %mask to <8 x i1>
3297  %4 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> %passthru
3298  ret <8 x i16> %4
3299}
3300
3301define <8 x i16> @stack_fold_pshuflw_maskz(<8 x i16> %a0, i8 %mask) {
3302; CHECK-LABEL: stack_fold_pshuflw_maskz:
3303; CHECK:       # %bb.0:
3304; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3305; CHECK-NEXT:    #APP
3306; CHECK-NEXT:    nop
3307; CHECK-NEXT:    #NO_APP
3308; CHECK-NEXT:    kmovd %edi, %k1
3309; CHECK-NEXT:    vpshuflw $27, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 {%k1} {z} # 16-byte Folded Reload
3310; CHECK-NEXT:    # xmm0 {%k1} {z} = mem[3,2,1,0,4,5,6,7]
3311; CHECK-NEXT:    retq
3312  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3313  %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7>
3314  %3 = bitcast i8 %mask to <8 x i1>
3315  %4 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> zeroinitializer
3316  ret <8 x i16> %4
3317}
3318
3319define <16 x i16> @stack_fold_pshuflw_ymm(<16 x i16> %a0) {
3320; CHECK-LABEL: stack_fold_pshuflw_ymm:
3321; CHECK:       # %bb.0:
3322; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3323; CHECK-NEXT:    #APP
3324; CHECK-NEXT:    nop
3325; CHECK-NEXT:    #NO_APP
3326; CHECK-NEXT:    vpshuflw $27, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
3327; CHECK-NEXT:    # ymm0 = mem[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15]
3328; CHECK-NEXT:    retq
3329  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3330  %2 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15>
3331  ret <16 x i16> %2
3332}
3333
3334define <16 x i16> @stack_fold_pshuflw_ymm_mask(<16 x i16> %passthru, <16 x i16> %a0, i16 %mask) {
3335; CHECK-LABEL: stack_fold_pshuflw_ymm_mask:
3336; CHECK:       # %bb.0:
3337; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3338; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3339; CHECK-NEXT:    #APP
3340; CHECK-NEXT:    nop
3341; CHECK-NEXT:    #NO_APP
3342; CHECK-NEXT:    kmovd %edi, %k1
3343; CHECK-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3344; CHECK-NEXT:    vpshuflw $27, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 {%k1} # 32-byte Folded Reload
3345; CHECK-NEXT:    # ymm0 {%k1} = mem[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15]
3346; CHECK-NEXT:    retq
3347  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3348  %2 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15>
3349  %3 = bitcast i16 %mask to <16 x i1>
3350  %4 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> %passthru
3351  ret <16 x i16> %4
3352}
3353
3354define <16 x i16> @stack_fold_pshuflw_ymm_maskz(<16 x i16> %a0, i16 %mask) {
3355; CHECK-LABEL: stack_fold_pshuflw_ymm_maskz:
3356; CHECK:       # %bb.0:
3357; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3358; CHECK-NEXT:    #APP
3359; CHECK-NEXT:    nop
3360; CHECK-NEXT:    #NO_APP
3361; CHECK-NEXT:    kmovd %edi, %k1
3362; CHECK-NEXT:    vpshuflw $27, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 {%k1} {z} # 32-byte Folded Reload
3363; CHECK-NEXT:    # ymm0 {%k1} {z} = mem[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15]
3364; CHECK-NEXT:    retq
3365  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3366  %2 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15>
3367  %3 = bitcast i16 %mask to <16 x i1>
3368  %4 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> zeroinitializer
3369  ret <16 x i16> %4
3370}
3371
3372define <4 x i32> @stack_fold_pslld(<4 x i32> %a0, <4 x i32> %a1) {
3373; CHECK-LABEL: stack_fold_pslld:
3374; CHECK:       # %bb.0:
3375; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3376; CHECK-NEXT:    #APP
3377; CHECK-NEXT:    nop
3378; CHECK-NEXT:    #NO_APP
3379; CHECK-NEXT:    vpslld {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3380; CHECK-NEXT:    retq
3381  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3382  %2 = call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %a0, <4 x i32> %a1)
3383  ret <4 x i32> %2
3384}
3385declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) nounwind readnone
3386
3387define <8 x i32> @stack_fold_pslld_ymm(<8 x i32> %a0, <4 x i32> %a1) {
3388; CHECK-LABEL: stack_fold_pslld_ymm:
3389; CHECK:       # %bb.0:
3390; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3391; CHECK-NEXT:    #APP
3392; CHECK-NEXT:    nop
3393; CHECK-NEXT:    #NO_APP
3394; CHECK-NEXT:    vpslld {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
3395; CHECK-NEXT:    retq
3396  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3397  %2 = call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %a0, <4 x i32> %a1)
3398  ret <8 x i32> %2
3399}
3400declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) nounwind readnone
3401
3402define <16 x i8> @stack_fold_pslldq(<16 x i8> %a) {
3403; CHECK-LABEL: stack_fold_pslldq:
3404; CHECK:       # %bb.0:
3405; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3406; CHECK-NEXT:    #APP
3407; CHECK-NEXT:    nop
3408; CHECK-NEXT:    #NO_APP
3409; CHECK-NEXT:    vpslldq $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3410; CHECK-NEXT:    # xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,mem[0,1,2,3]
3411; CHECK-NEXT:    retq
3412  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3413  %2 = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 17, i32 18, i32 19>
3414  ret <16 x i8> %2
3415}
3416
3417define <32 x i8> @stack_fold_pslldq_ymm(<32 x i8> %a) {
3418; CHECK-LABEL: stack_fold_pslldq_ymm:
3419; CHECK:       # %bb.0:
3420; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3421; CHECK-NEXT:    #APP
3422; CHECK-NEXT:    nop
3423; CHECK-NEXT:    #NO_APP
3424; CHECK-NEXT:    vpslldq $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
3425; CHECK-NEXT:    # ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,mem[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,mem[16]
3426; CHECK-NEXT:    retq
3427  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3428  %2 = shufflevector <32 x i8> zeroinitializer, <32 x i8> %a, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 32, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 48>
3429  ret <32 x i8> %2
3430}
3431
3432define <2 x i64> @stack_fold_psllq(<2 x i64> %a0, <2 x i64> %a1) {
3433; CHECK-LABEL: stack_fold_psllq:
3434; CHECK:       # %bb.0:
3435; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3436; CHECK-NEXT:    #APP
3437; CHECK-NEXT:    nop
3438; CHECK-NEXT:    #NO_APP
3439; CHECK-NEXT:    vpsllq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3440; CHECK-NEXT:    retq
3441  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3442  %2 = call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %a0, <2 x i64> %a1)
3443  ret <2 x i64> %2
3444}
3445declare <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64>, <2 x i64>) nounwind readnone
3446
3447define <4 x i64> @stack_fold_psllq_ymm(<4 x i64> %a0, <2 x i64> %a1) {
3448; CHECK-LABEL: stack_fold_psllq_ymm:
3449; CHECK:       # %bb.0:
3450; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3451; CHECK-NEXT:    #APP
3452; CHECK-NEXT:    nop
3453; CHECK-NEXT:    #NO_APP
3454; CHECK-NEXT:    vpsllq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
3455; CHECK-NEXT:    retq
3456  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3457  %2 = call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %a0, <2 x i64> %a1)
3458  ret <4 x i64> %2
3459}
3460declare <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64>, <2 x i64>) nounwind readnone
3461
3462define <4 x i32> @stack_fold_psllvd(<4 x i32> %a0, <4 x i32> %a1) {
3463; CHECK-LABEL: stack_fold_psllvd:
3464; CHECK:       # %bb.0:
3465; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3466; CHECK-NEXT:    #APP
3467; CHECK-NEXT:    nop
3468; CHECK-NEXT:    #NO_APP
3469; CHECK-NEXT:    vpsllvd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3470; CHECK-NEXT:    retq
3471  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3472  %2 = call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %a0, <4 x i32> %a1)
3473  ret <4 x i32> %2
3474}
3475declare <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32>, <4 x i32>) nounwind readnone
3476
3477define <8 x i32> @stack_fold_psllvd_ymm(<8 x i32> %a0, <8 x i32> %a1) {
3478; CHECK-LABEL: stack_fold_psllvd_ymm:
3479; CHECK:       # %bb.0:
3480; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3481; CHECK-NEXT:    #APP
3482; CHECK-NEXT:    nop
3483; CHECK-NEXT:    #NO_APP
3484; CHECK-NEXT:    vpsllvd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3485; CHECK-NEXT:    retq
3486  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3487  %2 = call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %a0, <8 x i32> %a1)
3488  ret <8 x i32> %2
3489}
3490declare <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32>, <8 x i32>) nounwind readnone
3491
3492define <2 x i64> @stack_fold_psllvq(<2 x i64> %a0, <2 x i64> %a1) {
3493; CHECK-LABEL: stack_fold_psllvq:
3494; CHECK:       # %bb.0:
3495; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3496; CHECK-NEXT:    #APP
3497; CHECK-NEXT:    nop
3498; CHECK-NEXT:    #NO_APP
3499; CHECK-NEXT:    vpsllvq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3500; CHECK-NEXT:    retq
3501  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3502  %2 = call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %a0, <2 x i64> %a1)
3503  ret <2 x i64> %2
3504}
3505declare <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64>, <2 x i64>) nounwind readnone
3506
3507define <4 x i64> @stack_fold_psllvq_ymm(<4 x i64> %a0, <4 x i64> %a1) {
3508; CHECK-LABEL: stack_fold_psllvq_ymm:
3509; CHECK:       # %bb.0:
3510; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3511; CHECK-NEXT:    #APP
3512; CHECK-NEXT:    nop
3513; CHECK-NEXT:    #NO_APP
3514; CHECK-NEXT:    vpsllvq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3515; CHECK-NEXT:    retq
3516  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3517  %2 = call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %a0, <4 x i64> %a1)
3518  ret <4 x i64> %2
3519}
3520declare <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64>, <4 x i64>) nounwind readnone
3521
3522define <8 x i16> @stack_fold_psllvw(<8 x i16> %a0, <8 x i16> %a1) {
3523; CHECK-LABEL: stack_fold_psllvw:
3524; CHECK:       # %bb.0:
3525; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3526; CHECK-NEXT:    #APP
3527; CHECK-NEXT:    nop
3528; CHECK-NEXT:    #NO_APP
3529; CHECK-NEXT:    vpsllvw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3530; CHECK-NEXT:    retq
3531  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3532  %2 = call <8 x i16> @llvm.x86.avx512.psllv.w.128(<8 x i16> %a0, <8 x i16> %a1)
3533  ret <8 x i16> %2
3534}
3535declare <8 x i16> @llvm.x86.avx512.psllv.w.128(<8 x i16>, <8 x i16>) nounwind readnone
3536
3537define <16 x i16> @stack_fold_psllvw_ymm(<16 x i16> %a0, <16 x i16> %a1) {
3538; CHECK-LABEL: stack_fold_psllvw_ymm:
3539; CHECK:       # %bb.0:
3540; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3541; CHECK-NEXT:    #APP
3542; CHECK-NEXT:    nop
3543; CHECK-NEXT:    #NO_APP
3544; CHECK-NEXT:    vpsllvw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3545; CHECK-NEXT:    retq
3546  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3547  %2 = call <16 x i16> @llvm.x86.avx512.psllv.w.256(<16 x i16> %a0, <16 x i16> %a1)
3548  ret <16 x i16> %2
3549}
3550declare <16 x i16> @llvm.x86.avx512.psllv.w.256(<16 x i16>, <16 x i16>) nounwind readnone
3551
3552define <8 x i16> @stack_fold_psllw(<8 x i16> %a0, <8 x i16> %a1) {
3553; CHECK-LABEL: stack_fold_psllw:
3554; CHECK:       # %bb.0:
3555; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3556; CHECK-NEXT:    #APP
3557; CHECK-NEXT:    nop
3558; CHECK-NEXT:    #NO_APP
3559; CHECK-NEXT:    vpsllw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3560; CHECK-NEXT:    retq
3561  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3562  %2 = call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %a0, <8 x i16> %a1)
3563  ret <8 x i16> %2
3564}
3565declare <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16>, <8 x i16>) nounwind readnone
3566
3567define <16 x i16> @stack_fold_psllw_ymm(<16 x i16> %a0, <8 x i16> %a1) {
3568; CHECK-LABEL: stack_fold_psllw_ymm:
3569; CHECK:       # %bb.0:
3570; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3571; CHECK-NEXT:    #APP
3572; CHECK-NEXT:    nop
3573; CHECK-NEXT:    #NO_APP
3574; CHECK-NEXT:    vpsllw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
3575; CHECK-NEXT:    retq
3576  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3577  %2 = call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %a0, <8 x i16> %a1)
3578  ret <16 x i16> %2
3579}
3580declare <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16>, <8 x i16>) nounwind readnone
3581
3582define <4 x i32> @stack_fold_psrad(<4 x i32> %a0, <4 x i32> %a1) {
3583; CHECK-LABEL: stack_fold_psrad:
3584; CHECK:       # %bb.0:
3585; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3586; CHECK-NEXT:    #APP
3587; CHECK-NEXT:    nop
3588; CHECK-NEXT:    #NO_APP
3589; CHECK-NEXT:    vpsrad {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3590; CHECK-NEXT:    retq
3591  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3592  %2 = call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %a0, <4 x i32> %a1)
3593  ret <4 x i32> %2
3594}
3595declare <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32>, <4 x i32>) nounwind readnone
3596
3597define <8 x i32> @stack_fold_psrad_ymm(<8 x i32> %a0, <4 x i32> %a1) {
3598; CHECK-LABEL: stack_fold_psrad_ymm:
3599; CHECK:       # %bb.0:
3600; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3601; CHECK-NEXT:    #APP
3602; CHECK-NEXT:    nop
3603; CHECK-NEXT:    #NO_APP
3604; CHECK-NEXT:    vpsrad {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
3605; CHECK-NEXT:    retq
3606  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3607  %2 = call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %a0, <4 x i32> %a1)
3608  ret <8 x i32> %2
3609}
3610declare <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32>, <4 x i32>) nounwind readnone
3611
3612define <2 x i64> @stack_fold_psraq(<2 x i64> %a0, <2 x i64> %a1) {
3613; CHECK-LABEL: stack_fold_psraq:
3614; CHECK:       # %bb.0:
3615; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3616; CHECK-NEXT:    #APP
3617; CHECK-NEXT:    nop
3618; CHECK-NEXT:    #NO_APP
3619; CHECK-NEXT:    vpsraq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3620; CHECK-NEXT:    retq
3621  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3622  %2 = call <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64> %a0, <2 x i64> %a1)
3623  ret <2 x i64> %2
3624}
3625declare <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64>, <2 x i64>) nounwind readnone
3626
3627define <4 x i64> @stack_fold_psraq_ymm(<4 x i64> %a0, <2 x i64> %a1) {
3628; CHECK-LABEL: stack_fold_psraq_ymm:
3629; CHECK:       # %bb.0:
3630; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3631; CHECK-NEXT:    #APP
3632; CHECK-NEXT:    nop
3633; CHECK-NEXT:    #NO_APP
3634; CHECK-NEXT:    vpsraq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
3635; CHECK-NEXT:    retq
3636  %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3637  %2 = call <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64> %a0, <2 x i64> %a1)
3638  ret <4 x i64> %2
3639}
3640declare <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64>, <2 x i64>) nounwind readnone
3641
3642define <4 x i32> @stack_fold_psravd(<4 x i32> %a0, <4 x i32> %a1) {
3643; CHECK-LABEL: stack_fold_psravd:
3644; CHECK:       # %bb.0:
3645; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3646; CHECK-NEXT:    #APP
3647; CHECK-NEXT:    nop
3648; CHECK-NEXT:    #NO_APP
3649; CHECK-NEXT:    vpsravd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3650; CHECK-NEXT:    retq
3651  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3652  %2 = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %a0, <4 x i32> %a1)
3653  ret <4 x i32> %2
3654}
3655declare <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32>, <4 x i32>) nounwind readnone
3656
3657define <8 x i32> @stack_fold_psravd_ymm(<8 x i32> %a0, <8 x i32> %a1) {
3658; CHECK-LABEL: stack_fold_psravd_ymm:
3659; CHECK:       # %bb.0:
3660; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3661; CHECK-NEXT:    #APP
3662; CHECK-NEXT:    nop
3663; CHECK-NEXT:    #NO_APP
3664; CHECK-NEXT:    vpsravd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3665; CHECK-NEXT:    retq
3666  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3667  %2 = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %a0, <8 x i32> %a1)
3668  ret <8 x i32> %2
3669}
3670declare <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32>, <8 x i32>) nounwind readnone
3671
3672define <2 x i64> @stack_fold_psravq(<2 x i64> %a0, <2 x i64> %a1) {
3673; CHECK-LABEL: stack_fold_psravq:
3674; CHECK:       # %bb.0:
3675; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3676; CHECK-NEXT:    #APP
3677; CHECK-NEXT:    nop
3678; CHECK-NEXT:    #NO_APP
3679; CHECK-NEXT:    vpsravq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3680; CHECK-NEXT:    retq
3681  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3682  %2 = call <2 x i64> @llvm.x86.avx512.psrav.q.128(<2 x i64> %a0, <2 x i64> %a1)
3683  ret <2 x i64> %2
3684}
3685declare <2 x i64> @llvm.x86.avx512.psrav.q.128(<2 x i64>, <2 x i64>) nounwind readnone
3686
3687define <4 x i64> @stack_fold_psravq_ymm(<4 x i64> %a0, <4 x i64> %a1) {
3688; CHECK-LABEL: stack_fold_psravq_ymm:
3689; CHECK:       # %bb.0:
3690; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3691; CHECK-NEXT:    #APP
3692; CHECK-NEXT:    nop
3693; CHECK-NEXT:    #NO_APP
3694; CHECK-NEXT:    vpsravq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3695; CHECK-NEXT:    retq
3696  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3697  %2 = call <4 x i64> @llvm.x86.avx512.psrav.q.256(<4 x i64> %a0, <4 x i64> %a1)
3698  ret <4 x i64> %2
3699}
3700declare <4 x i64> @llvm.x86.avx512.psrav.q.256(<4 x i64>, <4 x i64>) nounwind readnone
3701
3702define <8 x i16> @stack_fold_psravw(<8 x i16> %a0, <8 x i16> %a1) {
3703; CHECK-LABEL: stack_fold_psravw:
3704; CHECK:       # %bb.0:
3705; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3706; CHECK-NEXT:    #APP
3707; CHECK-NEXT:    nop
3708; CHECK-NEXT:    #NO_APP
3709; CHECK-NEXT:    vpsravw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3710; CHECK-NEXT:    retq
3711  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3712  %2 = call <8 x i16> @llvm.x86.avx512.psrav.w.128(<8 x i16> %a0, <8 x i16> %a1)
3713  ret <8 x i16> %2
3714}
3715declare <8 x i16> @llvm.x86.avx512.psrav.w.128(<8 x i16>, <8 x i16>) nounwind readnone
3716
3717define <16 x i16> @stack_fold_psravw_ymm(<16 x i16> %a0, <16 x i16> %a1) {
3718; CHECK-LABEL: stack_fold_psravw_ymm:
3719; CHECK:       # %bb.0:
3720; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3721; CHECK-NEXT:    #APP
3722; CHECK-NEXT:    nop
3723; CHECK-NEXT:    #NO_APP
3724; CHECK-NEXT:    vpsravw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3725; CHECK-NEXT:    retq
3726  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3727  %2 = call <16 x i16> @llvm.x86.avx512.psrav.w.256(<16 x i16> %a0, <16 x i16> %a1)
3728  ret <16 x i16> %2
3729}
3730declare <16 x i16> @llvm.x86.avx512.psrav.w.256(<16 x i16>, <16 x i16>) nounwind readnone
3731
3732define <8 x i16> @stack_fold_psraw(<8 x i16> %a0, <8 x i16> %a1) {
3733; CHECK-LABEL: stack_fold_psraw:
3734; CHECK:       # %bb.0:
3735; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3736; CHECK-NEXT:    #APP
3737; CHECK-NEXT:    nop
3738; CHECK-NEXT:    #NO_APP
3739; CHECK-NEXT:    vpsraw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3740; CHECK-NEXT:    retq
3741  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3742  %2 = call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %a0, <8 x i16> %a1)
3743  ret <8 x i16> %2
3744}
3745declare <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16>, <8 x i16>) nounwind readnone
3746
3747define <16 x i16> @stack_fold_psraw_ymm(<16 x i16> %a0, <8 x i16> %a1) {
3748; CHECK-LABEL: stack_fold_psraw_ymm:
3749; CHECK:       # %bb.0:
3750; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3751; CHECK-NEXT:    #APP
3752; CHECK-NEXT:    nop
3753; CHECK-NEXT:    #NO_APP
3754; CHECK-NEXT:    vpsraw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
3755; CHECK-NEXT:    retq
3756  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3757  %2 = call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %a0, <8 x i16> %a1)
3758  ret <16 x i16> %2
3759}
3760declare <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16>, <8 x i16>) nounwind readnone
3761
3762define <4 x i32> @stack_fold_psrld(<4 x i32> %a0, <4 x i32> %a1) {
3763; CHECK-LABEL: stack_fold_psrld:
3764; CHECK:       # %bb.0:
3765; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3766; CHECK-NEXT:    #APP
3767; CHECK-NEXT:    nop
3768; CHECK-NEXT:    #NO_APP
3769; CHECK-NEXT:    vpsrld {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3770; CHECK-NEXT:    retq
3771  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3772  %2 = call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %a0, <4 x i32> %a1)
3773  ret <4 x i32> %2
3774}
3775declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) nounwind readnone
3776
3777define <8 x i32> @stack_fold_psrld_ymm(<8 x i32> %a0, <4 x i32> %a1) {
3778; CHECK-LABEL: stack_fold_psrld_ymm:
3779; CHECK:       # %bb.0:
3780; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3781; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3782; CHECK-NEXT:    #APP
3783; CHECK-NEXT:    nop
3784; CHECK-NEXT:    #NO_APP
3785; CHECK-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3786; CHECK-NEXT:    vpsrld {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
3787; CHECK-NEXT:    retq
3788  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3789  %2 = call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %a0, <4 x i32> %a1)
3790  ret <8 x i32> %2
3791}
3792declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) nounwind readnone
3793
3794define <16 x i8> @stack_fold_psrldq(<16 x i8> %a) {
3795; CHECK-LABEL: stack_fold_psrldq:
3796; CHECK:       # %bb.0:
3797; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3798; CHECK-NEXT:    #APP
3799; CHECK-NEXT:    nop
3800; CHECK-NEXT:    #NO_APP
3801; CHECK-NEXT:    vpsrldq $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3802; CHECK-NEXT:    # xmm0 = mem[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3803; CHECK-NEXT:    retq
3804  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3805  %2 = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 28, i32 29, i32 30, i32 31, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 09, i32 0, i32 0, i32 0, i32 0, i32 0>
3806  ret <16 x i8> %2
3807}
3808
3809define <32 x i8> @stack_fold_psrldq_ymm(<32 x i8> %a) {
3810; CHECK-LABEL: stack_fold_psrldq_ymm:
3811; CHECK:       # %bb.0:
3812; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3813; CHECK-NEXT:    #APP
3814; CHECK-NEXT:    nop
3815; CHECK-NEXT:    #NO_APP
3816; CHECK-NEXT:    vpsrldq $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
3817; CHECK-NEXT:    # ymm0 = mem[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,mem[31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3818; CHECK-NEXT:    retq
3819  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3820  %2 = shufflevector <32 x i8> zeroinitializer, <32 x i8> %a, <32 x i32> <i32 47, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 63, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
3821  ret <32 x i8> %2
3822}
3823
3824define <2 x i64> @stack_fold_psrlq(<2 x i64> %a0, <2 x i64> %a1) {
3825; CHECK-LABEL: stack_fold_psrlq:
3826; CHECK:       # %bb.0:
3827; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3828; CHECK-NEXT:    #APP
3829; CHECK-NEXT:    nop
3830; CHECK-NEXT:    #NO_APP
3831; CHECK-NEXT:    vpsrlq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3832; CHECK-NEXT:    retq
3833  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3834  %2 = call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %a0, <2 x i64> %a1)
3835  ret <2 x i64> %2
3836}
3837declare <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64>, <2 x i64>) nounwind readnone
3838
3839define <4 x i64> @stack_fold_psrlq_ymm(<4 x i64> %a0, <2 x i64> %a1) {
3840; CHECK-LABEL: stack_fold_psrlq_ymm:
3841; CHECK:       # %bb.0:
3842; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3843; CHECK-NEXT:    #APP
3844; CHECK-NEXT:    nop
3845; CHECK-NEXT:    #NO_APP
3846; CHECK-NEXT:    vpsrlq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
3847; CHECK-NEXT:    retq
3848  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3849  %2 = call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %a0, <2 x i64> %a1)
3850  ret <4 x i64> %2
3851}
3852declare <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64>, <2 x i64>) nounwind readnone
3853
3854define <4 x i32> @stack_fold_psrlvd(<4 x i32> %a0, <4 x i32> %a1) {
3855; CHECK-LABEL: stack_fold_psrlvd:
3856; CHECK:       # %bb.0:
3857; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3858; CHECK-NEXT:    #APP
3859; CHECK-NEXT:    nop
3860; CHECK-NEXT:    #NO_APP
3861; CHECK-NEXT:    vpsrlvd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3862; CHECK-NEXT:    retq
3863  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3864  %2 = call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %a0, <4 x i32> %a1)
3865  ret <4 x i32> %2
3866}
3867declare <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32>, <4 x i32>) nounwind readnone
3868
3869define <8 x i32> @stack_fold_psrlvd_ymm(<8 x i32> %a0, <8 x i32> %a1) {
3870; CHECK-LABEL: stack_fold_psrlvd_ymm:
3871; CHECK:       # %bb.0:
3872; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3873; CHECK-NEXT:    #APP
3874; CHECK-NEXT:    nop
3875; CHECK-NEXT:    #NO_APP
3876; CHECK-NEXT:    vpsrlvd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3877; CHECK-NEXT:    retq
3878  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3879  %2 = call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %a0, <8 x i32> %a1)
3880  ret <8 x i32> %2
3881}
3882declare <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32>, <8 x i32>) nounwind readnone
3883
3884define <2 x i64> @stack_fold_psrlvq(<2 x i64> %a0, <2 x i64> %a1) {
3885; CHECK-LABEL: stack_fold_psrlvq:
3886; CHECK:       # %bb.0:
3887; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3888; CHECK-NEXT:    #APP
3889; CHECK-NEXT:    nop
3890; CHECK-NEXT:    #NO_APP
3891; CHECK-NEXT:    vpsrlvq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3892; CHECK-NEXT:    retq
3893  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3894  %2 = call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %a0, <2 x i64> %a1)
3895  ret <2 x i64> %2
3896}
3897declare <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64>, <2 x i64>) nounwind readnone
3898
3899define <4 x i64> @stack_fold_psrlvq_ymm(<4 x i64> %a0, <4 x i64> %a1) {
3900; CHECK-LABEL: stack_fold_psrlvq_ymm:
3901; CHECK:       # %bb.0:
3902; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3903; CHECK-NEXT:    #APP
3904; CHECK-NEXT:    nop
3905; CHECK-NEXT:    #NO_APP
3906; CHECK-NEXT:    vpsrlvq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3907; CHECK-NEXT:    retq
3908  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3909  %2 = call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %a0, <4 x i64> %a1)
3910  ret <4 x i64> %2
3911}
3912declare <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64>, <4 x i64>) nounwind readnone
3913
3914define <8 x i16> @stack_fold_psrlvw(<8 x i16> %a0, <8 x i16> %a1) {
3915; CHECK-LABEL: stack_fold_psrlvw:
3916; CHECK:       # %bb.0:
3917; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3918; CHECK-NEXT:    #APP
3919; CHECK-NEXT:    nop
3920; CHECK-NEXT:    #NO_APP
3921; CHECK-NEXT:    vpsrlvw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3922; CHECK-NEXT:    retq
3923  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3924  %2 = call <8 x i16> @llvm.x86.avx512.psrlv.w.128(<8 x i16> %a0, <8 x i16> %a1)
3925  ret <8 x i16> %2
3926}
3927declare <8 x i16> @llvm.x86.avx512.psrlv.w.128(<8 x i16>, <8 x i16>) nounwind readnone
3928
3929define <16 x i16> @stack_fold_psrlvw_ymm(<16 x i16> %a0, <16 x i16> %a1) {
3930; CHECK-LABEL: stack_fold_psrlvw_ymm:
3931; CHECK:       # %bb.0:
3932; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3933; CHECK-NEXT:    #APP
3934; CHECK-NEXT:    nop
3935; CHECK-NEXT:    #NO_APP
3936; CHECK-NEXT:    vpsrlvw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3937; CHECK-NEXT:    retq
3938  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3939  %2 = call <16 x i16> @llvm.x86.avx512.psrlv.w.256(<16 x i16> %a0, <16 x i16> %a1)
3940  ret <16 x i16> %2
3941}
3942declare <16 x i16> @llvm.x86.avx512.psrlv.w.256(<16 x i16>, <16 x i16>) nounwind readnone
3943
3944define <8 x i16> @stack_fold_psrlw(<8 x i16> %a0, <8 x i16> %a1) {
3945; CHECK-LABEL: stack_fold_psrlw:
3946; CHECK:       # %bb.0:
3947; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3948; CHECK-NEXT:    #APP
3949; CHECK-NEXT:    nop
3950; CHECK-NEXT:    #NO_APP
3951; CHECK-NEXT:    vpsrlw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3952; CHECK-NEXT:    retq
3953  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3954  %2 = call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %a0, <8 x i16> %a1)
3955  ret <8 x i16> %2
3956}
3957declare <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16>, <8 x i16>) nounwind readnone
3958
3959define <16 x i16> @stack_fold_psrlw_ymm(<16 x i16> %a0, <8 x i16> %a1) {
3960; CHECK-LABEL: stack_fold_psrlw_ymm:
3961; CHECK:       # %bb.0:
3962; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3963; CHECK-NEXT:    #APP
3964; CHECK-NEXT:    nop
3965; CHECK-NEXT:    #NO_APP
3966; CHECK-NEXT:    vpsrlw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
3967; CHECK-NEXT:    retq
3968  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3969  %2 = call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %a0, <8 x i16> %a1)
3970  ret <16 x i16> %2
3971}
3972declare <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16>, <8 x i16>) nounwind readnone
3973
3974define <16 x i8> @stack_fold_psubb(<16 x i8> %a0, <16 x i8> %a1) {
3975; CHECK-LABEL: stack_fold_psubb:
3976; CHECK:       # %bb.0:
3977; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3978; CHECK-NEXT:    #APP
3979; CHECK-NEXT:    nop
3980; CHECK-NEXT:    #NO_APP
3981; CHECK-NEXT:    vpsubb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3982; CHECK-NEXT:    retq
3983  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3984  %2 = sub <16 x i8> %a0, %a1
3985  ret <16 x i8> %2
3986}
3987
3988define <32 x i8> @stack_fold_psubb_ymm(<32 x i8> %a0, <32 x i8> %a1) {
3989; CHECK-LABEL: stack_fold_psubb_ymm:
3990; CHECK:       # %bb.0:
3991; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3992; CHECK-NEXT:    #APP
3993; CHECK-NEXT:    nop
3994; CHECK-NEXT:    #NO_APP
3995; CHECK-NEXT:    vpsubb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3996; CHECK-NEXT:    retq
3997  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3998  %2 = sub <32 x i8> %a0, %a1
3999  ret <32 x i8> %2
4000}
4001
4002define <4 x i32> @stack_fold_psubd(<4 x i32> %a0, <4 x i32> %a1) {
4003; CHECK-LABEL: stack_fold_psubd:
4004; CHECK:       # %bb.0:
4005; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4006; CHECK-NEXT:    #APP
4007; CHECK-NEXT:    nop
4008; CHECK-NEXT:    #NO_APP
4009; CHECK-NEXT:    vpsubd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4010; CHECK-NEXT:    retq
4011  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4012  %2 = sub <4 x i32> %a0, %a1
4013  ret <4 x i32> %2
4014}
4015
4016define <8 x i32> @stack_fold_psubd_ymm(<8 x i32> %a0, <8 x i32> %a1) {
4017; CHECK-LABEL: stack_fold_psubd_ymm:
4018; CHECK:       # %bb.0:
4019; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4020; CHECK-NEXT:    #APP
4021; CHECK-NEXT:    nop
4022; CHECK-NEXT:    #NO_APP
4023; CHECK-NEXT:    vpsubd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
4024; CHECK-NEXT:    retq
4025  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4026  %2 = sub <8 x i32> %a0, %a1
4027  ret <8 x i32> %2
4028}
4029
4030define <2 x i64> @stack_fold_psubq(<2 x i64> %a0, <2 x i64> %a1) {
4031; CHECK-LABEL: stack_fold_psubq:
4032; CHECK:       # %bb.0:
4033; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4034; CHECK-NEXT:    #APP
4035; CHECK-NEXT:    nop
4036; CHECK-NEXT:    #NO_APP
4037; CHECK-NEXT:    vpsubq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4038; CHECK-NEXT:    retq
4039  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4040  %2 = sub <2 x i64> %a0, %a1
4041  ret <2 x i64> %2
4042}
4043
4044define <4 x i64> @stack_fold_psubq_ymm(<4 x i64> %a0, <4 x i64> %a1) {
4045; CHECK-LABEL: stack_fold_psubq_ymm:
4046; CHECK:       # %bb.0:
4047; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4048; CHECK-NEXT:    #APP
4049; CHECK-NEXT:    nop
4050; CHECK-NEXT:    #NO_APP
4051; CHECK-NEXT:    vpsubq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
4052; CHECK-NEXT:    retq
4053  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4054  %2 = sub <4 x i64> %a0, %a1
4055  ret <4 x i64> %2
4056}
4057
4058define <16 x i8> @stack_fold_psubsb(<16 x i8> %a0, <16 x i8> %a1) {
4059; CHECK-LABEL: stack_fold_psubsb:
4060; CHECK:       # %bb.0:
4061; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4062; CHECK-NEXT:    #APP
4063; CHECK-NEXT:    nop
4064; CHECK-NEXT:    #NO_APP
4065; CHECK-NEXT:    vpsubsb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4066; CHECK-NEXT:    retq
4067  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4068  %2 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> %a0, <16 x i8> %a1)
4069  ret <16 x i8> %2
4070}
4071declare <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
4072
4073define <32 x i8> @stack_fold_psubsb_ymm(<32 x i8> %a0, <32 x i8> %a1) {
4074; CHECK-LABEL: stack_fold_psubsb_ymm:
4075; CHECK:       # %bb.0:
4076; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4077; CHECK-NEXT:    #APP
4078; CHECK-NEXT:    nop
4079; CHECK-NEXT:    #NO_APP
4080; CHECK-NEXT:    vpsubsb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
4081; CHECK-NEXT:    retq
4082  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4083  %2 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> %a0, <32 x i8> %a1)
4084  ret <32 x i8> %2
4085}
4086declare <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone
4087
4088define <8 x i16> @stack_fold_psubsw(<8 x i16> %a0, <8 x i16> %a1) {
4089; CHECK-LABEL: stack_fold_psubsw:
4090; CHECK:       # %bb.0:
4091; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4092; CHECK-NEXT:    #APP
4093; CHECK-NEXT:    nop
4094; CHECK-NEXT:    #NO_APP
4095; CHECK-NEXT:    vpsubsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4096; CHECK-NEXT:    retq
4097  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4098  %2 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %a0, <8 x i16> %a1)
4099  ret <8 x i16> %2
4100}
4101declare <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
4102
4103define <16 x i16> @stack_fold_psubsw_ymm(<16 x i16> %a0, <16 x i16> %a1) {
4104; CHECK-LABEL: stack_fold_psubsw_ymm:
4105; CHECK:       # %bb.0:
4106; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4107; CHECK-NEXT:    #APP
4108; CHECK-NEXT:    nop
4109; CHECK-NEXT:    #NO_APP
4110; CHECK-NEXT:    vpsubsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
4111; CHECK-NEXT:    retq
4112  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4113  %2 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> %a0, <16 x i16> %a1)
4114  ret <16 x i16> %2
4115}
4116declare <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone
4117
4118define <16 x i8> @stack_fold_psubusb(<16 x i8> %a0, <16 x i8> %a1) {
4119; CHECK-LABEL: stack_fold_psubusb:
4120; CHECK:       # %bb.0:
4121; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4122; CHECK-NEXT:    #APP
4123; CHECK-NEXT:    nop
4124; CHECK-NEXT:    #NO_APP
4125; CHECK-NEXT:    vpsubusb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4126; CHECK-NEXT:    retq
4127  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4128  %2 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> %a0, <16 x i8> %a1)
4129  ret <16 x i8> %2
4130}
4131declare <16 x i8> @llvm.usub.sat.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
4132
4133define <32 x i8> @stack_fold_psubusb_ymm(<32 x i8> %a0, <32 x i8> %a1) {
4134; CHECK-LABEL: stack_fold_psubusb_ymm:
4135; CHECK:       # %bb.0:
4136; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4137; CHECK-NEXT:    #APP
4138; CHECK-NEXT:    nop
4139; CHECK-NEXT:    #NO_APP
4140; CHECK-NEXT:    vpsubusb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
4141; CHECK-NEXT:    retq
4142  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4143  %2 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> %a0, <32 x i8> %a1)
4144  ret <32 x i8> %2
4145}
4146declare <32 x i8> @llvm.usub.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone
4147
4148define <8 x i16> @stack_fold_psubusw(<8 x i16> %a0, <8 x i16> %a1) {
4149; CHECK-LABEL: stack_fold_psubusw:
4150; CHECK:       # %bb.0:
4151; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4152; CHECK-NEXT:    #APP
4153; CHECK-NEXT:    nop
4154; CHECK-NEXT:    #NO_APP
4155; CHECK-NEXT:    vpsubusw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4156; CHECK-NEXT:    retq
4157  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4158  %2 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %a0, <8 x i16> %a1)
4159  ret <8 x i16> %2
4160}
4161declare <8 x i16> @llvm.usub.sat.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
4162
4163define <16 x i16> @stack_fold_psubusw_ymm(<16 x i16> %a0, <16 x i16> %a1) {
4164; CHECK-LABEL: stack_fold_psubusw_ymm:
4165; CHECK:       # %bb.0:
4166; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4167; CHECK-NEXT:    #APP
4168; CHECK-NEXT:    nop
4169; CHECK-NEXT:    #NO_APP
4170; CHECK-NEXT:    vpsubusw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
4171; CHECK-NEXT:    retq
4172  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4173  %2 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> %a0, <16 x i16> %a1)
4174  ret <16 x i16> %2
4175}
4176declare <16 x i16> @llvm.usub.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone
4177
4178define <8 x i16> @stack_fold_psubw(<8 x i16> %a0, <8 x i16> %a1) {
4179; CHECK-LABEL: stack_fold_psubw:
4180; CHECK:       # %bb.0:
4181; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4182; CHECK-NEXT:    #APP
4183; CHECK-NEXT:    nop
4184; CHECK-NEXT:    #NO_APP
4185; CHECK-NEXT:    vpsubw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4186; CHECK-NEXT:    retq
4187  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4188  %2 = sub <8 x i16> %a0, %a1
4189  ret <8 x i16> %2
4190}
4191
4192define <16 x i16> @stack_fold_psubw_ymm(<16 x i16> %a0, <16 x i16> %a1) {
4193; CHECK-LABEL: stack_fold_psubw_ymm:
4194; CHECK:       # %bb.0:
4195; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4196; CHECK-NEXT:    #APP
4197; CHECK-NEXT:    nop
4198; CHECK-NEXT:    #NO_APP
4199; CHECK-NEXT:    vpsubw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
4200; CHECK-NEXT:    retq
4201  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4202  %2 = sub <16 x i16> %a0, %a1
4203  ret <16 x i16> %2
4204}
4205
4206define <16 x i8> @stack_fold_punpckhbw(<16 x i8> %a0, <16 x i8> %a1) {
4207; CHECK-LABEL: stack_fold_punpckhbw:
4208; CHECK:       # %bb.0:
4209; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4210; CHECK-NEXT:    #APP
4211; CHECK-NEXT:    nop
4212; CHECK-NEXT:    #NO_APP
4213; CHECK-NEXT:    vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4214; CHECK-NEXT:    # xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15]
4215; CHECK-NEXT:    retq
4216  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4217  %2 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
4218  ret <16 x i8> %2
4219}
4220
4221define <16 x i8> @stack_fold_punpckhbw_mask(<16 x i8>* %passthru, <16 x i8> %a0, <16 x i8> %a1, i16 %mask) {
4222; CHECK-LABEL: stack_fold_punpckhbw_mask:
4223; CHECK:       # %bb.0:
4224; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4225; CHECK-NEXT:    #APP
4226; CHECK-NEXT:    nop
4227; CHECK-NEXT:    #NO_APP
4228; CHECK-NEXT:    kmovd %esi, %k1
4229; CHECK-NEXT:    vmovdqa (%rdi), %xmm2
4230; CHECK-NEXT:    vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
4231; CHECK-NEXT:    # xmm2 {%k1} = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15]
4232; CHECK-NEXT:    vmovdqa %xmm2, %xmm0
4233; CHECK-NEXT:    retq
4234  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4235  %2 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
4236  %3 = bitcast i16 %mask to <16 x i1>
4237  ; load needed to keep the operation from being scheduled about the asm block
4238  %4 = load <16 x i8>, <16 x i8>* %passthru
4239  %5 = select <16 x i1> %3, <16 x i8> %2, <16 x i8> %4
4240  ret <16 x i8> %5
4241}
4242
4243define <16 x i8> @stack_fold_punpckhbw_maskz(<16 x i8> %passthru, <16 x i8> %a0, <16 x i8> %a1, i16 %mask) {
4244; CHECK-LABEL: stack_fold_punpckhbw_maskz:
4245; CHECK:       # %bb.0:
4246; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4247; CHECK-NEXT:    #APP
4248; CHECK-NEXT:    nop
4249; CHECK-NEXT:    #NO_APP
4250; CHECK-NEXT:    kmovd %edi, %k1
4251; CHECK-NEXT:    vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
4252; CHECK-NEXT:    # xmm0 {%k1} {z} = xmm1[8],mem[8],xmm1[9],mem[9],xmm1[10],mem[10],xmm1[11],mem[11],xmm1[12],mem[12],xmm1[13],mem[13],xmm1[14],mem[14],xmm1[15],mem[15]
4253; CHECK-NEXT:    retq
4254  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4255  %2 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
4256  %3 = bitcast i16 %mask to <16 x i1>
4257  %4 = select <16 x i1> %3, <16 x i8> %2, <16 x i8> zeroinitializer
4258  ret <16 x i8> %4
4259}
4260
4261define <32 x i8> @stack_fold_punpckhbw_ymm(<32 x i8> %a0, <32 x i8> %a1) {
4262; CHECK-LABEL: stack_fold_punpckhbw_ymm:
4263; CHECK:       # %bb.0:
4264; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4265; CHECK-NEXT:    #APP
4266; CHECK-NEXT:    nop
4267; CHECK-NEXT:    #NO_APP
4268; CHECK-NEXT:    vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
4269; CHECK-NEXT:    # ymm0 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31]
4270; CHECK-NEXT:    retq
4271  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4272  %2 = shufflevector <32 x i8> %a0, <32 x i8> %a1, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
4273  ret <32 x i8> %2
4274}
4275
4276define <32 x i8> @stack_fold_punpckhbw_mask_ymm(<32 x i8>* %passthru, <32 x i8> %a0, <32 x i8> %a1, i32 %mask) {
4277; CHECK-LABEL: stack_fold_punpckhbw_mask_ymm:
4278; CHECK:       # %bb.0:
4279; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4280; CHECK-NEXT:    #APP
4281; CHECK-NEXT:    nop
4282; CHECK-NEXT:    #NO_APP
4283; CHECK-NEXT:    kmovd %esi, %k1
4284; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
4285; CHECK-NEXT:    vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
4286; CHECK-NEXT:    # ymm2 {%k1} = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31]
4287; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
4288; CHECK-NEXT:    retq
4289  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4290  %2 = shufflevector <32 x i8> %a0, <32 x i8> %a1, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
4291  %3 = bitcast i32 %mask to <32 x i1>
4292  ; load needed to keep the operation from being scheduled about the asm block
4293  %4 = load <32 x i8>, <32 x i8>* %passthru
4294  %5 = select <32 x i1> %3, <32 x i8> %2, <32 x i8> %4
4295  ret <32 x i8> %5
4296}
4297
4298define <32 x i8> @stack_fold_punpckhbw_maskz_ymm(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) {
4299; CHECK-LABEL: stack_fold_punpckhbw_maskz_ymm:
4300; CHECK:       # %bb.0:
4301; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4302; CHECK-NEXT:    #APP
4303; CHECK-NEXT:    nop
4304; CHECK-NEXT:    #NO_APP
4305; CHECK-NEXT:    kmovd %edi, %k1
4306; CHECK-NEXT:    vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 {%k1} {z} # 32-byte Folded Reload
4307; CHECK-NEXT:    # ymm0 {%k1} {z} = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31]
4308; CHECK-NEXT:    retq
4309  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4310  %2 = shufflevector <32 x i8> %a0, <32 x i8> %a1, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
4311  %3 = bitcast i32 %mask to <32 x i1>
4312  %4 = select <32 x i1> %3, <32 x i8> %2, <32 x i8> zeroinitializer
4313  ret <32 x i8> %4
4314}
4315
4316define <4 x i64> @stack_fold_shufi64x2_maskz(<4 x i64> %a, <4 x i64> %b, i8 %mask) {
4317; CHECK-LABEL: stack_fold_shufi64x2_maskz:
4318; CHECK:       # %bb.0:
4319; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4320; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4321; CHECK-NEXT:    #APP
4322; CHECK-NEXT:    nop
4323; CHECK-NEXT:    #NO_APP
4324; CHECK-NEXT:    kmovd %edi, %k1
4325; CHECK-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4326; CHECK-NEXT:    vshufi64x2 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 {%k1} {z} # 32-byte Folded Reload
4327; CHECK-NEXT:    # ymm0 {%k1} {z} = ymm0[2,3],mem[0,1]
4328; CHECK-NEXT:    retq
4329  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4330  %2 = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
4331  %3 = bitcast i8 %mask to <8 x i1>
4332  %4 = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4333  %5 = select <4 x i1> %4, <4 x i64> %2, <4 x i64> zeroinitializer
4334  ret <4 x i64> %5
4335}
4336
4337define <8 x i32> @stack_fold_shufi32x4_maskz(<8 x i32> %a, <8 x i32> %b, i8 %mask) {
4338; CHECK-LABEL: stack_fold_shufi32x4_maskz:
4339; CHECK:       # %bb.0:
4340; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4341; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4342; CHECK-NEXT:    #APP
4343; CHECK-NEXT:    nop
4344; CHECK-NEXT:    #NO_APP
4345; CHECK-NEXT:    kmovd %edi, %k1
4346; CHECK-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4347; CHECK-NEXT:    vshufi32x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 {%k1} {z} # 32-byte Folded Reload
4348; CHECK-NEXT:    # ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3]
4349; CHECK-NEXT:    retq
4350  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4351  %2 = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
4352  %3 = bitcast i8 %mask to <8 x i1>
4353  %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer
4354  ret <8 x i32> %4
4355}
4356
4357declare <4 x i32> @llvm.x86.avx512.conflict.d.128(<4 x i32>)
4358declare <8 x i32> @llvm.x86.avx512.conflict.d.256(<8 x i32>)
4359declare <2 x i64> @llvm.x86.avx512.conflict.q.128(<2 x i64>)
4360declare <4 x i64> @llvm.x86.avx512.conflict.q.256(<4 x i64>)
4361declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1)
4362declare <8 x i32> @llvm.ctlz.v8i32(<8 x i32>, i1)
4363declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>, i1)
4364declare <4 x i64> @llvm.ctlz.v4i64(<4 x i64>, i1)
4365