1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP,XOPAVX1
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOP,XOPAVX2
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=AVX512VL,AVX512DQVL
11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512VL,AVX512BWVL
12;
13; Just one 32-bit run to make sure we do reasonable things for i64 shifts.
14; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86-SSE
15
16;
17; Variable Shifts
18;
19
20define <2 x i32> @var_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind {
21; SSE2-LABEL: var_shift_v2i32:
22; SSE2:       # %bb.0:
23; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
24; SSE2-NEXT:    movdqa %xmm0, %xmm3
25; SSE2-NEXT:    psrad %xmm2, %xmm3
26; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7]
27; SSE2-NEXT:    movdqa %xmm0, %xmm2
28; SSE2-NEXT:    psrad %xmm4, %xmm2
29; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
30; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
31; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
32; SSE2-NEXT:    movdqa %xmm0, %xmm4
33; SSE2-NEXT:    psrad %xmm3, %xmm4
34; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
35; SSE2-NEXT:    psrad %xmm1, %xmm0
36; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
37; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3]
38; SSE2-NEXT:    movaps %xmm2, %xmm0
39; SSE2-NEXT:    retq
40;
41; SSE41-LABEL: var_shift_v2i32:
42; SSE41:       # %bb.0:
43; SSE41-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
44; SSE41-NEXT:    movdqa %xmm0, %xmm3
45; SSE41-NEXT:    psrad %xmm2, %xmm3
46; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
47; SSE41-NEXT:    pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
48; SSE41-NEXT:    movdqa %xmm0, %xmm5
49; SSE41-NEXT:    psrad %xmm4, %xmm5
50; SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7]
51; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
52; SSE41-NEXT:    movdqa %xmm0, %xmm3
53; SSE41-NEXT:    psrad %xmm1, %xmm3
54; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7]
55; SSE41-NEXT:    psrad %xmm1, %xmm0
56; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
57; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
58; SSE41-NEXT:    retq
59;
60; AVX1-LABEL: var_shift_v2i32:
61; AVX1:       # %bb.0:
62; AVX1-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
63; AVX1-NEXT:    vpsrad %xmm2, %xmm0, %xmm2
64; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm3
65; AVX1-NEXT:    vpsrad %xmm3, %xmm0, %xmm3
66; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
67; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
68; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
69; AVX1-NEXT:    vpsrad %xmm3, %xmm0, %xmm3
70; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
71; AVX1-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
72; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
73; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
74; AVX1-NEXT:    retq
75;
76; AVX2-LABEL: var_shift_v2i32:
77; AVX2:       # %bb.0:
78; AVX2-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
79; AVX2-NEXT:    retq
80;
81; XOPAVX1-LABEL: var_shift_v2i32:
82; XOPAVX1:       # %bb.0:
83; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
84; XOPAVX1-NEXT:    vpsubd %xmm1, %xmm2, %xmm1
85; XOPAVX1-NEXT:    vpshad %xmm1, %xmm0, %xmm0
86; XOPAVX1-NEXT:    retq
87;
88; XOPAVX2-LABEL: var_shift_v2i32:
89; XOPAVX2:       # %bb.0:
90; XOPAVX2-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
91; XOPAVX2-NEXT:    retq
92;
93; AVX512-LABEL: var_shift_v2i32:
94; AVX512:       # %bb.0:
95; AVX512-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
96; AVX512-NEXT:    retq
97;
98; AVX512VL-LABEL: var_shift_v2i32:
99; AVX512VL:       # %bb.0:
100; AVX512VL-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
101; AVX512VL-NEXT:    retq
102;
103; X86-SSE-LABEL: var_shift_v2i32:
104; X86-SSE:       # %bb.0:
105; X86-SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
106; X86-SSE-NEXT:    movdqa %xmm0, %xmm3
107; X86-SSE-NEXT:    psrad %xmm2, %xmm3
108; X86-SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7]
109; X86-SSE-NEXT:    movdqa %xmm0, %xmm2
110; X86-SSE-NEXT:    psrad %xmm4, %xmm2
111; X86-SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
112; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
113; X86-SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
114; X86-SSE-NEXT:    movdqa %xmm0, %xmm4
115; X86-SSE-NEXT:    psrad %xmm3, %xmm4
116; X86-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
117; X86-SSE-NEXT:    psrad %xmm1, %xmm0
118; X86-SSE-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
119; X86-SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3]
120; X86-SSE-NEXT:    movaps %xmm2, %xmm0
121; X86-SSE-NEXT:    retl
122  %shift = ashr <2 x i32> %a, %b
123  ret <2 x i32> %shift
124}
125
126define <4 x i16> @var_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind {
127; SSE2-LABEL: var_shift_v4i16:
128; SSE2:       # %bb.0:
129; SSE2-NEXT:    psllw $12, %xmm1
130; SSE2-NEXT:    movdqa %xmm1, %xmm2
131; SSE2-NEXT:    psraw $15, %xmm2
132; SSE2-NEXT:    movdqa %xmm2, %xmm3
133; SSE2-NEXT:    pandn %xmm0, %xmm3
134; SSE2-NEXT:    psraw $8, %xmm0
135; SSE2-NEXT:    pand %xmm2, %xmm0
136; SSE2-NEXT:    por %xmm3, %xmm0
137; SSE2-NEXT:    paddw %xmm1, %xmm1
138; SSE2-NEXT:    movdqa %xmm1, %xmm2
139; SSE2-NEXT:    psraw $15, %xmm2
140; SSE2-NEXT:    movdqa %xmm2, %xmm3
141; SSE2-NEXT:    pandn %xmm0, %xmm3
142; SSE2-NEXT:    psraw $4, %xmm0
143; SSE2-NEXT:    pand %xmm2, %xmm0
144; SSE2-NEXT:    por %xmm3, %xmm0
145; SSE2-NEXT:    paddw %xmm1, %xmm1
146; SSE2-NEXT:    movdqa %xmm1, %xmm2
147; SSE2-NEXT:    psraw $15, %xmm2
148; SSE2-NEXT:    movdqa %xmm2, %xmm3
149; SSE2-NEXT:    pandn %xmm0, %xmm3
150; SSE2-NEXT:    psraw $2, %xmm0
151; SSE2-NEXT:    pand %xmm2, %xmm0
152; SSE2-NEXT:    por %xmm3, %xmm0
153; SSE2-NEXT:    paddw %xmm1, %xmm1
154; SSE2-NEXT:    psraw $15, %xmm1
155; SSE2-NEXT:    movdqa %xmm1, %xmm2
156; SSE2-NEXT:    pandn %xmm0, %xmm2
157; SSE2-NEXT:    psraw $1, %xmm0
158; SSE2-NEXT:    pand %xmm1, %xmm0
159; SSE2-NEXT:    por %xmm2, %xmm0
160; SSE2-NEXT:    retq
161;
162; SSE41-LABEL: var_shift_v4i16:
163; SSE41:       # %bb.0:
164; SSE41-NEXT:    movdqa %xmm1, %xmm2
165; SSE41-NEXT:    movdqa %xmm0, %xmm1
166; SSE41-NEXT:    movdqa %xmm2, %xmm0
167; SSE41-NEXT:    psllw $12, %xmm0
168; SSE41-NEXT:    psllw $4, %xmm2
169; SSE41-NEXT:    por %xmm0, %xmm2
170; SSE41-NEXT:    movdqa %xmm2, %xmm3
171; SSE41-NEXT:    paddw %xmm2, %xmm3
172; SSE41-NEXT:    movdqa %xmm1, %xmm4
173; SSE41-NEXT:    psraw $8, %xmm4
174; SSE41-NEXT:    movdqa %xmm2, %xmm0
175; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm1
176; SSE41-NEXT:    movdqa %xmm1, %xmm2
177; SSE41-NEXT:    psraw $4, %xmm2
178; SSE41-NEXT:    movdqa %xmm3, %xmm0
179; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
180; SSE41-NEXT:    movdqa %xmm1, %xmm2
181; SSE41-NEXT:    psraw $2, %xmm2
182; SSE41-NEXT:    paddw %xmm3, %xmm3
183; SSE41-NEXT:    movdqa %xmm3, %xmm0
184; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
185; SSE41-NEXT:    movdqa %xmm1, %xmm2
186; SSE41-NEXT:    psraw $1, %xmm2
187; SSE41-NEXT:    paddw %xmm3, %xmm3
188; SSE41-NEXT:    movdqa %xmm3, %xmm0
189; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
190; SSE41-NEXT:    movdqa %xmm1, %xmm0
191; SSE41-NEXT:    retq
192;
193; AVX1-LABEL: var_shift_v4i16:
194; AVX1:       # %bb.0:
195; AVX1-NEXT:    vpsllw $12, %xmm1, %xmm2
196; AVX1-NEXT:    vpsllw $4, %xmm1, %xmm1
197; AVX1-NEXT:    vpor %xmm2, %xmm1, %xmm1
198; AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm2
199; AVX1-NEXT:    vpsraw $8, %xmm0, %xmm3
200; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
201; AVX1-NEXT:    vpsraw $4, %xmm0, %xmm1
202; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
203; AVX1-NEXT:    vpsraw $2, %xmm0, %xmm1
204; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
205; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
206; AVX1-NEXT:    vpsraw $1, %xmm0, %xmm1
207; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
208; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
209; AVX1-NEXT:    retq
210;
211; AVX2-LABEL: var_shift_v4i16:
212; AVX2:       # %bb.0:
213; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
214; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
215; AVX2-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
216; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
217; AVX2-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
218; AVX2-NEXT:    vzeroupper
219; AVX2-NEXT:    retq
220;
221; XOP-LABEL: var_shift_v4i16:
222; XOP:       # %bb.0:
223; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
224; XOP-NEXT:    vpsubw %xmm1, %xmm2, %xmm1
225; XOP-NEXT:    vpshaw %xmm1, %xmm0, %xmm0
226; XOP-NEXT:    retq
227;
228; AVX512DQ-LABEL: var_shift_v4i16:
229; AVX512DQ:       # %bb.0:
230; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
231; AVX512DQ-NEXT:    vpmovsxwd %xmm0, %ymm0
232; AVX512DQ-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
233; AVX512DQ-NEXT:    vpmovdw %zmm0, %ymm0
234; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
235; AVX512DQ-NEXT:    vzeroupper
236; AVX512DQ-NEXT:    retq
237;
238; AVX512BW-LABEL: var_shift_v4i16:
239; AVX512BW:       # %bb.0:
240; AVX512BW-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
241; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
242; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
243; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
244; AVX512BW-NEXT:    vzeroupper
245; AVX512BW-NEXT:    retq
246;
247; AVX512DQVL-LABEL: var_shift_v4i16:
248; AVX512DQVL:       # %bb.0:
249; AVX512DQVL-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
250; AVX512DQVL-NEXT:    vpmovsxwd %xmm0, %ymm0
251; AVX512DQVL-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
252; AVX512DQVL-NEXT:    vpmovdw %ymm0, %xmm0
253; AVX512DQVL-NEXT:    vzeroupper
254; AVX512DQVL-NEXT:    retq
255;
256; AVX512BWVL-LABEL: var_shift_v4i16:
257; AVX512BWVL:       # %bb.0:
258; AVX512BWVL-NEXT:    vpsravw %xmm1, %xmm0, %xmm0
259; AVX512BWVL-NEXT:    retq
260;
261; X86-SSE-LABEL: var_shift_v4i16:
262; X86-SSE:       # %bb.0:
263; X86-SSE-NEXT:    psllw $12, %xmm1
264; X86-SSE-NEXT:    movdqa %xmm1, %xmm2
265; X86-SSE-NEXT:    psraw $15, %xmm2
266; X86-SSE-NEXT:    movdqa %xmm2, %xmm3
267; X86-SSE-NEXT:    pandn %xmm0, %xmm3
268; X86-SSE-NEXT:    psraw $8, %xmm0
269; X86-SSE-NEXT:    pand %xmm2, %xmm0
270; X86-SSE-NEXT:    por %xmm3, %xmm0
271; X86-SSE-NEXT:    paddw %xmm1, %xmm1
272; X86-SSE-NEXT:    movdqa %xmm1, %xmm2
273; X86-SSE-NEXT:    psraw $15, %xmm2
274; X86-SSE-NEXT:    movdqa %xmm2, %xmm3
275; X86-SSE-NEXT:    pandn %xmm0, %xmm3
276; X86-SSE-NEXT:    psraw $4, %xmm0
277; X86-SSE-NEXT:    pand %xmm2, %xmm0
278; X86-SSE-NEXT:    por %xmm3, %xmm0
279; X86-SSE-NEXT:    paddw %xmm1, %xmm1
280; X86-SSE-NEXT:    movdqa %xmm1, %xmm2
281; X86-SSE-NEXT:    psraw $15, %xmm2
282; X86-SSE-NEXT:    movdqa %xmm2, %xmm3
283; X86-SSE-NEXT:    pandn %xmm0, %xmm3
284; X86-SSE-NEXT:    psraw $2, %xmm0
285; X86-SSE-NEXT:    pand %xmm2, %xmm0
286; X86-SSE-NEXT:    por %xmm3, %xmm0
287; X86-SSE-NEXT:    paddw %xmm1, %xmm1
288; X86-SSE-NEXT:    psraw $15, %xmm1
289; X86-SSE-NEXT:    movdqa %xmm1, %xmm2
290; X86-SSE-NEXT:    pandn %xmm0, %xmm2
291; X86-SSE-NEXT:    psraw $1, %xmm0
292; X86-SSE-NEXT:    pand %xmm1, %xmm0
293; X86-SSE-NEXT:    por %xmm2, %xmm0
294; X86-SSE-NEXT:    retl
295  %shift = ashr <4 x i16> %a, %b
296  ret <4 x i16> %shift
297}
298
299define <2 x i16> @var_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind {
300; SSE2-LABEL: var_shift_v2i16:
301; SSE2:       # %bb.0:
302; SSE2-NEXT:    psllw $12, %xmm1
303; SSE2-NEXT:    movdqa %xmm1, %xmm2
304; SSE2-NEXT:    psraw $15, %xmm2
305; SSE2-NEXT:    movdqa %xmm2, %xmm3
306; SSE2-NEXT:    pandn %xmm0, %xmm3
307; SSE2-NEXT:    psraw $8, %xmm0
308; SSE2-NEXT:    pand %xmm2, %xmm0
309; SSE2-NEXT:    por %xmm3, %xmm0
310; SSE2-NEXT:    paddw %xmm1, %xmm1
311; SSE2-NEXT:    movdqa %xmm1, %xmm2
312; SSE2-NEXT:    psraw $15, %xmm2
313; SSE2-NEXT:    movdqa %xmm2, %xmm3
314; SSE2-NEXT:    pandn %xmm0, %xmm3
315; SSE2-NEXT:    psraw $4, %xmm0
316; SSE2-NEXT:    pand %xmm2, %xmm0
317; SSE2-NEXT:    por %xmm3, %xmm0
318; SSE2-NEXT:    paddw %xmm1, %xmm1
319; SSE2-NEXT:    movdqa %xmm1, %xmm2
320; SSE2-NEXT:    psraw $15, %xmm2
321; SSE2-NEXT:    movdqa %xmm2, %xmm3
322; SSE2-NEXT:    pandn %xmm0, %xmm3
323; SSE2-NEXT:    psraw $2, %xmm0
324; SSE2-NEXT:    pand %xmm2, %xmm0
325; SSE2-NEXT:    por %xmm3, %xmm0
326; SSE2-NEXT:    paddw %xmm1, %xmm1
327; SSE2-NEXT:    psraw $15, %xmm1
328; SSE2-NEXT:    movdqa %xmm1, %xmm2
329; SSE2-NEXT:    pandn %xmm0, %xmm2
330; SSE2-NEXT:    psraw $1, %xmm0
331; SSE2-NEXT:    pand %xmm1, %xmm0
332; SSE2-NEXT:    por %xmm2, %xmm0
333; SSE2-NEXT:    retq
334;
335; SSE41-LABEL: var_shift_v2i16:
336; SSE41:       # %bb.0:
337; SSE41-NEXT:    movdqa %xmm1, %xmm2
338; SSE41-NEXT:    movdqa %xmm0, %xmm1
339; SSE41-NEXT:    movdqa %xmm2, %xmm0
340; SSE41-NEXT:    psllw $12, %xmm0
341; SSE41-NEXT:    psllw $4, %xmm2
342; SSE41-NEXT:    por %xmm0, %xmm2
343; SSE41-NEXT:    movdqa %xmm2, %xmm3
344; SSE41-NEXT:    paddw %xmm2, %xmm3
345; SSE41-NEXT:    movdqa %xmm1, %xmm4
346; SSE41-NEXT:    psraw $8, %xmm4
347; SSE41-NEXT:    movdqa %xmm2, %xmm0
348; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm1
349; SSE41-NEXT:    movdqa %xmm1, %xmm2
350; SSE41-NEXT:    psraw $4, %xmm2
351; SSE41-NEXT:    movdqa %xmm3, %xmm0
352; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
353; SSE41-NEXT:    movdqa %xmm1, %xmm2
354; SSE41-NEXT:    psraw $2, %xmm2
355; SSE41-NEXT:    paddw %xmm3, %xmm3
356; SSE41-NEXT:    movdqa %xmm3, %xmm0
357; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
358; SSE41-NEXT:    movdqa %xmm1, %xmm2
359; SSE41-NEXT:    psraw $1, %xmm2
360; SSE41-NEXT:    paddw %xmm3, %xmm3
361; SSE41-NEXT:    movdqa %xmm3, %xmm0
362; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
363; SSE41-NEXT:    movdqa %xmm1, %xmm0
364; SSE41-NEXT:    retq
365;
366; AVX1-LABEL: var_shift_v2i16:
367; AVX1:       # %bb.0:
368; AVX1-NEXT:    vpsllw $12, %xmm1, %xmm2
369; AVX1-NEXT:    vpsllw $4, %xmm1, %xmm1
370; AVX1-NEXT:    vpor %xmm2, %xmm1, %xmm1
371; AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm2
372; AVX1-NEXT:    vpsraw $8, %xmm0, %xmm3
373; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
374; AVX1-NEXT:    vpsraw $4, %xmm0, %xmm1
375; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
376; AVX1-NEXT:    vpsraw $2, %xmm0, %xmm1
377; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
378; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
379; AVX1-NEXT:    vpsraw $1, %xmm0, %xmm1
380; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
381; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
382; AVX1-NEXT:    retq
383;
384; AVX2-LABEL: var_shift_v2i16:
385; AVX2:       # %bb.0:
386; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
387; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
388; AVX2-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
389; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
390; AVX2-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
391; AVX2-NEXT:    vzeroupper
392; AVX2-NEXT:    retq
393;
394; XOP-LABEL: var_shift_v2i16:
395; XOP:       # %bb.0:
396; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
397; XOP-NEXT:    vpsubw %xmm1, %xmm2, %xmm1
398; XOP-NEXT:    vpshaw %xmm1, %xmm0, %xmm0
399; XOP-NEXT:    retq
400;
401; AVX512DQ-LABEL: var_shift_v2i16:
402; AVX512DQ:       # %bb.0:
403; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
404; AVX512DQ-NEXT:    vpmovsxwd %xmm0, %ymm0
405; AVX512DQ-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
406; AVX512DQ-NEXT:    vpmovdw %zmm0, %ymm0
407; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
408; AVX512DQ-NEXT:    vzeroupper
409; AVX512DQ-NEXT:    retq
410;
411; AVX512BW-LABEL: var_shift_v2i16:
412; AVX512BW:       # %bb.0:
413; AVX512BW-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
414; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
415; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
416; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
417; AVX512BW-NEXT:    vzeroupper
418; AVX512BW-NEXT:    retq
419;
420; AVX512DQVL-LABEL: var_shift_v2i16:
421; AVX512DQVL:       # %bb.0:
422; AVX512DQVL-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
423; AVX512DQVL-NEXT:    vpmovsxwd %xmm0, %ymm0
424; AVX512DQVL-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
425; AVX512DQVL-NEXT:    vpmovdw %ymm0, %xmm0
426; AVX512DQVL-NEXT:    vzeroupper
427; AVX512DQVL-NEXT:    retq
428;
429; AVX512BWVL-LABEL: var_shift_v2i16:
430; AVX512BWVL:       # %bb.0:
431; AVX512BWVL-NEXT:    vpsravw %xmm1, %xmm0, %xmm0
432; AVX512BWVL-NEXT:    retq
433;
434; X86-SSE-LABEL: var_shift_v2i16:
435; X86-SSE:       # %bb.0:
436; X86-SSE-NEXT:    psllw $12, %xmm1
437; X86-SSE-NEXT:    movdqa %xmm1, %xmm2
438; X86-SSE-NEXT:    psraw $15, %xmm2
439; X86-SSE-NEXT:    movdqa %xmm2, %xmm3
440; X86-SSE-NEXT:    pandn %xmm0, %xmm3
441; X86-SSE-NEXT:    psraw $8, %xmm0
442; X86-SSE-NEXT:    pand %xmm2, %xmm0
443; X86-SSE-NEXT:    por %xmm3, %xmm0
444; X86-SSE-NEXT:    paddw %xmm1, %xmm1
445; X86-SSE-NEXT:    movdqa %xmm1, %xmm2
446; X86-SSE-NEXT:    psraw $15, %xmm2
447; X86-SSE-NEXT:    movdqa %xmm2, %xmm3
448; X86-SSE-NEXT:    pandn %xmm0, %xmm3
449; X86-SSE-NEXT:    psraw $4, %xmm0
450; X86-SSE-NEXT:    pand %xmm2, %xmm0
451; X86-SSE-NEXT:    por %xmm3, %xmm0
452; X86-SSE-NEXT:    paddw %xmm1, %xmm1
453; X86-SSE-NEXT:    movdqa %xmm1, %xmm2
454; X86-SSE-NEXT:    psraw $15, %xmm2
455; X86-SSE-NEXT:    movdqa %xmm2, %xmm3
456; X86-SSE-NEXT:    pandn %xmm0, %xmm3
457; X86-SSE-NEXT:    psraw $2, %xmm0
458; X86-SSE-NEXT:    pand %xmm2, %xmm0
459; X86-SSE-NEXT:    por %xmm3, %xmm0
460; X86-SSE-NEXT:    paddw %xmm1, %xmm1
461; X86-SSE-NEXT:    psraw $15, %xmm1
462; X86-SSE-NEXT:    movdqa %xmm1, %xmm2
463; X86-SSE-NEXT:    pandn %xmm0, %xmm2
464; X86-SSE-NEXT:    psraw $1, %xmm0
465; X86-SSE-NEXT:    pand %xmm1, %xmm0
466; X86-SSE-NEXT:    por %xmm2, %xmm0
467; X86-SSE-NEXT:    retl
468  %shift = ashr <2 x i16> %a, %b
469  ret <2 x i16> %shift
470}
471
472define <8 x i8> @var_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind {
473; SSE2-LABEL: var_shift_v8i8:
474; SSE2:       # %bb.0:
475; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
476; SSE2-NEXT:    psllw $5, %xmm1
477; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
478; SSE2-NEXT:    pxor %xmm3, %xmm3
479; SSE2-NEXT:    pxor %xmm5, %xmm5
480; SSE2-NEXT:    pcmpgtw %xmm4, %xmm5
481; SSE2-NEXT:    movdqa %xmm5, %xmm6
482; SSE2-NEXT:    pandn %xmm2, %xmm6
483; SSE2-NEXT:    psraw $4, %xmm2
484; SSE2-NEXT:    pand %xmm5, %xmm2
485; SSE2-NEXT:    por %xmm6, %xmm2
486; SSE2-NEXT:    paddw %xmm4, %xmm4
487; SSE2-NEXT:    pxor %xmm5, %xmm5
488; SSE2-NEXT:    pcmpgtw %xmm4, %xmm5
489; SSE2-NEXT:    movdqa %xmm5, %xmm6
490; SSE2-NEXT:    pandn %xmm2, %xmm6
491; SSE2-NEXT:    psraw $2, %xmm2
492; SSE2-NEXT:    pand %xmm5, %xmm2
493; SSE2-NEXT:    por %xmm6, %xmm2
494; SSE2-NEXT:    paddw %xmm4, %xmm4
495; SSE2-NEXT:    pxor %xmm5, %xmm5
496; SSE2-NEXT:    pcmpgtw %xmm4, %xmm5
497; SSE2-NEXT:    movdqa %xmm5, %xmm4
498; SSE2-NEXT:    pandn %xmm2, %xmm4
499; SSE2-NEXT:    psraw $1, %xmm2
500; SSE2-NEXT:    pand %xmm5, %xmm2
501; SSE2-NEXT:    por %xmm4, %xmm2
502; SSE2-NEXT:    psrlw $8, %xmm2
503; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
504; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
505; SSE2-NEXT:    pxor %xmm4, %xmm4
506; SSE2-NEXT:    pcmpgtw %xmm1, %xmm4
507; SSE2-NEXT:    movdqa %xmm4, %xmm5
508; SSE2-NEXT:    pandn %xmm0, %xmm5
509; SSE2-NEXT:    psraw $4, %xmm0
510; SSE2-NEXT:    pand %xmm4, %xmm0
511; SSE2-NEXT:    por %xmm5, %xmm0
512; SSE2-NEXT:    paddw %xmm1, %xmm1
513; SSE2-NEXT:    pxor %xmm4, %xmm4
514; SSE2-NEXT:    pcmpgtw %xmm1, %xmm4
515; SSE2-NEXT:    movdqa %xmm4, %xmm5
516; SSE2-NEXT:    pandn %xmm0, %xmm5
517; SSE2-NEXT:    psraw $2, %xmm0
518; SSE2-NEXT:    pand %xmm4, %xmm0
519; SSE2-NEXT:    por %xmm5, %xmm0
520; SSE2-NEXT:    paddw %xmm1, %xmm1
521; SSE2-NEXT:    pcmpgtw %xmm1, %xmm3
522; SSE2-NEXT:    movdqa %xmm3, %xmm1
523; SSE2-NEXT:    pandn %xmm0, %xmm1
524; SSE2-NEXT:    psraw $1, %xmm0
525; SSE2-NEXT:    pand %xmm3, %xmm0
526; SSE2-NEXT:    por %xmm1, %xmm0
527; SSE2-NEXT:    psrlw $8, %xmm0
528; SSE2-NEXT:    packuswb %xmm2, %xmm0
529; SSE2-NEXT:    retq
530;
531; SSE41-LABEL: var_shift_v8i8:
532; SSE41:       # %bb.0:
533; SSE41-NEXT:    movdqa %xmm0, %xmm2
534; SSE41-NEXT:    psllw $5, %xmm1
535; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
536; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
537; SSE41-NEXT:    movdqa %xmm3, %xmm4
538; SSE41-NEXT:    psraw $4, %xmm4
539; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm3
540; SSE41-NEXT:    movdqa %xmm3, %xmm4
541; SSE41-NEXT:    psraw $2, %xmm4
542; SSE41-NEXT:    paddw %xmm0, %xmm0
543; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm3
544; SSE41-NEXT:    movdqa %xmm3, %xmm4
545; SSE41-NEXT:    psraw $1, %xmm4
546; SSE41-NEXT:    paddw %xmm0, %xmm0
547; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm3
548; SSE41-NEXT:    psrlw $8, %xmm3
549; SSE41-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
550; SSE41-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
551; SSE41-NEXT:    movdqa %xmm1, %xmm2
552; SSE41-NEXT:    psraw $4, %xmm2
553; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
554; SSE41-NEXT:    movdqa %xmm1, %xmm2
555; SSE41-NEXT:    psraw $2, %xmm2
556; SSE41-NEXT:    paddw %xmm0, %xmm0
557; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
558; SSE41-NEXT:    movdqa %xmm1, %xmm2
559; SSE41-NEXT:    psraw $1, %xmm2
560; SSE41-NEXT:    paddw %xmm0, %xmm0
561; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
562; SSE41-NEXT:    psrlw $8, %xmm1
563; SSE41-NEXT:    packuswb %xmm3, %xmm1
564; SSE41-NEXT:    movdqa %xmm1, %xmm0
565; SSE41-NEXT:    retq
566;
567; AVX-LABEL: var_shift_v8i8:
568; AVX:       # %bb.0:
569; AVX-NEXT:    vpsllw $5, %xmm1, %xmm1
570; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
571; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
572; AVX-NEXT:    vpsraw $4, %xmm3, %xmm4
573; AVX-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
574; AVX-NEXT:    vpsraw $2, %xmm3, %xmm4
575; AVX-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
576; AVX-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
577; AVX-NEXT:    vpsraw $1, %xmm3, %xmm4
578; AVX-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
579; AVX-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm2
580; AVX-NEXT:    vpsrlw $8, %xmm2, %xmm2
581; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
582; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
583; AVX-NEXT:    vpsraw $4, %xmm0, %xmm3
584; AVX-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
585; AVX-NEXT:    vpsraw $2, %xmm0, %xmm3
586; AVX-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
587; AVX-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
588; AVX-NEXT:    vpsraw $1, %xmm0, %xmm3
589; AVX-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
590; AVX-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
591; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm0
592; AVX-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
593; AVX-NEXT:    retq
594;
595; XOP-LABEL: var_shift_v8i8:
596; XOP:       # %bb.0:
597; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
598; XOP-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
599; XOP-NEXT:    vpshab %xmm1, %xmm0, %xmm0
600; XOP-NEXT:    retq
601;
602; AVX512DQ-LABEL: var_shift_v8i8:
603; AVX512DQ:       # %bb.0:
604; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
605; AVX512DQ-NEXT:    vpmovsxbd %xmm0, %zmm0
606; AVX512DQ-NEXT:    vpsravd %zmm1, %zmm0, %zmm0
607; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
608; AVX512DQ-NEXT:    vzeroupper
609; AVX512DQ-NEXT:    retq
610;
611; AVX512BW-LABEL: var_shift_v8i8:
612; AVX512BW:       # %bb.0:
613; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
614; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm0
615; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
616; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
617; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
618; AVX512BW-NEXT:    vzeroupper
619; AVX512BW-NEXT:    retq
620;
621; AVX512DQVL-LABEL: var_shift_v8i8:
622; AVX512DQVL:       # %bb.0:
623; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
624; AVX512DQVL-NEXT:    vpmovsxbd %xmm0, %zmm0
625; AVX512DQVL-NEXT:    vpsravd %zmm1, %zmm0, %zmm0
626; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
627; AVX512DQVL-NEXT:    vzeroupper
628; AVX512DQVL-NEXT:    retq
629;
630; AVX512BWVL-LABEL: var_shift_v8i8:
631; AVX512BWVL:       # %bb.0:
632; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
633; AVX512BWVL-NEXT:    vpmovsxbw %xmm0, %ymm0
634; AVX512BWVL-NEXT:    vpsravw %ymm1, %ymm0, %ymm0
635; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
636; AVX512BWVL-NEXT:    vzeroupper
637; AVX512BWVL-NEXT:    retq
638;
639; X86-SSE-LABEL: var_shift_v8i8:
640; X86-SSE:       # %bb.0:
641; X86-SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
642; X86-SSE-NEXT:    psllw $5, %xmm1
643; X86-SSE-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
644; X86-SSE-NEXT:    pxor %xmm3, %xmm3
645; X86-SSE-NEXT:    pxor %xmm5, %xmm5
646; X86-SSE-NEXT:    pcmpgtw %xmm4, %xmm5
647; X86-SSE-NEXT:    movdqa %xmm5, %xmm6
648; X86-SSE-NEXT:    pandn %xmm2, %xmm6
649; X86-SSE-NEXT:    psraw $4, %xmm2
650; X86-SSE-NEXT:    pand %xmm5, %xmm2
651; X86-SSE-NEXT:    por %xmm6, %xmm2
652; X86-SSE-NEXT:    paddw %xmm4, %xmm4
653; X86-SSE-NEXT:    pxor %xmm5, %xmm5
654; X86-SSE-NEXT:    pcmpgtw %xmm4, %xmm5
655; X86-SSE-NEXT:    movdqa %xmm5, %xmm6
656; X86-SSE-NEXT:    pandn %xmm2, %xmm6
657; X86-SSE-NEXT:    psraw $2, %xmm2
658; X86-SSE-NEXT:    pand %xmm5, %xmm2
659; X86-SSE-NEXT:    por %xmm6, %xmm2
660; X86-SSE-NEXT:    paddw %xmm4, %xmm4
661; X86-SSE-NEXT:    pxor %xmm5, %xmm5
662; X86-SSE-NEXT:    pcmpgtw %xmm4, %xmm5
663; X86-SSE-NEXT:    movdqa %xmm5, %xmm4
664; X86-SSE-NEXT:    pandn %xmm2, %xmm4
665; X86-SSE-NEXT:    psraw $1, %xmm2
666; X86-SSE-NEXT:    pand %xmm5, %xmm2
667; X86-SSE-NEXT:    por %xmm4, %xmm2
668; X86-SSE-NEXT:    psrlw $8, %xmm2
669; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
670; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
671; X86-SSE-NEXT:    pxor %xmm4, %xmm4
672; X86-SSE-NEXT:    pcmpgtw %xmm1, %xmm4
673; X86-SSE-NEXT:    movdqa %xmm4, %xmm5
674; X86-SSE-NEXT:    pandn %xmm0, %xmm5
675; X86-SSE-NEXT:    psraw $4, %xmm0
676; X86-SSE-NEXT:    pand %xmm4, %xmm0
677; X86-SSE-NEXT:    por %xmm5, %xmm0
678; X86-SSE-NEXT:    paddw %xmm1, %xmm1
679; X86-SSE-NEXT:    pxor %xmm4, %xmm4
680; X86-SSE-NEXT:    pcmpgtw %xmm1, %xmm4
681; X86-SSE-NEXT:    movdqa %xmm4, %xmm5
682; X86-SSE-NEXT:    pandn %xmm0, %xmm5
683; X86-SSE-NEXT:    psraw $2, %xmm0
684; X86-SSE-NEXT:    pand %xmm4, %xmm0
685; X86-SSE-NEXT:    por %xmm5, %xmm0
686; X86-SSE-NEXT:    paddw %xmm1, %xmm1
687; X86-SSE-NEXT:    pcmpgtw %xmm1, %xmm3
688; X86-SSE-NEXT:    movdqa %xmm3, %xmm1
689; X86-SSE-NEXT:    pandn %xmm0, %xmm1
690; X86-SSE-NEXT:    psraw $1, %xmm0
691; X86-SSE-NEXT:    pand %xmm3, %xmm0
692; X86-SSE-NEXT:    por %xmm1, %xmm0
693; X86-SSE-NEXT:    psrlw $8, %xmm0
694; X86-SSE-NEXT:    packuswb %xmm2, %xmm0
695; X86-SSE-NEXT:    retl
696  %shift = ashr <8 x i8> %a, %b
697  ret <8 x i8> %shift
698}
699
700define <4 x i8> @var_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind {
701; SSE2-LABEL: var_shift_v4i8:
702; SSE2:       # %bb.0:
703; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
704; SSE2-NEXT:    psllw $5, %xmm1
705; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
706; SSE2-NEXT:    pxor %xmm3, %xmm3
707; SSE2-NEXT:    pxor %xmm5, %xmm5
708; SSE2-NEXT:    pcmpgtw %xmm4, %xmm5
709; SSE2-NEXT:    movdqa %xmm5, %xmm6
710; SSE2-NEXT:    pandn %xmm2, %xmm6
711; SSE2-NEXT:    psraw $4, %xmm2
712; SSE2-NEXT:    pand %xmm5, %xmm2
713; SSE2-NEXT:    por %xmm6, %xmm2
714; SSE2-NEXT:    paddw %xmm4, %xmm4
715; SSE2-NEXT:    pxor %xmm5, %xmm5
716; SSE2-NEXT:    pcmpgtw %xmm4, %xmm5
717; SSE2-NEXT:    movdqa %xmm5, %xmm6
718; SSE2-NEXT:    pandn %xmm2, %xmm6
719; SSE2-NEXT:    psraw $2, %xmm2
720; SSE2-NEXT:    pand %xmm5, %xmm2
721; SSE2-NEXT:    por %xmm6, %xmm2
722; SSE2-NEXT:    paddw %xmm4, %xmm4
723; SSE2-NEXT:    pxor %xmm5, %xmm5
724; SSE2-NEXT:    pcmpgtw %xmm4, %xmm5
725; SSE2-NEXT:    movdqa %xmm5, %xmm4
726; SSE2-NEXT:    pandn %xmm2, %xmm4
727; SSE2-NEXT:    psraw $1, %xmm2
728; SSE2-NEXT:    pand %xmm5, %xmm2
729; SSE2-NEXT:    por %xmm4, %xmm2
730; SSE2-NEXT:    psrlw $8, %xmm2
731; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
732; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
733; SSE2-NEXT:    pxor %xmm4, %xmm4
734; SSE2-NEXT:    pcmpgtw %xmm1, %xmm4
735; SSE2-NEXT:    movdqa %xmm4, %xmm5
736; SSE2-NEXT:    pandn %xmm0, %xmm5
737; SSE2-NEXT:    psraw $4, %xmm0
738; SSE2-NEXT:    pand %xmm4, %xmm0
739; SSE2-NEXT:    por %xmm5, %xmm0
740; SSE2-NEXT:    paddw %xmm1, %xmm1
741; SSE2-NEXT:    pxor %xmm4, %xmm4
742; SSE2-NEXT:    pcmpgtw %xmm1, %xmm4
743; SSE2-NEXT:    movdqa %xmm4, %xmm5
744; SSE2-NEXT:    pandn %xmm0, %xmm5
745; SSE2-NEXT:    psraw $2, %xmm0
746; SSE2-NEXT:    pand %xmm4, %xmm0
747; SSE2-NEXT:    por %xmm5, %xmm0
748; SSE2-NEXT:    paddw %xmm1, %xmm1
749; SSE2-NEXT:    pcmpgtw %xmm1, %xmm3
750; SSE2-NEXT:    movdqa %xmm3, %xmm1
751; SSE2-NEXT:    pandn %xmm0, %xmm1
752; SSE2-NEXT:    psraw $1, %xmm0
753; SSE2-NEXT:    pand %xmm3, %xmm0
754; SSE2-NEXT:    por %xmm1, %xmm0
755; SSE2-NEXT:    psrlw $8, %xmm0
756; SSE2-NEXT:    packuswb %xmm2, %xmm0
757; SSE2-NEXT:    retq
758;
759; SSE41-LABEL: var_shift_v4i8:
760; SSE41:       # %bb.0:
761; SSE41-NEXT:    movdqa %xmm0, %xmm2
762; SSE41-NEXT:    psllw $5, %xmm1
763; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
764; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
765; SSE41-NEXT:    movdqa %xmm3, %xmm4
766; SSE41-NEXT:    psraw $4, %xmm4
767; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm3
768; SSE41-NEXT:    movdqa %xmm3, %xmm4
769; SSE41-NEXT:    psraw $2, %xmm4
770; SSE41-NEXT:    paddw %xmm0, %xmm0
771; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm3
772; SSE41-NEXT:    movdqa %xmm3, %xmm4
773; SSE41-NEXT:    psraw $1, %xmm4
774; SSE41-NEXT:    paddw %xmm0, %xmm0
775; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm3
776; SSE41-NEXT:    psrlw $8, %xmm3
777; SSE41-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
778; SSE41-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
779; SSE41-NEXT:    movdqa %xmm1, %xmm2
780; SSE41-NEXT:    psraw $4, %xmm2
781; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
782; SSE41-NEXT:    movdqa %xmm1, %xmm2
783; SSE41-NEXT:    psraw $2, %xmm2
784; SSE41-NEXT:    paddw %xmm0, %xmm0
785; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
786; SSE41-NEXT:    movdqa %xmm1, %xmm2
787; SSE41-NEXT:    psraw $1, %xmm2
788; SSE41-NEXT:    paddw %xmm0, %xmm0
789; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
790; SSE41-NEXT:    psrlw $8, %xmm1
791; SSE41-NEXT:    packuswb %xmm3, %xmm1
792; SSE41-NEXT:    movdqa %xmm1, %xmm0
793; SSE41-NEXT:    retq
794;
795; AVX-LABEL: var_shift_v4i8:
796; AVX:       # %bb.0:
797; AVX-NEXT:    vpsllw $5, %xmm1, %xmm1
798; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
799; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
800; AVX-NEXT:    vpsraw $4, %xmm3, %xmm4
801; AVX-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
802; AVX-NEXT:    vpsraw $2, %xmm3, %xmm4
803; AVX-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
804; AVX-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
805; AVX-NEXT:    vpsraw $1, %xmm3, %xmm4
806; AVX-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
807; AVX-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm2
808; AVX-NEXT:    vpsrlw $8, %xmm2, %xmm2
809; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
810; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
811; AVX-NEXT:    vpsraw $4, %xmm0, %xmm3
812; AVX-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
813; AVX-NEXT:    vpsraw $2, %xmm0, %xmm3
814; AVX-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
815; AVX-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
816; AVX-NEXT:    vpsraw $1, %xmm0, %xmm3
817; AVX-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
818; AVX-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
819; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm0
820; AVX-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
821; AVX-NEXT:    retq
822;
823; XOP-LABEL: var_shift_v4i8:
824; XOP:       # %bb.0:
825; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
826; XOP-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
827; XOP-NEXT:    vpshab %xmm1, %xmm0, %xmm0
828; XOP-NEXT:    retq
829;
830; AVX512DQ-LABEL: var_shift_v4i8:
831; AVX512DQ:       # %bb.0:
832; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
833; AVX512DQ-NEXT:    vpmovsxbd %xmm0, %zmm0
834; AVX512DQ-NEXT:    vpsravd %zmm1, %zmm0, %zmm0
835; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
836; AVX512DQ-NEXT:    vzeroupper
837; AVX512DQ-NEXT:    retq
838;
839; AVX512BW-LABEL: var_shift_v4i8:
840; AVX512BW:       # %bb.0:
841; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
842; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm0
843; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
844; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
845; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
846; AVX512BW-NEXT:    vzeroupper
847; AVX512BW-NEXT:    retq
848;
849; AVX512DQVL-LABEL: var_shift_v4i8:
850; AVX512DQVL:       # %bb.0:
851; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
852; AVX512DQVL-NEXT:    vpmovsxbd %xmm0, %zmm0
853; AVX512DQVL-NEXT:    vpsravd %zmm1, %zmm0, %zmm0
854; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
855; AVX512DQVL-NEXT:    vzeroupper
856; AVX512DQVL-NEXT:    retq
857;
858; AVX512BWVL-LABEL: var_shift_v4i8:
859; AVX512BWVL:       # %bb.0:
860; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
861; AVX512BWVL-NEXT:    vpmovsxbw %xmm0, %ymm0
862; AVX512BWVL-NEXT:    vpsravw %ymm1, %ymm0, %ymm0
863; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
864; AVX512BWVL-NEXT:    vzeroupper
865; AVX512BWVL-NEXT:    retq
866;
867; X86-SSE-LABEL: var_shift_v4i8:
868; X86-SSE:       # %bb.0:
869; X86-SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
870; X86-SSE-NEXT:    psllw $5, %xmm1
871; X86-SSE-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
872; X86-SSE-NEXT:    pxor %xmm3, %xmm3
873; X86-SSE-NEXT:    pxor %xmm5, %xmm5
874; X86-SSE-NEXT:    pcmpgtw %xmm4, %xmm5
875; X86-SSE-NEXT:    movdqa %xmm5, %xmm6
876; X86-SSE-NEXT:    pandn %xmm2, %xmm6
877; X86-SSE-NEXT:    psraw $4, %xmm2
878; X86-SSE-NEXT:    pand %xmm5, %xmm2
879; X86-SSE-NEXT:    por %xmm6, %xmm2
880; X86-SSE-NEXT:    paddw %xmm4, %xmm4
881; X86-SSE-NEXT:    pxor %xmm5, %xmm5
882; X86-SSE-NEXT:    pcmpgtw %xmm4, %xmm5
883; X86-SSE-NEXT:    movdqa %xmm5, %xmm6
884; X86-SSE-NEXT:    pandn %xmm2, %xmm6
885; X86-SSE-NEXT:    psraw $2, %xmm2
886; X86-SSE-NEXT:    pand %xmm5, %xmm2
887; X86-SSE-NEXT:    por %xmm6, %xmm2
888; X86-SSE-NEXT:    paddw %xmm4, %xmm4
889; X86-SSE-NEXT:    pxor %xmm5, %xmm5
890; X86-SSE-NEXT:    pcmpgtw %xmm4, %xmm5
891; X86-SSE-NEXT:    movdqa %xmm5, %xmm4
892; X86-SSE-NEXT:    pandn %xmm2, %xmm4
893; X86-SSE-NEXT:    psraw $1, %xmm2
894; X86-SSE-NEXT:    pand %xmm5, %xmm2
895; X86-SSE-NEXT:    por %xmm4, %xmm2
896; X86-SSE-NEXT:    psrlw $8, %xmm2
897; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
898; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
899; X86-SSE-NEXT:    pxor %xmm4, %xmm4
900; X86-SSE-NEXT:    pcmpgtw %xmm1, %xmm4
901; X86-SSE-NEXT:    movdqa %xmm4, %xmm5
902; X86-SSE-NEXT:    pandn %xmm0, %xmm5
903; X86-SSE-NEXT:    psraw $4, %xmm0
904; X86-SSE-NEXT:    pand %xmm4, %xmm0
905; X86-SSE-NEXT:    por %xmm5, %xmm0
906; X86-SSE-NEXT:    paddw %xmm1, %xmm1
907; X86-SSE-NEXT:    pxor %xmm4, %xmm4
908; X86-SSE-NEXT:    pcmpgtw %xmm1, %xmm4
909; X86-SSE-NEXT:    movdqa %xmm4, %xmm5
910; X86-SSE-NEXT:    pandn %xmm0, %xmm5
911; X86-SSE-NEXT:    psraw $2, %xmm0
912; X86-SSE-NEXT:    pand %xmm4, %xmm0
913; X86-SSE-NEXT:    por %xmm5, %xmm0
914; X86-SSE-NEXT:    paddw %xmm1, %xmm1
915; X86-SSE-NEXT:    pcmpgtw %xmm1, %xmm3
916; X86-SSE-NEXT:    movdqa %xmm3, %xmm1
917; X86-SSE-NEXT:    pandn %xmm0, %xmm1
918; X86-SSE-NEXT:    psraw $1, %xmm0
919; X86-SSE-NEXT:    pand %xmm3, %xmm0
920; X86-SSE-NEXT:    por %xmm1, %xmm0
921; X86-SSE-NEXT:    psrlw $8, %xmm0
922; X86-SSE-NEXT:    packuswb %xmm2, %xmm0
923; X86-SSE-NEXT:    retl
924  %shift = ashr <4 x i8> %a, %b
925  ret <4 x i8> %shift
926}
927
928define <2 x i8> @var_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind {
929; SSE2-LABEL: var_shift_v2i8:
930; SSE2:       # %bb.0:
931; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
932; SSE2-NEXT:    psllw $5, %xmm1
933; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
934; SSE2-NEXT:    pxor %xmm3, %xmm3
935; SSE2-NEXT:    pxor %xmm5, %xmm5
936; SSE2-NEXT:    pcmpgtw %xmm4, %xmm5
937; SSE2-NEXT:    movdqa %xmm5, %xmm6
938; SSE2-NEXT:    pandn %xmm2, %xmm6
939; SSE2-NEXT:    psraw $4, %xmm2
940; SSE2-NEXT:    pand %xmm5, %xmm2
941; SSE2-NEXT:    por %xmm6, %xmm2
942; SSE2-NEXT:    paddw %xmm4, %xmm4
943; SSE2-NEXT:    pxor %xmm5, %xmm5
944; SSE2-NEXT:    pcmpgtw %xmm4, %xmm5
945; SSE2-NEXT:    movdqa %xmm5, %xmm6
946; SSE2-NEXT:    pandn %xmm2, %xmm6
947; SSE2-NEXT:    psraw $2, %xmm2
948; SSE2-NEXT:    pand %xmm5, %xmm2
949; SSE2-NEXT:    por %xmm6, %xmm2
950; SSE2-NEXT:    paddw %xmm4, %xmm4
951; SSE2-NEXT:    pxor %xmm5, %xmm5
952; SSE2-NEXT:    pcmpgtw %xmm4, %xmm5
953; SSE2-NEXT:    movdqa %xmm5, %xmm4
954; SSE2-NEXT:    pandn %xmm2, %xmm4
955; SSE2-NEXT:    psraw $1, %xmm2
956; SSE2-NEXT:    pand %xmm5, %xmm2
957; SSE2-NEXT:    por %xmm4, %xmm2
958; SSE2-NEXT:    psrlw $8, %xmm2
959; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
960; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
961; SSE2-NEXT:    pxor %xmm4, %xmm4
962; SSE2-NEXT:    pcmpgtw %xmm1, %xmm4
963; SSE2-NEXT:    movdqa %xmm4, %xmm5
964; SSE2-NEXT:    pandn %xmm0, %xmm5
965; SSE2-NEXT:    psraw $4, %xmm0
966; SSE2-NEXT:    pand %xmm4, %xmm0
967; SSE2-NEXT:    por %xmm5, %xmm0
968; SSE2-NEXT:    paddw %xmm1, %xmm1
969; SSE2-NEXT:    pxor %xmm4, %xmm4
970; SSE2-NEXT:    pcmpgtw %xmm1, %xmm4
971; SSE2-NEXT:    movdqa %xmm4, %xmm5
972; SSE2-NEXT:    pandn %xmm0, %xmm5
973; SSE2-NEXT:    psraw $2, %xmm0
974; SSE2-NEXT:    pand %xmm4, %xmm0
975; SSE2-NEXT:    por %xmm5, %xmm0
976; SSE2-NEXT:    paddw %xmm1, %xmm1
977; SSE2-NEXT:    pcmpgtw %xmm1, %xmm3
978; SSE2-NEXT:    movdqa %xmm3, %xmm1
979; SSE2-NEXT:    pandn %xmm0, %xmm1
980; SSE2-NEXT:    psraw $1, %xmm0
981; SSE2-NEXT:    pand %xmm3, %xmm0
982; SSE2-NEXT:    por %xmm1, %xmm0
983; SSE2-NEXT:    psrlw $8, %xmm0
984; SSE2-NEXT:    packuswb %xmm2, %xmm0
985; SSE2-NEXT:    retq
986;
987; SSE41-LABEL: var_shift_v2i8:
988; SSE41:       # %bb.0:
989; SSE41-NEXT:    movdqa %xmm0, %xmm2
990; SSE41-NEXT:    psllw $5, %xmm1
991; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
992; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
993; SSE41-NEXT:    movdqa %xmm3, %xmm4
994; SSE41-NEXT:    psraw $4, %xmm4
995; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm3
996; SSE41-NEXT:    movdqa %xmm3, %xmm4
997; SSE41-NEXT:    psraw $2, %xmm4
998; SSE41-NEXT:    paddw %xmm0, %xmm0
999; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm3
1000; SSE41-NEXT:    movdqa %xmm3, %xmm4
1001; SSE41-NEXT:    psraw $1, %xmm4
1002; SSE41-NEXT:    paddw %xmm0, %xmm0
1003; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm3
1004; SSE41-NEXT:    psrlw $8, %xmm3
1005; SSE41-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1006; SSE41-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
1007; SSE41-NEXT:    movdqa %xmm1, %xmm2
1008; SSE41-NEXT:    psraw $4, %xmm2
1009; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
1010; SSE41-NEXT:    movdqa %xmm1, %xmm2
1011; SSE41-NEXT:    psraw $2, %xmm2
1012; SSE41-NEXT:    paddw %xmm0, %xmm0
1013; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
1014; SSE41-NEXT:    movdqa %xmm1, %xmm2
1015; SSE41-NEXT:    psraw $1, %xmm2
1016; SSE41-NEXT:    paddw %xmm0, %xmm0
1017; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
1018; SSE41-NEXT:    psrlw $8, %xmm1
1019; SSE41-NEXT:    packuswb %xmm3, %xmm1
1020; SSE41-NEXT:    movdqa %xmm1, %xmm0
1021; SSE41-NEXT:    retq
1022;
1023; AVX-LABEL: var_shift_v2i8:
1024; AVX:       # %bb.0:
1025; AVX-NEXT:    vpsllw $5, %xmm1, %xmm1
1026; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1027; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1028; AVX-NEXT:    vpsraw $4, %xmm3, %xmm4
1029; AVX-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
1030; AVX-NEXT:    vpsraw $2, %xmm3, %xmm4
1031; AVX-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
1032; AVX-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
1033; AVX-NEXT:    vpsraw $1, %xmm3, %xmm4
1034; AVX-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
1035; AVX-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm2
1036; AVX-NEXT:    vpsrlw $8, %xmm2, %xmm2
1037; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1038; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1039; AVX-NEXT:    vpsraw $4, %xmm0, %xmm3
1040; AVX-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
1041; AVX-NEXT:    vpsraw $2, %xmm0, %xmm3
1042; AVX-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
1043; AVX-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
1044; AVX-NEXT:    vpsraw $1, %xmm0, %xmm3
1045; AVX-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
1046; AVX-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
1047; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm0
1048; AVX-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
1049; AVX-NEXT:    retq
1050;
1051; XOP-LABEL: var_shift_v2i8:
1052; XOP:       # %bb.0:
1053; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1054; XOP-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
1055; XOP-NEXT:    vpshab %xmm1, %xmm0, %xmm0
1056; XOP-NEXT:    retq
1057;
1058; AVX512DQ-LABEL: var_shift_v2i8:
1059; AVX512DQ:       # %bb.0:
1060; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
1061; AVX512DQ-NEXT:    vpmovsxbd %xmm0, %zmm0
1062; AVX512DQ-NEXT:    vpsravd %zmm1, %zmm0, %zmm0
1063; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
1064; AVX512DQ-NEXT:    vzeroupper
1065; AVX512DQ-NEXT:    retq
1066;
1067; AVX512BW-LABEL: var_shift_v2i8:
1068; AVX512BW:       # %bb.0:
1069; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
1070; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm0
1071; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
1072; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
1073; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1074; AVX512BW-NEXT:    vzeroupper
1075; AVX512BW-NEXT:    retq
1076;
1077; AVX512DQVL-LABEL: var_shift_v2i8:
1078; AVX512DQVL:       # %bb.0:
1079; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
1080; AVX512DQVL-NEXT:    vpmovsxbd %xmm0, %zmm0
1081; AVX512DQVL-NEXT:    vpsravd %zmm1, %zmm0, %zmm0
1082; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
1083; AVX512DQVL-NEXT:    vzeroupper
1084; AVX512DQVL-NEXT:    retq
1085;
1086; AVX512BWVL-LABEL: var_shift_v2i8:
1087; AVX512BWVL:       # %bb.0:
1088; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
1089; AVX512BWVL-NEXT:    vpmovsxbw %xmm0, %ymm0
1090; AVX512BWVL-NEXT:    vpsravw %ymm1, %ymm0, %ymm0
1091; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
1092; AVX512BWVL-NEXT:    vzeroupper
1093; AVX512BWVL-NEXT:    retq
1094;
1095; X86-SSE-LABEL: var_shift_v2i8:
1096; X86-SSE:       # %bb.0:
1097; X86-SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
1098; X86-SSE-NEXT:    psllw $5, %xmm1
1099; X86-SSE-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
1100; X86-SSE-NEXT:    pxor %xmm3, %xmm3
1101; X86-SSE-NEXT:    pxor %xmm5, %xmm5
1102; X86-SSE-NEXT:    pcmpgtw %xmm4, %xmm5
1103; X86-SSE-NEXT:    movdqa %xmm5, %xmm6
1104; X86-SSE-NEXT:    pandn %xmm2, %xmm6
1105; X86-SSE-NEXT:    psraw $4, %xmm2
1106; X86-SSE-NEXT:    pand %xmm5, %xmm2
1107; X86-SSE-NEXT:    por %xmm6, %xmm2
1108; X86-SSE-NEXT:    paddw %xmm4, %xmm4
1109; X86-SSE-NEXT:    pxor %xmm5, %xmm5
1110; X86-SSE-NEXT:    pcmpgtw %xmm4, %xmm5
1111; X86-SSE-NEXT:    movdqa %xmm5, %xmm6
1112; X86-SSE-NEXT:    pandn %xmm2, %xmm6
1113; X86-SSE-NEXT:    psraw $2, %xmm2
1114; X86-SSE-NEXT:    pand %xmm5, %xmm2
1115; X86-SSE-NEXT:    por %xmm6, %xmm2
1116; X86-SSE-NEXT:    paddw %xmm4, %xmm4
1117; X86-SSE-NEXT:    pxor %xmm5, %xmm5
1118; X86-SSE-NEXT:    pcmpgtw %xmm4, %xmm5
1119; X86-SSE-NEXT:    movdqa %xmm5, %xmm4
1120; X86-SSE-NEXT:    pandn %xmm2, %xmm4
1121; X86-SSE-NEXT:    psraw $1, %xmm2
1122; X86-SSE-NEXT:    pand %xmm5, %xmm2
1123; X86-SSE-NEXT:    por %xmm4, %xmm2
1124; X86-SSE-NEXT:    psrlw $8, %xmm2
1125; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1126; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1127; X86-SSE-NEXT:    pxor %xmm4, %xmm4
1128; X86-SSE-NEXT:    pcmpgtw %xmm1, %xmm4
1129; X86-SSE-NEXT:    movdqa %xmm4, %xmm5
1130; X86-SSE-NEXT:    pandn %xmm0, %xmm5
1131; X86-SSE-NEXT:    psraw $4, %xmm0
1132; X86-SSE-NEXT:    pand %xmm4, %xmm0
1133; X86-SSE-NEXT:    por %xmm5, %xmm0
1134; X86-SSE-NEXT:    paddw %xmm1, %xmm1
1135; X86-SSE-NEXT:    pxor %xmm4, %xmm4
1136; X86-SSE-NEXT:    pcmpgtw %xmm1, %xmm4
1137; X86-SSE-NEXT:    movdqa %xmm4, %xmm5
1138; X86-SSE-NEXT:    pandn %xmm0, %xmm5
1139; X86-SSE-NEXT:    psraw $2, %xmm0
1140; X86-SSE-NEXT:    pand %xmm4, %xmm0
1141; X86-SSE-NEXT:    por %xmm5, %xmm0
1142; X86-SSE-NEXT:    paddw %xmm1, %xmm1
1143; X86-SSE-NEXT:    pcmpgtw %xmm1, %xmm3
1144; X86-SSE-NEXT:    movdqa %xmm3, %xmm1
1145; X86-SSE-NEXT:    pandn %xmm0, %xmm1
1146; X86-SSE-NEXT:    psraw $1, %xmm0
1147; X86-SSE-NEXT:    pand %xmm3, %xmm0
1148; X86-SSE-NEXT:    por %xmm1, %xmm0
1149; X86-SSE-NEXT:    psrlw $8, %xmm0
1150; X86-SSE-NEXT:    packuswb %xmm2, %xmm0
1151; X86-SSE-NEXT:    retl
1152  %shift = ashr <2 x i8> %a, %b
1153  ret <2 x i8> %shift
1154}
1155
1156;
1157; Uniform Variable Shifts
1158;
1159
1160define <2 x i32> @splatvar_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind {
1161; SSE2-LABEL: splatvar_shift_v2i32:
1162; SSE2:       # %bb.0:
1163; SSE2-NEXT:    xorps %xmm2, %xmm2
1164; SSE2-NEXT:    movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
1165; SSE2-NEXT:    psrad %xmm2, %xmm0
1166; SSE2-NEXT:    retq
1167;
1168; SSE41-LABEL: splatvar_shift_v2i32:
1169; SSE41:       # %bb.0:
1170; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
1171; SSE41-NEXT:    psrad %xmm1, %xmm0
1172; SSE41-NEXT:    retq
1173;
1174; AVX-LABEL: splatvar_shift_v2i32:
1175; AVX:       # %bb.0:
1176; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
1177; AVX-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
1178; AVX-NEXT:    retq
1179;
1180; XOP-LABEL: splatvar_shift_v2i32:
1181; XOP:       # %bb.0:
1182; XOP-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
1183; XOP-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
1184; XOP-NEXT:    retq
1185;
1186; AVX512-LABEL: splatvar_shift_v2i32:
1187; AVX512:       # %bb.0:
1188; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
1189; AVX512-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
1190; AVX512-NEXT:    retq
1191;
1192; AVX512VL-LABEL: splatvar_shift_v2i32:
1193; AVX512VL:       # %bb.0:
1194; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
1195; AVX512VL-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
1196; AVX512VL-NEXT:    retq
1197;
1198; X86-SSE-LABEL: splatvar_shift_v2i32:
1199; X86-SSE:       # %bb.0:
1200; X86-SSE-NEXT:    xorps %xmm2, %xmm2
1201; X86-SSE-NEXT:    movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
1202; X86-SSE-NEXT:    psrad %xmm2, %xmm0
1203; X86-SSE-NEXT:    retl
1204  %splat = shufflevector <2 x i32> %b, <2 x i32> undef, <2 x i32> zeroinitializer
1205  %shift = ashr <2 x i32> %a, %splat
1206  ret <2 x i32> %shift
1207}
1208
1209define <4 x i16> @splatvar_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind {
1210; SSE2-LABEL: splatvar_shift_v4i16:
1211; SSE2:       # %bb.0:
1212; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
1213; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1214; SSE2-NEXT:    psraw %xmm1, %xmm0
1215; SSE2-NEXT:    retq
1216;
1217; SSE41-LABEL: splatvar_shift_v4i16:
1218; SSE41:       # %bb.0:
1219; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
1220; SSE41-NEXT:    psraw %xmm1, %xmm0
1221; SSE41-NEXT:    retq
1222;
1223; AVX-LABEL: splatvar_shift_v4i16:
1224; AVX:       # %bb.0:
1225; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
1226; AVX-NEXT:    vpsraw %xmm1, %xmm0, %xmm0
1227; AVX-NEXT:    retq
1228;
1229; XOP-LABEL: splatvar_shift_v4i16:
1230; XOP:       # %bb.0:
1231; XOP-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
1232; XOP-NEXT:    vpsraw %xmm1, %xmm0, %xmm0
1233; XOP-NEXT:    retq
1234;
1235; AVX512-LABEL: splatvar_shift_v4i16:
1236; AVX512:       # %bb.0:
1237; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
1238; AVX512-NEXT:    vpsraw %xmm1, %xmm0, %xmm0
1239; AVX512-NEXT:    retq
1240;
1241; AVX512VL-LABEL: splatvar_shift_v4i16:
1242; AVX512VL:       # %bb.0:
1243; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
1244; AVX512VL-NEXT:    vpsraw %xmm1, %xmm0, %xmm0
1245; AVX512VL-NEXT:    retq
1246;
1247; X86-SSE-LABEL: splatvar_shift_v4i16:
1248; X86-SSE:       # %bb.0:
1249; X86-SSE-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
1250; X86-SSE-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1251; X86-SSE-NEXT:    psraw %xmm1, %xmm0
1252; X86-SSE-NEXT:    retl
1253  %splat = shufflevector <4 x i16> %b, <4 x i16> undef, <4 x i32> zeroinitializer
1254  %shift = ashr <4 x i16> %a, %splat
1255  ret <4 x i16> %shift
1256}
1257
1258define <2 x i16> @splatvar_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind {
1259; SSE2-LABEL: splatvar_shift_v2i16:
1260; SSE2:       # %bb.0:
1261; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
1262; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1263; SSE2-NEXT:    psraw %xmm1, %xmm0
1264; SSE2-NEXT:    retq
1265;
1266; SSE41-LABEL: splatvar_shift_v2i16:
1267; SSE41:       # %bb.0:
1268; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
1269; SSE41-NEXT:    psraw %xmm1, %xmm0
1270; SSE41-NEXT:    retq
1271;
1272; AVX-LABEL: splatvar_shift_v2i16:
1273; AVX:       # %bb.0:
1274; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
1275; AVX-NEXT:    vpsraw %xmm1, %xmm0, %xmm0
1276; AVX-NEXT:    retq
1277;
1278; XOP-LABEL: splatvar_shift_v2i16:
1279; XOP:       # %bb.0:
1280; XOP-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
1281; XOP-NEXT:    vpsraw %xmm1, %xmm0, %xmm0
1282; XOP-NEXT:    retq
1283;
1284; AVX512-LABEL: splatvar_shift_v2i16:
1285; AVX512:       # %bb.0:
1286; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
1287; AVX512-NEXT:    vpsraw %xmm1, %xmm0, %xmm0
1288; AVX512-NEXT:    retq
1289;
1290; AVX512VL-LABEL: splatvar_shift_v2i16:
1291; AVX512VL:       # %bb.0:
1292; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
1293; AVX512VL-NEXT:    vpsraw %xmm1, %xmm0, %xmm0
1294; AVX512VL-NEXT:    retq
1295;
1296; X86-SSE-LABEL: splatvar_shift_v2i16:
1297; X86-SSE:       # %bb.0:
1298; X86-SSE-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
1299; X86-SSE-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1300; X86-SSE-NEXT:    psraw %xmm1, %xmm0
1301; X86-SSE-NEXT:    retl
1302  %splat = shufflevector <2 x i16> %b, <2 x i16> undef, <2 x i32> zeroinitializer
1303  %shift = ashr <2 x i16> %a, %splat
1304  ret <2 x i16> %shift
1305}
1306
1307define <8 x i8> @splatvar_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind {
1308; SSE2-LABEL: splatvar_shift_v8i8:
1309; SSE2:       # %bb.0:
1310; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
1311; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1312; SSE2-NEXT:    psrlw %xmm1, %xmm0
1313; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
1314; SSE2-NEXT:    psrlw %xmm1, %xmm2
1315; SSE2-NEXT:    psrlw $8, %xmm2
1316; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1317; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
1318; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
1319; SSE2-NEXT:    pand %xmm2, %xmm0
1320; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
1321; SSE2-NEXT:    psrlw %xmm1, %xmm2
1322; SSE2-NEXT:    pxor %xmm2, %xmm0
1323; SSE2-NEXT:    psubb %xmm2, %xmm0
1324; SSE2-NEXT:    retq
1325;
1326; SSE41-LABEL: splatvar_shift_v8i8:
1327; SSE41:       # %bb.0:
1328; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1329; SSE41-NEXT:    psrlw %xmm1, %xmm0
1330; SSE41-NEXT:    pcmpeqd %xmm2, %xmm2
1331; SSE41-NEXT:    psrlw %xmm1, %xmm2
1332; SSE41-NEXT:    pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1333; SSE41-NEXT:    pand %xmm2, %xmm0
1334; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
1335; SSE41-NEXT:    psrlw %xmm1, %xmm2
1336; SSE41-NEXT:    pxor %xmm2, %xmm0
1337; SSE41-NEXT:    psubb %xmm2, %xmm0
1338; SSE41-NEXT:    retq
1339;
1340; AVX1-LABEL: splatvar_shift_v8i8:
1341; AVX1:       # %bb.0:
1342; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1343; AVX1-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
1344; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1345; AVX1-NEXT:    vpsrlw %xmm1, %xmm2, %xmm2
1346; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1347; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
1348; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
1349; AVX1-NEXT:    vpsrlw %xmm1, %xmm2, %xmm1
1350; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
1351; AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
1352; AVX1-NEXT:    retq
1353;
1354; AVX2-LABEL: splatvar_shift_v8i8:
1355; AVX2:       # %bb.0:
1356; AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1357; AVX2-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
1358; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1359; AVX2-NEXT:    vpsrlw %xmm1, %xmm2, %xmm2
1360; AVX2-NEXT:    vpsrlw $8, %xmm2, %xmm2
1361; AVX2-NEXT:    vpbroadcastb %xmm2, %xmm2
1362; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
1363; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
1364; AVX2-NEXT:    vpsrlw %xmm1, %xmm2, %xmm1
1365; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
1366; AVX2-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
1367; AVX2-NEXT:    retq
1368;
1369; XOPAVX1-LABEL: splatvar_shift_v8i8:
1370; XOPAVX1:       # %bb.0:
1371; XOPAVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1372; XOPAVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
1373; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1374; XOPAVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
1375; XOPAVX1-NEXT:    vpshab %xmm1, %xmm0, %xmm0
1376; XOPAVX1-NEXT:    retq
1377;
1378; XOPAVX2-LABEL: splatvar_shift_v8i8:
1379; XOPAVX2:       # %bb.0:
1380; XOPAVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
1381; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1382; XOPAVX2-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
1383; XOPAVX2-NEXT:    vpshab %xmm1, %xmm0, %xmm0
1384; XOPAVX2-NEXT:    retq
1385;
1386; AVX512DQ-LABEL: splatvar_shift_v8i8:
1387; AVX512DQ:       # %bb.0:
1388; AVX512DQ-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1389; AVX512DQ-NEXT:    vpmovsxbd %xmm0, %zmm0
1390; AVX512DQ-NEXT:    vpsrad %xmm1, %zmm0, %zmm0
1391; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
1392; AVX512DQ-NEXT:    vzeroupper
1393; AVX512DQ-NEXT:    retq
1394;
1395; AVX512BW-LABEL: splatvar_shift_v8i8:
1396; AVX512BW:       # %bb.0:
1397; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1398; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm0
1399; AVX512BW-NEXT:    vpsraw %xmm1, %ymm0, %ymm0
1400; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
1401; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1402; AVX512BW-NEXT:    vzeroupper
1403; AVX512BW-NEXT:    retq
1404;
1405; AVX512DQVL-LABEL: splatvar_shift_v8i8:
1406; AVX512DQVL:       # %bb.0:
1407; AVX512DQVL-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1408; AVX512DQVL-NEXT:    vpmovsxbd %xmm0, %zmm0
1409; AVX512DQVL-NEXT:    vpsrad %xmm1, %zmm0, %zmm0
1410; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
1411; AVX512DQVL-NEXT:    vzeroupper
1412; AVX512DQVL-NEXT:    retq
1413;
1414; AVX512BWVL-LABEL: splatvar_shift_v8i8:
1415; AVX512BWVL:       # %bb.0:
1416; AVX512BWVL-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1417; AVX512BWVL-NEXT:    vpmovsxbw %xmm0, %ymm0
1418; AVX512BWVL-NEXT:    vpsraw %xmm1, %ymm0, %ymm0
1419; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
1420; AVX512BWVL-NEXT:    vzeroupper
1421; AVX512BWVL-NEXT:    retq
1422;
1423; X86-SSE-LABEL: splatvar_shift_v8i8:
1424; X86-SSE:       # %bb.0:
1425; X86-SSE-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
1426; X86-SSE-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1427; X86-SSE-NEXT:    psrlw %xmm1, %xmm0
1428; X86-SSE-NEXT:    pcmpeqd %xmm2, %xmm2
1429; X86-SSE-NEXT:    psrlw %xmm1, %xmm2
1430; X86-SSE-NEXT:    psrlw $8, %xmm2
1431; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1432; X86-SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
1433; X86-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
1434; X86-SSE-NEXT:    pand %xmm2, %xmm0
1435; X86-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
1436; X86-SSE-NEXT:    psrlw %xmm1, %xmm2
1437; X86-SSE-NEXT:    pxor %xmm2, %xmm0
1438; X86-SSE-NEXT:    psubb %xmm2, %xmm0
1439; X86-SSE-NEXT:    retl
1440  %splat = shufflevector <8 x i8> %b, <8 x i8> undef, <8 x i32> zeroinitializer
1441  %shift = ashr <8 x i8> %a, %splat
1442  ret <8 x i8> %shift
1443}
1444
1445define <4 x i8> @splatvar_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind {
1446; SSE2-LABEL: splatvar_shift_v4i8:
1447; SSE2:       # %bb.0:
1448; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
1449; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1450; SSE2-NEXT:    psrlw %xmm1, %xmm0
1451; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
1452; SSE2-NEXT:    psrlw %xmm1, %xmm2
1453; SSE2-NEXT:    psrlw $8, %xmm2
1454; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1455; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
1456; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
1457; SSE2-NEXT:    pand %xmm2, %xmm0
1458; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
1459; SSE2-NEXT:    psrlw %xmm1, %xmm2
1460; SSE2-NEXT:    pxor %xmm2, %xmm0
1461; SSE2-NEXT:    psubb %xmm2, %xmm0
1462; SSE2-NEXT:    retq
1463;
1464; SSE41-LABEL: splatvar_shift_v4i8:
1465; SSE41:       # %bb.0:
1466; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1467; SSE41-NEXT:    psrlw %xmm1, %xmm0
1468; SSE41-NEXT:    pcmpeqd %xmm2, %xmm2
1469; SSE41-NEXT:    psrlw %xmm1, %xmm2
1470; SSE41-NEXT:    pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1471; SSE41-NEXT:    pand %xmm2, %xmm0
1472; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
1473; SSE41-NEXT:    psrlw %xmm1, %xmm2
1474; SSE41-NEXT:    pxor %xmm2, %xmm0
1475; SSE41-NEXT:    psubb %xmm2, %xmm0
1476; SSE41-NEXT:    retq
1477;
1478; AVX1-LABEL: splatvar_shift_v4i8:
1479; AVX1:       # %bb.0:
1480; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1481; AVX1-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
1482; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1483; AVX1-NEXT:    vpsrlw %xmm1, %xmm2, %xmm2
1484; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1485; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
1486; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
1487; AVX1-NEXT:    vpsrlw %xmm1, %xmm2, %xmm1
1488; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
1489; AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
1490; AVX1-NEXT:    retq
1491;
1492; AVX2-LABEL: splatvar_shift_v4i8:
1493; AVX2:       # %bb.0:
1494; AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1495; AVX2-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
1496; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1497; AVX2-NEXT:    vpsrlw %xmm1, %xmm2, %xmm2
1498; AVX2-NEXT:    vpsrlw $8, %xmm2, %xmm2
1499; AVX2-NEXT:    vpbroadcastb %xmm2, %xmm2
1500; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
1501; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
1502; AVX2-NEXT:    vpsrlw %xmm1, %xmm2, %xmm1
1503; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
1504; AVX2-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
1505; AVX2-NEXT:    retq
1506;
1507; XOPAVX1-LABEL: splatvar_shift_v4i8:
1508; XOPAVX1:       # %bb.0:
1509; XOPAVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1510; XOPAVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
1511; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1512; XOPAVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
1513; XOPAVX1-NEXT:    vpshab %xmm1, %xmm0, %xmm0
1514; XOPAVX1-NEXT:    retq
1515;
1516; XOPAVX2-LABEL: splatvar_shift_v4i8:
1517; XOPAVX2:       # %bb.0:
1518; XOPAVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
1519; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1520; XOPAVX2-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
1521; XOPAVX2-NEXT:    vpshab %xmm1, %xmm0, %xmm0
1522; XOPAVX2-NEXT:    retq
1523;
1524; AVX512DQ-LABEL: splatvar_shift_v4i8:
1525; AVX512DQ:       # %bb.0:
1526; AVX512DQ-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1527; AVX512DQ-NEXT:    vpmovsxbd %xmm0, %zmm0
1528; AVX512DQ-NEXT:    vpsrad %xmm1, %zmm0, %zmm0
1529; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
1530; AVX512DQ-NEXT:    vzeroupper
1531; AVX512DQ-NEXT:    retq
1532;
1533; AVX512BW-LABEL: splatvar_shift_v4i8:
1534; AVX512BW:       # %bb.0:
1535; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1536; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm0
1537; AVX512BW-NEXT:    vpsraw %xmm1, %ymm0, %ymm0
1538; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
1539; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1540; AVX512BW-NEXT:    vzeroupper
1541; AVX512BW-NEXT:    retq
1542;
1543; AVX512DQVL-LABEL: splatvar_shift_v4i8:
1544; AVX512DQVL:       # %bb.0:
1545; AVX512DQVL-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1546; AVX512DQVL-NEXT:    vpmovsxbd %xmm0, %zmm0
1547; AVX512DQVL-NEXT:    vpsrad %xmm1, %zmm0, %zmm0
1548; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
1549; AVX512DQVL-NEXT:    vzeroupper
1550; AVX512DQVL-NEXT:    retq
1551;
1552; AVX512BWVL-LABEL: splatvar_shift_v4i8:
1553; AVX512BWVL:       # %bb.0:
1554; AVX512BWVL-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1555; AVX512BWVL-NEXT:    vpmovsxbw %xmm0, %ymm0
1556; AVX512BWVL-NEXT:    vpsraw %xmm1, %ymm0, %ymm0
1557; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
1558; AVX512BWVL-NEXT:    vzeroupper
1559; AVX512BWVL-NEXT:    retq
1560;
1561; X86-SSE-LABEL: splatvar_shift_v4i8:
1562; X86-SSE:       # %bb.0:
1563; X86-SSE-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
1564; X86-SSE-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1565; X86-SSE-NEXT:    psrlw %xmm1, %xmm0
1566; X86-SSE-NEXT:    pcmpeqd %xmm2, %xmm2
1567; X86-SSE-NEXT:    psrlw %xmm1, %xmm2
1568; X86-SSE-NEXT:    psrlw $8, %xmm2
1569; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1570; X86-SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
1571; X86-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
1572; X86-SSE-NEXT:    pand %xmm2, %xmm0
1573; X86-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
1574; X86-SSE-NEXT:    psrlw %xmm1, %xmm2
1575; X86-SSE-NEXT:    pxor %xmm2, %xmm0
1576; X86-SSE-NEXT:    psubb %xmm2, %xmm0
1577; X86-SSE-NEXT:    retl
1578  %splat = shufflevector <4 x i8> %b, <4 x i8> undef, <4 x i32> zeroinitializer
1579  %shift = ashr <4 x i8> %a, %splat
1580  ret <4 x i8> %shift
1581}
1582
1583define <2 x i8> @splatvar_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind {
1584; SSE2-LABEL: splatvar_shift_v2i8:
1585; SSE2:       # %bb.0:
1586; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
1587; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1588; SSE2-NEXT:    psrlw %xmm1, %xmm0
1589; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
1590; SSE2-NEXT:    psrlw %xmm1, %xmm2
1591; SSE2-NEXT:    psrlw $8, %xmm2
1592; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1593; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
1594; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
1595; SSE2-NEXT:    pand %xmm2, %xmm0
1596; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
1597; SSE2-NEXT:    psrlw %xmm1, %xmm2
1598; SSE2-NEXT:    pxor %xmm2, %xmm0
1599; SSE2-NEXT:    psubb %xmm2, %xmm0
1600; SSE2-NEXT:    retq
1601;
1602; SSE41-LABEL: splatvar_shift_v2i8:
1603; SSE41:       # %bb.0:
1604; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1605; SSE41-NEXT:    psrlw %xmm1, %xmm0
1606; SSE41-NEXT:    pcmpeqd %xmm2, %xmm2
1607; SSE41-NEXT:    psrlw %xmm1, %xmm2
1608; SSE41-NEXT:    pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1609; SSE41-NEXT:    pand %xmm2, %xmm0
1610; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
1611; SSE41-NEXT:    psrlw %xmm1, %xmm2
1612; SSE41-NEXT:    pxor %xmm2, %xmm0
1613; SSE41-NEXT:    psubb %xmm2, %xmm0
1614; SSE41-NEXT:    retq
1615;
1616; AVX1-LABEL: splatvar_shift_v2i8:
1617; AVX1:       # %bb.0:
1618; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1619; AVX1-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
1620; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1621; AVX1-NEXT:    vpsrlw %xmm1, %xmm2, %xmm2
1622; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1623; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
1624; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
1625; AVX1-NEXT:    vpsrlw %xmm1, %xmm2, %xmm1
1626; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
1627; AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
1628; AVX1-NEXT:    retq
1629;
1630; AVX2-LABEL: splatvar_shift_v2i8:
1631; AVX2:       # %bb.0:
1632; AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1633; AVX2-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
1634; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1635; AVX2-NEXT:    vpsrlw %xmm1, %xmm2, %xmm2
1636; AVX2-NEXT:    vpsrlw $8, %xmm2, %xmm2
1637; AVX2-NEXT:    vpbroadcastb %xmm2, %xmm2
1638; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
1639; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
1640; AVX2-NEXT:    vpsrlw %xmm1, %xmm2, %xmm1
1641; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
1642; AVX2-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
1643; AVX2-NEXT:    retq
1644;
1645; XOP-LABEL: splatvar_shift_v2i8:
1646; XOP:       # %bb.0:
1647; XOP-NEXT:    insertq {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7,u,u,u,u,u,u,u,u]
1648; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1649; XOP-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
1650; XOP-NEXT:    vpshab %xmm1, %xmm0, %xmm0
1651; XOP-NEXT:    retq
1652;
1653; AVX512DQ-LABEL: splatvar_shift_v2i8:
1654; AVX512DQ:       # %bb.0:
1655; AVX512DQ-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1656; AVX512DQ-NEXT:    vpmovsxbd %xmm0, %zmm0
1657; AVX512DQ-NEXT:    vpsrad %xmm1, %zmm0, %zmm0
1658; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
1659; AVX512DQ-NEXT:    vzeroupper
1660; AVX512DQ-NEXT:    retq
1661;
1662; AVX512BW-LABEL: splatvar_shift_v2i8:
1663; AVX512BW:       # %bb.0:
1664; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1665; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm0
1666; AVX512BW-NEXT:    vpsraw %xmm1, %ymm0, %ymm0
1667; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
1668; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1669; AVX512BW-NEXT:    vzeroupper
1670; AVX512BW-NEXT:    retq
1671;
1672; AVX512DQVL-LABEL: splatvar_shift_v2i8:
1673; AVX512DQVL:       # %bb.0:
1674; AVX512DQVL-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1675; AVX512DQVL-NEXT:    vpmovsxbd %xmm0, %zmm0
1676; AVX512DQVL-NEXT:    vpsrad %xmm1, %zmm0, %zmm0
1677; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
1678; AVX512DQVL-NEXT:    vzeroupper
1679; AVX512DQVL-NEXT:    retq
1680;
1681; AVX512BWVL-LABEL: splatvar_shift_v2i8:
1682; AVX512BWVL:       # %bb.0:
1683; AVX512BWVL-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1684; AVX512BWVL-NEXT:    vpmovsxbw %xmm0, %ymm0
1685; AVX512BWVL-NEXT:    vpsraw %xmm1, %ymm0, %ymm0
1686; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
1687; AVX512BWVL-NEXT:    vzeroupper
1688; AVX512BWVL-NEXT:    retq
1689;
1690; X86-SSE-LABEL: splatvar_shift_v2i8:
1691; X86-SSE:       # %bb.0:
1692; X86-SSE-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
1693; X86-SSE-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1694; X86-SSE-NEXT:    psrlw %xmm1, %xmm0
1695; X86-SSE-NEXT:    pcmpeqd %xmm2, %xmm2
1696; X86-SSE-NEXT:    psrlw %xmm1, %xmm2
1697; X86-SSE-NEXT:    psrlw $8, %xmm2
1698; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1699; X86-SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
1700; X86-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
1701; X86-SSE-NEXT:    pand %xmm2, %xmm0
1702; X86-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
1703; X86-SSE-NEXT:    psrlw %xmm1, %xmm2
1704; X86-SSE-NEXT:    pxor %xmm2, %xmm0
1705; X86-SSE-NEXT:    psubb %xmm2, %xmm0
1706; X86-SSE-NEXT:    retl
1707  %splat = shufflevector <2 x i8> %b, <2 x i8> undef, <2 x i32> zeroinitializer
1708  %shift = ashr <2 x i8> %a, %splat
1709  ret <2 x i8> %shift
1710}
1711
1712;
1713; Constant Shifts
1714;
1715
1716define <2 x i32> @constant_shift_v2i32(<2 x i32> %a) nounwind {
1717; SSE2-LABEL: constant_shift_v2i32:
1718; SSE2:       # %bb.0:
1719; SSE2-NEXT:    movdqa %xmm0, %xmm1
1720; SSE2-NEXT:    psrad $4, %xmm1
1721; SSE2-NEXT:    psrad $5, %xmm0
1722; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
1723; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1724; SSE2-NEXT:    movdqa %xmm1, %xmm0
1725; SSE2-NEXT:    retq
1726;
1727; SSE41-LABEL: constant_shift_v2i32:
1728; SSE41:       # %bb.0:
1729; SSE41-NEXT:    movdqa %xmm0, %xmm1
1730; SSE41-NEXT:    psrad $5, %xmm1
1731; SSE41-NEXT:    psrad $4, %xmm0
1732; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
1733; SSE41-NEXT:    retq
1734;
1735; AVX1-LABEL: constant_shift_v2i32:
1736; AVX1:       # %bb.0:
1737; AVX1-NEXT:    vpsrad $5, %xmm0, %xmm1
1738; AVX1-NEXT:    vpsrad $4, %xmm0, %xmm0
1739; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
1740; AVX1-NEXT:    retq
1741;
1742; AVX2-LABEL: constant_shift_v2i32:
1743; AVX2:       # %bb.0:
1744; AVX2-NEXT:    vpsravd {{.*}}(%rip), %xmm0, %xmm0
1745; AVX2-NEXT:    retq
1746;
1747; XOPAVX1-LABEL: constant_shift_v2i32:
1748; XOPAVX1:       # %bb.0:
1749; XOPAVX1-NEXT:    vpshad {{.*}}(%rip), %xmm0, %xmm0
1750; XOPAVX1-NEXT:    retq
1751;
1752; XOPAVX2-LABEL: constant_shift_v2i32:
1753; XOPAVX2:       # %bb.0:
1754; XOPAVX2-NEXT:    vpsravd {{.*}}(%rip), %xmm0, %xmm0
1755; XOPAVX2-NEXT:    retq
1756;
1757; AVX512-LABEL: constant_shift_v2i32:
1758; AVX512:       # %bb.0:
1759; AVX512-NEXT:    vpsravd {{.*}}(%rip), %xmm0, %xmm0
1760; AVX512-NEXT:    retq
1761;
1762; AVX512VL-LABEL: constant_shift_v2i32:
1763; AVX512VL:       # %bb.0:
1764; AVX512VL-NEXT:    vpsravd {{.*}}(%rip), %xmm0, %xmm0
1765; AVX512VL-NEXT:    retq
1766;
1767; X86-SSE-LABEL: constant_shift_v2i32:
1768; X86-SSE:       # %bb.0:
1769; X86-SSE-NEXT:    movdqa %xmm0, %xmm1
1770; X86-SSE-NEXT:    psrad $4, %xmm1
1771; X86-SSE-NEXT:    psrad $5, %xmm0
1772; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
1773; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1774; X86-SSE-NEXT:    movdqa %xmm1, %xmm0
1775; X86-SSE-NEXT:    retl
1776  %shift = ashr <2 x i32> %a, <i32 4, i32 5>
1777  ret <2 x i32> %shift
1778}
1779
1780define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind {
1781; SSE2-LABEL: constant_shift_v4i16:
1782; SSE2:       # %bb.0:
1783; SSE2-NEXT:    movdqa %xmm0, %xmm1
1784; SSE2-NEXT:    psraw $2, %xmm1
1785; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
1786; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
1787; SSE2-NEXT:    movaps {{.*#+}} xmm2 = [65535,0,65535,0,65535,65535,65535,65535]
1788; SSE2-NEXT:    movaps %xmm1, %xmm0
1789; SSE2-NEXT:    andps %xmm2, %xmm0
1790; SSE2-NEXT:    psraw $1, %xmm1
1791; SSE2-NEXT:    andnps %xmm1, %xmm2
1792; SSE2-NEXT:    orps %xmm2, %xmm0
1793; SSE2-NEXT:    retq
1794;
1795; SSE41-LABEL: constant_shift_v4i16:
1796; SSE41:       # %bb.0:
1797; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = <u,32768,16384,8192,u,u,u,u>
1798; SSE41-NEXT:    pmulhw %xmm0, %xmm1
1799; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
1800; SSE41-NEXT:    psraw $1, %xmm0
1801; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7]
1802; SSE41-NEXT:    retq
1803;
1804; AVX-LABEL: constant_shift_v4i16:
1805; AVX:       # %bb.0:
1806; AVX-NEXT:    vpmulhw {{.*}}(%rip), %xmm0, %xmm1
1807; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
1808; AVX-NEXT:    vpsraw $1, %xmm0, %xmm0
1809; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7]
1810; AVX-NEXT:    retq
1811;
1812; XOP-LABEL: constant_shift_v4i16:
1813; XOP:       # %bb.0:
1814; XOP-NEXT:    vpshaw {{.*}}(%rip), %xmm0, %xmm0
1815; XOP-NEXT:    retq
1816;
1817; AVX512DQ-LABEL: constant_shift_v4i16:
1818; AVX512DQ:       # %bb.0:
1819; AVX512DQ-NEXT:    vpmovsxwd %xmm0, %ymm0
1820; AVX512DQ-NEXT:    vpsravd {{.*}}(%rip), %ymm0, %ymm0
1821; AVX512DQ-NEXT:    vpmovdw %zmm0, %ymm0
1822; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1823; AVX512DQ-NEXT:    vzeroupper
1824; AVX512DQ-NEXT:    retq
1825;
1826; AVX512BW-LABEL: constant_shift_v4i16:
1827; AVX512BW:       # %bb.0:
1828; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1829; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = <0,1,2,3,u,u,u,u>
1830; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
1831; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1832; AVX512BW-NEXT:    vzeroupper
1833; AVX512BW-NEXT:    retq
1834;
1835; AVX512DQVL-LABEL: constant_shift_v4i16:
1836; AVX512DQVL:       # %bb.0:
1837; AVX512DQVL-NEXT:    vpmovsxwd %xmm0, %ymm0
1838; AVX512DQVL-NEXT:    vpsravd {{.*}}(%rip), %ymm0, %ymm0
1839; AVX512DQVL-NEXT:    vpmovdw %ymm0, %xmm0
1840; AVX512DQVL-NEXT:    vzeroupper
1841; AVX512DQVL-NEXT:    retq
1842;
1843; AVX512BWVL-LABEL: constant_shift_v4i16:
1844; AVX512BWVL:       # %bb.0:
1845; AVX512BWVL-NEXT:    vpsravw {{.*}}(%rip), %xmm0, %xmm0
1846; AVX512BWVL-NEXT:    retq
1847;
1848; X86-SSE-LABEL: constant_shift_v4i16:
1849; X86-SSE:       # %bb.0:
1850; X86-SSE-NEXT:    movdqa %xmm0, %xmm1
1851; X86-SSE-NEXT:    psraw $2, %xmm1
1852; X86-SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
1853; X86-SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
1854; X86-SSE-NEXT:    movaps {{.*#+}} xmm2 = [65535,0,65535,0,65535,65535,65535,65535]
1855; X86-SSE-NEXT:    movaps %xmm1, %xmm0
1856; X86-SSE-NEXT:    andps %xmm2, %xmm0
1857; X86-SSE-NEXT:    psraw $1, %xmm1
1858; X86-SSE-NEXT:    andnps %xmm1, %xmm2
1859; X86-SSE-NEXT:    orps %xmm2, %xmm0
1860; X86-SSE-NEXT:    retl
1861  %shift = ashr <4 x i16> %a, <i16 0, i16 1, i16 2, i16 3>
1862  ret <4 x i16> %shift
1863}
1864
1865define <2 x i16> @constant_shift_v2i16(<2 x i16> %a) nounwind {
1866; SSE2-LABEL: constant_shift_v2i16:
1867; SSE2:       # %bb.0:
1868; SSE2-NEXT:    movdqa %xmm0, %xmm1
1869; SSE2-NEXT:    psraw $3, %xmm1
1870; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535]
1871; SSE2-NEXT:    psraw $2, %xmm0
1872; SSE2-NEXT:    pand %xmm2, %xmm0
1873; SSE2-NEXT:    pandn %xmm1, %xmm2
1874; SSE2-NEXT:    por %xmm2, %xmm0
1875; SSE2-NEXT:    retq
1876;
1877; SSE41-LABEL: constant_shift_v2i16:
1878; SSE41:       # %bb.0:
1879; SSE41-NEXT:    movdqa %xmm0, %xmm1
1880; SSE41-NEXT:    psraw $3, %xmm1
1881; SSE41-NEXT:    psraw $2, %xmm0
1882; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
1883; SSE41-NEXT:    retq
1884;
1885; AVX-LABEL: constant_shift_v2i16:
1886; AVX:       # %bb.0:
1887; AVX-NEXT:    vpsraw $3, %xmm0, %xmm1
1888; AVX-NEXT:    vpsraw $2, %xmm0, %xmm0
1889; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
1890; AVX-NEXT:    retq
1891;
1892; XOP-LABEL: constant_shift_v2i16:
1893; XOP:       # %bb.0:
1894; XOP-NEXT:    vpshaw {{.*}}(%rip), %xmm0, %xmm0
1895; XOP-NEXT:    retq
1896;
1897; AVX512DQ-LABEL: constant_shift_v2i16:
1898; AVX512DQ:       # %bb.0:
1899; AVX512DQ-NEXT:    vpsraw $3, %xmm0, %xmm1
1900; AVX512DQ-NEXT:    vpsraw $2, %xmm0, %xmm0
1901; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
1902; AVX512DQ-NEXT:    retq
1903;
1904; AVX512BW-LABEL: constant_shift_v2i16:
1905; AVX512BW:       # %bb.0:
1906; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1907; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = <2,3,u,u,u,u,u,u>
1908; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
1909; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1910; AVX512BW-NEXT:    vzeroupper
1911; AVX512BW-NEXT:    retq
1912;
1913; AVX512DQVL-LABEL: constant_shift_v2i16:
1914; AVX512DQVL:       # %bb.0:
1915; AVX512DQVL-NEXT:    vpsraw $3, %xmm0, %xmm1
1916; AVX512DQVL-NEXT:    vpsraw $2, %xmm0, %xmm0
1917; AVX512DQVL-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
1918; AVX512DQVL-NEXT:    retq
1919;
1920; AVX512BWVL-LABEL: constant_shift_v2i16:
1921; AVX512BWVL:       # %bb.0:
1922; AVX512BWVL-NEXT:    vpsravw {{.*}}(%rip), %xmm0, %xmm0
1923; AVX512BWVL-NEXT:    retq
1924;
1925; X86-SSE-LABEL: constant_shift_v2i16:
1926; X86-SSE:       # %bb.0:
1927; X86-SSE-NEXT:    movdqa %xmm0, %xmm1
1928; X86-SSE-NEXT:    psraw $3, %xmm1
1929; X86-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535]
1930; X86-SSE-NEXT:    psraw $2, %xmm0
1931; X86-SSE-NEXT:    pand %xmm2, %xmm0
1932; X86-SSE-NEXT:    pandn %xmm1, %xmm2
1933; X86-SSE-NEXT:    por %xmm2, %xmm0
1934; X86-SSE-NEXT:    retl
1935  %shift = ashr <2 x i16> %a, <i16 2, i16 3>
1936  ret <2 x i16> %shift
1937}
1938
1939define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind {
1940; SSE-LABEL: constant_shift_v8i8:
1941; SSE:       # %bb.0:
1942; SSE-NEXT:    pxor %xmm1, %xmm1
1943; SSE-NEXT:    movdqa %xmm0, %xmm2
1944; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
1945; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1946; SSE-NEXT:    psraw $8, %xmm0
1947; SSE-NEXT:    pmullw {{.*}}(%rip), %xmm0
1948; SSE-NEXT:    psrlw $8, %xmm0
1949; SSE-NEXT:    packuswb %xmm2, %xmm0
1950; SSE-NEXT:    retq
1951;
1952; AVX1-LABEL: constant_shift_v8i8:
1953; AVX1:       # %bb.0:
1954; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1955; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
1956; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1957; AVX1-NEXT:    vpsraw $8, %xmm0, %xmm0
1958; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
1959; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
1960; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1961; AVX1-NEXT:    retq
1962;
1963; AVX2-LABEL: constant_shift_v8i8:
1964; AVX2:       # %bb.0:
1965; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
1966; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
1967; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
1968; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1969; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1970; AVX2-NEXT:    vzeroupper
1971; AVX2-NEXT:    retq
1972;
1973; XOP-LABEL: constant_shift_v8i8:
1974; XOP:       # %bb.0:
1975; XOP-NEXT:    vpshab {{.*}}(%rip), %xmm0, %xmm0
1976; XOP-NEXT:    retq
1977;
1978; AVX512DQ-LABEL: constant_shift_v8i8:
1979; AVX512DQ:       # %bb.0:
1980; AVX512DQ-NEXT:    vpmovsxbd %xmm0, %zmm0
1981; AVX512DQ-NEXT:    vpsravd {{.*}}(%rip), %zmm0, %zmm0
1982; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
1983; AVX512DQ-NEXT:    vzeroupper
1984; AVX512DQ-NEXT:    retq
1985;
1986; AVX512BW-LABEL: constant_shift_v8i8:
1987; AVX512BW:       # %bb.0:
1988; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,0,0,0,0,0,0,0]
1989; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm0
1990; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
1991; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
1992; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1993; AVX512BW-NEXT:    vzeroupper
1994; AVX512BW-NEXT:    retq
1995;
1996; AVX512DQVL-LABEL: constant_shift_v8i8:
1997; AVX512DQVL:       # %bb.0:
1998; AVX512DQVL-NEXT:    vpmovsxbd %xmm0, %zmm0
1999; AVX512DQVL-NEXT:    vpsravd {{.*}}(%rip), %zmm0, %zmm0
2000; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
2001; AVX512DQVL-NEXT:    vzeroupper
2002; AVX512DQVL-NEXT:    retq
2003;
2004; AVX512BWVL-LABEL: constant_shift_v8i8:
2005; AVX512BWVL:       # %bb.0:
2006; AVX512BWVL-NEXT:    vpmovsxbw %xmm0, %ymm0
2007; AVX512BWVL-NEXT:    vpsravw {{.*}}(%rip), %ymm0, %ymm0
2008; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
2009; AVX512BWVL-NEXT:    vzeroupper
2010; AVX512BWVL-NEXT:    retq
2011;
2012; X86-SSE-LABEL: constant_shift_v8i8:
2013; X86-SSE:       # %bb.0:
2014; X86-SSE-NEXT:    pxor %xmm1, %xmm1
2015; X86-SSE-NEXT:    movdqa %xmm0, %xmm2
2016; X86-SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
2017; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2018; X86-SSE-NEXT:    psraw $8, %xmm0
2019; X86-SSE-NEXT:    pmullw {{\.LCPI.*}}, %xmm0
2020; X86-SSE-NEXT:    psrlw $8, %xmm0
2021; X86-SSE-NEXT:    packuswb %xmm2, %xmm0
2022; X86-SSE-NEXT:    retl
2023  %shift = ashr <8 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
2024  ret <8 x i8> %shift
2025}
2026
2027define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind {
2028; SSE-LABEL: constant_shift_v4i8:
2029; SSE:       # %bb.0:
2030; SSE-NEXT:    pxor %xmm1, %xmm1
2031; SSE-NEXT:    movdqa %xmm0, %xmm2
2032; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
2033; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2034; SSE-NEXT:    psraw $8, %xmm0
2035; SSE-NEXT:    pmullw {{.*}}(%rip), %xmm0
2036; SSE-NEXT:    psrlw $8, %xmm0
2037; SSE-NEXT:    packuswb %xmm2, %xmm0
2038; SSE-NEXT:    retq
2039;
2040; AVX1-LABEL: constant_shift_v4i8:
2041; AVX1:       # %bb.0:
2042; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2043; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
2044; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2045; AVX1-NEXT:    vpsraw $8, %xmm0, %xmm0
2046; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
2047; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
2048; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
2049; AVX1-NEXT:    retq
2050;
2051; AVX2-LABEL: constant_shift_v4i8:
2052; AVX2:       # %bb.0:
2053; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
2054; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
2055; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
2056; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
2057; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
2058; AVX2-NEXT:    vzeroupper
2059; AVX2-NEXT:    retq
2060;
2061; XOP-LABEL: constant_shift_v4i8:
2062; XOP:       # %bb.0:
2063; XOP-NEXT:    vpshab {{.*}}(%rip), %xmm0, %xmm0
2064; XOP-NEXT:    retq
2065;
2066; AVX512DQ-LABEL: constant_shift_v4i8:
2067; AVX512DQ:       # %bb.0:
2068; AVX512DQ-NEXT:    vpmovsxbd %xmm0, %zmm0
2069; AVX512DQ-NEXT:    vpsravd {{.*}}(%rip), %zmm0, %zmm0
2070; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
2071; AVX512DQ-NEXT:    vzeroupper
2072; AVX512DQ-NEXT:    retq
2073;
2074; AVX512BW-LABEL: constant_shift_v4i8:
2075; AVX512BW:       # %bb.0:
2076; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,0,0,0,0,0,0,0,0,0,0,0,0]
2077; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm0
2078; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
2079; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
2080; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2081; AVX512BW-NEXT:    vzeroupper
2082; AVX512BW-NEXT:    retq
2083;
2084; AVX512DQVL-LABEL: constant_shift_v4i8:
2085; AVX512DQVL:       # %bb.0:
2086; AVX512DQVL-NEXT:    vpmovsxbd %xmm0, %zmm0
2087; AVX512DQVL-NEXT:    vpsravd {{.*}}(%rip), %zmm0, %zmm0
2088; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
2089; AVX512DQVL-NEXT:    vzeroupper
2090; AVX512DQVL-NEXT:    retq
2091;
2092; AVX512BWVL-LABEL: constant_shift_v4i8:
2093; AVX512BWVL:       # %bb.0:
2094; AVX512BWVL-NEXT:    vpmovsxbw %xmm0, %ymm0
2095; AVX512BWVL-NEXT:    vpsravw {{.*}}(%rip), %ymm0, %ymm0
2096; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
2097; AVX512BWVL-NEXT:    vzeroupper
2098; AVX512BWVL-NEXT:    retq
2099;
2100; X86-SSE-LABEL: constant_shift_v4i8:
2101; X86-SSE:       # %bb.0:
2102; X86-SSE-NEXT:    pxor %xmm1, %xmm1
2103; X86-SSE-NEXT:    movdqa %xmm0, %xmm2
2104; X86-SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
2105; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2106; X86-SSE-NEXT:    psraw $8, %xmm0
2107; X86-SSE-NEXT:    pmullw {{\.LCPI.*}}, %xmm0
2108; X86-SSE-NEXT:    psrlw $8, %xmm0
2109; X86-SSE-NEXT:    packuswb %xmm2, %xmm0
2110; X86-SSE-NEXT:    retl
2111  %shift = ashr <4 x i8> %a, <i8 0, i8 1, i8 2, i8 3>
2112  ret <4 x i8> %shift
2113}
2114
2115define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind {
2116; SSE-LABEL: constant_shift_v2i8:
2117; SSE:       # %bb.0:
2118; SSE-NEXT:    pxor %xmm1, %xmm1
2119; SSE-NEXT:    movdqa %xmm0, %xmm2
2120; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
2121; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2122; SSE-NEXT:    psraw $8, %xmm0
2123; SSE-NEXT:    pmullw {{.*}}(%rip), %xmm0
2124; SSE-NEXT:    psrlw $8, %xmm0
2125; SSE-NEXT:    packuswb %xmm2, %xmm0
2126; SSE-NEXT:    retq
2127;
2128; AVX1-LABEL: constant_shift_v2i8:
2129; AVX1:       # %bb.0:
2130; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2131; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
2132; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2133; AVX1-NEXT:    vpsraw $8, %xmm0, %xmm0
2134; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
2135; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
2136; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
2137; AVX1-NEXT:    retq
2138;
2139; AVX2-LABEL: constant_shift_v2i8:
2140; AVX2:       # %bb.0:
2141; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
2142; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
2143; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
2144; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
2145; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
2146; AVX2-NEXT:    vzeroupper
2147; AVX2-NEXT:    retq
2148;
2149; XOP-LABEL: constant_shift_v2i8:
2150; XOP:       # %bb.0:
2151; XOP-NEXT:    vpshab {{.*}}(%rip), %xmm0, %xmm0
2152; XOP-NEXT:    retq
2153;
2154; AVX512DQ-LABEL: constant_shift_v2i8:
2155; AVX512DQ:       # %bb.0:
2156; AVX512DQ-NEXT:    vpmovsxbd %xmm0, %zmm0
2157; AVX512DQ-NEXT:    vpsravd {{.*}}(%rip), %zmm0, %zmm0
2158; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
2159; AVX512DQ-NEXT:    vzeroupper
2160; AVX512DQ-NEXT:    retq
2161;
2162; AVX512BW-LABEL: constant_shift_v2i8:
2163; AVX512BW:       # %bb.0:
2164; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
2165; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm0
2166; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
2167; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
2168; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2169; AVX512BW-NEXT:    vzeroupper
2170; AVX512BW-NEXT:    retq
2171;
2172; AVX512DQVL-LABEL: constant_shift_v2i8:
2173; AVX512DQVL:       # %bb.0:
2174; AVX512DQVL-NEXT:    vpmovsxbd %xmm0, %zmm0
2175; AVX512DQVL-NEXT:    vpsravd {{.*}}(%rip), %zmm0, %zmm0
2176; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
2177; AVX512DQVL-NEXT:    vzeroupper
2178; AVX512DQVL-NEXT:    retq
2179;
2180; AVX512BWVL-LABEL: constant_shift_v2i8:
2181; AVX512BWVL:       # %bb.0:
2182; AVX512BWVL-NEXT:    vpmovsxbw %xmm0, %ymm0
2183; AVX512BWVL-NEXT:    vpsravw {{.*}}(%rip), %ymm0, %ymm0
2184; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
2185; AVX512BWVL-NEXT:    vzeroupper
2186; AVX512BWVL-NEXT:    retq
2187;
2188; X86-SSE-LABEL: constant_shift_v2i8:
2189; X86-SSE:       # %bb.0:
2190; X86-SSE-NEXT:    pxor %xmm1, %xmm1
2191; X86-SSE-NEXT:    movdqa %xmm0, %xmm2
2192; X86-SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
2193; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2194; X86-SSE-NEXT:    psraw $8, %xmm0
2195; X86-SSE-NEXT:    pmullw {{\.LCPI.*}}, %xmm0
2196; X86-SSE-NEXT:    psrlw $8, %xmm0
2197; X86-SSE-NEXT:    packuswb %xmm2, %xmm0
2198; X86-SSE-NEXT:    retl
2199  %shift = ashr <2 x i8> %a, <i8 2, i8 3>
2200  ret <2 x i8> %shift
2201}
2202
2203;
2204; Uniform Constant Shifts
2205;
2206
2207define <2 x i32> @splatconstant_shift_v2i32(<2 x i32> %a) nounwind {
2208; SSE-LABEL: splatconstant_shift_v2i32:
2209; SSE:       # %bb.0:
2210; SSE-NEXT:    psrad $5, %xmm0
2211; SSE-NEXT:    retq
2212;
2213; AVX-LABEL: splatconstant_shift_v2i32:
2214; AVX:       # %bb.0:
2215; AVX-NEXT:    vpsrad $5, %xmm0, %xmm0
2216; AVX-NEXT:    retq
2217;
2218; XOP-LABEL: splatconstant_shift_v2i32:
2219; XOP:       # %bb.0:
2220; XOP-NEXT:    vpsrad $5, %xmm0, %xmm0
2221; XOP-NEXT:    retq
2222;
2223; AVX512-LABEL: splatconstant_shift_v2i32:
2224; AVX512:       # %bb.0:
2225; AVX512-NEXT:    vpsrad $5, %xmm0, %xmm0
2226; AVX512-NEXT:    retq
2227;
2228; AVX512VL-LABEL: splatconstant_shift_v2i32:
2229; AVX512VL:       # %bb.0:
2230; AVX512VL-NEXT:    vpsrad $5, %xmm0, %xmm0
2231; AVX512VL-NEXT:    retq
2232;
2233; X86-SSE-LABEL: splatconstant_shift_v2i32:
2234; X86-SSE:       # %bb.0:
2235; X86-SSE-NEXT:    psrad $5, %xmm0
2236; X86-SSE-NEXT:    retl
2237  %shift = ashr <2 x i32> %a, <i32 5, i32 5>
2238  ret <2 x i32> %shift
2239}
2240
2241define <4 x i16> @splatconstant_shift_v4i16(<4 x i16> %a) nounwind {
2242; SSE-LABEL: splatconstant_shift_v4i16:
2243; SSE:       # %bb.0:
2244; SSE-NEXT:    psraw $3, %xmm0
2245; SSE-NEXT:    retq
2246;
2247; AVX-LABEL: splatconstant_shift_v4i16:
2248; AVX:       # %bb.0:
2249; AVX-NEXT:    vpsraw $3, %xmm0, %xmm0
2250; AVX-NEXT:    retq
2251;
2252; XOP-LABEL: splatconstant_shift_v4i16:
2253; XOP:       # %bb.0:
2254; XOP-NEXT:    vpsraw $3, %xmm0, %xmm0
2255; XOP-NEXT:    retq
2256;
2257; AVX512-LABEL: splatconstant_shift_v4i16:
2258; AVX512:       # %bb.0:
2259; AVX512-NEXT:    vpsraw $3, %xmm0, %xmm0
2260; AVX512-NEXT:    retq
2261;
2262; AVX512VL-LABEL: splatconstant_shift_v4i16:
2263; AVX512VL:       # %bb.0:
2264; AVX512VL-NEXT:    vpsraw $3, %xmm0, %xmm0
2265; AVX512VL-NEXT:    retq
2266;
2267; X86-SSE-LABEL: splatconstant_shift_v4i16:
2268; X86-SSE:       # %bb.0:
2269; X86-SSE-NEXT:    psraw $3, %xmm0
2270; X86-SSE-NEXT:    retl
2271  %shift = ashr <4 x i16> %a, <i16 3, i16 3, i16 3, i16 3>
2272  ret <4 x i16> %shift
2273}
2274
2275define <2 x i16> @splatconstant_shift_v2i16(<2 x i16> %a) nounwind {
2276; SSE-LABEL: splatconstant_shift_v2i16:
2277; SSE:       # %bb.0:
2278; SSE-NEXT:    psraw $3, %xmm0
2279; SSE-NEXT:    retq
2280;
2281; AVX-LABEL: splatconstant_shift_v2i16:
2282; AVX:       # %bb.0:
2283; AVX-NEXT:    vpsraw $3, %xmm0, %xmm0
2284; AVX-NEXT:    retq
2285;
2286; XOP-LABEL: splatconstant_shift_v2i16:
2287; XOP:       # %bb.0:
2288; XOP-NEXT:    vpsraw $3, %xmm0, %xmm0
2289; XOP-NEXT:    retq
2290;
2291; AVX512-LABEL: splatconstant_shift_v2i16:
2292; AVX512:       # %bb.0:
2293; AVX512-NEXT:    vpsraw $3, %xmm0, %xmm0
2294; AVX512-NEXT:    retq
2295;
2296; AVX512VL-LABEL: splatconstant_shift_v2i16:
2297; AVX512VL:       # %bb.0:
2298; AVX512VL-NEXT:    vpsraw $3, %xmm0, %xmm0
2299; AVX512VL-NEXT:    retq
2300;
2301; X86-SSE-LABEL: splatconstant_shift_v2i16:
2302; X86-SSE:       # %bb.0:
2303; X86-SSE-NEXT:    psraw $3, %xmm0
2304; X86-SSE-NEXT:    retl
2305  %shift = ashr <2 x i16> %a, <i16 3, i16 3>
2306  ret <2 x i16> %shift
2307}
2308
2309define <8 x i8> @splatconstant_shift_v8i8(<8 x i8> %a) nounwind {
2310; SSE-LABEL: splatconstant_shift_v8i8:
2311; SSE:       # %bb.0:
2312; SSE-NEXT:    psrlw $3, %xmm0
2313; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
2314; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
2315; SSE-NEXT:    pxor %xmm1, %xmm0
2316; SSE-NEXT:    psubb %xmm1, %xmm0
2317; SSE-NEXT:    retq
2318;
2319; AVX-LABEL: splatconstant_shift_v8i8:
2320; AVX:       # %bb.0:
2321; AVX-NEXT:    vpsrlw $3, %xmm0, %xmm0
2322; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
2323; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
2324; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
2325; AVX-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
2326; AVX-NEXT:    retq
2327;
2328; XOP-LABEL: splatconstant_shift_v8i8:
2329; XOP:       # %bb.0:
2330; XOP-NEXT:    vpshab {{.*}}(%rip), %xmm0, %xmm0
2331; XOP-NEXT:    retq
2332;
2333; AVX512-LABEL: splatconstant_shift_v8i8:
2334; AVX512:       # %bb.0:
2335; AVX512-NEXT:    vpsrlw $3, %xmm0, %xmm0
2336; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
2337; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
2338; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
2339; AVX512-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
2340; AVX512-NEXT:    retq
2341;
2342; AVX512VL-LABEL: splatconstant_shift_v8i8:
2343; AVX512VL:       # %bb.0:
2344; AVX512VL-NEXT:    vpsrlw $3, %xmm0, %xmm0
2345; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
2346; AVX512VL-NEXT:    vpternlogq $108, {{.*}}(%rip), %xmm1, %xmm0
2347; AVX512VL-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
2348; AVX512VL-NEXT:    retq
2349;
2350; X86-SSE-LABEL: splatconstant_shift_v8i8:
2351; X86-SSE:       # %bb.0:
2352; X86-SSE-NEXT:    psrlw $3, %xmm0
2353; X86-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
2354; X86-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
2355; X86-SSE-NEXT:    pxor %xmm1, %xmm0
2356; X86-SSE-NEXT:    psubb %xmm1, %xmm0
2357; X86-SSE-NEXT:    retl
2358  %shift = ashr <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
2359  ret <8 x i8> %shift
2360}
2361
2362define <4 x i8> @splatconstant_shift_v4i8(<4 x i8> %a) nounwind {
2363; SSE-LABEL: splatconstant_shift_v4i8:
2364; SSE:       # %bb.0:
2365; SSE-NEXT:    psrlw $3, %xmm0
2366; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
2367; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
2368; SSE-NEXT:    pxor %xmm1, %xmm0
2369; SSE-NEXT:    psubb %xmm1, %xmm0
2370; SSE-NEXT:    retq
2371;
2372; AVX-LABEL: splatconstant_shift_v4i8:
2373; AVX:       # %bb.0:
2374; AVX-NEXT:    vpsrlw $3, %xmm0, %xmm0
2375; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
2376; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
2377; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
2378; AVX-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
2379; AVX-NEXT:    retq
2380;
2381; XOP-LABEL: splatconstant_shift_v4i8:
2382; XOP:       # %bb.0:
2383; XOP-NEXT:    vpshab {{.*}}(%rip), %xmm0, %xmm0
2384; XOP-NEXT:    retq
2385;
2386; AVX512-LABEL: splatconstant_shift_v4i8:
2387; AVX512:       # %bb.0:
2388; AVX512-NEXT:    vpsrlw $3, %xmm0, %xmm0
2389; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
2390; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
2391; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
2392; AVX512-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
2393; AVX512-NEXT:    retq
2394;
2395; AVX512VL-LABEL: splatconstant_shift_v4i8:
2396; AVX512VL:       # %bb.0:
2397; AVX512VL-NEXT:    vpsrlw $3, %xmm0, %xmm0
2398; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
2399; AVX512VL-NEXT:    vpternlogq $108, {{.*}}(%rip), %xmm1, %xmm0
2400; AVX512VL-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
2401; AVX512VL-NEXT:    retq
2402;
2403; X86-SSE-LABEL: splatconstant_shift_v4i8:
2404; X86-SSE:       # %bb.0:
2405; X86-SSE-NEXT:    psrlw $3, %xmm0
2406; X86-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
2407; X86-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
2408; X86-SSE-NEXT:    pxor %xmm1, %xmm0
2409; X86-SSE-NEXT:    psubb %xmm1, %xmm0
2410; X86-SSE-NEXT:    retl
2411  %shift = ashr <4 x i8> %a, <i8 3, i8 3, i8 3, i8 3>
2412  ret <4 x i8> %shift
2413}
2414
2415define <2 x i8> @splatconstant_shift_v2i8(<2 x i8> %a) nounwind {
2416; SSE-LABEL: splatconstant_shift_v2i8:
2417; SSE:       # %bb.0:
2418; SSE-NEXT:    psrlw $3, %xmm0
2419; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
2420; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
2421; SSE-NEXT:    pxor %xmm1, %xmm0
2422; SSE-NEXT:    psubb %xmm1, %xmm0
2423; SSE-NEXT:    retq
2424;
2425; AVX-LABEL: splatconstant_shift_v2i8:
2426; AVX:       # %bb.0:
2427; AVX-NEXT:    vpsrlw $3, %xmm0, %xmm0
2428; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
2429; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
2430; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
2431; AVX-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
2432; AVX-NEXT:    retq
2433;
2434; XOP-LABEL: splatconstant_shift_v2i8:
2435; XOP:       # %bb.0:
2436; XOP-NEXT:    vpshab {{.*}}(%rip), %xmm0, %xmm0
2437; XOP-NEXT:    retq
2438;
2439; AVX512-LABEL: splatconstant_shift_v2i8:
2440; AVX512:       # %bb.0:
2441; AVX512-NEXT:    vpsrlw $3, %xmm0, %xmm0
2442; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
2443; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
2444; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
2445; AVX512-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
2446; AVX512-NEXT:    retq
2447;
2448; AVX512VL-LABEL: splatconstant_shift_v2i8:
2449; AVX512VL:       # %bb.0:
2450; AVX512VL-NEXT:    vpsrlw $3, %xmm0, %xmm0
2451; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
2452; AVX512VL-NEXT:    vpternlogq $108, {{.*}}(%rip), %xmm1, %xmm0
2453; AVX512VL-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
2454; AVX512VL-NEXT:    retq
2455;
2456; X86-SSE-LABEL: splatconstant_shift_v2i8:
2457; X86-SSE:       # %bb.0:
2458; X86-SSE-NEXT:    psrlw $3, %xmm0
2459; X86-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
2460; X86-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
2461; X86-SSE-NEXT:    pxor %xmm1, %xmm0
2462; X86-SSE-NEXT:    psubb %xmm1, %xmm0
2463; X86-SSE-NEXT:    retl
2464  %shift = ashr <2 x i8> %a, <i8 3, i8 3>
2465  ret <2 x i8> %shift
2466}
2467