1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
8; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
9
10;
11; Just one 32-bit run to make sure we do reasonable things for i64 shifts.
12; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE2
13
14;
15; Variable Shifts
16;
17
18define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
19; SSE2-LABEL: var_shift_v2i64:
20; SSE2:       # BB#0:
21; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
22; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
23; SSE2-NEXT:    movdqa %xmm2, %xmm4
24; SSE2-NEXT:    psrlq %xmm3, %xmm4
25; SSE2-NEXT:    psrlq %xmm1, %xmm2
26; SSE2-NEXT:    movsd {{.*#+}} xmm4 = xmm2[0],xmm4[1]
27; SSE2-NEXT:    movdqa %xmm0, %xmm2
28; SSE2-NEXT:    psrlq %xmm3, %xmm2
29; SSE2-NEXT:    psrlq %xmm1, %xmm0
30; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
31; SSE2-NEXT:    xorpd %xmm4, %xmm2
32; SSE2-NEXT:    psubq %xmm4, %xmm2
33; SSE2-NEXT:    movdqa %xmm2, %xmm0
34; SSE2-NEXT:    retq
35;
36; SSE41-LABEL: var_shift_v2i64:
37; SSE41:       # BB#0:
38; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
39; SSE41-NEXT:    movdqa %xmm2, %xmm3
40; SSE41-NEXT:    psrlq %xmm1, %xmm3
41; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
42; SSE41-NEXT:    psrlq %xmm4, %xmm2
43; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
44; SSE41-NEXT:    movdqa %xmm0, %xmm3
45; SSE41-NEXT:    psrlq %xmm1, %xmm3
46; SSE41-NEXT:    psrlq %xmm4, %xmm0
47; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
48; SSE41-NEXT:    pxor %xmm2, %xmm0
49; SSE41-NEXT:    psubq %xmm2, %xmm0
50; SSE41-NEXT:    retq
51;
52; AVX1-LABEL: var_shift_v2i64:
53; AVX1:       # BB#0:
54; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
55; AVX1-NEXT:    vpsrlq %xmm1, %xmm2, %xmm3
56; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
57; AVX1-NEXT:    vpsrlq %xmm4, %xmm2, %xmm2
58; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
59; AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm1
60; AVX1-NEXT:    vpsrlq %xmm4, %xmm0, %xmm0
61; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
62; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
63; AVX1-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
64; AVX1-NEXT:    retq
65;
66; AVX2-LABEL: var_shift_v2i64:
67; AVX2:       # BB#0:
68; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
69; AVX2-NEXT:    vpsrlvq %xmm1, %xmm2, %xmm3
70; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
71; AVX2-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
72; AVX2-NEXT:    vpsubq %xmm3, %xmm0, %xmm0
73; AVX2-NEXT:    retq
74;
75; XOP-LABEL: var_shift_v2i64:
76; XOP:       # BB#0:
77; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
78; XOP-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
79; XOP-NEXT:    vpshaq %xmm1, %xmm0, %xmm0
80; XOP-NEXT:    retq
81;
82; AVX512-LABEL: var_shift_v2i64:
83; AVX512:       ## BB#0:
84; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
85; AVX512-NEXT:    vpsrlvq %xmm1, %xmm2, %xmm3
86; AVX512-NEXT:    vpxor %xmm2, %xmm0, %xmm0
87; AVX512-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
88; AVX512-NEXT:    vpsubq %xmm3, %xmm0, %xmm0
89; AVX512-NEXT:    retq
90;
91; X32-SSE-LABEL: var_shift_v2i64:
92; X32-SSE:       # BB#0:
93; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
94; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648]
95; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
96; X32-SSE-NEXT:    psrlq %xmm2, %xmm4
97; X32-SSE-NEXT:    movq {{.*#+}} xmm5 = xmm1[0],zero
98; X32-SSE-NEXT:    psrlq %xmm5, %xmm3
99; X32-SSE-NEXT:    movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1]
100; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
101; X32-SSE-NEXT:    psrlq %xmm2, %xmm1
102; X32-SSE-NEXT:    psrlq %xmm5, %xmm0
103; X32-SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
104; X32-SSE-NEXT:    xorpd %xmm4, %xmm1
105; X32-SSE-NEXT:    psubq %xmm4, %xmm1
106; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
107; X32-SSE-NEXT:    retl
108  %shift = ashr <2 x i64> %a, %b
109  ret <2 x i64> %shift
110}
111
112define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
113; SSE2-LABEL: var_shift_v4i32:
114; SSE2:       # BB#0:
115; SSE2-NEXT:    movdqa %xmm1, %xmm2
116; SSE2-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
117; SSE2-NEXT:    movdqa %xmm0, %xmm3
118; SSE2-NEXT:    psrad %xmm2, %xmm3
119; SSE2-NEXT:    movdqa %xmm1, %xmm2
120; SSE2-NEXT:    psrlq $32, %xmm2
121; SSE2-NEXT:    movdqa %xmm0, %xmm4
122; SSE2-NEXT:    psrad %xmm2, %xmm4
123; SSE2-NEXT:    movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1]
124; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3]
125; SSE2-NEXT:    pxor %xmm3, %xmm3
126; SSE2-NEXT:    movdqa %xmm1, %xmm4
127; SSE2-NEXT:    punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
128; SSE2-NEXT:    movdqa %xmm0, %xmm5
129; SSE2-NEXT:    psrad %xmm4, %xmm5
130; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
131; SSE2-NEXT:    psrad %xmm1, %xmm0
132; SSE2-NEXT:    movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1]
133; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3]
134; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
135; SSE2-NEXT:    retq
136;
137; SSE41-LABEL: var_shift_v4i32:
138; SSE41:       # BB#0:
139; SSE41-NEXT:    movdqa %xmm1, %xmm2
140; SSE41-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
141; SSE41-NEXT:    movdqa %xmm0, %xmm3
142; SSE41-NEXT:    psrad %xmm2, %xmm3
143; SSE41-NEXT:    movdqa %xmm1, %xmm2
144; SSE41-NEXT:    psrlq $32, %xmm2
145; SSE41-NEXT:    movdqa %xmm0, %xmm4
146; SSE41-NEXT:    psrad %xmm2, %xmm4
147; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
148; SSE41-NEXT:    pxor %xmm2, %xmm2
149; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero
150; SSE41-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
151; SSE41-NEXT:    movdqa %xmm0, %xmm2
152; SSE41-NEXT:    psrad %xmm1, %xmm2
153; SSE41-NEXT:    psrad %xmm3, %xmm0
154; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
155; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7]
156; SSE41-NEXT:    retq
157;
158; AVX1-LABEL: var_shift_v4i32:
159; AVX1:       # BB#0:
160; AVX1-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
161; AVX1-NEXT:    vpsrad %xmm2, %xmm0, %xmm2
162; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm3
163; AVX1-NEXT:    vpsrad %xmm3, %xmm0, %xmm3
164; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
165; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
166; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
167; AVX1-NEXT:    vpsrad %xmm3, %xmm0, %xmm3
168; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
169; AVX1-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
170; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
171; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
172; AVX1-NEXT:    retq
173;
174; AVX2-LABEL: var_shift_v4i32:
175; AVX2:       # BB#0:
176; AVX2-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
177; AVX2-NEXT:    retq
178;
179; XOPAVX1-LABEL: var_shift_v4i32:
180; XOPAVX1:       # BB#0:
181; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
182; XOPAVX1-NEXT:    vpsubd %xmm1, %xmm2, %xmm1
183; XOPAVX1-NEXT:    vpshad %xmm1, %xmm0, %xmm0
184; XOPAVX1-NEXT:    retq
185;
186; XOPAVX2-LABEL: var_shift_v4i32:
187; XOPAVX2:       # BB#0:
188; XOPAVX2-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
189; XOPAVX2-NEXT:    retq
190;
191; AVX512-LABEL: var_shift_v4i32:
192; AVX512:       ## BB#0:
193; AVX512-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
194; AVX512-NEXT:    retq
195;
196; X32-SSE-LABEL: var_shift_v4i32:
197; X32-SSE:       # BB#0:
198; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
199; X32-SSE-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
200; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
201; X32-SSE-NEXT:    psrad %xmm2, %xmm3
202; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
203; X32-SSE-NEXT:    psrlq $32, %xmm2
204; X32-SSE-NEXT:    movdqa %xmm0, %xmm4
205; X32-SSE-NEXT:    psrad %xmm2, %xmm4
206; X32-SSE-NEXT:    movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1]
207; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3]
208; X32-SSE-NEXT:    pxor %xmm3, %xmm3
209; X32-SSE-NEXT:    movdqa %xmm1, %xmm4
210; X32-SSE-NEXT:    punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
211; X32-SSE-NEXT:    movdqa %xmm0, %xmm5
212; X32-SSE-NEXT:    psrad %xmm4, %xmm5
213; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
214; X32-SSE-NEXT:    psrad %xmm1, %xmm0
215; X32-SSE-NEXT:    movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1]
216; X32-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3]
217; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
218; X32-SSE-NEXT:    retl
219  %shift = ashr <4 x i32> %a, %b
220  ret <4 x i32> %shift
221}
222
223define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
224; SSE2-LABEL: var_shift_v8i16:
225; SSE2:       # BB#0:
226; SSE2-NEXT:    psllw $12, %xmm1
227; SSE2-NEXT:    movdqa %xmm1, %xmm2
228; SSE2-NEXT:    psraw $15, %xmm2
229; SSE2-NEXT:    movdqa %xmm2, %xmm3
230; SSE2-NEXT:    pandn %xmm0, %xmm3
231; SSE2-NEXT:    psraw $8, %xmm0
232; SSE2-NEXT:    pand %xmm2, %xmm0
233; SSE2-NEXT:    por %xmm3, %xmm0
234; SSE2-NEXT:    paddw %xmm1, %xmm1
235; SSE2-NEXT:    movdqa %xmm1, %xmm2
236; SSE2-NEXT:    psraw $15, %xmm2
237; SSE2-NEXT:    movdqa %xmm2, %xmm3
238; SSE2-NEXT:    pandn %xmm0, %xmm3
239; SSE2-NEXT:    psraw $4, %xmm0
240; SSE2-NEXT:    pand %xmm2, %xmm0
241; SSE2-NEXT:    por %xmm3, %xmm0
242; SSE2-NEXT:    paddw %xmm1, %xmm1
243; SSE2-NEXT:    movdqa %xmm1, %xmm2
244; SSE2-NEXT:    psraw $15, %xmm2
245; SSE2-NEXT:    movdqa %xmm2, %xmm3
246; SSE2-NEXT:    pandn %xmm0, %xmm3
247; SSE2-NEXT:    psraw $2, %xmm0
248; SSE2-NEXT:    pand %xmm2, %xmm0
249; SSE2-NEXT:    por %xmm3, %xmm0
250; SSE2-NEXT:    paddw %xmm1, %xmm1
251; SSE2-NEXT:    psraw $15, %xmm1
252; SSE2-NEXT:    movdqa %xmm1, %xmm2
253; SSE2-NEXT:    pandn %xmm0, %xmm2
254; SSE2-NEXT:    psraw $1, %xmm0
255; SSE2-NEXT:    pand %xmm1, %xmm0
256; SSE2-NEXT:    por %xmm2, %xmm0
257; SSE2-NEXT:    retq
258;
259; SSE41-LABEL: var_shift_v8i16:
260; SSE41:       # BB#0:
261; SSE41-NEXT:    movdqa %xmm0, %xmm2
262; SSE41-NEXT:    movdqa %xmm1, %xmm0
263; SSE41-NEXT:    psllw $12, %xmm0
264; SSE41-NEXT:    psllw $4, %xmm1
265; SSE41-NEXT:    por %xmm0, %xmm1
266; SSE41-NEXT:    movdqa %xmm1, %xmm3
267; SSE41-NEXT:    paddw %xmm3, %xmm3
268; SSE41-NEXT:    movdqa %xmm2, %xmm4
269; SSE41-NEXT:    psraw $8, %xmm4
270; SSE41-NEXT:    movdqa %xmm1, %xmm0
271; SSE41-NEXT:    pblendvb %xmm4, %xmm2
272; SSE41-NEXT:    movdqa %xmm2, %xmm1
273; SSE41-NEXT:    psraw $4, %xmm1
274; SSE41-NEXT:    movdqa %xmm3, %xmm0
275; SSE41-NEXT:    pblendvb %xmm1, %xmm2
276; SSE41-NEXT:    movdqa %xmm2, %xmm1
277; SSE41-NEXT:    psraw $2, %xmm1
278; SSE41-NEXT:    paddw %xmm3, %xmm3
279; SSE41-NEXT:    movdqa %xmm3, %xmm0
280; SSE41-NEXT:    pblendvb %xmm1, %xmm2
281; SSE41-NEXT:    movdqa %xmm2, %xmm1
282; SSE41-NEXT:    psraw $1, %xmm1
283; SSE41-NEXT:    paddw %xmm3, %xmm3
284; SSE41-NEXT:    movdqa %xmm3, %xmm0
285; SSE41-NEXT:    pblendvb %xmm1, %xmm2
286; SSE41-NEXT:    movdqa %xmm2, %xmm0
287; SSE41-NEXT:    retq
288;
289; AVX1-LABEL: var_shift_v8i16:
290; AVX1:       # BB#0:
291; AVX1-NEXT:    vpsllw $12, %xmm1, %xmm2
292; AVX1-NEXT:    vpsllw $4, %xmm1, %xmm1
293; AVX1-NEXT:    vpor %xmm2, %xmm1, %xmm1
294; AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm2
295; AVX1-NEXT:    vpsraw $8, %xmm0, %xmm3
296; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
297; AVX1-NEXT:    vpsraw $4, %xmm0, %xmm1
298; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
299; AVX1-NEXT:    vpsraw $2, %xmm0, %xmm1
300; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
301; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
302; AVX1-NEXT:    vpsraw $1, %xmm0, %xmm1
303; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
304; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
305; AVX1-NEXT:    retq
306;
307; AVX2-LABEL: var_shift_v8i16:
308; AVX2:       # BB#0:
309; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
310; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
311; AVX2-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
312; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
313; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
314; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
315; AVX2-NEXT:    vzeroupper
316; AVX2-NEXT:    retq
317;
318; XOP-LABEL: var_shift_v8i16:
319; XOP:       # BB#0:
320; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
321; XOP-NEXT:    vpsubw %xmm1, %xmm2, %xmm1
322; XOP-NEXT:    vpshaw %xmm1, %xmm0, %xmm0
323; XOP-NEXT:    retq
324;
325; AVX512-LABEL: var_shift_v8i16:
326; AVX512:       ## BB#0:
327; AVX512-NEXT:    ## kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
328; AVX512-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
329; AVX512-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
330; AVX512-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
331; AVX512-NEXT:    retq
332;
333; X32-SSE-LABEL: var_shift_v8i16:
334; X32-SSE:       # BB#0:
335; X32-SSE-NEXT:    psllw $12, %xmm1
336; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
337; X32-SSE-NEXT:    psraw $15, %xmm2
338; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
339; X32-SSE-NEXT:    pandn %xmm0, %xmm3
340; X32-SSE-NEXT:    psraw $8, %xmm0
341; X32-SSE-NEXT:    pand %xmm2, %xmm0
342; X32-SSE-NEXT:    por %xmm3, %xmm0
343; X32-SSE-NEXT:    paddw %xmm1, %xmm1
344; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
345; X32-SSE-NEXT:    psraw $15, %xmm2
346; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
347; X32-SSE-NEXT:    pandn %xmm0, %xmm3
348; X32-SSE-NEXT:    psraw $4, %xmm0
349; X32-SSE-NEXT:    pand %xmm2, %xmm0
350; X32-SSE-NEXT:    por %xmm3, %xmm0
351; X32-SSE-NEXT:    paddw %xmm1, %xmm1
352; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
353; X32-SSE-NEXT:    psraw $15, %xmm2
354; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
355; X32-SSE-NEXT:    pandn %xmm0, %xmm3
356; X32-SSE-NEXT:    psraw $2, %xmm0
357; X32-SSE-NEXT:    pand %xmm2, %xmm0
358; X32-SSE-NEXT:    por %xmm3, %xmm0
359; X32-SSE-NEXT:    paddw %xmm1, %xmm1
360; X32-SSE-NEXT:    psraw $15, %xmm1
361; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
362; X32-SSE-NEXT:    pandn %xmm0, %xmm2
363; X32-SSE-NEXT:    psraw $1, %xmm0
364; X32-SSE-NEXT:    pand %xmm1, %xmm0
365; X32-SSE-NEXT:    por %xmm2, %xmm0
366; X32-SSE-NEXT:    retl
367  %shift = ashr <8 x i16> %a, %b
368  ret <8 x i16> %shift
369}
370
371define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
372; SSE2-LABEL: var_shift_v16i8:
373; SSE2:       # BB#0:
374; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
375; SSE2-NEXT:    psllw $5, %xmm1
376; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
377; SSE2-NEXT:    pxor %xmm3, %xmm3
378; SSE2-NEXT:    pxor %xmm5, %xmm5
379; SSE2-NEXT:    pcmpgtw %xmm4, %xmm5
380; SSE2-NEXT:    movdqa %xmm5, %xmm6
381; SSE2-NEXT:    pandn %xmm2, %xmm6
382; SSE2-NEXT:    psraw $4, %xmm2
383; SSE2-NEXT:    pand %xmm5, %xmm2
384; SSE2-NEXT:    por %xmm6, %xmm2
385; SSE2-NEXT:    paddw %xmm4, %xmm4
386; SSE2-NEXT:    pxor %xmm5, %xmm5
387; SSE2-NEXT:    pcmpgtw %xmm4, %xmm5
388; SSE2-NEXT:    movdqa %xmm5, %xmm6
389; SSE2-NEXT:    pandn %xmm2, %xmm6
390; SSE2-NEXT:    psraw $2, %xmm2
391; SSE2-NEXT:    pand %xmm5, %xmm2
392; SSE2-NEXT:    por %xmm6, %xmm2
393; SSE2-NEXT:    paddw %xmm4, %xmm4
394; SSE2-NEXT:    pxor %xmm5, %xmm5
395; SSE2-NEXT:    pcmpgtw %xmm4, %xmm5
396; SSE2-NEXT:    movdqa %xmm5, %xmm4
397; SSE2-NEXT:    pandn %xmm2, %xmm4
398; SSE2-NEXT:    psraw $1, %xmm2
399; SSE2-NEXT:    pand %xmm5, %xmm2
400; SSE2-NEXT:    por %xmm4, %xmm2
401; SSE2-NEXT:    psrlw $8, %xmm2
402; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
403; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
404; SSE2-NEXT:    pxor %xmm4, %xmm4
405; SSE2-NEXT:    pcmpgtw %xmm1, %xmm4
406; SSE2-NEXT:    movdqa %xmm4, %xmm5
407; SSE2-NEXT:    pandn %xmm0, %xmm5
408; SSE2-NEXT:    psraw $4, %xmm0
409; SSE2-NEXT:    pand %xmm4, %xmm0
410; SSE2-NEXT:    por %xmm5, %xmm0
411; SSE2-NEXT:    paddw %xmm1, %xmm1
412; SSE2-NEXT:    pxor %xmm4, %xmm4
413; SSE2-NEXT:    pcmpgtw %xmm1, %xmm4
414; SSE2-NEXT:    movdqa %xmm4, %xmm5
415; SSE2-NEXT:    pandn %xmm0, %xmm5
416; SSE2-NEXT:    psraw $2, %xmm0
417; SSE2-NEXT:    pand %xmm4, %xmm0
418; SSE2-NEXT:    por %xmm5, %xmm0
419; SSE2-NEXT:    paddw %xmm1, %xmm1
420; SSE2-NEXT:    pcmpgtw %xmm1, %xmm3
421; SSE2-NEXT:    movdqa %xmm3, %xmm1
422; SSE2-NEXT:    pandn %xmm0, %xmm1
423; SSE2-NEXT:    psraw $1, %xmm0
424; SSE2-NEXT:    pand %xmm3, %xmm0
425; SSE2-NEXT:    por %xmm1, %xmm0
426; SSE2-NEXT:    psrlw $8, %xmm0
427; SSE2-NEXT:    packuswb %xmm2, %xmm0
428; SSE2-NEXT:    retq
429;
430; SSE41-LABEL: var_shift_v16i8:
431; SSE41:       # BB#0:
432; SSE41-NEXT:    movdqa %xmm0, %xmm2
433; SSE41-NEXT:    psllw $5, %xmm1
434; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
435; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
436; SSE41-NEXT:    movdqa %xmm3, %xmm4
437; SSE41-NEXT:    psraw $4, %xmm4
438; SSE41-NEXT:    pblendvb %xmm4, %xmm3
439; SSE41-NEXT:    movdqa %xmm3, %xmm4
440; SSE41-NEXT:    psraw $2, %xmm4
441; SSE41-NEXT:    paddw %xmm0, %xmm0
442; SSE41-NEXT:    pblendvb %xmm4, %xmm3
443; SSE41-NEXT:    movdqa %xmm3, %xmm4
444; SSE41-NEXT:    psraw $1, %xmm4
445; SSE41-NEXT:    paddw %xmm0, %xmm0
446; SSE41-NEXT:    pblendvb %xmm4, %xmm3
447; SSE41-NEXT:    psrlw $8, %xmm3
448; SSE41-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
449; SSE41-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
450; SSE41-NEXT:    movdqa %xmm1, %xmm2
451; SSE41-NEXT:    psraw $4, %xmm2
452; SSE41-NEXT:    pblendvb %xmm2, %xmm1
453; SSE41-NEXT:    movdqa %xmm1, %xmm2
454; SSE41-NEXT:    psraw $2, %xmm2
455; SSE41-NEXT:    paddw %xmm0, %xmm0
456; SSE41-NEXT:    pblendvb %xmm2, %xmm1
457; SSE41-NEXT:    movdqa %xmm1, %xmm2
458; SSE41-NEXT:    psraw $1, %xmm2
459; SSE41-NEXT:    paddw %xmm0, %xmm0
460; SSE41-NEXT:    pblendvb %xmm2, %xmm1
461; SSE41-NEXT:    psrlw $8, %xmm1
462; SSE41-NEXT:    packuswb %xmm3, %xmm1
463; SSE41-NEXT:    movdqa %xmm1, %xmm0
464; SSE41-NEXT:    retq
465;
466; AVX-LABEL: var_shift_v16i8:
467; AVX:       # BB#0:
468; AVX-NEXT:    vpsllw $5, %xmm1, %xmm1
469; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
470; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
471; AVX-NEXT:    vpsraw $4, %xmm3, %xmm4
472; AVX-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
473; AVX-NEXT:    vpsraw $2, %xmm3, %xmm4
474; AVX-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
475; AVX-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
476; AVX-NEXT:    vpsraw $1, %xmm3, %xmm4
477; AVX-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
478; AVX-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm2
479; AVX-NEXT:    vpsrlw $8, %xmm2, %xmm2
480; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
481; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
482; AVX-NEXT:    vpsraw $4, %xmm0, %xmm3
483; AVX-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
484; AVX-NEXT:    vpsraw $2, %xmm0, %xmm3
485; AVX-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
486; AVX-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
487; AVX-NEXT:    vpsraw $1, %xmm0, %xmm3
488; AVX-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
489; AVX-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
490; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm0
491; AVX-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
492; AVX-NEXT:    retq
493;
494; XOP-LABEL: var_shift_v16i8:
495; XOP:       # BB#0:
496; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
497; XOP-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
498; XOP-NEXT:    vpshab %xmm1, %xmm0, %xmm0
499; XOP-NEXT:    retq
500;
501; AVX512-LABEL: var_shift_v16i8:
502; AVX512:       ## BB#0:
503; AVX512-NEXT:    vpsllw $5, %xmm1, %xmm1
504; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
505; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
506; AVX512-NEXT:    vpsraw $4, %xmm3, %xmm4
507; AVX512-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
508; AVX512-NEXT:    vpsraw $2, %xmm3, %xmm4
509; AVX512-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
510; AVX512-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
511; AVX512-NEXT:    vpsraw $1, %xmm3, %xmm4
512; AVX512-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
513; AVX512-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm2
514; AVX512-NEXT:    vpsrlw $8, %xmm2, %xmm2
515; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
516; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
517; AVX512-NEXT:    vpsraw $4, %xmm0, %xmm3
518; AVX512-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
519; AVX512-NEXT:    vpsraw $2, %xmm0, %xmm3
520; AVX512-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
521; AVX512-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
522; AVX512-NEXT:    vpsraw $1, %xmm0, %xmm3
523; AVX512-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
524; AVX512-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
525; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm0
526; AVX512-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
527; AVX512-NEXT:    retq
528;
529; X32-SSE-LABEL: var_shift_v16i8:
530; X32-SSE:       # BB#0:
531; X32-SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
532; X32-SSE-NEXT:    psllw $5, %xmm1
533; X32-SSE-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
534; X32-SSE-NEXT:    pxor %xmm3, %xmm3
535; X32-SSE-NEXT:    pxor %xmm5, %xmm5
536; X32-SSE-NEXT:    pcmpgtw %xmm4, %xmm5
537; X32-SSE-NEXT:    movdqa %xmm5, %xmm6
538; X32-SSE-NEXT:    pandn %xmm2, %xmm6
539; X32-SSE-NEXT:    psraw $4, %xmm2
540; X32-SSE-NEXT:    pand %xmm5, %xmm2
541; X32-SSE-NEXT:    por %xmm6, %xmm2
542; X32-SSE-NEXT:    paddw %xmm4, %xmm4
543; X32-SSE-NEXT:    pxor %xmm5, %xmm5
544; X32-SSE-NEXT:    pcmpgtw %xmm4, %xmm5
545; X32-SSE-NEXT:    movdqa %xmm5, %xmm6
546; X32-SSE-NEXT:    pandn %xmm2, %xmm6
547; X32-SSE-NEXT:    psraw $2, %xmm2
548; X32-SSE-NEXT:    pand %xmm5, %xmm2
549; X32-SSE-NEXT:    por %xmm6, %xmm2
550; X32-SSE-NEXT:    paddw %xmm4, %xmm4
551; X32-SSE-NEXT:    pxor %xmm5, %xmm5
552; X32-SSE-NEXT:    pcmpgtw %xmm4, %xmm5
553; X32-SSE-NEXT:    movdqa %xmm5, %xmm4
554; X32-SSE-NEXT:    pandn %xmm2, %xmm4
555; X32-SSE-NEXT:    psraw $1, %xmm2
556; X32-SSE-NEXT:    pand %xmm5, %xmm2
557; X32-SSE-NEXT:    por %xmm4, %xmm2
558; X32-SSE-NEXT:    psrlw $8, %xmm2
559; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
560; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
561; X32-SSE-NEXT:    pxor %xmm4, %xmm4
562; X32-SSE-NEXT:    pcmpgtw %xmm1, %xmm4
563; X32-SSE-NEXT:    movdqa %xmm4, %xmm5
564; X32-SSE-NEXT:    pandn %xmm0, %xmm5
565; X32-SSE-NEXT:    psraw $4, %xmm0
566; X32-SSE-NEXT:    pand %xmm4, %xmm0
567; X32-SSE-NEXT:    por %xmm5, %xmm0
568; X32-SSE-NEXT:    paddw %xmm1, %xmm1
569; X32-SSE-NEXT:    pxor %xmm4, %xmm4
570; X32-SSE-NEXT:    pcmpgtw %xmm1, %xmm4
571; X32-SSE-NEXT:    movdqa %xmm4, %xmm5
572; X32-SSE-NEXT:    pandn %xmm0, %xmm5
573; X32-SSE-NEXT:    psraw $2, %xmm0
574; X32-SSE-NEXT:    pand %xmm4, %xmm0
575; X32-SSE-NEXT:    por %xmm5, %xmm0
576; X32-SSE-NEXT:    paddw %xmm1, %xmm1
577; X32-SSE-NEXT:    pcmpgtw %xmm1, %xmm3
578; X32-SSE-NEXT:    movdqa %xmm3, %xmm1
579; X32-SSE-NEXT:    pandn %xmm0, %xmm1
580; X32-SSE-NEXT:    psraw $1, %xmm0
581; X32-SSE-NEXT:    pand %xmm3, %xmm0
582; X32-SSE-NEXT:    por %xmm1, %xmm0
583; X32-SSE-NEXT:    psrlw $8, %xmm0
584; X32-SSE-NEXT:    packuswb %xmm2, %xmm0
585; X32-SSE-NEXT:    retl
586  %shift = ashr <16 x i8> %a, %b
587  ret <16 x i8> %shift
588}
589
590;
591; Uniform Variable Shifts
592;
593
594define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
595; SSE-LABEL: splatvar_shift_v2i64:
596; SSE:       # BB#0:
597; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
598; SSE-NEXT:    psrlq %xmm1, %xmm2
599; SSE-NEXT:    psrlq %xmm1, %xmm0
600; SSE-NEXT:    pxor %xmm2, %xmm0
601; SSE-NEXT:    psubq %xmm2, %xmm0
602; SSE-NEXT:    retq
603;
604; AVX-LABEL: splatvar_shift_v2i64:
605; AVX:       # BB#0:
606; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
607; AVX-NEXT:    vpsrlq %xmm1, %xmm2, %xmm2
608; AVX-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
609; AVX-NEXT:    vpxor %xmm2, %xmm0, %xmm0
610; AVX-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
611; AVX-NEXT:    retq
612;
613; XOPAVX1-LABEL: splatvar_shift_v2i64:
614; XOPAVX1:       # BB#0:
615; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
616; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
617; XOPAVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
618; XOPAVX1-NEXT:    vpshaq %xmm1, %xmm0, %xmm0
619; XOPAVX1-NEXT:    retq
620;
621; XOPAVX2-LABEL: splatvar_shift_v2i64:
622; XOPAVX2:       # BB#0:
623; XOPAVX2-NEXT:    vpbroadcastq %xmm1, %xmm1
624; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
625; XOPAVX2-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
626; XOPAVX2-NEXT:    vpshaq %xmm1, %xmm0, %xmm0
627; XOPAVX2-NEXT:    retq
628;
629; AVX512-LABEL: splatvar_shift_v2i64:
630; AVX512:       ## BB#0:
631; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
632; AVX512-NEXT:    vpsrlq %xmm1, %xmm2, %xmm2
633; AVX512-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
634; AVX512-NEXT:    vpxor %xmm2, %xmm0, %xmm0
635; AVX512-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
636; AVX512-NEXT:    retq
637;
638; X32-SSE-LABEL: splatvar_shift_v2i64:
639; X32-SSE:       # BB#0:
640; X32-SSE-NEXT:    movq {{.*#+}} xmm1 = xmm1[0],zero
641; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
642; X32-SSE-NEXT:    psrlq %xmm1, %xmm2
643; X32-SSE-NEXT:    psrlq %xmm1, %xmm0
644; X32-SSE-NEXT:    pxor %xmm2, %xmm0
645; X32-SSE-NEXT:    psubq %xmm2, %xmm0
646; X32-SSE-NEXT:    retl
647  %splat = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer
648  %shift = ashr <2 x i64> %a, %splat
649  ret <2 x i64> %shift
650}
651
652define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
653; SSE2-LABEL: splatvar_shift_v4i32:
654; SSE2:       # BB#0:
655; SSE2-NEXT:    xorps %xmm2, %xmm2
656; SSE2-NEXT:    movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
657; SSE2-NEXT:    psrad %xmm2, %xmm0
658; SSE2-NEXT:    retq
659;
660; SSE41-LABEL: splatvar_shift_v4i32:
661; SSE41:       # BB#0:
662; SSE41-NEXT:    pxor %xmm2, %xmm2
663; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3,4,5,6,7]
664; SSE41-NEXT:    psrad %xmm2, %xmm0
665; SSE41-NEXT:    retq
666;
667; AVX-LABEL: splatvar_shift_v4i32:
668; AVX:       # BB#0:
669; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
670; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
671; AVX-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
672; AVX-NEXT:    retq
673;
674; XOP-LABEL: splatvar_shift_v4i32:
675; XOP:       # BB#0:
676; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
677; XOP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
678; XOP-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
679; XOP-NEXT:    retq
680;
681; AVX512-LABEL: splatvar_shift_v4i32:
682; AVX512:       ## BB#0:
683; AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2
684; AVX512-NEXT:    vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
685; AVX512-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
686; AVX512-NEXT:    retq
687;
688; X32-SSE-LABEL: splatvar_shift_v4i32:
689; X32-SSE:       # BB#0:
690; X32-SSE-NEXT:    xorps %xmm2, %xmm2
691; X32-SSE-NEXT:    movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
692; X32-SSE-NEXT:    psrad %xmm2, %xmm0
693; X32-SSE-NEXT:    retl
694  %splat = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> zeroinitializer
695  %shift = ashr <4 x i32> %a, %splat
696  ret <4 x i32> %shift
697}
698
699define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
700; SSE2-LABEL: splatvar_shift_v8i16:
701; SSE2:       # BB#0:
702; SSE2-NEXT:    movd %xmm1, %eax
703; SSE2-NEXT:    movzwl %ax, %eax
704; SSE2-NEXT:    movd %eax, %xmm1
705; SSE2-NEXT:    psraw %xmm1, %xmm0
706; SSE2-NEXT:    retq
707;
708; SSE41-LABEL: splatvar_shift_v8i16:
709; SSE41:       # BB#0:
710; SSE41-NEXT:    pxor %xmm2, %xmm2
711; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3,4,5,6,7]
712; SSE41-NEXT:    psraw %xmm2, %xmm0
713; SSE41-NEXT:    retq
714;
715; AVX-LABEL: splatvar_shift_v8i16:
716; AVX:       # BB#0:
717; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
718; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
719; AVX-NEXT:    vpsraw %xmm1, %xmm0, %xmm0
720; AVX-NEXT:    retq
721;
722; XOP-LABEL: splatvar_shift_v8i16:
723; XOP:       # BB#0:
724; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
725; XOP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
726; XOP-NEXT:    vpsraw %xmm1, %xmm0, %xmm0
727; XOP-NEXT:    retq
728;
729; AVX512-LABEL: splatvar_shift_v8i16:
730; AVX512:       ## BB#0:
731; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
732; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
733; AVX512-NEXT:    vpsraw %xmm1, %xmm0, %xmm0
734; AVX512-NEXT:    retq
735;
736; X32-SSE-LABEL: splatvar_shift_v8i16:
737; X32-SSE:       # BB#0:
738; X32-SSE-NEXT:    movd %xmm1, %eax
739; X32-SSE-NEXT:    movzwl %ax, %eax
740; X32-SSE-NEXT:    movd %eax, %xmm1
741; X32-SSE-NEXT:    psraw %xmm1, %xmm0
742; X32-SSE-NEXT:    retl
743  %splat = shufflevector <8 x i16> %b, <8 x i16> undef, <8 x i32> zeroinitializer
744  %shift = ashr <8 x i16> %a, %splat
745  ret <8 x i16> %shift
746}
747
748define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
749; SSE2-LABEL: splatvar_shift_v16i8:
750; SSE2:       # BB#0:
751; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
752; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
753; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1]
754; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
755; SSE2-NEXT:    psllw $5, %xmm3
756; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
757; SSE2-NEXT:    pxor %xmm2, %xmm2
758; SSE2-NEXT:    pxor %xmm5, %xmm5
759; SSE2-NEXT:    pcmpgtw %xmm4, %xmm5
760; SSE2-NEXT:    movdqa %xmm5, %xmm6
761; SSE2-NEXT:    pandn %xmm1, %xmm6
762; SSE2-NEXT:    psraw $4, %xmm1
763; SSE2-NEXT:    pand %xmm5, %xmm1
764; SSE2-NEXT:    por %xmm6, %xmm1
765; SSE2-NEXT:    paddw %xmm4, %xmm4
766; SSE2-NEXT:    pxor %xmm5, %xmm5
767; SSE2-NEXT:    pcmpgtw %xmm4, %xmm5
768; SSE2-NEXT:    movdqa %xmm5, %xmm6
769; SSE2-NEXT:    pandn %xmm1, %xmm6
770; SSE2-NEXT:    psraw $2, %xmm1
771; SSE2-NEXT:    pand %xmm5, %xmm1
772; SSE2-NEXT:    por %xmm6, %xmm1
773; SSE2-NEXT:    paddw %xmm4, %xmm4
774; SSE2-NEXT:    pxor %xmm5, %xmm5
775; SSE2-NEXT:    pcmpgtw %xmm4, %xmm5
776; SSE2-NEXT:    movdqa %xmm5, %xmm4
777; SSE2-NEXT:    pandn %xmm1, %xmm4
778; SSE2-NEXT:    psraw $1, %xmm1
779; SSE2-NEXT:    pand %xmm5, %xmm1
780; SSE2-NEXT:    por %xmm4, %xmm1
781; SSE2-NEXT:    psrlw $8, %xmm1
782; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
783; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
784; SSE2-NEXT:    pxor %xmm4, %xmm4
785; SSE2-NEXT:    pcmpgtw %xmm3, %xmm4
786; SSE2-NEXT:    movdqa %xmm4, %xmm5
787; SSE2-NEXT:    pandn %xmm0, %xmm5
788; SSE2-NEXT:    psraw $4, %xmm0
789; SSE2-NEXT:    pand %xmm4, %xmm0
790; SSE2-NEXT:    por %xmm5, %xmm0
791; SSE2-NEXT:    paddw %xmm3, %xmm3
792; SSE2-NEXT:    pxor %xmm4, %xmm4
793; SSE2-NEXT:    pcmpgtw %xmm3, %xmm4
794; SSE2-NEXT:    movdqa %xmm4, %xmm5
795; SSE2-NEXT:    pandn %xmm0, %xmm5
796; SSE2-NEXT:    psraw $2, %xmm0
797; SSE2-NEXT:    pand %xmm4, %xmm0
798; SSE2-NEXT:    por %xmm5, %xmm0
799; SSE2-NEXT:    paddw %xmm3, %xmm3
800; SSE2-NEXT:    pcmpgtw %xmm3, %xmm2
801; SSE2-NEXT:    movdqa %xmm2, %xmm3
802; SSE2-NEXT:    pandn %xmm0, %xmm3
803; SSE2-NEXT:    psraw $1, %xmm0
804; SSE2-NEXT:    pand %xmm2, %xmm0
805; SSE2-NEXT:    por %xmm3, %xmm0
806; SSE2-NEXT:    psrlw $8, %xmm0
807; SSE2-NEXT:    packuswb %xmm1, %xmm0
808; SSE2-NEXT:    retq
809;
810; SSE41-LABEL: splatvar_shift_v16i8:
811; SSE41:       # BB#0:
812; SSE41-NEXT:    movdqa %xmm0, %xmm2
813; SSE41-NEXT:    pxor %xmm0, %xmm0
814; SSE41-NEXT:    pshufb %xmm0, %xmm1
815; SSE41-NEXT:    psllw $5, %xmm1
816; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
817; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
818; SSE41-NEXT:    movdqa %xmm3, %xmm4
819; SSE41-NEXT:    psraw $4, %xmm4
820; SSE41-NEXT:    pblendvb %xmm4, %xmm3
821; SSE41-NEXT:    movdqa %xmm3, %xmm4
822; SSE41-NEXT:    psraw $2, %xmm4
823; SSE41-NEXT:    paddw %xmm0, %xmm0
824; SSE41-NEXT:    pblendvb %xmm4, %xmm3
825; SSE41-NEXT:    movdqa %xmm3, %xmm4
826; SSE41-NEXT:    psraw $1, %xmm4
827; SSE41-NEXT:    paddw %xmm0, %xmm0
828; SSE41-NEXT:    pblendvb %xmm4, %xmm3
829; SSE41-NEXT:    psrlw $8, %xmm3
830; SSE41-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
831; SSE41-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
832; SSE41-NEXT:    movdqa %xmm1, %xmm2
833; SSE41-NEXT:    psraw $4, %xmm2
834; SSE41-NEXT:    pblendvb %xmm2, %xmm1
835; SSE41-NEXT:    movdqa %xmm1, %xmm2
836; SSE41-NEXT:    psraw $2, %xmm2
837; SSE41-NEXT:    paddw %xmm0, %xmm0
838; SSE41-NEXT:    pblendvb %xmm2, %xmm1
839; SSE41-NEXT:    movdqa %xmm1, %xmm2
840; SSE41-NEXT:    psraw $1, %xmm2
841; SSE41-NEXT:    paddw %xmm0, %xmm0
842; SSE41-NEXT:    pblendvb %xmm2, %xmm1
843; SSE41-NEXT:    psrlw $8, %xmm1
844; SSE41-NEXT:    packuswb %xmm3, %xmm1
845; SSE41-NEXT:    movdqa %xmm1, %xmm0
846; SSE41-NEXT:    retq
847;
848; AVX1-LABEL: splatvar_shift_v16i8:
849; AVX1:       # BB#0:
850; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
851; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
852; AVX1-NEXT:    vpsllw $5, %xmm1, %xmm1
853; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
854; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
855; AVX1-NEXT:    vpsraw $4, %xmm3, %xmm4
856; AVX1-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
857; AVX1-NEXT:    vpsraw $2, %xmm3, %xmm4
858; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
859; AVX1-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
860; AVX1-NEXT:    vpsraw $1, %xmm3, %xmm4
861; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
862; AVX1-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm2
863; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
864; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
865; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
866; AVX1-NEXT:    vpsraw $4, %xmm0, %xmm3
867; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
868; AVX1-NEXT:    vpsraw $2, %xmm0, %xmm3
869; AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
870; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
871; AVX1-NEXT:    vpsraw $1, %xmm0, %xmm3
872; AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
873; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
874; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
875; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
876; AVX1-NEXT:    retq
877;
878; AVX2-LABEL: splatvar_shift_v16i8:
879; AVX2:       # BB#0:
880; AVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
881; AVX2-NEXT:    vpsllw $5, %xmm1, %xmm1
882; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
883; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
884; AVX2-NEXT:    vpsraw $4, %xmm3, %xmm4
885; AVX2-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
886; AVX2-NEXT:    vpsraw $2, %xmm3, %xmm4
887; AVX2-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
888; AVX2-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
889; AVX2-NEXT:    vpsraw $1, %xmm3, %xmm4
890; AVX2-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
891; AVX2-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm2
892; AVX2-NEXT:    vpsrlw $8, %xmm2, %xmm2
893; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
894; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
895; AVX2-NEXT:    vpsraw $4, %xmm0, %xmm3
896; AVX2-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
897; AVX2-NEXT:    vpsraw $2, %xmm0, %xmm3
898; AVX2-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
899; AVX2-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
900; AVX2-NEXT:    vpsraw $1, %xmm0, %xmm3
901; AVX2-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
902; AVX2-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
903; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm0
904; AVX2-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
905; AVX2-NEXT:    retq
906;
907; XOPAVX1-LABEL: splatvar_shift_v16i8:
908; XOPAVX1:       # BB#0:
909; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
910; XOPAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
911; XOPAVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
912; XOPAVX1-NEXT:    vpshab %xmm1, %xmm0, %xmm0
913; XOPAVX1-NEXT:    retq
914;
915; XOPAVX2-LABEL: splatvar_shift_v16i8:
916; XOPAVX2:       # BB#0:
917; XOPAVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
918; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
919; XOPAVX2-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
920; XOPAVX2-NEXT:    vpshab %xmm1, %xmm0, %xmm0
921; XOPAVX2-NEXT:    retq
922;
923; AVX512-LABEL: splatvar_shift_v16i8:
924; AVX512:       ## BB#0:
925; AVX512-NEXT:    vpbroadcastb %xmm1, %xmm1
926; AVX512-NEXT:    vpsllw $5, %xmm1, %xmm1
927; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
928; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
929; AVX512-NEXT:    vpsraw $4, %xmm3, %xmm4
930; AVX512-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
931; AVX512-NEXT:    vpsraw $2, %xmm3, %xmm4
932; AVX512-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
933; AVX512-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
934; AVX512-NEXT:    vpsraw $1, %xmm3, %xmm4
935; AVX512-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
936; AVX512-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm2
937; AVX512-NEXT:    vpsrlw $8, %xmm2, %xmm2
938; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
939; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
940; AVX512-NEXT:    vpsraw $4, %xmm0, %xmm3
941; AVX512-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
942; AVX512-NEXT:    vpsraw $2, %xmm0, %xmm3
943; AVX512-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
944; AVX512-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
945; AVX512-NEXT:    vpsraw $1, %xmm0, %xmm3
946; AVX512-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
947; AVX512-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
948; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm0
949; AVX512-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
950; AVX512-NEXT:    retq
951;
952; X32-SSE-LABEL: splatvar_shift_v16i8:
953; X32-SSE:       # BB#0:
954; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
955; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
956; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1]
957; X32-SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
958; X32-SSE-NEXT:    psllw $5, %xmm3
959; X32-SSE-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
960; X32-SSE-NEXT:    pxor %xmm2, %xmm2
961; X32-SSE-NEXT:    pxor %xmm5, %xmm5
962; X32-SSE-NEXT:    pcmpgtw %xmm4, %xmm5
963; X32-SSE-NEXT:    movdqa %xmm5, %xmm6
964; X32-SSE-NEXT:    pandn %xmm1, %xmm6
965; X32-SSE-NEXT:    psraw $4, %xmm1
966; X32-SSE-NEXT:    pand %xmm5, %xmm1
967; X32-SSE-NEXT:    por %xmm6, %xmm1
968; X32-SSE-NEXT:    paddw %xmm4, %xmm4
969; X32-SSE-NEXT:    pxor %xmm5, %xmm5
970; X32-SSE-NEXT:    pcmpgtw %xmm4, %xmm5
971; X32-SSE-NEXT:    movdqa %xmm5, %xmm6
972; X32-SSE-NEXT:    pandn %xmm1, %xmm6
973; X32-SSE-NEXT:    psraw $2, %xmm1
974; X32-SSE-NEXT:    pand %xmm5, %xmm1
975; X32-SSE-NEXT:    por %xmm6, %xmm1
976; X32-SSE-NEXT:    paddw %xmm4, %xmm4
977; X32-SSE-NEXT:    pxor %xmm5, %xmm5
978; X32-SSE-NEXT:    pcmpgtw %xmm4, %xmm5
979; X32-SSE-NEXT:    movdqa %xmm5, %xmm4
980; X32-SSE-NEXT:    pandn %xmm1, %xmm4
981; X32-SSE-NEXT:    psraw $1, %xmm1
982; X32-SSE-NEXT:    pand %xmm5, %xmm1
983; X32-SSE-NEXT:    por %xmm4, %xmm1
984; X32-SSE-NEXT:    psrlw $8, %xmm1
985; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
986; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
987; X32-SSE-NEXT:    pxor %xmm4, %xmm4
988; X32-SSE-NEXT:    pcmpgtw %xmm3, %xmm4
989; X32-SSE-NEXT:    movdqa %xmm4, %xmm5
990; X32-SSE-NEXT:    pandn %xmm0, %xmm5
991; X32-SSE-NEXT:    psraw $4, %xmm0
992; X32-SSE-NEXT:    pand %xmm4, %xmm0
993; X32-SSE-NEXT:    por %xmm5, %xmm0
994; X32-SSE-NEXT:    paddw %xmm3, %xmm3
995; X32-SSE-NEXT:    pxor %xmm4, %xmm4
996; X32-SSE-NEXT:    pcmpgtw %xmm3, %xmm4
997; X32-SSE-NEXT:    movdqa %xmm4, %xmm5
998; X32-SSE-NEXT:    pandn %xmm0, %xmm5
999; X32-SSE-NEXT:    psraw $2, %xmm0
1000; X32-SSE-NEXT:    pand %xmm4, %xmm0
1001; X32-SSE-NEXT:    por %xmm5, %xmm0
1002; X32-SSE-NEXT:    paddw %xmm3, %xmm3
1003; X32-SSE-NEXT:    pcmpgtw %xmm3, %xmm2
1004; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
1005; X32-SSE-NEXT:    pandn %xmm0, %xmm3
1006; X32-SSE-NEXT:    psraw $1, %xmm0
1007; X32-SSE-NEXT:    pand %xmm2, %xmm0
1008; X32-SSE-NEXT:    por %xmm3, %xmm0
1009; X32-SSE-NEXT:    psrlw $8, %xmm0
1010; X32-SSE-NEXT:    packuswb %xmm1, %xmm0
1011; X32-SSE-NEXT:    retl
1012  %splat = shufflevector <16 x i8> %b, <16 x i8> undef, <16 x i32> zeroinitializer
1013  %shift = ashr <16 x i8> %a, %splat
1014  ret <16 x i8> %shift
1015}
1016
1017;
1018; Constant Shifts
1019;
1020
1021define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind {
1022; SSE2-LABEL: constant_shift_v2i64:
1023; SSE2:       # BB#0:
1024; SSE2-NEXT:    movdqa %xmm0, %xmm1
1025; SSE2-NEXT:    psrlq $7, %xmm1
1026; SSE2-NEXT:    psrlq $1, %xmm0
1027; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
1028; SSE2-NEXT:    movapd {{.*#+}} xmm0 = [4611686018427387904,72057594037927936]
1029; SSE2-NEXT:    xorpd %xmm0, %xmm1
1030; SSE2-NEXT:    psubq %xmm0, %xmm1
1031; SSE2-NEXT:    movdqa %xmm1, %xmm0
1032; SSE2-NEXT:    retq
1033;
1034; SSE41-LABEL: constant_shift_v2i64:
1035; SSE41:       # BB#0:
1036; SSE41-NEXT:    movdqa %xmm0, %xmm1
1037; SSE41-NEXT:    psrlq $7, %xmm1
1038; SSE41-NEXT:    psrlq $1, %xmm0
1039; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
1040; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [4611686018427387904,72057594037927936]
1041; SSE41-NEXT:    pxor %xmm1, %xmm0
1042; SSE41-NEXT:    psubq %xmm1, %xmm0
1043; SSE41-NEXT:    retq
1044;
1045; AVX1-LABEL: constant_shift_v2i64:
1046; AVX1:       # BB#0:
1047; AVX1-NEXT:    vpsrlq $7, %xmm0, %xmm1
1048; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm0
1049; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
1050; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [4611686018427387904,72057594037927936]
1051; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
1052; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
1053; AVX1-NEXT:    retq
1054;
1055; AVX2-LABEL: constant_shift_v2i64:
1056; AVX2:       # BB#0:
1057; AVX2-NEXT:    vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
1058; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [4611686018427387904,72057594037927936]
1059; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
1060; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
1061; AVX2-NEXT:    retq
1062;
1063; XOP-LABEL: constant_shift_v2i64:
1064; XOP:       # BB#0:
1065; XOP-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1066; XOP-NEXT:    vpsubq {{.*}}(%rip), %xmm1, %xmm1
1067; XOP-NEXT:    vpshaq %xmm1, %xmm0, %xmm0
1068; XOP-NEXT:    retq
1069;
1070; AVX512-LABEL: constant_shift_v2i64:
1071; AVX512:       ## BB#0:
1072; AVX512-NEXT:    vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
1073; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [4611686018427387904,72057594037927936]
1074; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
1075; AVX512-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
1076; AVX512-NEXT:    retq
1077;
1078; X32-SSE-LABEL: constant_shift_v2i64:
1079; X32-SSE:       # BB#0:
1080; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [0,2147483648,0,2147483648]
1081; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
1082; X32-SSE-NEXT:    psrlq $7, %xmm2
1083; X32-SSE-NEXT:    psrlq $1, %xmm1
1084; X32-SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
1085; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
1086; X32-SSE-NEXT:    psrlq $7, %xmm1
1087; X32-SSE-NEXT:    psrlq $1, %xmm0
1088; X32-SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
1089; X32-SSE-NEXT:    xorpd %xmm2, %xmm1
1090; X32-SSE-NEXT:    psubq %xmm2, %xmm1
1091; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
1092; X32-SSE-NEXT:    retl
1093  %shift = ashr <2 x i64> %a, <i64 1, i64 7>
1094  ret <2 x i64> %shift
1095}
1096
1097define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) nounwind {
1098; SSE2-LABEL: constant_shift_v4i32:
1099; SSE2:       # BB#0:
1100; SSE2-NEXT:    movdqa %xmm0, %xmm1
1101; SSE2-NEXT:    psrad $7, %xmm1
1102; SSE2-NEXT:    movdqa %xmm0, %xmm2
1103; SSE2-NEXT:    psrad $5, %xmm2
1104; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
1105; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
1106; SSE2-NEXT:    movdqa %xmm0, %xmm2
1107; SSE2-NEXT:    psrad $6, %xmm2
1108; SSE2-NEXT:    psrad $4, %xmm0
1109; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
1110; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
1111; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1112; SSE2-NEXT:    retq
1113;
1114; SSE41-LABEL: constant_shift_v4i32:
1115; SSE41:       # BB#0:
1116; SSE41-NEXT:    movdqa %xmm0, %xmm1
1117; SSE41-NEXT:    psrad $7, %xmm1
1118; SSE41-NEXT:    movdqa %xmm0, %xmm2
1119; SSE41-NEXT:    psrad $5, %xmm2
1120; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
1121; SSE41-NEXT:    movdqa %xmm0, %xmm1
1122; SSE41-NEXT:    psrad $6, %xmm1
1123; SSE41-NEXT:    psrad $4, %xmm0
1124; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
1125; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
1126; SSE41-NEXT:    retq
1127;
1128; AVX1-LABEL: constant_shift_v4i32:
1129; AVX1:       # BB#0:
1130; AVX1-NEXT:    vpsrad $7, %xmm0, %xmm1
1131; AVX1-NEXT:    vpsrad $5, %xmm0, %xmm2
1132; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
1133; AVX1-NEXT:    vpsrad $6, %xmm0, %xmm2
1134; AVX1-NEXT:    vpsrad $4, %xmm0, %xmm0
1135; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
1136; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
1137; AVX1-NEXT:    retq
1138;
1139; AVX2-LABEL: constant_shift_v4i32:
1140; AVX2:       # BB#0:
1141; AVX2-NEXT:    vpsravd {{.*}}(%rip), %xmm0, %xmm0
1142; AVX2-NEXT:    retq
1143;
1144; XOPAVX1-LABEL: constant_shift_v4i32:
1145; XOPAVX1:       # BB#0:
1146; XOPAVX1-NEXT:    vpshad {{.*}}(%rip), %xmm0, %xmm0
1147; XOPAVX1-NEXT:    retq
1148;
1149; XOPAVX2-LABEL: constant_shift_v4i32:
1150; XOPAVX2:       # BB#0:
1151; XOPAVX2-NEXT:    vpsravd {{.*}}(%rip), %xmm0, %xmm0
1152; XOPAVX2-NEXT:    retq
1153;
1154; AVX512-LABEL: constant_shift_v4i32:
1155; AVX512:       ## BB#0:
1156; AVX512-NEXT:    vpsravd {{.*}}(%rip), %xmm0, %xmm0
1157; AVX512-NEXT:    retq
1158;
1159; X32-SSE-LABEL: constant_shift_v4i32:
1160; X32-SSE:       # BB#0:
1161; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
1162; X32-SSE-NEXT:    psrad $7, %xmm1
1163; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
1164; X32-SSE-NEXT:    psrad $5, %xmm2
1165; X32-SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
1166; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
1167; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
1168; X32-SSE-NEXT:    psrad $6, %xmm2
1169; X32-SSE-NEXT:    psrad $4, %xmm0
1170; X32-SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
1171; X32-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
1172; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1173; X32-SSE-NEXT:    retl
1174  %shift = ashr <4 x i32> %a, <i32 4, i32 5, i32 6, i32 7>
1175  ret <4 x i32> %shift
1176}
1177
1178define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
1179; SSE2-LABEL: constant_shift_v8i16:
1180; SSE2:       # BB#0:
1181; SSE2-NEXT:    movdqa %xmm0, %xmm1
1182; SSE2-NEXT:    psraw $4, %xmm1
1183; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
1184; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
1185; SSE2-NEXT:    psraw $2, %xmm1
1186; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
1187; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
1188; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0]
1189; SSE2-NEXT:    movdqa %xmm2, %xmm1
1190; SSE2-NEXT:    pand %xmm0, %xmm1
1191; SSE2-NEXT:    psraw $1, %xmm2
1192; SSE2-NEXT:    pandn %xmm2, %xmm0
1193; SSE2-NEXT:    por %xmm1, %xmm0
1194; SSE2-NEXT:    retq
1195;
1196; SSE41-LABEL: constant_shift_v8i16:
1197; SSE41:       # BB#0:
1198; SSE41-NEXT:    movdqa %xmm0, %xmm1
1199; SSE41-NEXT:    psraw $4, %xmm1
1200; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
1201; SSE41-NEXT:    movdqa %xmm1, %xmm2
1202; SSE41-NEXT:    psraw $2, %xmm2
1203; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
1204; SSE41-NEXT:    movdqa %xmm2, %xmm0
1205; SSE41-NEXT:    psraw $1, %xmm0
1206; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4],xmm0[5],xmm2[6],xmm0[7]
1207; SSE41-NEXT:    retq
1208;
1209; AVX1-LABEL: constant_shift_v8i16:
1210; AVX1:       # BB#0:
1211; AVX1-NEXT:    vpsraw $4, %xmm0, %xmm1
1212; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
1213; AVX1-NEXT:    vpsraw $2, %xmm0, %xmm1
1214; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
1215; AVX1-NEXT:    vpsraw $1, %xmm0, %xmm1
1216; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
1217; AVX1-NEXT:    retq
1218;
1219; AVX2-LABEL: constant_shift_v8i16:
1220; AVX2:       # BB#0:
1221; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
1222; AVX2-NEXT:    vpsravd {{.*}}(%rip), %ymm0, %ymm0
1223; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
1224; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1225; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
1226; AVX2-NEXT:    vzeroupper
1227; AVX2-NEXT:    retq
1228;
1229; XOP-LABEL: constant_shift_v8i16:
1230; XOP:       # BB#0:
1231; XOP-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1232; XOP-NEXT:    vpsubw {{.*}}(%rip), %xmm1, %xmm1
1233; XOP-NEXT:    vpshaw %xmm1, %xmm0, %xmm0
1234; XOP-NEXT:    retq
1235;
1236; AVX512-LABEL: constant_shift_v8i16:
1237; AVX512:       ## BB#0:
1238; AVX512-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
1239; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7]
1240; AVX512-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
1241; AVX512-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
1242; AVX512-NEXT:    retq
1243;
1244; X32-SSE-LABEL: constant_shift_v8i16:
1245; X32-SSE:       # BB#0:
1246; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
1247; X32-SSE-NEXT:    psraw $4, %xmm1
1248; X32-SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
1249; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
1250; X32-SSE-NEXT:    psraw $2, %xmm1
1251; X32-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
1252; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
1253; X32-SSE-NEXT:    movdqa {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0]
1254; X32-SSE-NEXT:    movdqa %xmm2, %xmm1
1255; X32-SSE-NEXT:    pand %xmm0, %xmm1
1256; X32-SSE-NEXT:    psraw $1, %xmm2
1257; X32-SSE-NEXT:    pandn %xmm2, %xmm0
1258; X32-SSE-NEXT:    por %xmm1, %xmm0
1259; X32-SSE-NEXT:    retl
1260  %shift = ashr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
1261  ret <8 x i16> %shift
1262}
1263
1264define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
1265; SSE2-LABEL: constant_shift_v16i8:
1266; SSE2:       # BB#0:
1267; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1268; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
1269; SSE2-NEXT:    psllw $5, %xmm3
1270; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
1271; SSE2-NEXT:    pxor %xmm2, %xmm2
1272; SSE2-NEXT:    pxor %xmm5, %xmm5
1273; SSE2-NEXT:    pcmpgtw %xmm4, %xmm5
1274; SSE2-NEXT:    movdqa %xmm5, %xmm6
1275; SSE2-NEXT:    pandn %xmm1, %xmm6
1276; SSE2-NEXT:    psraw $4, %xmm1
1277; SSE2-NEXT:    pand %xmm5, %xmm1
1278; SSE2-NEXT:    por %xmm6, %xmm1
1279; SSE2-NEXT:    paddw %xmm4, %xmm4
1280; SSE2-NEXT:    pxor %xmm5, %xmm5
1281; SSE2-NEXT:    pcmpgtw %xmm4, %xmm5
1282; SSE2-NEXT:    movdqa %xmm5, %xmm6
1283; SSE2-NEXT:    pandn %xmm1, %xmm6
1284; SSE2-NEXT:    psraw $2, %xmm1
1285; SSE2-NEXT:    pand %xmm5, %xmm1
1286; SSE2-NEXT:    por %xmm6, %xmm1
1287; SSE2-NEXT:    paddw %xmm4, %xmm4
1288; SSE2-NEXT:    pxor %xmm5, %xmm5
1289; SSE2-NEXT:    pcmpgtw %xmm4, %xmm5
1290; SSE2-NEXT:    movdqa %xmm5, %xmm4
1291; SSE2-NEXT:    pandn %xmm1, %xmm4
1292; SSE2-NEXT:    psraw $1, %xmm1
1293; SSE2-NEXT:    pand %xmm5, %xmm1
1294; SSE2-NEXT:    por %xmm4, %xmm1
1295; SSE2-NEXT:    psrlw $8, %xmm1
1296; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1297; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1298; SSE2-NEXT:    pxor %xmm4, %xmm4
1299; SSE2-NEXT:    pcmpgtw %xmm3, %xmm4
1300; SSE2-NEXT:    movdqa %xmm4, %xmm5
1301; SSE2-NEXT:    pandn %xmm0, %xmm5
1302; SSE2-NEXT:    psraw $4, %xmm0
1303; SSE2-NEXT:    pand %xmm4, %xmm0
1304; SSE2-NEXT:    por %xmm5, %xmm0
1305; SSE2-NEXT:    paddw %xmm3, %xmm3
1306; SSE2-NEXT:    pxor %xmm4, %xmm4
1307; SSE2-NEXT:    pcmpgtw %xmm3, %xmm4
1308; SSE2-NEXT:    movdqa %xmm4, %xmm5
1309; SSE2-NEXT:    pandn %xmm0, %xmm5
1310; SSE2-NEXT:    psraw $2, %xmm0
1311; SSE2-NEXT:    pand %xmm4, %xmm0
1312; SSE2-NEXT:    por %xmm5, %xmm0
1313; SSE2-NEXT:    paddw %xmm3, %xmm3
1314; SSE2-NEXT:    pcmpgtw %xmm3, %xmm2
1315; SSE2-NEXT:    movdqa %xmm2, %xmm3
1316; SSE2-NEXT:    pandn %xmm0, %xmm3
1317; SSE2-NEXT:    psraw $1, %xmm0
1318; SSE2-NEXT:    pand %xmm2, %xmm0
1319; SSE2-NEXT:    por %xmm3, %xmm0
1320; SSE2-NEXT:    psrlw $8, %xmm0
1321; SSE2-NEXT:    packuswb %xmm1, %xmm0
1322; SSE2-NEXT:    retq
1323;
1324; SSE41-LABEL: constant_shift_v16i8:
1325; SSE41:       # BB#0:
1326; SSE41-NEXT:    movdqa %xmm0, %xmm1
1327; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
1328; SSE41-NEXT:    psllw $5, %xmm3
1329; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15]
1330; SSE41-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
1331; SSE41-NEXT:    movdqa %xmm2, %xmm4
1332; SSE41-NEXT:    psraw $4, %xmm4
1333; SSE41-NEXT:    pblendvb %xmm4, %xmm2
1334; SSE41-NEXT:    movdqa %xmm2, %xmm4
1335; SSE41-NEXT:    psraw $2, %xmm4
1336; SSE41-NEXT:    paddw %xmm0, %xmm0
1337; SSE41-NEXT:    pblendvb %xmm4, %xmm2
1338; SSE41-NEXT:    movdqa %xmm2, %xmm4
1339; SSE41-NEXT:    psraw $1, %xmm4
1340; SSE41-NEXT:    paddw %xmm0, %xmm0
1341; SSE41-NEXT:    pblendvb %xmm4, %xmm2
1342; SSE41-NEXT:    psrlw $8, %xmm2
1343; SSE41-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
1344; SSE41-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1345; SSE41-NEXT:    movdqa %xmm1, %xmm3
1346; SSE41-NEXT:    psraw $4, %xmm3
1347; SSE41-NEXT:    pblendvb %xmm3, %xmm1
1348; SSE41-NEXT:    movdqa %xmm1, %xmm3
1349; SSE41-NEXT:    psraw $2, %xmm3
1350; SSE41-NEXT:    paddw %xmm0, %xmm0
1351; SSE41-NEXT:    pblendvb %xmm3, %xmm1
1352; SSE41-NEXT:    movdqa %xmm1, %xmm3
1353; SSE41-NEXT:    psraw $1, %xmm3
1354; SSE41-NEXT:    paddw %xmm0, %xmm0
1355; SSE41-NEXT:    pblendvb %xmm3, %xmm1
1356; SSE41-NEXT:    psrlw $8, %xmm1
1357; SSE41-NEXT:    packuswb %xmm2, %xmm1
1358; SSE41-NEXT:    movdqa %xmm1, %xmm0
1359; SSE41-NEXT:    retq
1360;
1361; AVX-LABEL: constant_shift_v16i8:
1362; AVX:       # BB#0:
1363; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
1364; AVX-NEXT:    vpsllw $5, %xmm1, %xmm1
1365; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
1366; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1367; AVX-NEXT:    vpsraw $4, %xmm3, %xmm4
1368; AVX-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
1369; AVX-NEXT:    vpsraw $2, %xmm3, %xmm4
1370; AVX-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
1371; AVX-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
1372; AVX-NEXT:    vpsraw $1, %xmm3, %xmm4
1373; AVX-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
1374; AVX-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm2
1375; AVX-NEXT:    vpsrlw $8, %xmm2, %xmm2
1376; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1377; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1378; AVX-NEXT:    vpsraw $4, %xmm0, %xmm3
1379; AVX-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
1380; AVX-NEXT:    vpsraw $2, %xmm0, %xmm3
1381; AVX-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
1382; AVX-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
1383; AVX-NEXT:    vpsraw $1, %xmm0, %xmm3
1384; AVX-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
1385; AVX-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
1386; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm0
1387; AVX-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
1388; AVX-NEXT:    retq
1389;
1390; XOP-LABEL: constant_shift_v16i8:
1391; XOP:       # BB#0:
1392; XOP-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1393; XOP-NEXT:    vpsubb {{.*}}(%rip), %xmm1, %xmm1
1394; XOP-NEXT:    vpshab %xmm1, %xmm0, %xmm0
1395; XOP-NEXT:    retq
1396;
1397; AVX512-LABEL: constant_shift_v16i8:
1398; AVX512:       ## BB#0:
1399; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
1400; AVX512-NEXT:    vpsllw $5, %xmm1, %xmm1
1401; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
1402; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1403; AVX512-NEXT:    vpsraw $4, %xmm3, %xmm4
1404; AVX512-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
1405; AVX512-NEXT:    vpsraw $2, %xmm3, %xmm4
1406; AVX512-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
1407; AVX512-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
1408; AVX512-NEXT:    vpsraw $1, %xmm3, %xmm4
1409; AVX512-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
1410; AVX512-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm2
1411; AVX512-NEXT:    vpsrlw $8, %xmm2, %xmm2
1412; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1413; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1414; AVX512-NEXT:    vpsraw $4, %xmm0, %xmm3
1415; AVX512-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
1416; AVX512-NEXT:    vpsraw $2, %xmm0, %xmm3
1417; AVX512-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
1418; AVX512-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
1419; AVX512-NEXT:    vpsraw $1, %xmm0, %xmm3
1420; AVX512-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
1421; AVX512-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
1422; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm0
1423; AVX512-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
1424; AVX512-NEXT:    retq
1425;
1426; X32-SSE-LABEL: constant_shift_v16i8:
1427; X32-SSE:       # BB#0:
1428; X32-SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1429; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
1430; X32-SSE-NEXT:    psllw $5, %xmm3
1431; X32-SSE-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
1432; X32-SSE-NEXT:    pxor %xmm2, %xmm2
1433; X32-SSE-NEXT:    pxor %xmm5, %xmm5
1434; X32-SSE-NEXT:    pcmpgtw %xmm4, %xmm5
1435; X32-SSE-NEXT:    movdqa %xmm5, %xmm6
1436; X32-SSE-NEXT:    pandn %xmm1, %xmm6
1437; X32-SSE-NEXT:    psraw $4, %xmm1
1438; X32-SSE-NEXT:    pand %xmm5, %xmm1
1439; X32-SSE-NEXT:    por %xmm6, %xmm1
1440; X32-SSE-NEXT:    paddw %xmm4, %xmm4
1441; X32-SSE-NEXT:    pxor %xmm5, %xmm5
1442; X32-SSE-NEXT:    pcmpgtw %xmm4, %xmm5
1443; X32-SSE-NEXT:    movdqa %xmm5, %xmm6
1444; X32-SSE-NEXT:    pandn %xmm1, %xmm6
1445; X32-SSE-NEXT:    psraw $2, %xmm1
1446; X32-SSE-NEXT:    pand %xmm5, %xmm1
1447; X32-SSE-NEXT:    por %xmm6, %xmm1
1448; X32-SSE-NEXT:    paddw %xmm4, %xmm4
1449; X32-SSE-NEXT:    pxor %xmm5, %xmm5
1450; X32-SSE-NEXT:    pcmpgtw %xmm4, %xmm5
1451; X32-SSE-NEXT:    movdqa %xmm5, %xmm4
1452; X32-SSE-NEXT:    pandn %xmm1, %xmm4
1453; X32-SSE-NEXT:    psraw $1, %xmm1
1454; X32-SSE-NEXT:    pand %xmm5, %xmm1
1455; X32-SSE-NEXT:    por %xmm4, %xmm1
1456; X32-SSE-NEXT:    psrlw $8, %xmm1
1457; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1458; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1459; X32-SSE-NEXT:    pxor %xmm4, %xmm4
1460; X32-SSE-NEXT:    pcmpgtw %xmm3, %xmm4
1461; X32-SSE-NEXT:    movdqa %xmm4, %xmm5
1462; X32-SSE-NEXT:    pandn %xmm0, %xmm5
1463; X32-SSE-NEXT:    psraw $4, %xmm0
1464; X32-SSE-NEXT:    pand %xmm4, %xmm0
1465; X32-SSE-NEXT:    por %xmm5, %xmm0
1466; X32-SSE-NEXT:    paddw %xmm3, %xmm3
1467; X32-SSE-NEXT:    pxor %xmm4, %xmm4
1468; X32-SSE-NEXT:    pcmpgtw %xmm3, %xmm4
1469; X32-SSE-NEXT:    movdqa %xmm4, %xmm5
1470; X32-SSE-NEXT:    pandn %xmm0, %xmm5
1471; X32-SSE-NEXT:    psraw $2, %xmm0
1472; X32-SSE-NEXT:    pand %xmm4, %xmm0
1473; X32-SSE-NEXT:    por %xmm5, %xmm0
1474; X32-SSE-NEXT:    paddw %xmm3, %xmm3
1475; X32-SSE-NEXT:    pcmpgtw %xmm3, %xmm2
1476; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
1477; X32-SSE-NEXT:    pandn %xmm0, %xmm3
1478; X32-SSE-NEXT:    psraw $1, %xmm0
1479; X32-SSE-NEXT:    pand %xmm2, %xmm0
1480; X32-SSE-NEXT:    por %xmm3, %xmm0
1481; X32-SSE-NEXT:    psrlw $8, %xmm0
1482; X32-SSE-NEXT:    packuswb %xmm1, %xmm0
1483; X32-SSE-NEXT:    retl
1484  %shift = ashr <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
1485  ret <16 x i8> %shift
1486}
1487
1488;
1489; Uniform Constant Shifts
1490;
1491
1492define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) nounwind {
1493; SSE2-LABEL: splatconstant_shift_v2i64:
1494; SSE2:       # BB#0:
1495; SSE2-NEXT:    movdqa %xmm0, %xmm1
1496; SSE2-NEXT:    psrad $7, %xmm1
1497; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
1498; SSE2-NEXT:    psrlq $7, %xmm0
1499; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1500; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1501; SSE2-NEXT:    retq
1502;
1503; SSE41-LABEL: splatconstant_shift_v2i64:
1504; SSE41:       # BB#0:
1505; SSE41-NEXT:    movdqa %xmm0, %xmm1
1506; SSE41-NEXT:    psrad $7, %xmm1
1507; SSE41-NEXT:    psrlq $7, %xmm0
1508; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
1509; SSE41-NEXT:    retq
1510;
1511; AVX1-LABEL: splatconstant_shift_v2i64:
1512; AVX1:       # BB#0:
1513; AVX1-NEXT:    vpsrad $7, %xmm0, %xmm1
1514; AVX1-NEXT:    vpsrlq $7, %xmm0, %xmm0
1515; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
1516; AVX1-NEXT:    retq
1517;
1518; AVX2-LABEL: splatconstant_shift_v2i64:
1519; AVX2:       # BB#0:
1520; AVX2-NEXT:    vpsrad $7, %xmm0, %xmm1
1521; AVX2-NEXT:    vpsrlq $7, %xmm0, %xmm0
1522; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
1523; AVX2-NEXT:    retq
1524;
1525; XOP-LABEL: splatconstant_shift_v2i64:
1526; XOP:       # BB#0:
1527; XOP-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1528; XOP-NEXT:    vpsubq {{.*}}(%rip), %xmm1, %xmm1
1529; XOP-NEXT:    vpshaq %xmm1, %xmm0, %xmm0
1530; XOP-NEXT:    retq
1531;
1532; AVX512-LABEL: splatconstant_shift_v2i64:
1533; AVX512:       ## BB#0:
1534; AVX512-NEXT:    vpsrad $7, %xmm0, %xmm1
1535; AVX512-NEXT:    vpsrlq $7, %xmm0, %xmm0
1536; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
1537; AVX512-NEXT:    retq
1538;
1539; X32-SSE-LABEL: splatconstant_shift_v2i64:
1540; X32-SSE:       # BB#0:
1541; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
1542; X32-SSE-NEXT:    psrad $7, %xmm1
1543; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
1544; X32-SSE-NEXT:    psrlq $7, %xmm0
1545; X32-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1546; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1547; X32-SSE-NEXT:    retl
1548  %shift = ashr <2 x i64> %a, <i64 7, i64 7>
1549  ret <2 x i64> %shift
1550}
1551
1552define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) nounwind {
1553; SSE-LABEL: splatconstant_shift_v4i32:
1554; SSE:       # BB#0:
1555; SSE-NEXT:    psrad $5, %xmm0
1556; SSE-NEXT:    retq
1557;
1558; AVX-LABEL: splatconstant_shift_v4i32:
1559; AVX:       # BB#0:
1560; AVX-NEXT:    vpsrad $5, %xmm0, %xmm0
1561; AVX-NEXT:    retq
1562;
1563; XOP-LABEL: splatconstant_shift_v4i32:
1564; XOP:       # BB#0:
1565; XOP-NEXT:    vpsrad $5, %xmm0, %xmm0
1566; XOP-NEXT:    retq
1567;
1568; AVX512-LABEL: splatconstant_shift_v4i32:
1569; AVX512:       ## BB#0:
1570; AVX512-NEXT:    vpsrad $5, %xmm0, %xmm0
1571; AVX512-NEXT:    retq
1572;
1573; X32-SSE-LABEL: splatconstant_shift_v4i32:
1574; X32-SSE:       # BB#0:
1575; X32-SSE-NEXT:    psrad $5, %xmm0
1576; X32-SSE-NEXT:    retl
1577  %shift = ashr <4 x i32> %a, <i32 5, i32 5, i32 5, i32 5>
1578  ret <4 x i32> %shift
1579}
1580
1581define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) nounwind {
1582; SSE-LABEL: splatconstant_shift_v8i16:
1583; SSE:       # BB#0:
1584; SSE-NEXT:    psraw $3, %xmm0
1585; SSE-NEXT:    retq
1586;
1587; AVX-LABEL: splatconstant_shift_v8i16:
1588; AVX:       # BB#0:
1589; AVX-NEXT:    vpsraw $3, %xmm0, %xmm0
1590; AVX-NEXT:    retq
1591;
1592; XOP-LABEL: splatconstant_shift_v8i16:
1593; XOP:       # BB#0:
1594; XOP-NEXT:    vpsraw $3, %xmm0, %xmm0
1595; XOP-NEXT:    retq
1596;
1597; AVX512-LABEL: splatconstant_shift_v8i16:
1598; AVX512:       ## BB#0:
1599; AVX512-NEXT:    vpsraw $3, %xmm0, %xmm0
1600; AVX512-NEXT:    retq
1601;
1602; X32-SSE-LABEL: splatconstant_shift_v8i16:
1603; X32-SSE:       # BB#0:
1604; X32-SSE-NEXT:    psraw $3, %xmm0
1605; X32-SSE-NEXT:    retl
1606  %shift = ashr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
1607  ret <8 x i16> %shift
1608}
1609
1610define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) nounwind {
1611; SSE-LABEL: splatconstant_shift_v16i8:
1612; SSE:       # BB#0:
1613; SSE-NEXT:    psrlw $3, %xmm0
1614; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
1615; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
1616; SSE-NEXT:    pxor %xmm1, %xmm0
1617; SSE-NEXT:    psubb %xmm1, %xmm0
1618; SSE-NEXT:    retq
1619;
1620; AVX-LABEL: splatconstant_shift_v16i8:
1621; AVX:       # BB#0:
1622; AVX-NEXT:    vpsrlw $3, %xmm0, %xmm0
1623; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
1624; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
1625; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
1626; AVX-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
1627; AVX-NEXT:    retq
1628;
1629; XOP-LABEL: splatconstant_shift_v16i8:
1630; XOP:       # BB#0:
1631; XOP-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1632; XOP-NEXT:    vpsubb {{.*}}(%rip), %xmm1, %xmm1
1633; XOP-NEXT:    vpshab %xmm1, %xmm0, %xmm0
1634; XOP-NEXT:    retq
1635;
1636; AVX512-LABEL: splatconstant_shift_v16i8:
1637; AVX512:       ## BB#0:
1638; AVX512-NEXT:    vpsrlw $3, %xmm0, %xmm0
1639; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
1640; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
1641; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
1642; AVX512-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
1643; AVX512-NEXT:    retq
1644;
1645; X32-SSE-LABEL: splatconstant_shift_v16i8:
1646; X32-SSE:       # BB#0:
1647; X32-SSE-NEXT:    psrlw $3, %xmm0
1648; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
1649; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
1650; X32-SSE-NEXT:    pxor %xmm1, %xmm0
1651; X32-SSE-NEXT:    psubb %xmm1, %xmm0
1652; X32-SSE-NEXT:    retl
1653  %shift = ashr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
1654  ret <16 x i8> %shift
1655}
1656