1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
8; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
9;
10; Just one 32-bit run to make sure we do reasonable things for i64 shifts.
11; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE2
12
13;
14; Variable Shifts
15;
16
17define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
18; SSE2-LABEL: var_shift_v2i64:
19; SSE2:       # BB#0:
20; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
21; SSE2-NEXT:    movdqa %xmm0, %xmm2
22; SSE2-NEXT:    psrlq %xmm3, %xmm2
23; SSE2-NEXT:    psrlq %xmm1, %xmm0
24; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
25; SSE2-NEXT:    movapd %xmm2, %xmm0
26; SSE2-NEXT:    retq
27;
28; SSE41-LABEL: var_shift_v2i64:
29; SSE41:       # BB#0:
30; SSE41-NEXT:    movdqa %xmm0, %xmm2
31; SSE41-NEXT:    psrlq %xmm1, %xmm2
32; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
33; SSE41-NEXT:    psrlq %xmm1, %xmm0
34; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
35; SSE41-NEXT:    retq
36;
37; AVX1-LABEL: var_shift_v2i64:
38; AVX1:       # BB#0:
39; AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm2
40; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
41; AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
42; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
43; AVX1-NEXT:    retq
44;
45; AVX2-LABEL: var_shift_v2i64:
46; AVX2:       # BB#0:
47; AVX2-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
48; AVX2-NEXT:    retq
49;
50; XOPAVX1-LABEL: var_shift_v2i64:
51; XOPAVX1:       # BB#0:
52; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
53; XOPAVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
54; XOPAVX1-NEXT:    vpshlq %xmm1, %xmm0, %xmm0
55; XOPAVX1-NEXT:    retq
56;
57; XOPAVX2-LABEL: var_shift_v2i64:
58; XOPAVX2:       # BB#0:
59; XOPAVX2-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
60; XOPAVX2-NEXT:    retq
61;
62; AVX512-LABEL: var_shift_v2i64:
63; AVX512:       ## BB#0:
64; AVX512-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
65; AVX512-NEXT:    retq
66;
67; X32-SSE-LABEL: var_shift_v2i64:
68; X32-SSE:       # BB#0:
69; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
70; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
71; X32-SSE-NEXT:    psrlq %xmm3, %xmm2
72; X32-SSE-NEXT:    movq {{.*#+}} xmm1 = xmm1[0],zero
73; X32-SSE-NEXT:    psrlq %xmm1, %xmm0
74; X32-SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
75; X32-SSE-NEXT:    movapd %xmm2, %xmm0
76; X32-SSE-NEXT:    retl
77  %shift = lshr <2 x i64> %a, %b
78  ret <2 x i64> %shift
79}
80
81define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
82; SSE2-LABEL: var_shift_v4i32:
83; SSE2:       # BB#0:
84; SSE2-NEXT:    movdqa %xmm1, %xmm2
85; SSE2-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
86; SSE2-NEXT:    movdqa %xmm0, %xmm3
87; SSE2-NEXT:    psrld %xmm2, %xmm3
88; SSE2-NEXT:    movdqa %xmm1, %xmm2
89; SSE2-NEXT:    psrlq $32, %xmm2
90; SSE2-NEXT:    movdqa %xmm0, %xmm4
91; SSE2-NEXT:    psrld %xmm2, %xmm4
92; SSE2-NEXT:    movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1]
93; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3]
94; SSE2-NEXT:    pxor %xmm3, %xmm3
95; SSE2-NEXT:    movdqa %xmm1, %xmm4
96; SSE2-NEXT:    punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
97; SSE2-NEXT:    movdqa %xmm0, %xmm5
98; SSE2-NEXT:    psrld %xmm4, %xmm5
99; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
100; SSE2-NEXT:    psrld %xmm1, %xmm0
101; SSE2-NEXT:    movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1]
102; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3]
103; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
104; SSE2-NEXT:    retq
105;
106; SSE41-LABEL: var_shift_v4i32:
107; SSE41:       # BB#0:
108; SSE41-NEXT:    movdqa %xmm1, %xmm2
109; SSE41-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
110; SSE41-NEXT:    movdqa %xmm0, %xmm3
111; SSE41-NEXT:    psrld %xmm2, %xmm3
112; SSE41-NEXT:    movdqa %xmm1, %xmm2
113; SSE41-NEXT:    psrlq $32, %xmm2
114; SSE41-NEXT:    movdqa %xmm0, %xmm4
115; SSE41-NEXT:    psrld %xmm2, %xmm4
116; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
117; SSE41-NEXT:    pxor %xmm2, %xmm2
118; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero
119; SSE41-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
120; SSE41-NEXT:    movdqa %xmm0, %xmm2
121; SSE41-NEXT:    psrld %xmm1, %xmm2
122; SSE41-NEXT:    psrld %xmm3, %xmm0
123; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
124; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7]
125; SSE41-NEXT:    retq
126;
127; AVX1-LABEL: var_shift_v4i32:
128; AVX1:       # BB#0:
129; AVX1-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
130; AVX1-NEXT:    vpsrld %xmm2, %xmm0, %xmm2
131; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm3
132; AVX1-NEXT:    vpsrld %xmm3, %xmm0, %xmm3
133; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
134; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
135; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
136; AVX1-NEXT:    vpsrld %xmm3, %xmm0, %xmm3
137; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
138; AVX1-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
139; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
140; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
141; AVX1-NEXT:    retq
142;
143; AVX2-LABEL: var_shift_v4i32:
144; AVX2:       # BB#0:
145; AVX2-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
146; AVX2-NEXT:    retq
147;
148; XOPAVX1-LABEL: var_shift_v4i32:
149; XOPAVX1:       # BB#0:
150; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
151; XOPAVX1-NEXT:    vpsubd %xmm1, %xmm2, %xmm1
152; XOPAVX1-NEXT:    vpshld %xmm1, %xmm0, %xmm0
153; XOPAVX1-NEXT:    retq
154;
155; XOPAVX2-LABEL: var_shift_v4i32:
156; XOPAVX2:       # BB#0:
157; XOPAVX2-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
158; XOPAVX2-NEXT:    retq
159;
160; AVX512-LABEL: var_shift_v4i32:
161; AVX512:       ## BB#0:
162; AVX512-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
163; AVX512-NEXT:    retq
164;
165; X32-SSE-LABEL: var_shift_v4i32:
166; X32-SSE:       # BB#0:
167; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
168; X32-SSE-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
169; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
170; X32-SSE-NEXT:    psrld %xmm2, %xmm3
171; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
172; X32-SSE-NEXT:    psrlq $32, %xmm2
173; X32-SSE-NEXT:    movdqa %xmm0, %xmm4
174; X32-SSE-NEXT:    psrld %xmm2, %xmm4
175; X32-SSE-NEXT:    movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1]
176; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3]
177; X32-SSE-NEXT:    pxor %xmm3, %xmm3
178; X32-SSE-NEXT:    movdqa %xmm1, %xmm4
179; X32-SSE-NEXT:    punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
180; X32-SSE-NEXT:    movdqa %xmm0, %xmm5
181; X32-SSE-NEXT:    psrld %xmm4, %xmm5
182; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
183; X32-SSE-NEXT:    psrld %xmm1, %xmm0
184; X32-SSE-NEXT:    movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1]
185; X32-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3]
186; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
187; X32-SSE-NEXT:    retl
188  %shift = lshr <4 x i32> %a, %b
189  ret <4 x i32> %shift
190}
191
192define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
193; SSE2-LABEL: var_shift_v8i16:
194; SSE2:       # BB#0:
195; SSE2-NEXT:    psllw $12, %xmm1
196; SSE2-NEXT:    movdqa %xmm1, %xmm2
197; SSE2-NEXT:    psraw $15, %xmm2
198; SSE2-NEXT:    movdqa %xmm2, %xmm3
199; SSE2-NEXT:    pandn %xmm0, %xmm3
200; SSE2-NEXT:    psrlw $8, %xmm0
201; SSE2-NEXT:    pand %xmm2, %xmm0
202; SSE2-NEXT:    por %xmm3, %xmm0
203; SSE2-NEXT:    paddw %xmm1, %xmm1
204; SSE2-NEXT:    movdqa %xmm1, %xmm2
205; SSE2-NEXT:    psraw $15, %xmm2
206; SSE2-NEXT:    movdqa %xmm2, %xmm3
207; SSE2-NEXT:    pandn %xmm0, %xmm3
208; SSE2-NEXT:    psrlw $4, %xmm0
209; SSE2-NEXT:    pand %xmm2, %xmm0
210; SSE2-NEXT:    por %xmm3, %xmm0
211; SSE2-NEXT:    paddw %xmm1, %xmm1
212; SSE2-NEXT:    movdqa %xmm1, %xmm2
213; SSE2-NEXT:    psraw $15, %xmm2
214; SSE2-NEXT:    movdqa %xmm2, %xmm3
215; SSE2-NEXT:    pandn %xmm0, %xmm3
216; SSE2-NEXT:    psrlw $2, %xmm0
217; SSE2-NEXT:    pand %xmm2, %xmm0
218; SSE2-NEXT:    por %xmm3, %xmm0
219; SSE2-NEXT:    paddw %xmm1, %xmm1
220; SSE2-NEXT:    psraw $15, %xmm1
221; SSE2-NEXT:    movdqa %xmm1, %xmm2
222; SSE2-NEXT:    pandn %xmm0, %xmm2
223; SSE2-NEXT:    psrlw $1, %xmm0
224; SSE2-NEXT:    pand %xmm1, %xmm0
225; SSE2-NEXT:    por %xmm2, %xmm0
226; SSE2-NEXT:    retq
227;
228; SSE41-LABEL: var_shift_v8i16:
229; SSE41:       # BB#0:
230; SSE41-NEXT:    movdqa %xmm0, %xmm2
231; SSE41-NEXT:    movdqa %xmm1, %xmm0
232; SSE41-NEXT:    psllw $12, %xmm0
233; SSE41-NEXT:    psllw $4, %xmm1
234; SSE41-NEXT:    por %xmm0, %xmm1
235; SSE41-NEXT:    movdqa %xmm1, %xmm3
236; SSE41-NEXT:    paddw %xmm3, %xmm3
237; SSE41-NEXT:    movdqa %xmm2, %xmm4
238; SSE41-NEXT:    psrlw $8, %xmm4
239; SSE41-NEXT:    movdqa %xmm1, %xmm0
240; SSE41-NEXT:    pblendvb %xmm4, %xmm2
241; SSE41-NEXT:    movdqa %xmm2, %xmm1
242; SSE41-NEXT:    psrlw $4, %xmm1
243; SSE41-NEXT:    movdqa %xmm3, %xmm0
244; SSE41-NEXT:    pblendvb %xmm1, %xmm2
245; SSE41-NEXT:    movdqa %xmm2, %xmm1
246; SSE41-NEXT:    psrlw $2, %xmm1
247; SSE41-NEXT:    paddw %xmm3, %xmm3
248; SSE41-NEXT:    movdqa %xmm3, %xmm0
249; SSE41-NEXT:    pblendvb %xmm1, %xmm2
250; SSE41-NEXT:    movdqa %xmm2, %xmm1
251; SSE41-NEXT:    psrlw $1, %xmm1
252; SSE41-NEXT:    paddw %xmm3, %xmm3
253; SSE41-NEXT:    movdqa %xmm3, %xmm0
254; SSE41-NEXT:    pblendvb %xmm1, %xmm2
255; SSE41-NEXT:    movdqa %xmm2, %xmm0
256; SSE41-NEXT:    retq
257;
258; AVX1-LABEL: var_shift_v8i16:
259; AVX1:       # BB#0:
260; AVX1-NEXT:    vpsllw $12, %xmm1, %xmm2
261; AVX1-NEXT:    vpsllw $4, %xmm1, %xmm1
262; AVX1-NEXT:    vpor %xmm2, %xmm1, %xmm1
263; AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm2
264; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm3
265; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
266; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm1
267; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
268; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm1
269; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
270; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
271; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm1
272; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
273; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
274; AVX1-NEXT:    retq
275;
276; AVX2-LABEL: var_shift_v8i16:
277; AVX2:       # BB#0:
278; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
279; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
280; AVX2-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
281; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
282; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
283; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
284; AVX2-NEXT:    vzeroupper
285; AVX2-NEXT:    retq
286;
287; XOP-LABEL: var_shift_v8i16:
288; XOP:       # BB#0:
289; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
290; XOP-NEXT:    vpsubw %xmm1, %xmm2, %xmm1
291; XOP-NEXT:    vpshlw %xmm1, %xmm0, %xmm0
292; XOP-NEXT:    retq
293;
294; AVX512-LABEL: var_shift_v8i16:
295; AVX512:       ## BB#0:
296; AVX512-NEXT:    ## kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
297; AVX512-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
298; AVX512-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
299; AVX512-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
300; AVX512-NEXT:    retq
301;
302; X32-SSE-LABEL: var_shift_v8i16:
303; X32-SSE:       # BB#0:
304; X32-SSE-NEXT:    psllw $12, %xmm1
305; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
306; X32-SSE-NEXT:    psraw $15, %xmm2
307; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
308; X32-SSE-NEXT:    pandn %xmm0, %xmm3
309; X32-SSE-NEXT:    psrlw $8, %xmm0
310; X32-SSE-NEXT:    pand %xmm2, %xmm0
311; X32-SSE-NEXT:    por %xmm3, %xmm0
312; X32-SSE-NEXT:    paddw %xmm1, %xmm1
313; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
314; X32-SSE-NEXT:    psraw $15, %xmm2
315; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
316; X32-SSE-NEXT:    pandn %xmm0, %xmm3
317; X32-SSE-NEXT:    psrlw $4, %xmm0
318; X32-SSE-NEXT:    pand %xmm2, %xmm0
319; X32-SSE-NEXT:    por %xmm3, %xmm0
320; X32-SSE-NEXT:    paddw %xmm1, %xmm1
321; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
322; X32-SSE-NEXT:    psraw $15, %xmm2
323; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
324; X32-SSE-NEXT:    pandn %xmm0, %xmm3
325; X32-SSE-NEXT:    psrlw $2, %xmm0
326; X32-SSE-NEXT:    pand %xmm2, %xmm0
327; X32-SSE-NEXT:    por %xmm3, %xmm0
328; X32-SSE-NEXT:    paddw %xmm1, %xmm1
329; X32-SSE-NEXT:    psraw $15, %xmm1
330; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
331; X32-SSE-NEXT:    pandn %xmm0, %xmm2
332; X32-SSE-NEXT:    psrlw $1, %xmm0
333; X32-SSE-NEXT:    pand %xmm1, %xmm0
334; X32-SSE-NEXT:    por %xmm2, %xmm0
335; X32-SSE-NEXT:    retl
336  %shift = lshr <8 x i16> %a, %b
337  ret <8 x i16> %shift
338}
339
340define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
341; SSE2-LABEL: var_shift_v16i8:
342; SSE2:       # BB#0:
343; SSE2-NEXT:    psllw $5, %xmm1
344; SSE2-NEXT:    pxor %xmm2, %xmm2
345; SSE2-NEXT:    pxor %xmm3, %xmm3
346; SSE2-NEXT:    pcmpgtb %xmm1, %xmm3
347; SSE2-NEXT:    movdqa %xmm3, %xmm4
348; SSE2-NEXT:    pandn %xmm0, %xmm4
349; SSE2-NEXT:    psrlw $4, %xmm0
350; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
351; SSE2-NEXT:    pand %xmm3, %xmm0
352; SSE2-NEXT:    por %xmm4, %xmm0
353; SSE2-NEXT:    paddb %xmm1, %xmm1
354; SSE2-NEXT:    pxor %xmm3, %xmm3
355; SSE2-NEXT:    pcmpgtb %xmm1, %xmm3
356; SSE2-NEXT:    movdqa %xmm3, %xmm4
357; SSE2-NEXT:    pandn %xmm0, %xmm4
358; SSE2-NEXT:    psrlw $2, %xmm0
359; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
360; SSE2-NEXT:    pand %xmm3, %xmm0
361; SSE2-NEXT:    por %xmm4, %xmm0
362; SSE2-NEXT:    paddb %xmm1, %xmm1
363; SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
364; SSE2-NEXT:    movdqa %xmm2, %xmm1
365; SSE2-NEXT:    pandn %xmm0, %xmm1
366; SSE2-NEXT:    psrlw $1, %xmm0
367; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
368; SSE2-NEXT:    pand %xmm2, %xmm0
369; SSE2-NEXT:    por %xmm1, %xmm0
370; SSE2-NEXT:    retq
371;
372; SSE41-LABEL: var_shift_v16i8:
373; SSE41:       # BB#0:
374; SSE41-NEXT:    movdqa %xmm0, %xmm2
375; SSE41-NEXT:    psllw $5, %xmm1
376; SSE41-NEXT:    movdqa %xmm2, %xmm3
377; SSE41-NEXT:    psrlw $4, %xmm3
378; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
379; SSE41-NEXT:    movdqa %xmm1, %xmm0
380; SSE41-NEXT:    pblendvb %xmm3, %xmm2
381; SSE41-NEXT:    movdqa %xmm2, %xmm3
382; SSE41-NEXT:    psrlw $2, %xmm3
383; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
384; SSE41-NEXT:    paddb %xmm1, %xmm1
385; SSE41-NEXT:    movdqa %xmm1, %xmm0
386; SSE41-NEXT:    pblendvb %xmm3, %xmm2
387; SSE41-NEXT:    movdqa %xmm2, %xmm3
388; SSE41-NEXT:    psrlw $1, %xmm3
389; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
390; SSE41-NEXT:    paddb %xmm1, %xmm1
391; SSE41-NEXT:    movdqa %xmm1, %xmm0
392; SSE41-NEXT:    pblendvb %xmm3, %xmm2
393; SSE41-NEXT:    movdqa %xmm2, %xmm0
394; SSE41-NEXT:    retq
395;
396; AVX-LABEL: var_shift_v16i8:
397; AVX:       # BB#0:
398; AVX-NEXT:    vpsllw $5, %xmm1, %xmm1
399; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm2
400; AVX-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
401; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
402; AVX-NEXT:    vpsrlw $2, %xmm0, %xmm2
403; AVX-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
404; AVX-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
405; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
406; AVX-NEXT:    vpsrlw $1, %xmm0, %xmm2
407; AVX-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
408; AVX-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
409; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
410; AVX-NEXT:    retq
411;
412; XOP-LABEL: var_shift_v16i8:
413; XOP:       # BB#0:
414; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
415; XOP-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
416; XOP-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
417; XOP-NEXT:    retq
418;
419; AVX512-LABEL: var_shift_v16i8:
420; AVX512:       ## BB#0:
421; AVX512-NEXT:    vpsllw $5, %xmm1, %xmm1
422; AVX512-NEXT:    vpsrlw $4, %xmm0, %xmm2
423; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
424; AVX512-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
425; AVX512-NEXT:    vpsrlw $2, %xmm0, %xmm2
426; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
427; AVX512-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
428; AVX512-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
429; AVX512-NEXT:    vpsrlw $1, %xmm0, %xmm2
430; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
431; AVX512-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
432; AVX512-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
433; AVX512-NEXT:    retq
434;
435; X32-SSE-LABEL: var_shift_v16i8:
436; X32-SSE:       # BB#0:
437; X32-SSE-NEXT:    psllw $5, %xmm1
438; X32-SSE-NEXT:    pxor %xmm2, %xmm2
439; X32-SSE-NEXT:    pxor %xmm3, %xmm3
440; X32-SSE-NEXT:    pcmpgtb %xmm1, %xmm3
441; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
442; X32-SSE-NEXT:    pandn %xmm0, %xmm4
443; X32-SSE-NEXT:    psrlw $4, %xmm0
444; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
445; X32-SSE-NEXT:    pand %xmm3, %xmm0
446; X32-SSE-NEXT:    por %xmm4, %xmm0
447; X32-SSE-NEXT:    paddb %xmm1, %xmm1
448; X32-SSE-NEXT:    pxor %xmm3, %xmm3
449; X32-SSE-NEXT:    pcmpgtb %xmm1, %xmm3
450; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
451; X32-SSE-NEXT:    pandn %xmm0, %xmm4
452; X32-SSE-NEXT:    psrlw $2, %xmm0
453; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
454; X32-SSE-NEXT:    pand %xmm3, %xmm0
455; X32-SSE-NEXT:    por %xmm4, %xmm0
456; X32-SSE-NEXT:    paddb %xmm1, %xmm1
457; X32-SSE-NEXT:    pcmpgtb %xmm1, %xmm2
458; X32-SSE-NEXT:    movdqa %xmm2, %xmm1
459; X32-SSE-NEXT:    pandn %xmm0, %xmm1
460; X32-SSE-NEXT:    psrlw $1, %xmm0
461; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
462; X32-SSE-NEXT:    pand %xmm2, %xmm0
463; X32-SSE-NEXT:    por %xmm1, %xmm0
464; X32-SSE-NEXT:    retl
465  %shift = lshr <16 x i8> %a, %b
466  ret <16 x i8> %shift
467}
468
469;
470; Uniform Variable Shifts
471;
472
473define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
474; SSE-LABEL: splatvar_shift_v2i64:
475; SSE:       # BB#0:
476; SSE-NEXT:    psrlq %xmm1, %xmm0
477; SSE-NEXT:    retq
478;
479; AVX-LABEL: splatvar_shift_v2i64:
480; AVX:       # BB#0:
481; AVX-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
482; AVX-NEXT:    retq
483;
484; XOP-LABEL: splatvar_shift_v2i64:
485; XOP:       # BB#0:
486; XOP-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
487; XOP-NEXT:    retq
488;
489; AVX512-LABEL: splatvar_shift_v2i64:
490; AVX512:       ## BB#0:
491; AVX512-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
492; AVX512-NEXT:    retq
493;
494; X32-SSE-LABEL: splatvar_shift_v2i64:
495; X32-SSE:       # BB#0:
496; X32-SSE-NEXT:    movq {{.*#+}} xmm1 = xmm1[0],zero
497; X32-SSE-NEXT:    psrlq %xmm1, %xmm0
498; X32-SSE-NEXT:    retl
499  %splat = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer
500  %shift = lshr <2 x i64> %a, %splat
501  ret <2 x i64> %shift
502}
503
504define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
505; SSE2-LABEL: splatvar_shift_v4i32:
506; SSE2:       # BB#0:
507; SSE2-NEXT:    xorps %xmm2, %xmm2
508; SSE2-NEXT:    movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
509; SSE2-NEXT:    psrld %xmm2, %xmm0
510; SSE2-NEXT:    retq
511;
512; SSE41-LABEL: splatvar_shift_v4i32:
513; SSE41:       # BB#0:
514; SSE41-NEXT:    pxor %xmm2, %xmm2
515; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3,4,5,6,7]
516; SSE41-NEXT:    psrld %xmm2, %xmm0
517; SSE41-NEXT:    retq
518;
519; AVX-LABEL: splatvar_shift_v4i32:
520; AVX:       # BB#0:
521; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
522; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
523; AVX-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
524; AVX-NEXT:    retq
525;
526; XOP-LABEL: splatvar_shift_v4i32:
527; XOP:       # BB#0:
528; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
529; XOP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
530; XOP-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
531; XOP-NEXT:    retq
532;
533; AVX512-LABEL: splatvar_shift_v4i32:
534; AVX512:       ## BB#0:
535; AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2
536; AVX512-NEXT:    vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
537; AVX512-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
538; AVX512-NEXT:    retq
539;
540; X32-SSE-LABEL: splatvar_shift_v4i32:
541; X32-SSE:       # BB#0:
542; X32-SSE-NEXT:    xorps %xmm2, %xmm2
543; X32-SSE-NEXT:    movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
544; X32-SSE-NEXT:    psrld %xmm2, %xmm0
545; X32-SSE-NEXT:    retl
546  %splat = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> zeroinitializer
547  %shift = lshr <4 x i32> %a, %splat
548  ret <4 x i32> %shift
549}
550
551define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
552; SSE2-LABEL: splatvar_shift_v8i16:
553; SSE2:       # BB#0:
554; SSE2-NEXT:    movd %xmm1, %eax
555; SSE2-NEXT:    movzwl %ax, %eax
556; SSE2-NEXT:    movd %eax, %xmm1
557; SSE2-NEXT:    psrlw %xmm1, %xmm0
558; SSE2-NEXT:    retq
559;
560; SSE41-LABEL: splatvar_shift_v8i16:
561; SSE41:       # BB#0:
562; SSE41-NEXT:    pxor %xmm2, %xmm2
563; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3,4,5,6,7]
564; SSE41-NEXT:    psrlw %xmm2, %xmm0
565; SSE41-NEXT:    retq
566;
567; AVX-LABEL: splatvar_shift_v8i16:
568; AVX:       # BB#0:
569; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
570; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
571; AVX-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
572; AVX-NEXT:    retq
573;
574; XOP-LABEL: splatvar_shift_v8i16:
575; XOP:       # BB#0:
576; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
577; XOP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
578; XOP-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
579; XOP-NEXT:    retq
580;
581; AVX512-LABEL: splatvar_shift_v8i16:
582; AVX512:       ## BB#0:
583; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
584; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
585; AVX512-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
586; AVX512-NEXT:    retq
587;
588; X32-SSE-LABEL: splatvar_shift_v8i16:
589; X32-SSE:       # BB#0:
590; X32-SSE-NEXT:    movd %xmm1, %eax
591; X32-SSE-NEXT:    movzwl %ax, %eax
592; X32-SSE-NEXT:    movd %eax, %xmm1
593; X32-SSE-NEXT:    psrlw %xmm1, %xmm0
594; X32-SSE-NEXT:    retl
595  %splat = shufflevector <8 x i16> %b, <8 x i16> undef, <8 x i32> zeroinitializer
596  %shift = lshr <8 x i16> %a, %splat
597  ret <8 x i16> %shift
598}
599
600define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
601; SSE2-LABEL: splatvar_shift_v16i8:
602; SSE2:       # BB#0:
603; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
604; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
605; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1]
606; SSE2-NEXT:    psllw $5, %xmm2
607; SSE2-NEXT:    pxor %xmm1, %xmm1
608; SSE2-NEXT:    pxor %xmm3, %xmm3
609; SSE2-NEXT:    pcmpgtb %xmm2, %xmm3
610; SSE2-NEXT:    movdqa %xmm3, %xmm4
611; SSE2-NEXT:    pandn %xmm0, %xmm4
612; SSE2-NEXT:    psrlw $4, %xmm0
613; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
614; SSE2-NEXT:    pand %xmm3, %xmm0
615; SSE2-NEXT:    por %xmm4, %xmm0
616; SSE2-NEXT:    paddb %xmm2, %xmm2
617; SSE2-NEXT:    pxor %xmm3, %xmm3
618; SSE2-NEXT:    pcmpgtb %xmm2, %xmm3
619; SSE2-NEXT:    movdqa %xmm3, %xmm4
620; SSE2-NEXT:    pandn %xmm0, %xmm4
621; SSE2-NEXT:    psrlw $2, %xmm0
622; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
623; SSE2-NEXT:    pand %xmm3, %xmm0
624; SSE2-NEXT:    por %xmm4, %xmm0
625; SSE2-NEXT:    paddb %xmm2, %xmm2
626; SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
627; SSE2-NEXT:    movdqa %xmm1, %xmm2
628; SSE2-NEXT:    pandn %xmm0, %xmm2
629; SSE2-NEXT:    psrlw $1, %xmm0
630; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
631; SSE2-NEXT:    pand %xmm1, %xmm0
632; SSE2-NEXT:    por %xmm2, %xmm0
633; SSE2-NEXT:    retq
634;
635; SSE41-LABEL: splatvar_shift_v16i8:
636; SSE41:       # BB#0:
637; SSE41-NEXT:    movdqa %xmm0, %xmm2
638; SSE41-NEXT:    pxor %xmm0, %xmm0
639; SSE41-NEXT:    pshufb %xmm0, %xmm1
640; SSE41-NEXT:    psllw $5, %xmm1
641; SSE41-NEXT:    movdqa %xmm1, %xmm3
642; SSE41-NEXT:    paddb %xmm3, %xmm3
643; SSE41-NEXT:    movdqa %xmm2, %xmm4
644; SSE41-NEXT:    psrlw $4, %xmm4
645; SSE41-NEXT:    pand {{.*}}(%rip), %xmm4
646; SSE41-NEXT:    movdqa %xmm1, %xmm0
647; SSE41-NEXT:    pblendvb %xmm4, %xmm2
648; SSE41-NEXT:    movdqa %xmm2, %xmm1
649; SSE41-NEXT:    psrlw $2, %xmm1
650; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
651; SSE41-NEXT:    movdqa %xmm3, %xmm0
652; SSE41-NEXT:    pblendvb %xmm1, %xmm2
653; SSE41-NEXT:    movdqa %xmm2, %xmm1
654; SSE41-NEXT:    psrlw $1, %xmm1
655; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
656; SSE41-NEXT:    paddb %xmm3, %xmm3
657; SSE41-NEXT:    movdqa %xmm3, %xmm0
658; SSE41-NEXT:    pblendvb %xmm1, %xmm2
659; SSE41-NEXT:    movdqa %xmm2, %xmm0
660; SSE41-NEXT:    retq
661;
662; AVX1-LABEL: splatvar_shift_v16i8:
663; AVX1:       # BB#0:
664; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
665; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
666; AVX1-NEXT:    vpsllw $5, %xmm1, %xmm1
667; AVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm2
668; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm3
669; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
670; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
671; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm1
672; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
673; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
674; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm1
675; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
676; AVX1-NEXT:    vpaddb %xmm2, %xmm2, %xmm2
677; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
678; AVX1-NEXT:    retq
679;
680; AVX2-LABEL: splatvar_shift_v16i8:
681; AVX2:       # BB#0:
682; AVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
683; AVX2-NEXT:    vpsllw $5, %xmm1, %xmm1
684; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm2
685; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
686; AVX2-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
687; AVX2-NEXT:    vpsrlw $2, %xmm0, %xmm2
688; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
689; AVX2-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
690; AVX2-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
691; AVX2-NEXT:    vpsrlw $1, %xmm0, %xmm2
692; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
693; AVX2-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
694; AVX2-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
695; AVX2-NEXT:    retq
696;
697; XOPAVX1-LABEL: splatvar_shift_v16i8:
698; XOPAVX1:       # BB#0:
699; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
700; XOPAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
701; XOPAVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
702; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
703; XOPAVX1-NEXT:    retq
704;
705; XOPAVX2-LABEL: splatvar_shift_v16i8:
706; XOPAVX2:       # BB#0:
707; XOPAVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
708; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
709; XOPAVX2-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
710; XOPAVX2-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
711; XOPAVX2-NEXT:    retq
712;
713; AVX512-LABEL: splatvar_shift_v16i8:
714; AVX512:       ## BB#0:
715; AVX512-NEXT:    vpbroadcastb %xmm1, %xmm1
716; AVX512-NEXT:    vpsllw $5, %xmm1, %xmm1
717; AVX512-NEXT:    vpsrlw $4, %xmm0, %xmm2
718; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
719; AVX512-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
720; AVX512-NEXT:    vpsrlw $2, %xmm0, %xmm2
721; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
722; AVX512-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
723; AVX512-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
724; AVX512-NEXT:    vpsrlw $1, %xmm0, %xmm2
725; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
726; AVX512-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
727; AVX512-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
728; AVX512-NEXT:    retq
729;
730; X32-SSE-LABEL: splatvar_shift_v16i8:
731; X32-SSE:       # BB#0:
732; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
733; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
734; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1]
735; X32-SSE-NEXT:    psllw $5, %xmm2
736; X32-SSE-NEXT:    pxor %xmm1, %xmm1
737; X32-SSE-NEXT:    pxor %xmm3, %xmm3
738; X32-SSE-NEXT:    pcmpgtb %xmm2, %xmm3
739; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
740; X32-SSE-NEXT:    pandn %xmm0, %xmm4
741; X32-SSE-NEXT:    psrlw $4, %xmm0
742; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
743; X32-SSE-NEXT:    pand %xmm3, %xmm0
744; X32-SSE-NEXT:    por %xmm4, %xmm0
745; X32-SSE-NEXT:    paddb %xmm2, %xmm2
746; X32-SSE-NEXT:    pxor %xmm3, %xmm3
747; X32-SSE-NEXT:    pcmpgtb %xmm2, %xmm3
748; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
749; X32-SSE-NEXT:    pandn %xmm0, %xmm4
750; X32-SSE-NEXT:    psrlw $2, %xmm0
751; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
752; X32-SSE-NEXT:    pand %xmm3, %xmm0
753; X32-SSE-NEXT:    por %xmm4, %xmm0
754; X32-SSE-NEXT:    paddb %xmm2, %xmm2
755; X32-SSE-NEXT:    pcmpgtb %xmm2, %xmm1
756; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
757; X32-SSE-NEXT:    pandn %xmm0, %xmm2
758; X32-SSE-NEXT:    psrlw $1, %xmm0
759; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
760; X32-SSE-NEXT:    pand %xmm1, %xmm0
761; X32-SSE-NEXT:    por %xmm2, %xmm0
762; X32-SSE-NEXT:    retl
763  %splat = shufflevector <16 x i8> %b, <16 x i8> undef, <16 x i32> zeroinitializer
764  %shift = lshr <16 x i8> %a, %splat
765  ret <16 x i8> %shift
766}
767
768;
769; Constant Shifts
770;
771
772define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind {
773; SSE2-LABEL: constant_shift_v2i64:
774; SSE2:       # BB#0:
775; SSE2-NEXT:    movdqa %xmm0, %xmm1
776; SSE2-NEXT:    psrlq $7, %xmm1
777; SSE2-NEXT:    psrlq $1, %xmm0
778; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
779; SSE2-NEXT:    movapd %xmm1, %xmm0
780; SSE2-NEXT:    retq
781;
782; SSE41-LABEL: constant_shift_v2i64:
783; SSE41:       # BB#0:
784; SSE41-NEXT:    movdqa %xmm0, %xmm1
785; SSE41-NEXT:    psrlq $7, %xmm1
786; SSE41-NEXT:    psrlq $1, %xmm0
787; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
788; SSE41-NEXT:    retq
789;
790; AVX1-LABEL: constant_shift_v2i64:
791; AVX1:       # BB#0:
792; AVX1-NEXT:    vpsrlq $7, %xmm0, %xmm1
793; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm0
794; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
795; AVX1-NEXT:    retq
796;
797; AVX2-LABEL: constant_shift_v2i64:
798; AVX2:       # BB#0:
799; AVX2-NEXT:    vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
800; AVX2-NEXT:    retq
801;
802; XOPAVX1-LABEL: constant_shift_v2i64:
803; XOPAVX1:       # BB#0:
804; XOPAVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
805; XOPAVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm1, %xmm1
806; XOPAVX1-NEXT:    vpshlq %xmm1, %xmm0, %xmm0
807; XOPAVX1-NEXT:    retq
808;
809; XOPAVX2-LABEL: constant_shift_v2i64:
810; XOPAVX2:       # BB#0:
811; XOPAVX2-NEXT:    vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
812; XOPAVX2-NEXT:    retq
813;
814; AVX512-LABEL: constant_shift_v2i64:
815; AVX512:       ## BB#0:
816; AVX512-NEXT:    vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
817; AVX512-NEXT:    retq
818;
819; X32-SSE-LABEL: constant_shift_v2i64:
820; X32-SSE:       # BB#0:
821; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
822; X32-SSE-NEXT:    psrlq $7, %xmm1
823; X32-SSE-NEXT:    psrlq $1, %xmm0
824; X32-SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
825; X32-SSE-NEXT:    movapd %xmm1, %xmm0
826; X32-SSE-NEXT:    retl
827  %shift = lshr <2 x i64> %a, <i64 1, i64 7>
828  ret <2 x i64> %shift
829}
830
831define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) nounwind {
832; SSE2-LABEL: constant_shift_v4i32:
833; SSE2:       # BB#0:
834; SSE2-NEXT:    movdqa %xmm0, %xmm1
835; SSE2-NEXT:    psrld $7, %xmm1
836; SSE2-NEXT:    movdqa %xmm0, %xmm2
837; SSE2-NEXT:    psrld $5, %xmm2
838; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
839; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
840; SSE2-NEXT:    movdqa %xmm0, %xmm2
841; SSE2-NEXT:    psrld $6, %xmm2
842; SSE2-NEXT:    psrld $4, %xmm0
843; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
844; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
845; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
846; SSE2-NEXT:    retq
847;
848; SSE41-LABEL: constant_shift_v4i32:
849; SSE41:       # BB#0:
850; SSE41-NEXT:    movdqa %xmm0, %xmm1
851; SSE41-NEXT:    psrld $7, %xmm1
852; SSE41-NEXT:    movdqa %xmm0, %xmm2
853; SSE41-NEXT:    psrld $5, %xmm2
854; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
855; SSE41-NEXT:    movdqa %xmm0, %xmm1
856; SSE41-NEXT:    psrld $6, %xmm1
857; SSE41-NEXT:    psrld $4, %xmm0
858; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
859; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
860; SSE41-NEXT:    retq
861;
862; AVX1-LABEL: constant_shift_v4i32:
863; AVX1:       # BB#0:
864; AVX1-NEXT:    vpsrld $7, %xmm0, %xmm1
865; AVX1-NEXT:    vpsrld $5, %xmm0, %xmm2
866; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
867; AVX1-NEXT:    vpsrld $6, %xmm0, %xmm2
868; AVX1-NEXT:    vpsrld $4, %xmm0, %xmm0
869; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
870; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
871; AVX1-NEXT:    retq
872;
873; AVX2-LABEL: constant_shift_v4i32:
874; AVX2:       # BB#0:
875; AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
876; AVX2-NEXT:    retq
877;
878; XOPAVX1-LABEL: constant_shift_v4i32:
879; XOPAVX1:       # BB#0:
880; XOPAVX1-NEXT:    vpshld {{.*}}(%rip), %xmm0, %xmm0
881; XOPAVX1-NEXT:    retq
882;
883; XOPAVX2-LABEL: constant_shift_v4i32:
884; XOPAVX2:       # BB#0:
885; XOPAVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
886; XOPAVX2-NEXT:    retq
887;
888; AVX512-LABEL: constant_shift_v4i32:
889; AVX512:       ## BB#0:
890; AVX512-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
891; AVX512-NEXT:    retq
892;
893; X32-SSE-LABEL: constant_shift_v4i32:
894; X32-SSE:       # BB#0:
895; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
896; X32-SSE-NEXT:    psrld $7, %xmm1
897; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
898; X32-SSE-NEXT:    psrld $5, %xmm2
899; X32-SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
900; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
901; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
902; X32-SSE-NEXT:    psrld $6, %xmm2
903; X32-SSE-NEXT:    psrld $4, %xmm0
904; X32-SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
905; X32-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
906; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
907; X32-SSE-NEXT:    retl
908  %shift = lshr <4 x i32> %a, <i32 4, i32 5, i32 6, i32 7>
909  ret <4 x i32> %shift
910}
911
912define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
913; SSE2-LABEL: constant_shift_v8i16:
914; SSE2:       # BB#0:
915; SSE2-NEXT:    movdqa %xmm0, %xmm1
916; SSE2-NEXT:    psrlw $4, %xmm1
917; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
918; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
919; SSE2-NEXT:    psrlw $2, %xmm1
920; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
921; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
922; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0]
923; SSE2-NEXT:    movdqa %xmm2, %xmm1
924; SSE2-NEXT:    pand %xmm0, %xmm1
925; SSE2-NEXT:    psrlw $1, %xmm2
926; SSE2-NEXT:    pandn %xmm2, %xmm0
927; SSE2-NEXT:    por %xmm1, %xmm0
928; SSE2-NEXT:    retq
929;
930; SSE41-LABEL: constant_shift_v8i16:
931; SSE41:       # BB#0:
932; SSE41-NEXT:    movdqa %xmm0, %xmm1
933; SSE41-NEXT:    psrlw $4, %xmm1
934; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
935; SSE41-NEXT:    movdqa %xmm1, %xmm2
936; SSE41-NEXT:    psrlw $2, %xmm2
937; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
938; SSE41-NEXT:    movdqa %xmm2, %xmm0
939; SSE41-NEXT:    psrlw $1, %xmm0
940; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4],xmm0[5],xmm2[6],xmm0[7]
941; SSE41-NEXT:    retq
942;
943; AVX1-LABEL: constant_shift_v8i16:
944; AVX1:       # BB#0:
945; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm1
946; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
947; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm1
948; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
949; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm1
950; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
951; AVX1-NEXT:    retq
952;
953; AVX2-LABEL: constant_shift_v8i16:
954; AVX2:       # BB#0:
955; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
956; AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %ymm0, %ymm0
957; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
958; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
959; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
960; AVX2-NEXT:    vzeroupper
961; AVX2-NEXT:    retq
962;
963; XOP-LABEL: constant_shift_v8i16:
964; XOP:       # BB#0:
965; XOP-NEXT:    vpxor %xmm1, %xmm1, %xmm1
966; XOP-NEXT:    vpsubw {{.*}}(%rip), %xmm1, %xmm1
967; XOP-NEXT:    vpshlw %xmm1, %xmm0, %xmm0
968; XOP-NEXT:    retq
969;
970; AVX512-LABEL: constant_shift_v8i16:
971; AVX512:       ## BB#0:
972; AVX512-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
973; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7]
974; AVX512-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
975; AVX512-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
976; AVX512-NEXT:    retq
977;
978; X32-SSE-LABEL: constant_shift_v8i16:
979; X32-SSE:       # BB#0:
980; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
981; X32-SSE-NEXT:    psrlw $4, %xmm1
982; X32-SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
983; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
984; X32-SSE-NEXT:    psrlw $2, %xmm1
985; X32-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
986; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
987; X32-SSE-NEXT:    movdqa {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0]
988; X32-SSE-NEXT:    movdqa %xmm2, %xmm1
989; X32-SSE-NEXT:    pand %xmm0, %xmm1
990; X32-SSE-NEXT:    psrlw $1, %xmm2
991; X32-SSE-NEXT:    pandn %xmm2, %xmm0
992; X32-SSE-NEXT:    por %xmm1, %xmm0
993; X32-SSE-NEXT:    retl
994  %shift = lshr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
995  ret <8 x i16> %shift
996}
997
998define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
999; SSE2-LABEL: constant_shift_v16i8:
1000; SSE2:       # BB#0:
1001; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
1002; SSE2-NEXT:    psllw $5, %xmm2
1003; SSE2-NEXT:    pxor %xmm1, %xmm1
1004; SSE2-NEXT:    pxor %xmm3, %xmm3
1005; SSE2-NEXT:    pcmpgtb %xmm2, %xmm3
1006; SSE2-NEXT:    movdqa %xmm3, %xmm4
1007; SSE2-NEXT:    pandn %xmm0, %xmm4
1008; SSE2-NEXT:    psrlw $4, %xmm0
1009; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
1010; SSE2-NEXT:    pand %xmm3, %xmm0
1011; SSE2-NEXT:    por %xmm4, %xmm0
1012; SSE2-NEXT:    paddb %xmm2, %xmm2
1013; SSE2-NEXT:    pxor %xmm3, %xmm3
1014; SSE2-NEXT:    pcmpgtb %xmm2, %xmm3
1015; SSE2-NEXT:    movdqa %xmm3, %xmm4
1016; SSE2-NEXT:    pandn %xmm0, %xmm4
1017; SSE2-NEXT:    psrlw $2, %xmm0
1018; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
1019; SSE2-NEXT:    pand %xmm3, %xmm0
1020; SSE2-NEXT:    por %xmm4, %xmm0
1021; SSE2-NEXT:    paddb %xmm2, %xmm2
1022; SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
1023; SSE2-NEXT:    movdqa %xmm1, %xmm2
1024; SSE2-NEXT:    pandn %xmm0, %xmm2
1025; SSE2-NEXT:    psrlw $1, %xmm0
1026; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
1027; SSE2-NEXT:    pand %xmm1, %xmm0
1028; SSE2-NEXT:    por %xmm2, %xmm0
1029; SSE2-NEXT:    retq
1030;
1031; SSE41-LABEL: constant_shift_v16i8:
1032; SSE41:       # BB#0:
1033; SSE41-NEXT:    movdqa %xmm0, %xmm1
1034; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
1035; SSE41-NEXT:    psllw $5, %xmm0
1036; SSE41-NEXT:    movdqa %xmm1, %xmm2
1037; SSE41-NEXT:    psrlw $4, %xmm2
1038; SSE41-NEXT:    pand {{.*}}(%rip), %xmm2
1039; SSE41-NEXT:    pblendvb %xmm2, %xmm1
1040; SSE41-NEXT:    movdqa %xmm1, %xmm2
1041; SSE41-NEXT:    psrlw $2, %xmm2
1042; SSE41-NEXT:    pand {{.*}}(%rip), %xmm2
1043; SSE41-NEXT:    paddb %xmm0, %xmm0
1044; SSE41-NEXT:    pblendvb %xmm2, %xmm1
1045; SSE41-NEXT:    movdqa %xmm1, %xmm2
1046; SSE41-NEXT:    psrlw $1, %xmm2
1047; SSE41-NEXT:    pand {{.*}}(%rip), %xmm2
1048; SSE41-NEXT:    paddb %xmm0, %xmm0
1049; SSE41-NEXT:    pblendvb %xmm2, %xmm1
1050; SSE41-NEXT:    movdqa %xmm1, %xmm0
1051; SSE41-NEXT:    retq
1052;
1053; AVX-LABEL: constant_shift_v16i8:
1054; AVX:       # BB#0:
1055; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
1056; AVX-NEXT:    vpsllw $5, %xmm1, %xmm1
1057; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm2
1058; AVX-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
1059; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
1060; AVX-NEXT:    vpsrlw $2, %xmm0, %xmm2
1061; AVX-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
1062; AVX-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
1063; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
1064; AVX-NEXT:    vpsrlw $1, %xmm0, %xmm2
1065; AVX-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
1066; AVX-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
1067; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
1068; AVX-NEXT:    retq
1069;
1070; XOP-LABEL: constant_shift_v16i8:
1071; XOP:       # BB#0:
1072; XOP-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1073; XOP-NEXT:    vpsubb {{.*}}(%rip), %xmm1, %xmm1
1074; XOP-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
1075; XOP-NEXT:    retq
1076;
1077; AVX512-LABEL: constant_shift_v16i8:
1078; AVX512:       ## BB#0:
1079; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
1080; AVX512-NEXT:    vpsllw $5, %xmm1, %xmm1
1081; AVX512-NEXT:    vpsrlw $4, %xmm0, %xmm2
1082; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
1083; AVX512-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
1084; AVX512-NEXT:    vpsrlw $2, %xmm0, %xmm2
1085; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
1086; AVX512-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
1087; AVX512-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
1088; AVX512-NEXT:    vpsrlw $1, %xmm0, %xmm2
1089; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
1090; AVX512-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
1091; AVX512-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
1092; AVX512-NEXT:    retq
1093;
1094; X32-SSE-LABEL: constant_shift_v16i8:
1095; X32-SSE:       # BB#0:
1096; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
1097; X32-SSE-NEXT:    psllw $5, %xmm2
1098; X32-SSE-NEXT:    pxor %xmm1, %xmm1
1099; X32-SSE-NEXT:    pxor %xmm3, %xmm3
1100; X32-SSE-NEXT:    pcmpgtb %xmm2, %xmm3
1101; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
1102; X32-SSE-NEXT:    pandn %xmm0, %xmm4
1103; X32-SSE-NEXT:    psrlw $4, %xmm0
1104; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
1105; X32-SSE-NEXT:    pand %xmm3, %xmm0
1106; X32-SSE-NEXT:    por %xmm4, %xmm0
1107; X32-SSE-NEXT:    paddb %xmm2, %xmm2
1108; X32-SSE-NEXT:    pxor %xmm3, %xmm3
1109; X32-SSE-NEXT:    pcmpgtb %xmm2, %xmm3
1110; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
1111; X32-SSE-NEXT:    pandn %xmm0, %xmm4
1112; X32-SSE-NEXT:    psrlw $2, %xmm0
1113; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
1114; X32-SSE-NEXT:    pand %xmm3, %xmm0
1115; X32-SSE-NEXT:    por %xmm4, %xmm0
1116; X32-SSE-NEXT:    paddb %xmm2, %xmm2
1117; X32-SSE-NEXT:    pcmpgtb %xmm2, %xmm1
1118; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
1119; X32-SSE-NEXT:    pandn %xmm0, %xmm2
1120; X32-SSE-NEXT:    psrlw $1, %xmm0
1121; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
1122; X32-SSE-NEXT:    pand %xmm1, %xmm0
1123; X32-SSE-NEXT:    por %xmm2, %xmm0
1124; X32-SSE-NEXT:    retl
1125  %shift = lshr <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
1126  ret <16 x i8> %shift
1127}
1128
1129;
1130; Uniform Constant Shifts
1131;
1132
1133define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) nounwind {
1134; SSE-LABEL: splatconstant_shift_v2i64:
1135; SSE:       # BB#0:
1136; SSE-NEXT:    psrlq $7, %xmm0
1137; SSE-NEXT:    retq
1138;
1139; AVX-LABEL: splatconstant_shift_v2i64:
1140; AVX:       # BB#0:
1141; AVX-NEXT:    vpsrlq $7, %xmm0, %xmm0
1142; AVX-NEXT:    retq
1143;
1144; XOP-LABEL: splatconstant_shift_v2i64:
1145; XOP:       # BB#0:
1146; XOP-NEXT:    vpsrlq $7, %xmm0, %xmm0
1147; XOP-NEXT:    retq
1148;
1149; AVX512-LABEL: splatconstant_shift_v2i64:
1150; AVX512:       ## BB#0:
1151; AVX512-NEXT:    vpsrlq $7, %xmm0, %xmm0
1152; AVX512-NEXT:    retq
1153;
1154; X32-SSE-LABEL: splatconstant_shift_v2i64:
1155; X32-SSE:       # BB#0:
1156; X32-SSE-NEXT:    psrlq $7, %xmm0
1157; X32-SSE-NEXT:    retl
1158  %shift = lshr <2 x i64> %a, <i64 7, i64 7>
1159  ret <2 x i64> %shift
1160}
1161
1162define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) nounwind {
1163; SSE-LABEL: splatconstant_shift_v4i32:
1164; SSE:       # BB#0:
1165; SSE-NEXT:    psrld $5, %xmm0
1166; SSE-NEXT:    retq
1167;
1168; AVX-LABEL: splatconstant_shift_v4i32:
1169; AVX:       # BB#0:
1170; AVX-NEXT:    vpsrld $5, %xmm0, %xmm0
1171; AVX-NEXT:    retq
1172;
1173; XOP-LABEL: splatconstant_shift_v4i32:
1174; XOP:       # BB#0:
1175; XOP-NEXT:    vpsrld $5, %xmm0, %xmm0
1176; XOP-NEXT:    retq
1177;
1178; AVX512-LABEL: splatconstant_shift_v4i32:
1179; AVX512:       ## BB#0:
1180; AVX512-NEXT:    vpsrld $5, %xmm0, %xmm0
1181; AVX512-NEXT:    retq
1182;
1183; X32-SSE-LABEL: splatconstant_shift_v4i32:
1184; X32-SSE:       # BB#0:
1185; X32-SSE-NEXT:    psrld $5, %xmm0
1186; X32-SSE-NEXT:    retl
1187  %shift = lshr <4 x i32> %a, <i32 5, i32 5, i32 5, i32 5>
1188  ret <4 x i32> %shift
1189}
1190
1191define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) nounwind {
1192; SSE-LABEL: splatconstant_shift_v8i16:
1193; SSE:       # BB#0:
1194; SSE-NEXT:    psrlw $3, %xmm0
1195; SSE-NEXT:    retq
1196;
1197; AVX-LABEL: splatconstant_shift_v8i16:
1198; AVX:       # BB#0:
1199; AVX-NEXT:    vpsrlw $3, %xmm0, %xmm0
1200; AVX-NEXT:    retq
1201;
1202; XOP-LABEL: splatconstant_shift_v8i16:
1203; XOP:       # BB#0:
1204; XOP-NEXT:    vpsrlw $3, %xmm0, %xmm0
1205; XOP-NEXT:    retq
1206;
1207; AVX512-LABEL: splatconstant_shift_v8i16:
1208; AVX512:       ## BB#0:
1209; AVX512-NEXT:    vpsrlw $3, %xmm0, %xmm0
1210; AVX512-NEXT:    retq
1211;
1212; X32-SSE-LABEL: splatconstant_shift_v8i16:
1213; X32-SSE:       # BB#0:
1214; X32-SSE-NEXT:    psrlw $3, %xmm0
1215; X32-SSE-NEXT:    retl
1216  %shift = lshr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
1217  ret <8 x i16> %shift
1218}
1219
1220define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) nounwind {
1221; SSE-LABEL: splatconstant_shift_v16i8:
1222; SSE:       # BB#0:
1223; SSE-NEXT:    psrlw $3, %xmm0
1224; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
1225; SSE-NEXT:    retq
1226;
1227; AVX-LABEL: splatconstant_shift_v16i8:
1228; AVX:       # BB#0:
1229; AVX-NEXT:    vpsrlw $3, %xmm0, %xmm0
1230; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
1231; AVX-NEXT:    retq
1232;
1233; XOP-LABEL: splatconstant_shift_v16i8:
1234; XOP:       # BB#0:
1235; XOP-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1236; XOP-NEXT:    vpsubb {{.*}}(%rip), %xmm1, %xmm1
1237; XOP-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
1238; XOP-NEXT:    retq
1239;
1240; AVX512-LABEL: splatconstant_shift_v16i8:
1241; AVX512:       ## BB#0:
1242; AVX512-NEXT:    vpsrlw $3, %xmm0, %xmm0
1243; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
1244; AVX512-NEXT:    retq
1245;
1246; X32-SSE-LABEL: splatconstant_shift_v16i8:
1247; X32-SSE:       # BB#0:
1248; X32-SSE-NEXT:    psrlw $3, %xmm0
1249; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
1250; X32-SSE-NEXT:    retl
1251  %shift = lshr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
1252  ret <16 x i8> %shift
1253}
1254