1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX
7
8;
9; Unary shuffle indices from registers
10;
11
12define <2 x double> @var_shuffle_v2f64_v2f64_xx_i64(<2 x double> %x, i64 %i0, i64 %i1) nounwind {
13; SSE-LABEL: var_shuffle_v2f64_v2f64_xx_i64:
14; SSE:       # %bb.0:
15; SSE-NEXT:    andl $1, %esi
16; SSE-NEXT:    andl $1, %edi
17; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
18; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
19; SSE-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
20; SSE-NEXT:    retq
21;
22; AVX-LABEL: var_shuffle_v2f64_v2f64_xx_i64:
23; AVX:       # %bb.0:
24; AVX-NEXT:    andl $1, %esi
25; AVX-NEXT:    andl $1, %edi
26; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
27; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
28; AVX-NEXT:    vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
29; AVX-NEXT:    retq
30  %x0 = extractelement <2 x double> %x, i64 %i0
31  %x1 = extractelement <2 x double> %x, i64 %i1
32  %r0 = insertelement <2 x double> undef, double %x0, i32 0
33  %r1 = insertelement <2 x double>   %r0, double %x1, i32 1
34  ret <2 x double> %r1
35}
36
37define <2 x i64> @var_shuffle_v2i64_v2i64_xx_i64(<2 x i64> %x, i32 %i0, i32 %i1) nounwind {
38; SSE-LABEL: var_shuffle_v2i64_v2i64_xx_i64:
39; SSE:       # %bb.0:
40; SSE-NEXT:    # kill: def $esi killed $esi def $rsi
41; SSE-NEXT:    # kill: def $edi killed $edi def $rdi
42; SSE-NEXT:    andl $1, %edi
43; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
44; SSE-NEXT:    andl $1, %esi
45; SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
46; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
47; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
48; SSE-NEXT:    retq
49;
50; AVX-LABEL: var_shuffle_v2i64_v2i64_xx_i64:
51; AVX:       # %bb.0:
52; AVX-NEXT:    # kill: def $esi killed $esi def $rsi
53; AVX-NEXT:    # kill: def $edi killed $edi def $rdi
54; AVX-NEXT:    andl $1, %edi
55; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
56; AVX-NEXT:    andl $1, %esi
57; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
58; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
59; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
60; AVX-NEXT:    retq
61  %x0 = extractelement <2 x i64> %x, i32 %i0
62  %x1 = extractelement <2 x i64> %x, i32 %i1
63  %r0 = insertelement <2 x i64> undef, i64 %x0, i32 0
64  %r1 = insertelement <2 x i64>   %r0, i64 %x1, i32 1
65  ret <2 x i64> %r1
66}
67
68define <4 x float> @var_shuffle_v4f32_v4f32_xxxx_i32(<4 x float> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3) nounwind {
69; SSE2-LABEL: var_shuffle_v4f32_v4f32_xxxx_i32:
70; SSE2:       # %bb.0:
71; SSE2-NEXT:    # kill: def $ecx killed $ecx def $rcx
72; SSE2-NEXT:    # kill: def $edx killed $edx def $rdx
73; SSE2-NEXT:    # kill: def $esi killed $esi def $rsi
74; SSE2-NEXT:    # kill: def $edi killed $edi def $rdi
75; SSE2-NEXT:    andl $3, %edi
76; SSE2-NEXT:    andl $3, %esi
77; SSE2-NEXT:    andl $3, %edx
78; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
79; SSE2-NEXT:    andl $3, %ecx
80; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
81; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
82; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
83; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
84; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
85; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
86; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
87; SSE2-NEXT:    retq
88;
89; SSSE3-LABEL: var_shuffle_v4f32_v4f32_xxxx_i32:
90; SSSE3:       # %bb.0:
91; SSSE3-NEXT:    # kill: def $ecx killed $ecx def $rcx
92; SSSE3-NEXT:    # kill: def $edx killed $edx def $rdx
93; SSSE3-NEXT:    # kill: def $esi killed $esi def $rsi
94; SSSE3-NEXT:    # kill: def $edi killed $edi def $rdi
95; SSSE3-NEXT:    andl $3, %edi
96; SSSE3-NEXT:    andl $3, %esi
97; SSSE3-NEXT:    andl $3, %edx
98; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
99; SSSE3-NEXT:    andl $3, %ecx
100; SSSE3-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
101; SSSE3-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
102; SSSE3-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
103; SSSE3-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
104; SSSE3-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
105; SSSE3-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
106; SSSE3-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
107; SSSE3-NEXT:    retq
108;
109; SSE41-LABEL: var_shuffle_v4f32_v4f32_xxxx_i32:
110; SSE41:       # %bb.0:
111; SSE41-NEXT:    # kill: def $ecx killed $ecx def $rcx
112; SSE41-NEXT:    # kill: def $edx killed $edx def $rdx
113; SSE41-NEXT:    # kill: def $esi killed $esi def $rsi
114; SSE41-NEXT:    # kill: def $edi killed $edi def $rdi
115; SSE41-NEXT:    andl $3, %edi
116; SSE41-NEXT:    andl $3, %esi
117; SSE41-NEXT:    andl $3, %edx
118; SSE41-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
119; SSE41-NEXT:    andl $3, %ecx
120; SSE41-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
121; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
122; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
123; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
124; SSE41-NEXT:    retq
125;
126; AVX-LABEL: var_shuffle_v4f32_v4f32_xxxx_i32:
127; AVX:       # %bb.0:
128; AVX-NEXT:    # kill: def $ecx killed $ecx def $rcx
129; AVX-NEXT:    # kill: def $edx killed $edx def $rdx
130; AVX-NEXT:    # kill: def $esi killed $esi def $rsi
131; AVX-NEXT:    # kill: def $edi killed $edi def $rdi
132; AVX-NEXT:    andl $3, %edi
133; AVX-NEXT:    andl $3, %esi
134; AVX-NEXT:    andl $3, %edx
135; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
136; AVX-NEXT:    andl $3, %ecx
137; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
138; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
139; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
140; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
141; AVX-NEXT:    retq
142  %x0 = extractelement <4 x float> %x, i32 %i0
143  %x1 = extractelement <4 x float> %x, i32 %i1
144  %x2 = extractelement <4 x float> %x, i32 %i2
145  %x3 = extractelement <4 x float> %x, i32 %i3
146  %r0 = insertelement <4 x float> undef, float %x0, i32 0
147  %r1 = insertelement <4 x float>   %r0, float %x1, i32 1
148  %r2 = insertelement <4 x float>   %r1, float %x2, i32 2
149  %r3 = insertelement <4 x float>   %r2, float %x3, i32 3
150  ret <4 x float> %r3
151}
152
153define <4 x i32> @var_shuffle_v4i32_v4i32_xxxx_i32(<4 x i32> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3) nounwind {
154; SSE2-LABEL: var_shuffle_v4i32_v4i32_xxxx_i32:
155; SSE2:       # %bb.0:
156; SSE2-NEXT:    # kill: def $ecx killed $ecx def $rcx
157; SSE2-NEXT:    # kill: def $edx killed $edx def $rdx
158; SSE2-NEXT:    # kill: def $esi killed $esi def $rsi
159; SSE2-NEXT:    # kill: def $edi killed $edi def $rdi
160; SSE2-NEXT:    andl $3, %edi
161; SSE2-NEXT:    andl $3, %esi
162; SSE2-NEXT:    andl $3, %edx
163; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
164; SSE2-NEXT:    andl $3, %ecx
165; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
166; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
167; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
168; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
169; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
170; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
171; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
172; SSE2-NEXT:    retq
173;
174; SSSE3-LABEL: var_shuffle_v4i32_v4i32_xxxx_i32:
175; SSSE3:       # %bb.0:
176; SSSE3-NEXT:    # kill: def $ecx killed $ecx def $rcx
177; SSSE3-NEXT:    # kill: def $edx killed $edx def $rdx
178; SSSE3-NEXT:    # kill: def $esi killed $esi def $rsi
179; SSSE3-NEXT:    # kill: def $edi killed $edi def $rdi
180; SSSE3-NEXT:    andl $3, %edi
181; SSSE3-NEXT:    andl $3, %esi
182; SSSE3-NEXT:    andl $3, %edx
183; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
184; SSSE3-NEXT:    andl $3, %ecx
185; SSSE3-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
186; SSSE3-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
187; SSSE3-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
188; SSSE3-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
189; SSSE3-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
190; SSSE3-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
191; SSSE3-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
192; SSSE3-NEXT:    retq
193;
194; SSE41-LABEL: var_shuffle_v4i32_v4i32_xxxx_i32:
195; SSE41:       # %bb.0:
196; SSE41-NEXT:    # kill: def $ecx killed $ecx def $rcx
197; SSE41-NEXT:    # kill: def $edx killed $edx def $rdx
198; SSE41-NEXT:    # kill: def $esi killed $esi def $rsi
199; SSE41-NEXT:    # kill: def $edi killed $edi def $rdi
200; SSE41-NEXT:    andl $3, %edi
201; SSE41-NEXT:    andl $3, %esi
202; SSE41-NEXT:    andl $3, %edx
203; SSE41-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
204; SSE41-NEXT:    andl $3, %ecx
205; SSE41-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
206; SSE41-NEXT:    pinsrd $1, -24(%rsp,%rsi,4), %xmm0
207; SSE41-NEXT:    pinsrd $2, -24(%rsp,%rdx,4), %xmm0
208; SSE41-NEXT:    pinsrd $3, -24(%rsp,%rcx,4), %xmm0
209; SSE41-NEXT:    retq
210;
211; AVX-LABEL: var_shuffle_v4i32_v4i32_xxxx_i32:
212; AVX:       # %bb.0:
213; AVX-NEXT:    # kill: def $ecx killed $ecx def $rcx
214; AVX-NEXT:    # kill: def $edx killed $edx def $rdx
215; AVX-NEXT:    # kill: def $esi killed $esi def $rsi
216; AVX-NEXT:    # kill: def $edi killed $edi def $rdi
217; AVX-NEXT:    andl $3, %edi
218; AVX-NEXT:    andl $3, %esi
219; AVX-NEXT:    andl $3, %edx
220; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
221; AVX-NEXT:    andl $3, %ecx
222; AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
223; AVX-NEXT:    vpinsrd $1, -24(%rsp,%rsi,4), %xmm0, %xmm0
224; AVX-NEXT:    vpinsrd $2, -24(%rsp,%rdx,4), %xmm0, %xmm0
225; AVX-NEXT:    vpinsrd $3, -24(%rsp,%rcx,4), %xmm0, %xmm0
226; AVX-NEXT:    retq
227  %x0 = extractelement <4 x i32> %x, i32 %i0
228  %x1 = extractelement <4 x i32> %x, i32 %i1
229  %x2 = extractelement <4 x i32> %x, i32 %i2
230  %x3 = extractelement <4 x i32> %x, i32 %i3
231  %r0 = insertelement <4 x i32> undef, i32 %x0, i32 0
232  %r1 = insertelement <4 x i32>   %r0, i32 %x1, i32 1
233  %r2 = insertelement <4 x i32>   %r1, i32 %x2, i32 2
234  %r3 = insertelement <4 x i32>   %r2, i32 %x3, i32 3
235  ret <4 x i32> %r3
236}
237
238define <8 x i16> @var_shuffle_v8i16_v8i16_xxxxxxxx_i16(<8 x i16> %x, i16 %i0, i16 %i1, i16 %i2, i16 %i3, i16 %i4, i16 %i5, i16 %i6, i16 %i7) nounwind {
239; SSE2-LABEL: var_shuffle_v8i16_v8i16_xxxxxxxx_i16:
240; SSE2:       # %bb.0:
241; SSE2-NEXT:    # kill: def $r9d killed $r9d def $r9
242; SSE2-NEXT:    # kill: def $r8d killed $r8d def $r8
243; SSE2-NEXT:    # kill: def $ecx killed $ecx def $rcx
244; SSE2-NEXT:    # kill: def $edx killed $edx def $rdx
245; SSE2-NEXT:    # kill: def $esi killed $esi def $rsi
246; SSE2-NEXT:    # kill: def $edi killed $edi def $rdi
247; SSE2-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
248; SSE2-NEXT:    andl $7, %r10d
249; SSE2-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
250; SSE2-NEXT:    andl $7, %eax
251; SSE2-NEXT:    andl $7, %edi
252; SSE2-NEXT:    andl $7, %esi
253; SSE2-NEXT:    andl $7, %edx
254; SSE2-NEXT:    andl $7, %ecx
255; SSE2-NEXT:    andl $7, %r8d
256; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
257; SSE2-NEXT:    andl $7, %r9d
258; SSE2-NEXT:    movzwl -24(%rsp,%rcx,2), %ecx
259; SSE2-NEXT:    movd %ecx, %xmm0
260; SSE2-NEXT:    movzwl -24(%rsp,%rdx,2), %ecx
261; SSE2-NEXT:    movd %ecx, %xmm1
262; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
263; SSE2-NEXT:    movzwl -24(%rsp,%rsi,2), %ecx
264; SSE2-NEXT:    movd %ecx, %xmm2
265; SSE2-NEXT:    movzwl -24(%rsp,%rdi,2), %ecx
266; SSE2-NEXT:    movd %ecx, %xmm0
267; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
268; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
269; SSE2-NEXT:    movzwl -24(%rsp,%r9,2), %ecx
270; SSE2-NEXT:    movd %ecx, %xmm1
271; SSE2-NEXT:    movzwl -24(%rsp,%r8,2), %ecx
272; SSE2-NEXT:    movd %ecx, %xmm2
273; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
274; SSE2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
275; SSE2-NEXT:    movd %eax, %xmm1
276; SSE2-NEXT:    movzwl -24(%rsp,%r10,2), %eax
277; SSE2-NEXT:    movd %eax, %xmm3
278; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
279; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
280; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
281; SSE2-NEXT:    retq
282;
283; SSSE3-LABEL: var_shuffle_v8i16_v8i16_xxxxxxxx_i16:
284; SSSE3:       # %bb.0:
285; SSSE3-NEXT:    # kill: def $r9d killed $r9d def $r9
286; SSSE3-NEXT:    # kill: def $r8d killed $r8d def $r8
287; SSSE3-NEXT:    # kill: def $ecx killed $ecx def $rcx
288; SSSE3-NEXT:    # kill: def $edx killed $edx def $rdx
289; SSSE3-NEXT:    # kill: def $esi killed $esi def $rsi
290; SSSE3-NEXT:    # kill: def $edi killed $edi def $rdi
291; SSSE3-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
292; SSSE3-NEXT:    andl $7, %r10d
293; SSSE3-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
294; SSSE3-NEXT:    andl $7, %eax
295; SSSE3-NEXT:    andl $7, %edi
296; SSSE3-NEXT:    andl $7, %esi
297; SSSE3-NEXT:    andl $7, %edx
298; SSSE3-NEXT:    andl $7, %ecx
299; SSSE3-NEXT:    andl $7, %r8d
300; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
301; SSSE3-NEXT:    andl $7, %r9d
302; SSSE3-NEXT:    movzwl -24(%rsp,%rcx,2), %ecx
303; SSSE3-NEXT:    movd %ecx, %xmm0
304; SSSE3-NEXT:    movzwl -24(%rsp,%rdx,2), %ecx
305; SSSE3-NEXT:    movd %ecx, %xmm1
306; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
307; SSSE3-NEXT:    movzwl -24(%rsp,%rsi,2), %ecx
308; SSSE3-NEXT:    movd %ecx, %xmm2
309; SSSE3-NEXT:    movzwl -24(%rsp,%rdi,2), %ecx
310; SSSE3-NEXT:    movd %ecx, %xmm0
311; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
312; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
313; SSSE3-NEXT:    movzwl -24(%rsp,%r9,2), %ecx
314; SSSE3-NEXT:    movd %ecx, %xmm1
315; SSSE3-NEXT:    movzwl -24(%rsp,%r8,2), %ecx
316; SSSE3-NEXT:    movd %ecx, %xmm2
317; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
318; SSSE3-NEXT:    movzwl -24(%rsp,%rax,2), %eax
319; SSSE3-NEXT:    movd %eax, %xmm1
320; SSSE3-NEXT:    movzwl -24(%rsp,%r10,2), %eax
321; SSSE3-NEXT:    movd %eax, %xmm3
322; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
323; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
324; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
325; SSSE3-NEXT:    retq
326;
327; SSE41-LABEL: var_shuffle_v8i16_v8i16_xxxxxxxx_i16:
328; SSE41:       # %bb.0:
329; SSE41-NEXT:    # kill: def $r9d killed $r9d def $r9
330; SSE41-NEXT:    # kill: def $r8d killed $r8d def $r8
331; SSE41-NEXT:    # kill: def $ecx killed $ecx def $rcx
332; SSE41-NEXT:    # kill: def $edx killed $edx def $rdx
333; SSE41-NEXT:    # kill: def $esi killed $esi def $rsi
334; SSE41-NEXT:    # kill: def $edi killed $edi def $rdi
335; SSE41-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
336; SSE41-NEXT:    andl $7, %r10d
337; SSE41-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
338; SSE41-NEXT:    andl $7, %eax
339; SSE41-NEXT:    andl $7, %edi
340; SSE41-NEXT:    andl $7, %esi
341; SSE41-NEXT:    andl $7, %edx
342; SSE41-NEXT:    andl $7, %ecx
343; SSE41-NEXT:    andl $7, %r8d
344; SSE41-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
345; SSE41-NEXT:    andl $7, %r9d
346; SSE41-NEXT:    movzwl -24(%rsp,%rdi,2), %edi
347; SSE41-NEXT:    movd %edi, %xmm0
348; SSE41-NEXT:    pinsrw $1, -24(%rsp,%rsi,2), %xmm0
349; SSE41-NEXT:    pinsrw $2, -24(%rsp,%rdx,2), %xmm0
350; SSE41-NEXT:    pinsrw $3, -24(%rsp,%rcx,2), %xmm0
351; SSE41-NEXT:    pinsrw $4, -24(%rsp,%r8,2), %xmm0
352; SSE41-NEXT:    pinsrw $5, -24(%rsp,%r9,2), %xmm0
353; SSE41-NEXT:    pinsrw $6, -24(%rsp,%rax,2), %xmm0
354; SSE41-NEXT:    pinsrw $7, -24(%rsp,%r10,2), %xmm0
355; SSE41-NEXT:    retq
356;
357; AVX-LABEL: var_shuffle_v8i16_v8i16_xxxxxxxx_i16:
358; AVX:       # %bb.0:
359; AVX-NEXT:    # kill: def $r9d killed $r9d def $r9
360; AVX-NEXT:    # kill: def $r8d killed $r8d def $r8
361; AVX-NEXT:    # kill: def $ecx killed $ecx def $rcx
362; AVX-NEXT:    # kill: def $edx killed $edx def $rdx
363; AVX-NEXT:    # kill: def $esi killed $esi def $rsi
364; AVX-NEXT:    # kill: def $edi killed $edi def $rdi
365; AVX-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
366; AVX-NEXT:    andl $7, %r10d
367; AVX-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
368; AVX-NEXT:    andl $7, %eax
369; AVX-NEXT:    andl $7, %edi
370; AVX-NEXT:    andl $7, %esi
371; AVX-NEXT:    andl $7, %edx
372; AVX-NEXT:    andl $7, %ecx
373; AVX-NEXT:    andl $7, %r8d
374; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
375; AVX-NEXT:    andl $7, %r9d
376; AVX-NEXT:    movzwl -24(%rsp,%rdi,2), %edi
377; AVX-NEXT:    vmovd %edi, %xmm0
378; AVX-NEXT:    vpinsrw $1, -24(%rsp,%rsi,2), %xmm0, %xmm0
379; AVX-NEXT:    vpinsrw $2, -24(%rsp,%rdx,2), %xmm0, %xmm0
380; AVX-NEXT:    vpinsrw $3, -24(%rsp,%rcx,2), %xmm0, %xmm0
381; AVX-NEXT:    vpinsrw $4, -24(%rsp,%r8,2), %xmm0, %xmm0
382; AVX-NEXT:    vpinsrw $5, -24(%rsp,%r9,2), %xmm0, %xmm0
383; AVX-NEXT:    vpinsrw $6, -24(%rsp,%rax,2), %xmm0, %xmm0
384; AVX-NEXT:    vpinsrw $7, -24(%rsp,%r10,2), %xmm0, %xmm0
385; AVX-NEXT:    retq
386  %x0 = extractelement <8 x i16> %x, i16 %i0
387  %x1 = extractelement <8 x i16> %x, i16 %i1
388  %x2 = extractelement <8 x i16> %x, i16 %i2
389  %x3 = extractelement <8 x i16> %x, i16 %i3
390  %x4 = extractelement <8 x i16> %x, i16 %i4
391  %x5 = extractelement <8 x i16> %x, i16 %i5
392  %x6 = extractelement <8 x i16> %x, i16 %i6
393  %x7 = extractelement <8 x i16> %x, i16 %i7
394  %r0 = insertelement <8 x i16> undef, i16 %x0, i32 0
395  %r1 = insertelement <8 x i16>   %r0, i16 %x1, i32 1
396  %r2 = insertelement <8 x i16>   %r1, i16 %x2, i32 2
397  %r3 = insertelement <8 x i16>   %r2, i16 %x3, i32 3
398  %r4 = insertelement <8 x i16>   %r3, i16 %x4, i32 4
399  %r5 = insertelement <8 x i16>   %r4, i16 %x5, i32 5
400  %r6 = insertelement <8 x i16>   %r5, i16 %x6, i32 6
401  %r7 = insertelement <8 x i16>   %r6, i16 %x7, i32 7
402  ret <8 x i16> %r7
403}
404
405define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 %i0, i8 %i1, i8 %i2, i8 %i3, i8 %i4, i8 %i5, i8 %i6, i8 %i7, i8 %i8, i8 %i9, i8 %i10, i8 %i11, i8 %i12, i8 %i13, i8 %i14, i8 %i15) nounwind {
406; SSE2-LABEL: var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8:
407; SSE2:       # %bb.0:
408; SSE2-NEXT:    # kill: def $r9d killed $r9d def $r9
409; SSE2-NEXT:    # kill: def $r8d killed $r8d def $r8
410; SSE2-NEXT:    # kill: def $ecx killed $ecx def $rcx
411; SSE2-NEXT:    # kill: def $edx killed $edx def $rdx
412; SSE2-NEXT:    # kill: def $esi killed $esi def $rsi
413; SSE2-NEXT:    # kill: def $edi killed $edi def $rdi
414; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
415; SSE2-NEXT:    andl $15, %eax
416; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
417; SSE2-NEXT:    movzbl -24(%rsp,%rax), %eax
418; SSE2-NEXT:    movd %eax, %xmm8
419; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
420; SSE2-NEXT:    andl $15, %eax
421; SSE2-NEXT:    movzbl -24(%rsp,%rax), %eax
422; SSE2-NEXT:    movd %eax, %xmm15
423; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
424; SSE2-NEXT:    andl $15, %eax
425; SSE2-NEXT:    movzbl -24(%rsp,%rax), %eax
426; SSE2-NEXT:    movd %eax, %xmm9
427; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
428; SSE2-NEXT:    andl $15, %eax
429; SSE2-NEXT:    movzbl -24(%rsp,%rax), %eax
430; SSE2-NEXT:    movd %eax, %xmm3
431; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
432; SSE2-NEXT:    andl $15, %eax
433; SSE2-NEXT:    movzbl -24(%rsp,%rax), %eax
434; SSE2-NEXT:    movd %eax, %xmm10
435; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
436; SSE2-NEXT:    andl $15, %eax
437; SSE2-NEXT:    movzbl -24(%rsp,%rax), %eax
438; SSE2-NEXT:    movd %eax, %xmm7
439; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
440; SSE2-NEXT:    andl $15, %eax
441; SSE2-NEXT:    movzbl -24(%rsp,%rax), %eax
442; SSE2-NEXT:    movd %eax, %xmm11
443; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
444; SSE2-NEXT:    andl $15, %eax
445; SSE2-NEXT:    movzbl -24(%rsp,%rax), %eax
446; SSE2-NEXT:    movd %eax, %xmm6
447; SSE2-NEXT:    andl $15, %ecx
448; SSE2-NEXT:    movzbl -24(%rsp,%rcx), %eax
449; SSE2-NEXT:    movd %eax, %xmm12
450; SSE2-NEXT:    andl $15, %edx
451; SSE2-NEXT:    movzbl -24(%rsp,%rdx), %eax
452; SSE2-NEXT:    movd %eax, %xmm5
453; SSE2-NEXT:    andl $15, %esi
454; SSE2-NEXT:    movzbl -24(%rsp,%rsi), %eax
455; SSE2-NEXT:    movd %eax, %xmm13
456; SSE2-NEXT:    andl $15, %edi
457; SSE2-NEXT:    movzbl -24(%rsp,%rdi), %eax
458; SSE2-NEXT:    movd %eax, %xmm0
459; SSE2-NEXT:    andl $15, %r9d
460; SSE2-NEXT:    movzbl -24(%rsp,%r9), %eax
461; SSE2-NEXT:    movd %eax, %xmm14
462; SSE2-NEXT:    andl $15, %r8d
463; SSE2-NEXT:    movzbl -24(%rsp,%r8), %eax
464; SSE2-NEXT:    movd %eax, %xmm1
465; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
466; SSE2-NEXT:    andl $15, %eax
467; SSE2-NEXT:    movzbl -24(%rsp,%rax), %eax
468; SSE2-NEXT:    movd %eax, %xmm4
469; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
470; SSE2-NEXT:    andl $15, %eax
471; SSE2-NEXT:    movzbl -24(%rsp,%rax), %eax
472; SSE2-NEXT:    movd %eax, %xmm2
473; SSE2-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
474; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
475; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3]
476; SSE2-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7]
477; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7]
478; SSE2-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
479; SSE2-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
480; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7]
481; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3],xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7]
482; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
483; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7]
484; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
485; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
486; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
487; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0]
488; SSE2-NEXT:    retq
489;
490; SSSE3-LABEL: var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8:
491; SSSE3:       # %bb.0:
492; SSSE3-NEXT:    # kill: def $r9d killed $r9d def $r9
493; SSSE3-NEXT:    # kill: def $r8d killed $r8d def $r8
494; SSSE3-NEXT:    # kill: def $ecx killed $ecx def $rcx
495; SSSE3-NEXT:    # kill: def $edx killed $edx def $rdx
496; SSSE3-NEXT:    # kill: def $esi killed $esi def $rsi
497; SSSE3-NEXT:    # kill: def $edi killed $edi def $rdi
498; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
499; SSSE3-NEXT:    andl $15, %eax
500; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
501; SSSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
502; SSSE3-NEXT:    movd %eax, %xmm8
503; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
504; SSSE3-NEXT:    andl $15, %eax
505; SSSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
506; SSSE3-NEXT:    movd %eax, %xmm15
507; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
508; SSSE3-NEXT:    andl $15, %eax
509; SSSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
510; SSSE3-NEXT:    movd %eax, %xmm9
511; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
512; SSSE3-NEXT:    andl $15, %eax
513; SSSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
514; SSSE3-NEXT:    movd %eax, %xmm3
515; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
516; SSSE3-NEXT:    andl $15, %eax
517; SSSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
518; SSSE3-NEXT:    movd %eax, %xmm10
519; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
520; SSSE3-NEXT:    andl $15, %eax
521; SSSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
522; SSSE3-NEXT:    movd %eax, %xmm7
523; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
524; SSSE3-NEXT:    andl $15, %eax
525; SSSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
526; SSSE3-NEXT:    movd %eax, %xmm11
527; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
528; SSSE3-NEXT:    andl $15, %eax
529; SSSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
530; SSSE3-NEXT:    movd %eax, %xmm6
531; SSSE3-NEXT:    andl $15, %ecx
532; SSSE3-NEXT:    movzbl -24(%rsp,%rcx), %eax
533; SSSE3-NEXT:    movd %eax, %xmm12
534; SSSE3-NEXT:    andl $15, %edx
535; SSSE3-NEXT:    movzbl -24(%rsp,%rdx), %eax
536; SSSE3-NEXT:    movd %eax, %xmm5
537; SSSE3-NEXT:    andl $15, %esi
538; SSSE3-NEXT:    movzbl -24(%rsp,%rsi), %eax
539; SSSE3-NEXT:    movd %eax, %xmm13
540; SSSE3-NEXT:    andl $15, %edi
541; SSSE3-NEXT:    movzbl -24(%rsp,%rdi), %eax
542; SSSE3-NEXT:    movd %eax, %xmm0
543; SSSE3-NEXT:    andl $15, %r9d
544; SSSE3-NEXT:    movzbl -24(%rsp,%r9), %eax
545; SSSE3-NEXT:    movd %eax, %xmm14
546; SSSE3-NEXT:    andl $15, %r8d
547; SSSE3-NEXT:    movzbl -24(%rsp,%r8), %eax
548; SSSE3-NEXT:    movd %eax, %xmm1
549; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
550; SSSE3-NEXT:    andl $15, %eax
551; SSSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
552; SSSE3-NEXT:    movd %eax, %xmm4
553; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
554; SSSE3-NEXT:    andl $15, %eax
555; SSSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
556; SSSE3-NEXT:    movd %eax, %xmm2
557; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
558; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
559; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3]
560; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7]
561; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7]
562; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
563; SSSE3-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
564; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7]
565; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3],xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7]
566; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
567; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7]
568; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
569; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
570; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
571; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0]
572; SSSE3-NEXT:    retq
573;
574; SSE41-LABEL: var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8:
575; SSE41:       # %bb.0:
576; SSE41-NEXT:    # kill: def $r9d killed $r9d def $r9
577; SSE41-NEXT:    # kill: def $r8d killed $r8d def $r8
578; SSE41-NEXT:    # kill: def $ecx killed $ecx def $rcx
579; SSE41-NEXT:    # kill: def $edx killed $edx def $rdx
580; SSE41-NEXT:    # kill: def $esi killed $esi def $rsi
581; SSE41-NEXT:    # kill: def $edi killed $edi def $rdi
582; SSE41-NEXT:    andl $15, %edi
583; SSE41-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
584; SSE41-NEXT:    movzbl -24(%rsp,%rdi), %eax
585; SSE41-NEXT:    movd %eax, %xmm0
586; SSE41-NEXT:    andl $15, %esi
587; SSE41-NEXT:    pinsrb $1, -24(%rsp,%rsi), %xmm0
588; SSE41-NEXT:    andl $15, %edx
589; SSE41-NEXT:    pinsrb $2, -24(%rsp,%rdx), %xmm0
590; SSE41-NEXT:    andl $15, %ecx
591; SSE41-NEXT:    pinsrb $3, -24(%rsp,%rcx), %xmm0
592; SSE41-NEXT:    andl $15, %r8d
593; SSE41-NEXT:    pinsrb $4, -24(%rsp,%r8), %xmm0
594; SSE41-NEXT:    andl $15, %r9d
595; SSE41-NEXT:    pinsrb $5, -24(%rsp,%r9), %xmm0
596; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
597; SSE41-NEXT:    andl $15, %eax
598; SSE41-NEXT:    pinsrb $6, -24(%rsp,%rax), %xmm0
599; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
600; SSE41-NEXT:    andl $15, %eax
601; SSE41-NEXT:    pinsrb $7, -24(%rsp,%rax), %xmm0
602; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
603; SSE41-NEXT:    andl $15, %eax
604; SSE41-NEXT:    pinsrb $8, -24(%rsp,%rax), %xmm0
605; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
606; SSE41-NEXT:    andl $15, %eax
607; SSE41-NEXT:    pinsrb $9, -24(%rsp,%rax), %xmm0
608; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
609; SSE41-NEXT:    andl $15, %eax
610; SSE41-NEXT:    pinsrb $10, -24(%rsp,%rax), %xmm0
611; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
612; SSE41-NEXT:    andl $15, %eax
613; SSE41-NEXT:    pinsrb $11, -24(%rsp,%rax), %xmm0
614; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
615; SSE41-NEXT:    andl $15, %eax
616; SSE41-NEXT:    pinsrb $12, -24(%rsp,%rax), %xmm0
617; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
618; SSE41-NEXT:    andl $15, %eax
619; SSE41-NEXT:    pinsrb $13, -24(%rsp,%rax), %xmm0
620; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
621; SSE41-NEXT:    andl $15, %eax
622; SSE41-NEXT:    pinsrb $14, -24(%rsp,%rax), %xmm0
623; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
624; SSE41-NEXT:    andl $15, %eax
625; SSE41-NEXT:    pinsrb $15, -24(%rsp,%rax), %xmm0
626; SSE41-NEXT:    retq
627;
628; AVX-LABEL: var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8:
629; AVX:       # %bb.0:
630; AVX-NEXT:    # kill: def $r9d killed $r9d def $r9
631; AVX-NEXT:    # kill: def $r8d killed $r8d def $r8
632; AVX-NEXT:    # kill: def $ecx killed $ecx def $rcx
633; AVX-NEXT:    # kill: def $edx killed $edx def $rdx
634; AVX-NEXT:    # kill: def $esi killed $esi def $rsi
635; AVX-NEXT:    # kill: def $edi killed $edi def $rdi
636; AVX-NEXT:    andl $15, %edi
637; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
638; AVX-NEXT:    movzbl -24(%rsp,%rdi), %eax
639; AVX-NEXT:    vmovd %eax, %xmm0
640; AVX-NEXT:    andl $15, %esi
641; AVX-NEXT:    vpinsrb $1, -24(%rsp,%rsi), %xmm0, %xmm0
642; AVX-NEXT:    andl $15, %edx
643; AVX-NEXT:    vpinsrb $2, -24(%rsp,%rdx), %xmm0, %xmm0
644; AVX-NEXT:    andl $15, %ecx
645; AVX-NEXT:    vpinsrb $3, -24(%rsp,%rcx), %xmm0, %xmm0
646; AVX-NEXT:    andl $15, %r8d
647; AVX-NEXT:    vpinsrb $4, -24(%rsp,%r8), %xmm0, %xmm0
648; AVX-NEXT:    andl $15, %r9d
649; AVX-NEXT:    vpinsrb $5, -24(%rsp,%r9), %xmm0, %xmm0
650; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
651; AVX-NEXT:    andl $15, %eax
652; AVX-NEXT:    vpinsrb $6, -24(%rsp,%rax), %xmm0, %xmm0
653; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
654; AVX-NEXT:    andl $15, %eax
655; AVX-NEXT:    vpinsrb $7, -24(%rsp,%rax), %xmm0, %xmm0
656; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
657; AVX-NEXT:    andl $15, %eax
658; AVX-NEXT:    vpinsrb $8, -24(%rsp,%rax), %xmm0, %xmm0
659; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
660; AVX-NEXT:    andl $15, %eax
661; AVX-NEXT:    vpinsrb $9, -24(%rsp,%rax), %xmm0, %xmm0
662; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
663; AVX-NEXT:    andl $15, %eax
664; AVX-NEXT:    vpinsrb $10, -24(%rsp,%rax), %xmm0, %xmm0
665; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
666; AVX-NEXT:    andl $15, %eax
667; AVX-NEXT:    vpinsrb $11, -24(%rsp,%rax), %xmm0, %xmm0
668; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
669; AVX-NEXT:    andl $15, %eax
670; AVX-NEXT:    vpinsrb $12, -24(%rsp,%rax), %xmm0, %xmm0
671; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
672; AVX-NEXT:    andl $15, %eax
673; AVX-NEXT:    vpinsrb $13, -24(%rsp,%rax), %xmm0, %xmm0
674; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
675; AVX-NEXT:    andl $15, %eax
676; AVX-NEXT:    vpinsrb $14, -24(%rsp,%rax), %xmm0, %xmm0
677; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
678; AVX-NEXT:    andl $15, %eax
679; AVX-NEXT:    vpinsrb $15, -24(%rsp,%rax), %xmm0, %xmm0
680; AVX-NEXT:    retq
681  %x0  = extractelement <16 x i8> %x, i8 %i0
682  %x1  = extractelement <16 x i8> %x, i8 %i1
683  %x2  = extractelement <16 x i8> %x, i8 %i2
684  %x3  = extractelement <16 x i8> %x, i8 %i3
685  %x4  = extractelement <16 x i8> %x, i8 %i4
686  %x5  = extractelement <16 x i8> %x, i8 %i5
687  %x6  = extractelement <16 x i8> %x, i8 %i6
688  %x7  = extractelement <16 x i8> %x, i8 %i7
689  %x8  = extractelement <16 x i8> %x, i8 %i8
690  %x9  = extractelement <16 x i8> %x, i8 %i9
691  %x10 = extractelement <16 x i8> %x, i8 %i10
692  %x11 = extractelement <16 x i8> %x, i8 %i11
693  %x12 = extractelement <16 x i8> %x, i8 %i12
694  %x13 = extractelement <16 x i8> %x, i8 %i13
695  %x14 = extractelement <16 x i8> %x, i8 %i14
696  %x15 = extractelement <16 x i8> %x, i8 %i15
697  %r0  = insertelement <16 x i8> undef, i8 %x0 , i32 0
698  %r1  = insertelement <16 x i8>  %r0 , i8 %x1 , i32 1
699  %r2  = insertelement <16 x i8>  %r1 , i8 %x2 , i32 2
700  %r3  = insertelement <16 x i8>  %r2 , i8 %x3 , i32 3
701  %r4  = insertelement <16 x i8>  %r3 , i8 %x4 , i32 4
702  %r5  = insertelement <16 x i8>  %r4 , i8 %x5 , i32 5
703  %r6  = insertelement <16 x i8>  %r5 , i8 %x6 , i32 6
704  %r7  = insertelement <16 x i8>  %r6 , i8 %x7 , i32 7
705  %r8  = insertelement <16 x i8>  %r7 , i8 %x8 , i32 8
706  %r9  = insertelement <16 x i8>  %r8 , i8 %x9 , i32 9
707  %r10 = insertelement <16 x i8>  %r9 , i8 %x10, i32 10
708  %r11 = insertelement <16 x i8>  %r10, i8 %x11, i32 11
709  %r12 = insertelement <16 x i8>  %r11, i8 %x12, i32 12
710  %r13 = insertelement <16 x i8>  %r12, i8 %x13, i32 13
711  %r14 = insertelement <16 x i8>  %r13, i8 %x14, i32 14
712  %r15 = insertelement <16 x i8>  %r14, i8 %x15, i32 15
713  ret <16 x i8> %r15
714}
715
716;
717; Unary shuffle indices from memory
718;
719
720define <4 x i32> @mem_shuffle_v4i32_v4i32_xxxx_i32(<4 x i32> %x, i32* %i) nounwind {
721; SSE2-LABEL: mem_shuffle_v4i32_v4i32_xxxx_i32:
722; SSE2:       # %bb.0:
723; SSE2-NEXT:    movl (%rdi), %eax
724; SSE2-NEXT:    movl 4(%rdi), %ecx
725; SSE2-NEXT:    andl $3, %eax
726; SSE2-NEXT:    andl $3, %ecx
727; SSE2-NEXT:    movl 8(%rdi), %edx
728; SSE2-NEXT:    andl $3, %edx
729; SSE2-NEXT:    movl 12(%rdi), %esi
730; SSE2-NEXT:    andl $3, %esi
731; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
732; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
733; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
734; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
735; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
736; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
737; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
738; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
739; SSE2-NEXT:    retq
740;
741; SSSE3-LABEL: mem_shuffle_v4i32_v4i32_xxxx_i32:
742; SSSE3:       # %bb.0:
743; SSSE3-NEXT:    movl (%rdi), %eax
744; SSSE3-NEXT:    movl 4(%rdi), %ecx
745; SSSE3-NEXT:    andl $3, %eax
746; SSSE3-NEXT:    andl $3, %ecx
747; SSSE3-NEXT:    movl 8(%rdi), %edx
748; SSSE3-NEXT:    andl $3, %edx
749; SSSE3-NEXT:    movl 12(%rdi), %esi
750; SSSE3-NEXT:    andl $3, %esi
751; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
752; SSSE3-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
753; SSSE3-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
754; SSSE3-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
755; SSSE3-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
756; SSSE3-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
757; SSSE3-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
758; SSSE3-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
759; SSSE3-NEXT:    retq
760;
761; SSE41-LABEL: mem_shuffle_v4i32_v4i32_xxxx_i32:
762; SSE41:       # %bb.0:
763; SSE41-NEXT:    movl (%rdi), %eax
764; SSE41-NEXT:    movl 4(%rdi), %ecx
765; SSE41-NEXT:    andl $3, %eax
766; SSE41-NEXT:    andl $3, %ecx
767; SSE41-NEXT:    movl 8(%rdi), %edx
768; SSE41-NEXT:    andl $3, %edx
769; SSE41-NEXT:    movl 12(%rdi), %esi
770; SSE41-NEXT:    andl $3, %esi
771; SSE41-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
772; SSE41-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
773; SSE41-NEXT:    pinsrd $1, -24(%rsp,%rcx,4), %xmm0
774; SSE41-NEXT:    pinsrd $2, -24(%rsp,%rdx,4), %xmm0
775; SSE41-NEXT:    pinsrd $3, -24(%rsp,%rsi,4), %xmm0
776; SSE41-NEXT:    retq
777;
778; AVX-LABEL: mem_shuffle_v4i32_v4i32_xxxx_i32:
779; AVX:       # %bb.0:
780; AVX-NEXT:    movl (%rdi), %eax
781; AVX-NEXT:    movl 4(%rdi), %ecx
782; AVX-NEXT:    andl $3, %eax
783; AVX-NEXT:    andl $3, %ecx
784; AVX-NEXT:    movl 8(%rdi), %edx
785; AVX-NEXT:    andl $3, %edx
786; AVX-NEXT:    movl 12(%rdi), %esi
787; AVX-NEXT:    andl $3, %esi
788; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
789; AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
790; AVX-NEXT:    vpinsrd $1, -24(%rsp,%rcx,4), %xmm0, %xmm0
791; AVX-NEXT:    vpinsrd $2, -24(%rsp,%rdx,4), %xmm0, %xmm0
792; AVX-NEXT:    vpinsrd $3, -24(%rsp,%rsi,4), %xmm0, %xmm0
793; AVX-NEXT:    retq
794  %p0  = getelementptr inbounds i32, i32* %i, i64 0
795  %p1  = getelementptr inbounds i32, i32* %i, i64 1
796  %p2  = getelementptr inbounds i32, i32* %i, i64 2
797  %p3  = getelementptr inbounds i32, i32* %i, i64 3
798  %i0  = load i32, i32* %p0, align 4
799  %i1  = load i32, i32* %p1, align 4
800  %i2  = load i32, i32* %p2, align 4
801  %i3  = load i32, i32* %p3, align 4
802  %x0 = extractelement <4 x i32> %x, i32 %i0
803  %x1 = extractelement <4 x i32> %x, i32 %i1
804  %x2 = extractelement <4 x i32> %x, i32 %i2
805  %x3 = extractelement <4 x i32> %x, i32 %i3
806  %r0 = insertelement <4 x i32> undef, i32 %x0, i32 0
807  %r1 = insertelement <4 x i32>   %r0, i32 %x1, i32 1
808  %r2 = insertelement <4 x i32>   %r1, i32 %x2, i32 2
809  %r3 = insertelement <4 x i32>   %r2, i32 %x3, i32 3
810  ret <4 x i32> %r3
811}
812
813define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8* %i) nounwind {
814; SSE2-LABEL: mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8:
815; SSE2:       # %bb.0:
816; SSE2-NEXT:    pushq %rbp
817; SSE2-NEXT:    pushq %r15
818; SSE2-NEXT:    pushq %r14
819; SSE2-NEXT:    pushq %r13
820; SSE2-NEXT:    pushq %r12
821; SSE2-NEXT:    pushq %rbx
822; SSE2-NEXT:    movzbl (%rdi), %eax
823; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
824; SSE2-NEXT:    movzbl 1(%rdi), %r9d
825; SSE2-NEXT:    movzbl 2(%rdi), %r10d
826; SSE2-NEXT:    movzbl 3(%rdi), %r11d
827; SSE2-NEXT:    movzbl 4(%rdi), %r14d
828; SSE2-NEXT:    movzbl 5(%rdi), %r15d
829; SSE2-NEXT:    movzbl 6(%rdi), %r12d
830; SSE2-NEXT:    movzbl 7(%rdi), %r13d
831; SSE2-NEXT:    movzbl 8(%rdi), %ebx
832; SSE2-NEXT:    movzbl 9(%rdi), %r8d
833; SSE2-NEXT:    movzbl 10(%rdi), %ecx
834; SSE2-NEXT:    movzbl 11(%rdi), %edx
835; SSE2-NEXT:    movzbl 12(%rdi), %esi
836; SSE2-NEXT:    movzbl 13(%rdi), %ebp
837; SSE2-NEXT:    movzbl 14(%rdi), %eax
838; SSE2-NEXT:    movzbl 15(%rdi), %edi
839; SSE2-NEXT:    andl $15, %edi
840; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
841; SSE2-NEXT:    movzbl -24(%rsp,%rdi), %edi
842; SSE2-NEXT:    movd %edi, %xmm8
843; SSE2-NEXT:    andl $15, %eax
844; SSE2-NEXT:    movzbl -24(%rsp,%rax), %eax
845; SSE2-NEXT:    movd %eax, %xmm15
846; SSE2-NEXT:    andl $15, %ebp
847; SSE2-NEXT:    movzbl -24(%rsp,%rbp), %eax
848; SSE2-NEXT:    movd %eax, %xmm9
849; SSE2-NEXT:    andl $15, %esi
850; SSE2-NEXT:    movzbl -24(%rsp,%rsi), %eax
851; SSE2-NEXT:    movd %eax, %xmm3
852; SSE2-NEXT:    andl $15, %edx
853; SSE2-NEXT:    movzbl -24(%rsp,%rdx), %eax
854; SSE2-NEXT:    movd %eax, %xmm10
855; SSE2-NEXT:    andl $15, %ecx
856; SSE2-NEXT:    movzbl -24(%rsp,%rcx), %eax
857; SSE2-NEXT:    movd %eax, %xmm7
858; SSE2-NEXT:    andl $15, %r8d
859; SSE2-NEXT:    movzbl -24(%rsp,%r8), %eax
860; SSE2-NEXT:    movd %eax, %xmm11
861; SSE2-NEXT:    andl $15, %ebx
862; SSE2-NEXT:    movzbl -24(%rsp,%rbx), %eax
863; SSE2-NEXT:    movd %eax, %xmm6
864; SSE2-NEXT:    andl $15, %r13d
865; SSE2-NEXT:    movzbl -24(%rsp,%r13), %eax
866; SSE2-NEXT:    movd %eax, %xmm12
867; SSE2-NEXT:    andl $15, %r12d
868; SSE2-NEXT:    movzbl -24(%rsp,%r12), %eax
869; SSE2-NEXT:    movd %eax, %xmm5
870; SSE2-NEXT:    andl $15, %r15d
871; SSE2-NEXT:    movzbl -24(%rsp,%r15), %eax
872; SSE2-NEXT:    movd %eax, %xmm13
873; SSE2-NEXT:    andl $15, %r14d
874; SSE2-NEXT:    movzbl -24(%rsp,%r14), %eax
875; SSE2-NEXT:    movd %eax, %xmm4
876; SSE2-NEXT:    andl $15, %r11d
877; SSE2-NEXT:    movzbl -24(%rsp,%r11), %eax
878; SSE2-NEXT:    movd %eax, %xmm14
879; SSE2-NEXT:    andl $15, %r10d
880; SSE2-NEXT:    movzbl -24(%rsp,%r10), %eax
881; SSE2-NEXT:    movd %eax, %xmm1
882; SSE2-NEXT:    andl $15, %r9d
883; SSE2-NEXT:    movzbl -24(%rsp,%r9), %eax
884; SSE2-NEXT:    movd %eax, %xmm2
885; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
886; SSE2-NEXT:    andl $15, %eax
887; SSE2-NEXT:    movzbl -24(%rsp,%rax), %eax
888; SSE2-NEXT:    movd %eax, %xmm0
889; SSE2-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
890; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
891; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3]
892; SSE2-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7]
893; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7]
894; SSE2-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
895; SSE2-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
896; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7]
897; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3],xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7]
898; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
899; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7]
900; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
901; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
902; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
903; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0]
904; SSE2-NEXT:    popq %rbx
905; SSE2-NEXT:    popq %r12
906; SSE2-NEXT:    popq %r13
907; SSE2-NEXT:    popq %r14
908; SSE2-NEXT:    popq %r15
909; SSE2-NEXT:    popq %rbp
910; SSE2-NEXT:    retq
911;
912; SSSE3-LABEL: mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8:
913; SSSE3:       # %bb.0:
914; SSSE3-NEXT:    pushq %rbp
915; SSSE3-NEXT:    pushq %r15
916; SSSE3-NEXT:    pushq %r14
917; SSSE3-NEXT:    pushq %r13
918; SSSE3-NEXT:    pushq %r12
919; SSSE3-NEXT:    pushq %rbx
920; SSSE3-NEXT:    movzbl (%rdi), %eax
921; SSSE3-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
922; SSSE3-NEXT:    movzbl 1(%rdi), %r9d
923; SSSE3-NEXT:    movzbl 2(%rdi), %r10d
924; SSSE3-NEXT:    movzbl 3(%rdi), %r11d
925; SSSE3-NEXT:    movzbl 4(%rdi), %r14d
926; SSSE3-NEXT:    movzbl 5(%rdi), %r15d
927; SSSE3-NEXT:    movzbl 6(%rdi), %r12d
928; SSSE3-NEXT:    movzbl 7(%rdi), %r13d
929; SSSE3-NEXT:    movzbl 8(%rdi), %ebx
930; SSSE3-NEXT:    movzbl 9(%rdi), %r8d
931; SSSE3-NEXT:    movzbl 10(%rdi), %ecx
932; SSSE3-NEXT:    movzbl 11(%rdi), %edx
933; SSSE3-NEXT:    movzbl 12(%rdi), %esi
934; SSSE3-NEXT:    movzbl 13(%rdi), %ebp
935; SSSE3-NEXT:    movzbl 14(%rdi), %eax
936; SSSE3-NEXT:    movzbl 15(%rdi), %edi
937; SSSE3-NEXT:    andl $15, %edi
938; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
939; SSSE3-NEXT:    movzbl -24(%rsp,%rdi), %edi
940; SSSE3-NEXT:    movd %edi, %xmm8
941; SSSE3-NEXT:    andl $15, %eax
942; SSSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
943; SSSE3-NEXT:    movd %eax, %xmm15
944; SSSE3-NEXT:    andl $15, %ebp
945; SSSE3-NEXT:    movzbl -24(%rsp,%rbp), %eax
946; SSSE3-NEXT:    movd %eax, %xmm9
947; SSSE3-NEXT:    andl $15, %esi
948; SSSE3-NEXT:    movzbl -24(%rsp,%rsi), %eax
949; SSSE3-NEXT:    movd %eax, %xmm3
950; SSSE3-NEXT:    andl $15, %edx
951; SSSE3-NEXT:    movzbl -24(%rsp,%rdx), %eax
952; SSSE3-NEXT:    movd %eax, %xmm10
953; SSSE3-NEXT:    andl $15, %ecx
954; SSSE3-NEXT:    movzbl -24(%rsp,%rcx), %eax
955; SSSE3-NEXT:    movd %eax, %xmm7
956; SSSE3-NEXT:    andl $15, %r8d
957; SSSE3-NEXT:    movzbl -24(%rsp,%r8), %eax
958; SSSE3-NEXT:    movd %eax, %xmm11
959; SSSE3-NEXT:    andl $15, %ebx
960; SSSE3-NEXT:    movzbl -24(%rsp,%rbx), %eax
961; SSSE3-NEXT:    movd %eax, %xmm6
962; SSSE3-NEXT:    andl $15, %r13d
963; SSSE3-NEXT:    movzbl -24(%rsp,%r13), %eax
964; SSSE3-NEXT:    movd %eax, %xmm12
965; SSSE3-NEXT:    andl $15, %r12d
966; SSSE3-NEXT:    movzbl -24(%rsp,%r12), %eax
967; SSSE3-NEXT:    movd %eax, %xmm5
968; SSSE3-NEXT:    andl $15, %r15d
969; SSSE3-NEXT:    movzbl -24(%rsp,%r15), %eax
970; SSSE3-NEXT:    movd %eax, %xmm13
971; SSSE3-NEXT:    andl $15, %r14d
972; SSSE3-NEXT:    movzbl -24(%rsp,%r14), %eax
973; SSSE3-NEXT:    movd %eax, %xmm4
974; SSSE3-NEXT:    andl $15, %r11d
975; SSSE3-NEXT:    movzbl -24(%rsp,%r11), %eax
976; SSSE3-NEXT:    movd %eax, %xmm14
977; SSSE3-NEXT:    andl $15, %r10d
978; SSSE3-NEXT:    movzbl -24(%rsp,%r10), %eax
979; SSSE3-NEXT:    movd %eax, %xmm1
980; SSSE3-NEXT:    andl $15, %r9d
981; SSSE3-NEXT:    movzbl -24(%rsp,%r9), %eax
982; SSSE3-NEXT:    movd %eax, %xmm2
983; SSSE3-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
984; SSSE3-NEXT:    andl $15, %eax
985; SSSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
986; SSSE3-NEXT:    movd %eax, %xmm0
987; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
988; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
989; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3]
990; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7]
991; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7]
992; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
993; SSSE3-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
994; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7]
995; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3],xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7]
996; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
997; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7]
998; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
999; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1000; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
1001; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0]
1002; SSSE3-NEXT:    popq %rbx
1003; SSSE3-NEXT:    popq %r12
1004; SSSE3-NEXT:    popq %r13
1005; SSSE3-NEXT:    popq %r14
1006; SSSE3-NEXT:    popq %r15
1007; SSSE3-NEXT:    popq %rbp
1008; SSSE3-NEXT:    retq
1009;
1010; SSE41-LABEL: mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8:
1011; SSE41:       # %bb.0:
1012; SSE41-NEXT:    pushq %rbp
1013; SSE41-NEXT:    pushq %r15
1014; SSE41-NEXT:    pushq %r14
1015; SSE41-NEXT:    pushq %r13
1016; SSE41-NEXT:    pushq %r12
1017; SSE41-NEXT:    pushq %rbx
1018; SSE41-NEXT:    movzbl (%rdi), %r9d
1019; SSE41-NEXT:    andl $15, %r9d
1020; SSE41-NEXT:    movzbl 1(%rdi), %ebx
1021; SSE41-NEXT:    movzbl 2(%rdi), %eax
1022; SSE41-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1023; SSE41-NEXT:    movzbl 3(%rdi), %r11d
1024; SSE41-NEXT:    movzbl 4(%rdi), %r14d
1025; SSE41-NEXT:    movzbl 5(%rdi), %r15d
1026; SSE41-NEXT:    movzbl 6(%rdi), %r12d
1027; SSE41-NEXT:    movzbl 7(%rdi), %r13d
1028; SSE41-NEXT:    movzbl 8(%rdi), %r10d
1029; SSE41-NEXT:    movzbl 9(%rdi), %r8d
1030; SSE41-NEXT:    movzbl 10(%rdi), %ecx
1031; SSE41-NEXT:    movzbl 11(%rdi), %edx
1032; SSE41-NEXT:    movzbl 12(%rdi), %esi
1033; SSE41-NEXT:    movzbl 13(%rdi), %ebp
1034; SSE41-NEXT:    movzbl 14(%rdi), %eax
1035; SSE41-NEXT:    movzbl 15(%rdi), %edi
1036; SSE41-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
1037; SSE41-NEXT:    movzbl -24(%rsp,%r9), %r9d
1038; SSE41-NEXT:    movd %r9d, %xmm0
1039; SSE41-NEXT:    andl $15, %ebx
1040; SSE41-NEXT:    pinsrb $1, -24(%rsp,%rbx), %xmm0
1041; SSE41-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
1042; SSE41-NEXT:    andl $15, %ebx
1043; SSE41-NEXT:    pinsrb $2, -24(%rsp,%rbx), %xmm0
1044; SSE41-NEXT:    andl $15, %r11d
1045; SSE41-NEXT:    pinsrb $3, -24(%rsp,%r11), %xmm0
1046; SSE41-NEXT:    andl $15, %r14d
1047; SSE41-NEXT:    pinsrb $4, -24(%rsp,%r14), %xmm0
1048; SSE41-NEXT:    andl $15, %r15d
1049; SSE41-NEXT:    pinsrb $5, -24(%rsp,%r15), %xmm0
1050; SSE41-NEXT:    andl $15, %r12d
1051; SSE41-NEXT:    pinsrb $6, -24(%rsp,%r12), %xmm0
1052; SSE41-NEXT:    andl $15, %r13d
1053; SSE41-NEXT:    pinsrb $7, -24(%rsp,%r13), %xmm0
1054; SSE41-NEXT:    andl $15, %r10d
1055; SSE41-NEXT:    pinsrb $8, -24(%rsp,%r10), %xmm0
1056; SSE41-NEXT:    andl $15, %r8d
1057; SSE41-NEXT:    pinsrb $9, -24(%rsp,%r8), %xmm0
1058; SSE41-NEXT:    andl $15, %ecx
1059; SSE41-NEXT:    pinsrb $10, -24(%rsp,%rcx), %xmm0
1060; SSE41-NEXT:    andl $15, %edx
1061; SSE41-NEXT:    pinsrb $11, -24(%rsp,%rdx), %xmm0
1062; SSE41-NEXT:    andl $15, %esi
1063; SSE41-NEXT:    pinsrb $12, -24(%rsp,%rsi), %xmm0
1064; SSE41-NEXT:    andl $15, %ebp
1065; SSE41-NEXT:    pinsrb $13, -24(%rsp,%rbp), %xmm0
1066; SSE41-NEXT:    andl $15, %eax
1067; SSE41-NEXT:    pinsrb $14, -24(%rsp,%rax), %xmm0
1068; SSE41-NEXT:    andl $15, %edi
1069; SSE41-NEXT:    pinsrb $15, -24(%rsp,%rdi), %xmm0
1070; SSE41-NEXT:    popq %rbx
1071; SSE41-NEXT:    popq %r12
1072; SSE41-NEXT:    popq %r13
1073; SSE41-NEXT:    popq %r14
1074; SSE41-NEXT:    popq %r15
1075; SSE41-NEXT:    popq %rbp
1076; SSE41-NEXT:    retq
1077;
1078; AVX-LABEL: mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8:
1079; AVX:       # %bb.0:
1080; AVX-NEXT:    pushq %rbp
1081; AVX-NEXT:    pushq %r15
1082; AVX-NEXT:    pushq %r14
1083; AVX-NEXT:    pushq %r13
1084; AVX-NEXT:    pushq %r12
1085; AVX-NEXT:    pushq %rbx
1086; AVX-NEXT:    movzbl (%rdi), %r9d
1087; AVX-NEXT:    andl $15, %r9d
1088; AVX-NEXT:    movzbl 1(%rdi), %ebx
1089; AVX-NEXT:    movzbl 2(%rdi), %eax
1090; AVX-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1091; AVX-NEXT:    movzbl 3(%rdi), %r11d
1092; AVX-NEXT:    movzbl 4(%rdi), %r14d
1093; AVX-NEXT:    movzbl 5(%rdi), %r15d
1094; AVX-NEXT:    movzbl 6(%rdi), %r12d
1095; AVX-NEXT:    movzbl 7(%rdi), %r13d
1096; AVX-NEXT:    movzbl 8(%rdi), %r10d
1097; AVX-NEXT:    movzbl 9(%rdi), %r8d
1098; AVX-NEXT:    movzbl 10(%rdi), %ecx
1099; AVX-NEXT:    movzbl 11(%rdi), %edx
1100; AVX-NEXT:    movzbl 12(%rdi), %esi
1101; AVX-NEXT:    movzbl 13(%rdi), %ebp
1102; AVX-NEXT:    movzbl 14(%rdi), %eax
1103; AVX-NEXT:    movzbl 15(%rdi), %edi
1104; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
1105; AVX-NEXT:    movzbl -24(%rsp,%r9), %r9d
1106; AVX-NEXT:    vmovd %r9d, %xmm0
1107; AVX-NEXT:    andl $15, %ebx
1108; AVX-NEXT:    vpinsrb $1, -24(%rsp,%rbx), %xmm0, %xmm0
1109; AVX-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
1110; AVX-NEXT:    andl $15, %ebx
1111; AVX-NEXT:    vpinsrb $2, -24(%rsp,%rbx), %xmm0, %xmm0
1112; AVX-NEXT:    andl $15, %r11d
1113; AVX-NEXT:    vpinsrb $3, -24(%rsp,%r11), %xmm0, %xmm0
1114; AVX-NEXT:    andl $15, %r14d
1115; AVX-NEXT:    vpinsrb $4, -24(%rsp,%r14), %xmm0, %xmm0
1116; AVX-NEXT:    andl $15, %r15d
1117; AVX-NEXT:    vpinsrb $5, -24(%rsp,%r15), %xmm0, %xmm0
1118; AVX-NEXT:    andl $15, %r12d
1119; AVX-NEXT:    vpinsrb $6, -24(%rsp,%r12), %xmm0, %xmm0
1120; AVX-NEXT:    andl $15, %r13d
1121; AVX-NEXT:    vpinsrb $7, -24(%rsp,%r13), %xmm0, %xmm0
1122; AVX-NEXT:    andl $15, %r10d
1123; AVX-NEXT:    vpinsrb $8, -24(%rsp,%r10), %xmm0, %xmm0
1124; AVX-NEXT:    andl $15, %r8d
1125; AVX-NEXT:    vpinsrb $9, -24(%rsp,%r8), %xmm0, %xmm0
1126; AVX-NEXT:    andl $15, %ecx
1127; AVX-NEXT:    vpinsrb $10, -24(%rsp,%rcx), %xmm0, %xmm0
1128; AVX-NEXT:    andl $15, %edx
1129; AVX-NEXT:    vpinsrb $11, -24(%rsp,%rdx), %xmm0, %xmm0
1130; AVX-NEXT:    andl $15, %esi
1131; AVX-NEXT:    vpinsrb $12, -24(%rsp,%rsi), %xmm0, %xmm0
1132; AVX-NEXT:    andl $15, %ebp
1133; AVX-NEXT:    vpinsrb $13, -24(%rsp,%rbp), %xmm0, %xmm0
1134; AVX-NEXT:    andl $15, %eax
1135; AVX-NEXT:    vpinsrb $14, -24(%rsp,%rax), %xmm0, %xmm0
1136; AVX-NEXT:    andl $15, %edi
1137; AVX-NEXT:    vpinsrb $15, -24(%rsp,%rdi), %xmm0, %xmm0
1138; AVX-NEXT:    popq %rbx
1139; AVX-NEXT:    popq %r12
1140; AVX-NEXT:    popq %r13
1141; AVX-NEXT:    popq %r14
1142; AVX-NEXT:    popq %r15
1143; AVX-NEXT:    popq %rbp
1144; AVX-NEXT:    retq
1145  %p0  = getelementptr inbounds i8, i8* %i, i64 0
1146  %p1  = getelementptr inbounds i8, i8* %i, i64 1
1147  %p2  = getelementptr inbounds i8, i8* %i, i64 2
1148  %p3  = getelementptr inbounds i8, i8* %i, i64 3
1149  %p4  = getelementptr inbounds i8, i8* %i, i64 4
1150  %p5  = getelementptr inbounds i8, i8* %i, i64 5
1151  %p6  = getelementptr inbounds i8, i8* %i, i64 6
1152  %p7  = getelementptr inbounds i8, i8* %i, i64 7
1153  %p8  = getelementptr inbounds i8, i8* %i, i64 8
1154  %p9  = getelementptr inbounds i8, i8* %i, i64 9
1155  %p10 = getelementptr inbounds i8, i8* %i, i64 10
1156  %p11 = getelementptr inbounds i8, i8* %i, i64 11
1157  %p12 = getelementptr inbounds i8, i8* %i, i64 12
1158  %p13 = getelementptr inbounds i8, i8* %i, i64 13
1159  %p14 = getelementptr inbounds i8, i8* %i, i64 14
1160  %p15 = getelementptr inbounds i8, i8* %i, i64 15
1161  %i0  = load i8, i8* %p0 , align 4
1162  %i1  = load i8, i8* %p1 , align 4
1163  %i2  = load i8, i8* %p2 , align 4
1164  %i3  = load i8, i8* %p3 , align 4
1165  %i4  = load i8, i8* %p4 , align 4
1166  %i5  = load i8, i8* %p5 , align 4
1167  %i6  = load i8, i8* %p6 , align 4
1168  %i7  = load i8, i8* %p7 , align 4
1169  %i8  = load i8, i8* %p8 , align 4
1170  %i9  = load i8, i8* %p9 , align 4
1171  %i10 = load i8, i8* %p10, align 4
1172  %i11 = load i8, i8* %p11, align 4
1173  %i12 = load i8, i8* %p12, align 4
1174  %i13 = load i8, i8* %p13, align 4
1175  %i14 = load i8, i8* %p14, align 4
1176  %i15 = load i8, i8* %p15, align 4
1177  %x0  = extractelement <16 x i8> %x, i8 %i0
1178  %x1  = extractelement <16 x i8> %x, i8 %i1
1179  %x2  = extractelement <16 x i8> %x, i8 %i2
1180  %x3  = extractelement <16 x i8> %x, i8 %i3
1181  %x4  = extractelement <16 x i8> %x, i8 %i4
1182  %x5  = extractelement <16 x i8> %x, i8 %i5
1183  %x6  = extractelement <16 x i8> %x, i8 %i6
1184  %x7  = extractelement <16 x i8> %x, i8 %i7
1185  %x8  = extractelement <16 x i8> %x, i8 %i8
1186  %x9  = extractelement <16 x i8> %x, i8 %i9
1187  %x10 = extractelement <16 x i8> %x, i8 %i10
1188  %x11 = extractelement <16 x i8> %x, i8 %i11
1189  %x12 = extractelement <16 x i8> %x, i8 %i12
1190  %x13 = extractelement <16 x i8> %x, i8 %i13
1191  %x14 = extractelement <16 x i8> %x, i8 %i14
1192  %x15 = extractelement <16 x i8> %x, i8 %i15
1193  %r0  = insertelement <16 x i8> undef, i8 %x0 , i32 0
1194  %r1  = insertelement <16 x i8>  %r0 , i8 %x1 , i32 1
1195  %r2  = insertelement <16 x i8>  %r1 , i8 %x2 , i32 2
1196  %r3  = insertelement <16 x i8>  %r2 , i8 %x3 , i32 3
1197  %r4  = insertelement <16 x i8>  %r3 , i8 %x4 , i32 4
1198  %r5  = insertelement <16 x i8>  %r4 , i8 %x5 , i32 5
1199  %r6  = insertelement <16 x i8>  %r5 , i8 %x6 , i32 6
1200  %r7  = insertelement <16 x i8>  %r6 , i8 %x7 , i32 7
1201  %r8  = insertelement <16 x i8>  %r7 , i8 %x8 , i32 8
1202  %r9  = insertelement <16 x i8>  %r8 , i8 %x9 , i32 9
1203  %r10 = insertelement <16 x i8>  %r9 , i8 %x10, i32 10
1204  %r11 = insertelement <16 x i8>  %r10, i8 %x11, i32 11
1205  %r12 = insertelement <16 x i8>  %r11, i8 %x12, i32 12
1206  %r13 = insertelement <16 x i8>  %r12, i8 %x13, i32 13
1207  %r14 = insertelement <16 x i8>  %r13, i8 %x14, i32 14
1208  %r15 = insertelement <16 x i8>  %r14, i8 %x15, i32 15
1209  ret <16 x i8> %r15
1210}
1211
1212;
1213; Binary shuffle indices from registers
1214;
1215
1216define <4 x float> @var_shuffle_v4f32_v4f32_x0yx_i32(<4 x float> %x, <4 x float> %y, i32 %i0, i32 %i1, i32 %i2, i32 %i3) nounwind {
1217; SSE2-LABEL: var_shuffle_v4f32_v4f32_x0yx_i32:
1218; SSE2:       # %bb.0:
1219; SSE2-NEXT:    # kill: def $ecx killed $ecx def $rcx
1220; SSE2-NEXT:    # kill: def $edx killed $edx def $rdx
1221; SSE2-NEXT:    # kill: def $edi killed $edi def $rdi
1222; SSE2-NEXT:    andl $3, %edi
1223; SSE2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
1224; SSE2-NEXT:    andl $3, %edx
1225; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
1226; SSE2-NEXT:    andl $3, %ecx
1227; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1228; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1229; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
1230; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1231; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1232; SSE2-NEXT:    retq
1233;
1234; SSSE3-LABEL: var_shuffle_v4f32_v4f32_x0yx_i32:
1235; SSSE3:       # %bb.0:
1236; SSSE3-NEXT:    # kill: def $ecx killed $ecx def $rcx
1237; SSSE3-NEXT:    # kill: def $edx killed $edx def $rdx
1238; SSSE3-NEXT:    # kill: def $edi killed $edi def $rdi
1239; SSSE3-NEXT:    andl $3, %edi
1240; SSSE3-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
1241; SSSE3-NEXT:    andl $3, %edx
1242; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
1243; SSSE3-NEXT:    andl $3, %ecx
1244; SSSE3-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1245; SSSE3-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1246; SSSE3-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
1247; SSSE3-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1248; SSSE3-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1249; SSSE3-NEXT:    retq
1250;
1251; SSE41-LABEL: var_shuffle_v4f32_v4f32_x0yx_i32:
1252; SSE41:       # %bb.0:
1253; SSE41-NEXT:    # kill: def $ecx killed $ecx def $rcx
1254; SSE41-NEXT:    # kill: def $edx killed $edx def $rdx
1255; SSE41-NEXT:    # kill: def $edi killed $edi def $rdi
1256; SSE41-NEXT:    andl $3, %edi
1257; SSE41-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
1258; SSE41-NEXT:    andl $3, %edx
1259; SSE41-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
1260; SSE41-NEXT:    andl $3, %ecx
1261; SSE41-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1262; SSE41-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1263; SSE41-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0],mem[0],zero,zero
1264; SSE41-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1265; SSE41-NEXT:    retq
1266;
1267; AVX-LABEL: var_shuffle_v4f32_v4f32_x0yx_i32:
1268; AVX:       # %bb.0:
1269; AVX-NEXT:    # kill: def $ecx killed $ecx def $rcx
1270; AVX-NEXT:    # kill: def $edx killed $edx def $rdx
1271; AVX-NEXT:    # kill: def $edi killed $edi def $rdi
1272; AVX-NEXT:    andl $3, %edi
1273; AVX-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
1274; AVX-NEXT:    andl $3, %edx
1275; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
1276; AVX-NEXT:    andl $3, %ecx
1277; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1278; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1279; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],zero,zero
1280; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1281; AVX-NEXT:    retq
1282  %x0 = extractelement <4 x float> %x, i32 %i0
1283  %x1 = extractelement <4 x float> %x, i32 %i1
1284  %y2 = extractelement <4 x float> %y, i32 %i2
1285  %x3 = extractelement <4 x float> %x, i32 %i3
1286  %r0 = insertelement <4 x float> undef, float %x0, i32 0
1287  %r1 = insertelement <4 x float>   %r0, float 0.0, i32 1
1288  %r2 = insertelement <4 x float>   %r1, float %y2, i32 2
1289  %r3 = insertelement <4 x float>   %r2, float %x3, i32 3
1290  ret <4 x float> %r3
1291}
1292
1293define <8 x i16> @var_shuffle_v8i16_v8i16_xyxyxy00_i16(<8 x i16> %x, <8 x i16> %y, i16 %i0, i16 %i1, i16 %i2, i16 %i3, i16 %i4, i16 %i5, i16 %i6, i16 %i7) nounwind {
1294; SSE2-LABEL: var_shuffle_v8i16_v8i16_xyxyxy00_i16:
1295; SSE2:       # %bb.0:
1296; SSE2-NEXT:    # kill: def $r9d killed $r9d def $r9
1297; SSE2-NEXT:    # kill: def $r8d killed $r8d def $r8
1298; SSE2-NEXT:    # kill: def $ecx killed $ecx def $rcx
1299; SSE2-NEXT:    # kill: def $edx killed $edx def $rdx
1300; SSE2-NEXT:    # kill: def $esi killed $esi def $rsi
1301; SSE2-NEXT:    # kill: def $edi killed $edi def $rdi
1302; SSE2-NEXT:    andl $7, %edi
1303; SSE2-NEXT:    andl $7, %esi
1304; SSE2-NEXT:    andl $7, %edx
1305; SSE2-NEXT:    andl $7, %ecx
1306; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
1307; SSE2-NEXT:    andl $7, %r8d
1308; SSE2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
1309; SSE2-NEXT:    andl $7, %r9d
1310; SSE2-NEXT:    movzwl -24(%rsp,%rcx,2), %eax
1311; SSE2-NEXT:    movd %eax, %xmm0
1312; SSE2-NEXT:    movzwl -40(%rsp,%rdx,2), %eax
1313; SSE2-NEXT:    movd %eax, %xmm1
1314; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1315; SSE2-NEXT:    movzwl -24(%rsp,%rsi,2), %eax
1316; SSE2-NEXT:    movd %eax, %xmm2
1317; SSE2-NEXT:    movzwl -40(%rsp,%rdi,2), %eax
1318; SSE2-NEXT:    movd %eax, %xmm0
1319; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1320; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1321; SSE2-NEXT:    movzwl -24(%rsp,%r9,2), %eax
1322; SSE2-NEXT:    movd %eax, %xmm1
1323; SSE2-NEXT:    movzwl -40(%rsp,%r8,2), %eax
1324; SSE2-NEXT:    movd %eax, %xmm2
1325; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
1326; SSE2-NEXT:    pxor %xmm1, %xmm1
1327; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
1328; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1329; SSE2-NEXT:    retq
1330;
1331; SSSE3-LABEL: var_shuffle_v8i16_v8i16_xyxyxy00_i16:
1332; SSSE3:       # %bb.0:
1333; SSSE3-NEXT:    # kill: def $r9d killed $r9d def $r9
1334; SSSE3-NEXT:    # kill: def $r8d killed $r8d def $r8
1335; SSSE3-NEXT:    # kill: def $ecx killed $ecx def $rcx
1336; SSSE3-NEXT:    # kill: def $edx killed $edx def $rdx
1337; SSSE3-NEXT:    # kill: def $esi killed $esi def $rsi
1338; SSSE3-NEXT:    # kill: def $edi killed $edi def $rdi
1339; SSSE3-NEXT:    andl $7, %edi
1340; SSSE3-NEXT:    andl $7, %esi
1341; SSSE3-NEXT:    andl $7, %edx
1342; SSSE3-NEXT:    andl $7, %ecx
1343; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
1344; SSSE3-NEXT:    andl $7, %r8d
1345; SSSE3-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
1346; SSSE3-NEXT:    andl $7, %r9d
1347; SSSE3-NEXT:    movzwl -24(%rsp,%rcx,2), %eax
1348; SSSE3-NEXT:    movd %eax, %xmm0
1349; SSSE3-NEXT:    movzwl -40(%rsp,%rdx,2), %eax
1350; SSSE3-NEXT:    movd %eax, %xmm1
1351; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1352; SSSE3-NEXT:    movzwl -24(%rsp,%rsi,2), %eax
1353; SSSE3-NEXT:    movd %eax, %xmm2
1354; SSSE3-NEXT:    movzwl -40(%rsp,%rdi,2), %eax
1355; SSSE3-NEXT:    movd %eax, %xmm0
1356; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1357; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1358; SSSE3-NEXT:    movzwl -24(%rsp,%r9,2), %eax
1359; SSSE3-NEXT:    movd %eax, %xmm1
1360; SSSE3-NEXT:    movzwl -40(%rsp,%r8,2), %eax
1361; SSSE3-NEXT:    movd %eax, %xmm2
1362; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
1363; SSSE3-NEXT:    pxor %xmm1, %xmm1
1364; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
1365; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1366; SSSE3-NEXT:    retq
1367;
1368; SSE41-LABEL: var_shuffle_v8i16_v8i16_xyxyxy00_i16:
1369; SSE41:       # %bb.0:
1370; SSE41-NEXT:    # kill: def $r9d killed $r9d def $r9
1371; SSE41-NEXT:    # kill: def $r8d killed $r8d def $r8
1372; SSE41-NEXT:    # kill: def $ecx killed $ecx def $rcx
1373; SSE41-NEXT:    # kill: def $edx killed $edx def $rdx
1374; SSE41-NEXT:    # kill: def $esi killed $esi def $rsi
1375; SSE41-NEXT:    # kill: def $edi killed $edi def $rdi
1376; SSE41-NEXT:    andl $7, %edi
1377; SSE41-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
1378; SSE41-NEXT:    movzwl -40(%rsp,%rdi,2), %eax
1379; SSE41-NEXT:    andl $7, %esi
1380; SSE41-NEXT:    andl $7, %edx
1381; SSE41-NEXT:    andl $7, %ecx
1382; SSE41-NEXT:    andl $7, %r8d
1383; SSE41-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
1384; SSE41-NEXT:    andl $7, %r9d
1385; SSE41-NEXT:    movd %eax, %xmm0
1386; SSE41-NEXT:    pinsrw $1, -24(%rsp,%rsi,2), %xmm0
1387; SSE41-NEXT:    pinsrw $2, -40(%rsp,%rdx,2), %xmm0
1388; SSE41-NEXT:    pinsrw $3, -24(%rsp,%rcx,2), %xmm0
1389; SSE41-NEXT:    pinsrw $4, -40(%rsp,%r8,2), %xmm0
1390; SSE41-NEXT:    pinsrw $5, -24(%rsp,%r9,2), %xmm0
1391; SSE41-NEXT:    retq
1392;
1393; AVX-LABEL: var_shuffle_v8i16_v8i16_xyxyxy00_i16:
1394; AVX:       # %bb.0:
1395; AVX-NEXT:    # kill: def $r9d killed $r9d def $r9
1396; AVX-NEXT:    # kill: def $r8d killed $r8d def $r8
1397; AVX-NEXT:    # kill: def $ecx killed $ecx def $rcx
1398; AVX-NEXT:    # kill: def $edx killed $edx def $rdx
1399; AVX-NEXT:    # kill: def $esi killed $esi def $rsi
1400; AVX-NEXT:    # kill: def $edi killed $edi def $rdi
1401; AVX-NEXT:    andl $7, %edi
1402; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
1403; AVX-NEXT:    movzwl -40(%rsp,%rdi,2), %eax
1404; AVX-NEXT:    andl $7, %esi
1405; AVX-NEXT:    andl $7, %edx
1406; AVX-NEXT:    andl $7, %ecx
1407; AVX-NEXT:    andl $7, %r8d
1408; AVX-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
1409; AVX-NEXT:    andl $7, %r9d
1410; AVX-NEXT:    vmovd %eax, %xmm0
1411; AVX-NEXT:    vpinsrw $1, -24(%rsp,%rsi,2), %xmm0, %xmm0
1412; AVX-NEXT:    vpinsrw $2, -40(%rsp,%rdx,2), %xmm0, %xmm0
1413; AVX-NEXT:    vpinsrw $3, -24(%rsp,%rcx,2), %xmm0, %xmm0
1414; AVX-NEXT:    vpinsrw $4, -40(%rsp,%r8,2), %xmm0, %xmm0
1415; AVX-NEXT:    vpinsrw $5, -24(%rsp,%r9,2), %xmm0, %xmm0
1416; AVX-NEXT:    retq
1417  %x0 = extractelement <8 x i16> %x, i16 %i0
1418  %y1 = extractelement <8 x i16> %y, i16 %i1
1419  %x2 = extractelement <8 x i16> %x, i16 %i2
1420  %y3 = extractelement <8 x i16> %y, i16 %i3
1421  %x4 = extractelement <8 x i16> %x, i16 %i4
1422  %y5 = extractelement <8 x i16> %y, i16 %i5
1423  %x6 = extractelement <8 x i16> %x, i16 %i6
1424  %x7 = extractelement <8 x i16> %x, i16 %i7
1425  %r0 = insertelement <8 x i16> undef, i16 %x0, i32 0
1426  %r1 = insertelement <8 x i16>   %r0, i16 %y1, i32 1
1427  %r2 = insertelement <8 x i16>   %r1, i16 %x2, i32 2
1428  %r3 = insertelement <8 x i16>   %r2, i16 %y3, i32 3
1429  %r4 = insertelement <8 x i16>   %r3, i16 %x4, i32 4
1430  %r5 = insertelement <8 x i16>   %r4, i16 %y5, i32 5
1431  %r6 = insertelement <8 x i16>   %r5, i16   0, i32 6
1432  %r7 = insertelement <8 x i16>   %r6, i16   0, i32 7
1433  ret <8 x i16> %r7
1434}
1435