1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2-SLOW
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2-FAST
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
9
10define void @shuffle_v32i8_to_v16i8_1(<32 x i8>* %L, <16 x i8>* %S) nounwind {
11; AVX-LABEL: shuffle_v32i8_to_v16i8_1:
12; AVX:       # %bb.0:
13; AVX-NEXT:    vmovdqa (%rdi), %xmm0
14; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
15; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
16; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
17; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
18; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
19; AVX-NEXT:    vmovdqa %xmm0, (%rsi)
20; AVX-NEXT:    retq
21;
22; AVX512-LABEL: shuffle_v32i8_to_v16i8_1:
23; AVX512:       # %bb.0:
24; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
25; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm1
26; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
27; AVX512-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
28; AVX512-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
29; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
30; AVX512-NEXT:    vmovdqa %xmm0, (%rsi)
31; AVX512-NEXT:    retq
32  %vec = load <32 x i8>, <32 x i8>* %L
33  %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
34  store <16 x i8> %strided.vec, <16 x i8>* %S
35  ret void
36}
37
38define void @shuffle_v16i16_to_v8i16_1(<16 x i16>* %L, <8 x i16>* %S) nounwind {
39; AVX-LABEL: shuffle_v16i16_to_v8i16_1:
40; AVX:       # %bb.0:
41; AVX-NEXT:    vmovdqa (%rdi), %xmm0
42; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
43; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
44; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
45; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
46; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
47; AVX-NEXT:    vmovdqa %xmm0, (%rsi)
48; AVX-NEXT:    retq
49;
50; AVX512F-LABEL: shuffle_v16i16_to_v8i16_1:
51; AVX512F:       # %bb.0:
52; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
53; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm1
54; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
55; AVX512F-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
56; AVX512F-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
57; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
58; AVX512F-NEXT:    vmovdqa %xmm0, (%rsi)
59; AVX512F-NEXT:    retq
60;
61; AVX512VL-LABEL: shuffle_v16i16_to_v8i16_1:
62; AVX512VL:       # %bb.0:
63; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
64; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
65; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
66; AVX512VL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
67; AVX512VL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
68; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
69; AVX512VL-NEXT:    vmovdqa %xmm0, (%rsi)
70; AVX512VL-NEXT:    retq
71;
72; AVX512BW-LABEL: shuffle_v16i16_to_v8i16_1:
73; AVX512BW:       # %bb.0:
74; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm0 = [1,3,5,7,33,35,37,39]
75; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm1
76; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm2
77; AVX512BW-NEXT:    vpermt2w %zmm2, %zmm0, %zmm1
78; AVX512BW-NEXT:    vmovdqa %xmm1, (%rsi)
79; AVX512BW-NEXT:    vzeroupper
80; AVX512BW-NEXT:    retq
81;
82; AVX512BWVL-LABEL: shuffle_v16i16_to_v8i16_1:
83; AVX512BWVL:       # %bb.0:
84; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
85; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,3,5,7,9,11,13,15]
86; AVX512BWVL-NEXT:    vpermi2w 16(%rdi), %xmm0, %xmm1
87; AVX512BWVL-NEXT:    vmovdqa %xmm1, (%rsi)
88; AVX512BWVL-NEXT:    retq
89  %vec = load <16 x i16>, <16 x i16>* %L
90  %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
91  store <8 x i16> %strided.vec, <8 x i16>* %S
92  ret void
93}
94
95define void @shuffle_v8i32_to_v4i32_1(<8 x i32>* %L, <4 x i32>* %S) nounwind {
96; AVX-LABEL: shuffle_v8i32_to_v4i32_1:
97; AVX:       # %bb.0:
98; AVX-NEXT:    vmovaps (%rdi), %xmm0
99; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],mem[1,3]
100; AVX-NEXT:    vmovaps %xmm0, (%rsi)
101; AVX-NEXT:    retq
102;
103; AVX512-LABEL: shuffle_v8i32_to_v4i32_1:
104; AVX512:       # %bb.0:
105; AVX512-NEXT:    vmovaps (%rdi), %xmm0
106; AVX512-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],mem[1,3]
107; AVX512-NEXT:    vmovaps %xmm0, (%rsi)
108; AVX512-NEXT:    retq
109  %vec = load <8 x i32>, <8 x i32>* %L
110  %strided.vec = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
111  store <4 x i32> %strided.vec, <4 x i32>* %S
112  ret void
113}
114
115define void @shuffle_v32i8_to_v8i8_1(<32 x i8>* %L, <8 x i8>* %S) nounwind {
116; AVX-LABEL: shuffle_v32i8_to_v8i8_1:
117; AVX:       # %bb.0:
118; AVX-NEXT:    vmovdqa (%rdi), %xmm0
119; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
120; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
121; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
122; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
123; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
124; AVX-NEXT:    vmovq %xmm0, (%rsi)
125; AVX-NEXT:    retq
126;
127; AVX512-LABEL: shuffle_v32i8_to_v8i8_1:
128; AVX512:       # %bb.0:
129; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
130; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm1
131; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
132; AVX512-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
133; AVX512-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
134; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
135; AVX512-NEXT:    vmovq %xmm0, (%rsi)
136; AVX512-NEXT:    retq
137  %vec = load <32 x i8>, <32 x i8>* %L
138  %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
139  store <8 x i8> %strided.vec, <8 x i8>* %S
140  ret void
141}
142
143define void @shuffle_v32i8_to_v8i8_2(<32 x i8>* %L, <8 x i8>* %S) nounwind {
144; AVX-LABEL: shuffle_v32i8_to_v8i8_2:
145; AVX:       # %bb.0:
146; AVX-NEXT:    vmovdqa (%rdi), %xmm0
147; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
148; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
149; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
150; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
151; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
152; AVX-NEXT:    vmovq %xmm0, (%rsi)
153; AVX-NEXT:    retq
154;
155; AVX512-LABEL: shuffle_v32i8_to_v8i8_2:
156; AVX512:       # %bb.0:
157; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
158; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm1
159; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
160; AVX512-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
161; AVX512-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
162; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
163; AVX512-NEXT:    vmovq %xmm0, (%rsi)
164; AVX512-NEXT:    retq
165  %vec = load <32 x i8>, <32 x i8>* %L
166  %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
167  store <8 x i8> %strided.vec, <8 x i8>* %S
168  ret void
169}
170
171define void @shuffle_v32i8_to_v8i8_3(<32 x i8>* %L, <8 x i8>* %S) nounwind {
172; AVX-LABEL: shuffle_v32i8_to_v8i8_3:
173; AVX:       # %bb.0:
174; AVX-NEXT:    vmovdqa (%rdi), %xmm0
175; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
176; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
177; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
178; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
179; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
180; AVX-NEXT:    vmovq %xmm0, (%rsi)
181; AVX-NEXT:    retq
182;
183; AVX512-LABEL: shuffle_v32i8_to_v8i8_3:
184; AVX512:       # %bb.0:
185; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
186; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm1
187; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
188; AVX512-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
189; AVX512-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
190; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
191; AVX512-NEXT:    vmovq %xmm0, (%rsi)
192; AVX512-NEXT:    retq
193  %vec = load <32 x i8>, <32 x i8>* %L
194  %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
195  store <8 x i8> %strided.vec, <8 x i8>* %S
196  ret void
197}
198
199define void @shuffle_v16i16_to_v4i16_1(<16 x i16>* %L, <4 x i16>* %S) nounwind {
200; AVX1-LABEL: shuffle_v16i16_to_v4i16_1:
201; AVX1:       # %bb.0:
202; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
203; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
204; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = mem[0,2,2,3]
205; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
206; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
207; AVX1-NEXT:    vmovq %xmm0, (%rsi)
208; AVX1-NEXT:    retq
209;
210; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16_1:
211; AVX2-SLOW:       # %bb.0:
212; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
213; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
214; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = mem[0,2,2,3]
215; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
216; AVX2-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
217; AVX2-SLOW-NEXT:    vmovq %xmm0, (%rsi)
218; AVX2-SLOW-NEXT:    retq
219;
220; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16_1:
221; AVX2-FAST:       # %bb.0:
222; AVX2-FAST-NEXT:    vmovdqa (%rdi), %xmm0
223; AVX2-FAST-NEXT:    vmovdqa 16(%rdi), %xmm1
224; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
225; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
226; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
227; AVX2-FAST-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
228; AVX2-FAST-NEXT:    vmovq %xmm0, (%rsi)
229; AVX2-FAST-NEXT:    retq
230;
231; AVX512F-LABEL: shuffle_v16i16_to_v4i16_1:
232; AVX512F:       # %bb.0:
233; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
234; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
235; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = mem[0,2,2,3]
236; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
237; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
238; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
239; AVX512F-NEXT:    retq
240;
241; AVX512VL-LABEL: shuffle_v16i16_to_v4i16_1:
242; AVX512VL:       # %bb.0:
243; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
244; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
245; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
246; AVX512VL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
247; AVX512VL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
248; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
249; AVX512VL-NEXT:    vmovq %xmm0, (%rsi)
250; AVX512VL-NEXT:    retq
251;
252; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_1:
253; AVX512BW:       # %bb.0:
254; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm0 = [1,5,33,37,4,5,36,37]
255; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm1
256; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm2
257; AVX512BW-NEXT:    vpermt2w %zmm2, %zmm0, %zmm1
258; AVX512BW-NEXT:    vmovq %xmm1, (%rsi)
259; AVX512BW-NEXT:    vzeroupper
260; AVX512BW-NEXT:    retq
261;
262; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_1:
263; AVX512BWVL:       # %bb.0:
264; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
265; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm1 = <1,5,9,13,u,u,u,u>
266; AVX512BWVL-NEXT:    vpermi2w 16(%rdi), %xmm0, %xmm1
267; AVX512BWVL-NEXT:    vmovq %xmm1, (%rsi)
268; AVX512BWVL-NEXT:    retq
269  %vec = load <16 x i16>, <16 x i16>* %L
270  %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
271  store <4 x i16> %strided.vec, <4 x i16>* %S
272  ret void
273}
274
275define void @shuffle_v16i16_to_v4i16_2(<16 x i16>* %L, <4 x i16>* %S) nounwind {
276; AVX1-LABEL: shuffle_v16i16_to_v4i16_2:
277; AVX1:       # %bb.0:
278; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
279; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
280; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = mem[3,1,2,3]
281; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
282; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
283; AVX1-NEXT:    vmovq %xmm0, (%rsi)
284; AVX1-NEXT:    retq
285;
286; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16_2:
287; AVX2-SLOW:       # %bb.0:
288; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
289; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
290; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = mem[3,1,2,3]
291; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
292; AVX2-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
293; AVX2-SLOW-NEXT:    vmovq %xmm0, (%rsi)
294; AVX2-SLOW-NEXT:    retq
295;
296; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16_2:
297; AVX2-FAST:       # %bb.0:
298; AVX2-FAST-NEXT:    vmovdqa (%rdi), %xmm0
299; AVX2-FAST-NEXT:    vmovdqa 16(%rdi), %xmm1
300; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = [4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15]
301; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
302; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
303; AVX2-FAST-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
304; AVX2-FAST-NEXT:    vmovq %xmm0, (%rsi)
305; AVX2-FAST-NEXT:    retq
306;
307; AVX512F-LABEL: shuffle_v16i16_to_v4i16_2:
308; AVX512F:       # %bb.0:
309; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
310; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
311; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = mem[3,1,2,3]
312; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
313; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
314; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
315; AVX512F-NEXT:    retq
316;
317; AVX512VL-LABEL: shuffle_v16i16_to_v4i16_2:
318; AVX512VL:       # %bb.0:
319; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
320; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
321; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15]
322; AVX512VL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
323; AVX512VL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
324; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
325; AVX512VL-NEXT:    vmovq %xmm0, (%rsi)
326; AVX512VL-NEXT:    retq
327;
328; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_2:
329; AVX512BW:       # %bb.0:
330; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm0 = [2,6,34,38,2,3,34,35]
331; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm1
332; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm2
333; AVX512BW-NEXT:    vpermt2w %zmm2, %zmm0, %zmm1
334; AVX512BW-NEXT:    vmovq %xmm1, (%rsi)
335; AVX512BW-NEXT:    vzeroupper
336; AVX512BW-NEXT:    retq
337;
338; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_2:
339; AVX512BWVL:       # %bb.0:
340; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
341; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm1 = <2,6,10,14,u,u,u,u>
342; AVX512BWVL-NEXT:    vpermi2w 16(%rdi), %xmm0, %xmm1
343; AVX512BWVL-NEXT:    vmovq %xmm1, (%rsi)
344; AVX512BWVL-NEXT:    retq
345  %vec = load <16 x i16>, <16 x i16>* %L
346  %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
347  store <4 x i16> %strided.vec, <4 x i16>* %S
348  ret void
349}
350
351define void @shuffle_v16i16_to_v4i16_3(<16 x i16>* %L, <4 x i16>* %S) nounwind {
352; AVX1-LABEL: shuffle_v16i16_to_v4i16_3:
353; AVX1:       # %bb.0:
354; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
355; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
356; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = mem[3,1,2,3]
357; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
358; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
359; AVX1-NEXT:    vmovq %xmm0, (%rsi)
360; AVX1-NEXT:    retq
361;
362; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16_3:
363; AVX2-SLOW:       # %bb.0:
364; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
365; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
366; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = mem[3,1,2,3]
367; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
368; AVX2-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
369; AVX2-SLOW-NEXT:    vmovq %xmm0, (%rsi)
370; AVX2-SLOW-NEXT:    retq
371;
372; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16_3:
373; AVX2-FAST:       # %bb.0:
374; AVX2-FAST-NEXT:    vmovdqa (%rdi), %xmm0
375; AVX2-FAST-NEXT:    vmovdqa 16(%rdi), %xmm1
376; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = [6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15]
377; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
378; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
379; AVX2-FAST-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
380; AVX2-FAST-NEXT:    vmovq %xmm0, (%rsi)
381; AVX2-FAST-NEXT:    retq
382;
383; AVX512F-LABEL: shuffle_v16i16_to_v4i16_3:
384; AVX512F:       # %bb.0:
385; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
386; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
387; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = mem[3,1,2,3]
388; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
389; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
390; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
391; AVX512F-NEXT:    retq
392;
393; AVX512VL-LABEL: shuffle_v16i16_to_v4i16_3:
394; AVX512VL:       # %bb.0:
395; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
396; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
397; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15]
398; AVX512VL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
399; AVX512VL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
400; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
401; AVX512VL-NEXT:    vmovq %xmm0, (%rsi)
402; AVX512VL-NEXT:    retq
403;
404; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_3:
405; AVX512BW:       # %bb.0:
406; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm0 = [3,7,35,39,2,3,34,35]
407; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm1
408; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm2
409; AVX512BW-NEXT:    vpermt2w %zmm2, %zmm0, %zmm1
410; AVX512BW-NEXT:    vmovq %xmm1, (%rsi)
411; AVX512BW-NEXT:    vzeroupper
412; AVX512BW-NEXT:    retq
413;
414; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_3:
415; AVX512BWVL:       # %bb.0:
416; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
417; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm1 = <3,7,11,15,u,u,u,u>
418; AVX512BWVL-NEXT:    vpermi2w 16(%rdi), %xmm0, %xmm1
419; AVX512BWVL-NEXT:    vmovq %xmm1, (%rsi)
420; AVX512BWVL-NEXT:    retq
421  %vec = load <16 x i16>, <16 x i16>* %L
422  %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
423  store <4 x i16> %strided.vec, <4 x i16>* %S
424  ret void
425}
426
427define void @shuffle_v32i8_to_v4i8_1(<32 x i8>* %L, <4 x i8>* %S) nounwind {
428; AVX-LABEL: shuffle_v32i8_to_v4i8_1:
429; AVX:       # %bb.0:
430; AVX-NEXT:    vmovdqa (%rdi), %xmm0
431; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
432; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
433; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
434; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
435; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
436; AVX-NEXT:    vmovd %xmm0, (%rsi)
437; AVX-NEXT:    retq
438;
439; AVX512-LABEL: shuffle_v32i8_to_v4i8_1:
440; AVX512:       # %bb.0:
441; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
442; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm1
443; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
444; AVX512-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
445; AVX512-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
446; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
447; AVX512-NEXT:    vmovd %xmm0, (%rsi)
448; AVX512-NEXT:    retq
449  %vec = load <32 x i8>, <32 x i8>* %L
450  %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 1, i32 9, i32 17, i32 25>
451  store <4 x i8> %strided.vec, <4 x i8>* %S
452  ret void
453}
454
455define void @shuffle_v32i8_to_v4i8_2(<32 x i8>* %L, <4 x i8>* %S) nounwind {
456; AVX-LABEL: shuffle_v32i8_to_v4i8_2:
457; AVX:       # %bb.0:
458; AVX-NEXT:    vmovdqa (%rdi), %xmm0
459; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
460; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
461; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
462; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
463; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
464; AVX-NEXT:    vmovd %xmm0, (%rsi)
465; AVX-NEXT:    retq
466;
467; AVX512-LABEL: shuffle_v32i8_to_v4i8_2:
468; AVX512:       # %bb.0:
469; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
470; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm1
471; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
472; AVX512-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
473; AVX512-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
474; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
475; AVX512-NEXT:    vmovd %xmm0, (%rsi)
476; AVX512-NEXT:    retq
477  %vec = load <32 x i8>, <32 x i8>* %L
478  %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 2, i32 10, i32 18, i32 26>
479  store <4 x i8> %strided.vec, <4 x i8>* %S
480  ret void
481}
482
483define void @shuffle_v32i8_to_v4i8_3(<32 x i8>* %L, <4 x i8>* %S) nounwind {
484; AVX-LABEL: shuffle_v32i8_to_v4i8_3:
485; AVX:       # %bb.0:
486; AVX-NEXT:    vmovdqa (%rdi), %xmm0
487; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
488; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
489; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
490; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
491; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
492; AVX-NEXT:    vmovd %xmm0, (%rsi)
493; AVX-NEXT:    retq
494;
495; AVX512-LABEL: shuffle_v32i8_to_v4i8_3:
496; AVX512:       # %bb.0:
497; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
498; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm1
499; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
500; AVX512-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
501; AVX512-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
502; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
503; AVX512-NEXT:    vmovd %xmm0, (%rsi)
504; AVX512-NEXT:    retq
505  %vec = load <32 x i8>, <32 x i8>* %L
506  %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 3, i32 11, i32 19, i32 27>
507  store <4 x i8> %strided.vec, <4 x i8>* %S
508  ret void
509}
510
511define void @shuffle_v32i8_to_v4i8_4(<32 x i8>* %L, <4 x i8>* %S) nounwind {
512; AVX-LABEL: shuffle_v32i8_to_v4i8_4:
513; AVX:       # %bb.0:
514; AVX-NEXT:    vmovdqa (%rdi), %xmm0
515; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
516; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
517; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
518; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
519; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
520; AVX-NEXT:    vmovd %xmm0, (%rsi)
521; AVX-NEXT:    retq
522;
523; AVX512-LABEL: shuffle_v32i8_to_v4i8_4:
524; AVX512:       # %bb.0:
525; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
526; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm1
527; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
528; AVX512-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
529; AVX512-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
530; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
531; AVX512-NEXT:    vmovd %xmm0, (%rsi)
532; AVX512-NEXT:    retq
533  %vec = load <32 x i8>, <32 x i8>* %L
534  %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 4, i32 12, i32 20, i32 28>
535  store <4 x i8> %strided.vec, <4 x i8>* %S
536  ret void
537}
538
539define void @shuffle_v32i8_to_v4i8_5(<32 x i8>* %L, <4 x i8>* %S) nounwind {
540; AVX-LABEL: shuffle_v32i8_to_v4i8_5:
541; AVX:       # %bb.0:
542; AVX-NEXT:    vmovdqa (%rdi), %xmm0
543; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
544; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
545; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
546; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
547; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
548; AVX-NEXT:    vmovd %xmm0, (%rsi)
549; AVX-NEXT:    retq
550;
551; AVX512-LABEL: shuffle_v32i8_to_v4i8_5:
552; AVX512:       # %bb.0:
553; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
554; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm1
555; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
556; AVX512-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
557; AVX512-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
558; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
559; AVX512-NEXT:    vmovd %xmm0, (%rsi)
560; AVX512-NEXT:    retq
561  %vec = load <32 x i8>, <32 x i8>* %L
562  %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 5, i32 13, i32 21, i32 29>
563  store <4 x i8> %strided.vec, <4 x i8>* %S
564  ret void
565}
566
567define void @shuffle_v32i8_to_v4i8_6(<32 x i8>* %L, <4 x i8>* %S) nounwind {
568; AVX-LABEL: shuffle_v32i8_to_v4i8_6:
569; AVX:       # %bb.0:
570; AVX-NEXT:    vmovdqa (%rdi), %xmm0
571; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
572; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
573; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
574; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
575; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
576; AVX-NEXT:    vmovd %xmm0, (%rsi)
577; AVX-NEXT:    retq
578;
579; AVX512-LABEL: shuffle_v32i8_to_v4i8_6:
580; AVX512:       # %bb.0:
581; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
582; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm1
583; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
584; AVX512-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
585; AVX512-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
586; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
587; AVX512-NEXT:    vmovd %xmm0, (%rsi)
588; AVX512-NEXT:    retq
589  %vec = load <32 x i8>, <32 x i8>* %L
590  %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 6, i32 14, i32 22, i32 30>
591  store <4 x i8> %strided.vec, <4 x i8>* %S
592  ret void
593}
594
595define void @shuffle_v32i8_to_v4i8_7(<32 x i8>* %L, <4 x i8>* %S) nounwind {
596; AVX-LABEL: shuffle_v32i8_to_v4i8_7:
597; AVX:       # %bb.0:
598; AVX-NEXT:    vmovdqa (%rdi), %xmm0
599; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
600; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
601; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
602; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
603; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
604; AVX-NEXT:    vmovd %xmm0, (%rsi)
605; AVX-NEXT:    retq
606;
607; AVX512-LABEL: shuffle_v32i8_to_v4i8_7:
608; AVX512:       # %bb.0:
609; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
610; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm1
611; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
612; AVX512-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
613; AVX512-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
614; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
615; AVX512-NEXT:    vmovd %xmm0, (%rsi)
616; AVX512-NEXT:    retq
617  %vec = load <32 x i8>, <32 x i8>* %L
618  %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 7, i32 15, i32 23, i32 31>
619  store <4 x i8> %strided.vec, <4 x i8>* %S
620  ret void
621}
622
623