1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2-SLOW
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2-FAST
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
11
12define void @shuffle_v16i8_to_v8i8_1(<16 x i8>* %L, <8 x i8>* %S) nounwind {
13; SSE2-LABEL: shuffle_v16i8_to_v8i8_1:
14; SSE2:       # %bb.0:
15; SSE2-NEXT:    movdqa (%rdi), %xmm0
16; SSE2-NEXT:    psrlw $8, %xmm0
17; SSE2-NEXT:    packuswb %xmm0, %xmm0
18; SSE2-NEXT:    movq %xmm0, (%rsi)
19; SSE2-NEXT:    retq
20;
21; SSE42-LABEL: shuffle_v16i8_to_v8i8_1:
22; SSE42:       # %bb.0:
23; SSE42-NEXT:    movdqa (%rdi), %xmm0
24; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
25; SSE42-NEXT:    movq %xmm0, (%rsi)
26; SSE42-NEXT:    retq
27;
28; AVX-LABEL: shuffle_v16i8_to_v8i8_1:
29; AVX:       # %bb.0:
30; AVX-NEXT:    vmovdqa (%rdi), %xmm0
31; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
32; AVX-NEXT:    vmovq %xmm0, (%rsi)
33; AVX-NEXT:    retq
34;
35; AVX512-LABEL: shuffle_v16i8_to_v8i8_1:
36; AVX512:       # %bb.0:
37; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
38; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
39; AVX512-NEXT:    vmovq %xmm0, (%rsi)
40; AVX512-NEXT:    retq
41  %vec = load <16 x i8>, <16 x i8>* %L
42  %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
43  store <8 x i8> %strided.vec, <8 x i8>* %S
44  ret void
45}
46
47define void @shuffle_v8i16_to_v4i16_1(<8 x i16>* %L, <4 x i16>* %S) nounwind {
48; SSE2-LABEL: shuffle_v8i16_to_v4i16_1:
49; SSE2:       # %bb.0:
50; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = mem[3,1,2,3,4,5,6,7]
51; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
52; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
53; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
54; SSE2-NEXT:    movq %xmm0, (%rsi)
55; SSE2-NEXT:    retq
56;
57; SSE42-LABEL: shuffle_v8i16_to_v4i16_1:
58; SSE42:       # %bb.0:
59; SSE42-NEXT:    movdqa (%rdi), %xmm0
60; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
61; SSE42-NEXT:    movq %xmm0, (%rsi)
62; SSE42-NEXT:    retq
63;
64; AVX-LABEL: shuffle_v8i16_to_v4i16_1:
65; AVX:       # %bb.0:
66; AVX-NEXT:    vmovdqa (%rdi), %xmm0
67; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
68; AVX-NEXT:    vmovq %xmm0, (%rsi)
69; AVX-NEXT:    retq
70;
71; AVX512-LABEL: shuffle_v8i16_to_v4i16_1:
72; AVX512:       # %bb.0:
73; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
74; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
75; AVX512-NEXT:    vmovq %xmm0, (%rsi)
76; AVX512-NEXT:    retq
77  %vec = load <8 x i16>, <8 x i16>* %L
78  %strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
79  store <4 x i16> %strided.vec, <4 x i16>* %S
80  ret void
81}
82
83define void @shuffle_v4i32_to_v2i32_1(<4 x i32>* %L, <2 x i32>* %S) nounwind {
84; SSE-LABEL: shuffle_v4i32_to_v2i32_1:
85; SSE:       # %bb.0:
86; SSE-NEXT:    pshufd {{.*#+}} xmm0 = mem[1,3,2,3]
87; SSE-NEXT:    movq %xmm0, (%rsi)
88; SSE-NEXT:    retq
89;
90; AVX-LABEL: shuffle_v4i32_to_v2i32_1:
91; AVX:       # %bb.0:
92; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = mem[1,3,2,3]
93; AVX-NEXT:    vmovlps %xmm0, (%rsi)
94; AVX-NEXT:    retq
95;
96; AVX512-LABEL: shuffle_v4i32_to_v2i32_1:
97; AVX512:       # %bb.0:
98; AVX512-NEXT:    vpermilps {{.*#+}} xmm0 = mem[1,3,2,3]
99; AVX512-NEXT:    vmovlps %xmm0, (%rsi)
100; AVX512-NEXT:    retq
101  %vec = load <4 x i32>, <4 x i32>* %L
102  %strided.vec = shufflevector <4 x i32> %vec, <4 x i32> undef, <2 x i32> <i32 1, i32 3>
103  store <2 x i32> %strided.vec, <2 x i32>* %S
104  ret void
105}
106
107define void @shuffle_v16i8_to_v4i8_1(<16 x i8>* %L, <4 x i8>* %S) nounwind {
108; SSE2-LABEL: shuffle_v16i8_to_v4i8_1:
109; SSE2:       # %bb.0:
110; SSE2-NEXT:    movdqa (%rdi), %xmm0
111; SSE2-NEXT:    pxor %xmm1, %xmm1
112; SSE2-NEXT:    movdqa %xmm0, %xmm2
113; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
114; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
115; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
116; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
117; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
118; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
119; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
120; SSE2-NEXT:    packuswb %xmm0, %xmm0
121; SSE2-NEXT:    movd %xmm0, (%rsi)
122; SSE2-NEXT:    retq
123;
124; SSE42-LABEL: shuffle_v16i8_to_v4i8_1:
125; SSE42:       # %bb.0:
126; SSE42-NEXT:    movdqa (%rdi), %xmm0
127; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u]
128; SSE42-NEXT:    movd %xmm0, (%rsi)
129; SSE42-NEXT:    retq
130;
131; AVX-LABEL: shuffle_v16i8_to_v4i8_1:
132; AVX:       # %bb.0:
133; AVX-NEXT:    vmovdqa (%rdi), %xmm0
134; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u]
135; AVX-NEXT:    vmovd %xmm0, (%rsi)
136; AVX-NEXT:    retq
137;
138; AVX512-LABEL: shuffle_v16i8_to_v4i8_1:
139; AVX512:       # %bb.0:
140; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
141; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u]
142; AVX512-NEXT:    vmovd %xmm0, (%rsi)
143; AVX512-NEXT:    retq
144  %vec = load <16 x i8>, <16 x i8>* %L
145  %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
146  store <4 x i8> %strided.vec, <4 x i8>* %S
147  ret void
148}
149
150define void @shuffle_v16i8_to_v4i8_2(<16 x i8>* %L, <4 x i8>* %S) nounwind {
151; SSE2-LABEL: shuffle_v16i8_to_v4i8_2:
152; SSE2:       # %bb.0:
153; SSE2-NEXT:    movdqa (%rdi), %xmm0
154; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
155; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
156; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
157; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
158; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
159; SSE2-NEXT:    packuswb %xmm0, %xmm0
160; SSE2-NEXT:    movd %xmm0, (%rsi)
161; SSE2-NEXT:    retq
162;
163; SSE42-LABEL: shuffle_v16i8_to_v4i8_2:
164; SSE42:       # %bb.0:
165; SSE42-NEXT:    movdqa (%rdi), %xmm0
166; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u]
167; SSE42-NEXT:    movd %xmm0, (%rsi)
168; SSE42-NEXT:    retq
169;
170; AVX-LABEL: shuffle_v16i8_to_v4i8_2:
171; AVX:       # %bb.0:
172; AVX-NEXT:    vmovdqa (%rdi), %xmm0
173; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u]
174; AVX-NEXT:    vmovd %xmm0, (%rsi)
175; AVX-NEXT:    retq
176;
177; AVX512-LABEL: shuffle_v16i8_to_v4i8_2:
178; AVX512:       # %bb.0:
179; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
180; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u]
181; AVX512-NEXT:    vmovd %xmm0, (%rsi)
182; AVX512-NEXT:    retq
183  %vec = load <16 x i8>, <16 x i8>* %L
184  %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
185  store <4 x i8> %strided.vec, <4 x i8>* %S
186  ret void
187}
188
189define void @shuffle_v16i8_to_v4i8_3(<16 x i8>* %L, <4 x i8>* %S) nounwind {
190; SSE2-LABEL: shuffle_v16i8_to_v4i8_3:
191; SSE2:       # %bb.0:
192; SSE2-NEXT:    movdqa (%rdi), %xmm0
193; SSE2-NEXT:    pxor %xmm1, %xmm1
194; SSE2-NEXT:    movdqa %xmm0, %xmm2
195; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
196; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
197; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
198; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
199; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
200; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
201; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
202; SSE2-NEXT:    packuswb %xmm0, %xmm0
203; SSE2-NEXT:    movd %xmm0, (%rsi)
204; SSE2-NEXT:    retq
205;
206; SSE42-LABEL: shuffle_v16i8_to_v4i8_3:
207; SSE42:       # %bb.0:
208; SSE42-NEXT:    movdqa (%rdi), %xmm0
209; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u]
210; SSE42-NEXT:    movd %xmm0, (%rsi)
211; SSE42-NEXT:    retq
212;
213; AVX-LABEL: shuffle_v16i8_to_v4i8_3:
214; AVX:       # %bb.0:
215; AVX-NEXT:    vmovdqa (%rdi), %xmm0
216; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u]
217; AVX-NEXT:    vmovd %xmm0, (%rsi)
218; AVX-NEXT:    retq
219;
220; AVX512-LABEL: shuffle_v16i8_to_v4i8_3:
221; AVX512:       # %bb.0:
222; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
223; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u]
224; AVX512-NEXT:    vmovd %xmm0, (%rsi)
225; AVX512-NEXT:    retq
226  %vec = load <16 x i8>, <16 x i8>* %L
227  %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
228  store <4 x i8> %strided.vec, <4 x i8>* %S
229  ret void
230}
231
232define void @shuffle_v8i16_to_v2i16_1(<8 x i16>* %L, <2 x i16>* %S) nounwind {
233; SSE-LABEL: shuffle_v8i16_to_v2i16_1:
234; SSE:       # %bb.0:
235; SSE-NEXT:    pshufd {{.*#+}} xmm0 = mem[0,2,2,3]
236; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
237; SSE-NEXT:    movd %xmm0, (%rsi)
238; SSE-NEXT:    retq
239;
240; AVX1-LABEL: shuffle_v8i16_to_v2i16_1:
241; AVX1:       # %bb.0:
242; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
243; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
244; AVX1-NEXT:    vmovd %xmm0, (%rsi)
245; AVX1-NEXT:    retq
246;
247; AVX2-SLOW-LABEL: shuffle_v8i16_to_v2i16_1:
248; AVX2-SLOW:       # %bb.0:
249; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
250; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
251; AVX2-SLOW-NEXT:    vmovd %xmm0, (%rsi)
252; AVX2-SLOW-NEXT:    retq
253;
254; AVX2-FAST-LABEL: shuffle_v8i16_to_v2i16_1:
255; AVX2-FAST:       # %bb.0:
256; AVX2-FAST-NEXT:    vmovdqa (%rdi), %xmm0
257; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
258; AVX2-FAST-NEXT:    vmovd %xmm0, (%rsi)
259; AVX2-FAST-NEXT:    retq
260;
261; AVX512F-LABEL: shuffle_v8i16_to_v2i16_1:
262; AVX512F:       # %bb.0:
263; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
264; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
265; AVX512F-NEXT:    vmovd %xmm0, (%rsi)
266; AVX512F-NEXT:    retq
267;
268; AVX512VL-LABEL: shuffle_v8i16_to_v2i16_1:
269; AVX512VL:       # %bb.0:
270; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
271; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
272; AVX512VL-NEXT:    vmovd %xmm0, (%rsi)
273; AVX512VL-NEXT:    retq
274;
275; AVX512BW-LABEL: shuffle_v8i16_to_v2i16_1:
276; AVX512BW:       # %bb.0:
277; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
278; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
279; AVX512BW-NEXT:    vmovd %xmm0, (%rsi)
280; AVX512BW-NEXT:    retq
281;
282; AVX512BWVL-LABEL: shuffle_v8i16_to_v2i16_1:
283; AVX512BWVL:       # %bb.0:
284; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
285; AVX512BWVL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
286; AVX512BWVL-NEXT:    vmovd %xmm0, (%rsi)
287; AVX512BWVL-NEXT:    retq
288  %vec = load <8 x i16>, <8 x i16>* %L
289  %strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <2 x i32> <i32 1, i32 5>
290  store <2 x i16> %strided.vec, <2 x i16>* %S
291  ret void
292}
293
294define void @shuffle_v8i16_to_v2i16_2(<8 x i16>* %L, <2 x i16>* %S) nounwind {
295; SSE-LABEL: shuffle_v8i16_to_v2i16_2:
296; SSE:       # %bb.0:
297; SSE-NEXT:    pshufd {{.*#+}} xmm0 = mem[3,1,2,3]
298; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
299; SSE-NEXT:    movd %xmm0, (%rsi)
300; SSE-NEXT:    retq
301;
302; AVX1-LABEL: shuffle_v8i16_to_v2i16_2:
303; AVX1:       # %bb.0:
304; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
305; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
306; AVX1-NEXT:    vmovd %xmm0, (%rsi)
307; AVX1-NEXT:    retq
308;
309; AVX2-SLOW-LABEL: shuffle_v8i16_to_v2i16_2:
310; AVX2-SLOW:       # %bb.0:
311; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
312; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
313; AVX2-SLOW-NEXT:    vmovd %xmm0, (%rsi)
314; AVX2-SLOW-NEXT:    retq
315;
316; AVX2-FAST-LABEL: shuffle_v8i16_to_v2i16_2:
317; AVX2-FAST:       # %bb.0:
318; AVX2-FAST-NEXT:    vmovdqa (%rdi), %xmm0
319; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u]
320; AVX2-FAST-NEXT:    vmovd %xmm0, (%rsi)
321; AVX2-FAST-NEXT:    retq
322;
323; AVX512F-LABEL: shuffle_v8i16_to_v2i16_2:
324; AVX512F:       # %bb.0:
325; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
326; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
327; AVX512F-NEXT:    vmovd %xmm0, (%rsi)
328; AVX512F-NEXT:    retq
329;
330; AVX512VL-LABEL: shuffle_v8i16_to_v2i16_2:
331; AVX512VL:       # %bb.0:
332; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
333; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u]
334; AVX512VL-NEXT:    vmovd %xmm0, (%rsi)
335; AVX512VL-NEXT:    retq
336;
337; AVX512BW-LABEL: shuffle_v8i16_to_v2i16_2:
338; AVX512BW:       # %bb.0:
339; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
340; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u]
341; AVX512BW-NEXT:    vmovd %xmm0, (%rsi)
342; AVX512BW-NEXT:    retq
343;
344; AVX512BWVL-LABEL: shuffle_v8i16_to_v2i16_2:
345; AVX512BWVL:       # %bb.0:
346; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
347; AVX512BWVL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u]
348; AVX512BWVL-NEXT:    vmovd %xmm0, (%rsi)
349; AVX512BWVL-NEXT:    retq
350  %vec = load <8 x i16>, <8 x i16>* %L
351  %strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <2 x i32> <i32 2, i32 6>
352  store <2 x i16> %strided.vec, <2 x i16>* %S
353  ret void
354}
355
356define void @shuffle_v8i16_to_v2i16_3(<8 x i16>* %L, <2 x i16>* %S) nounwind {
357; SSE-LABEL: shuffle_v8i16_to_v2i16_3:
358; SSE:       # %bb.0:
359; SSE-NEXT:    pshufd {{.*#+}} xmm0 = mem[3,1,2,3]
360; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
361; SSE-NEXT:    movd %xmm0, (%rsi)
362; SSE-NEXT:    retq
363;
364; AVX1-LABEL: shuffle_v8i16_to_v2i16_3:
365; AVX1:       # %bb.0:
366; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
367; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
368; AVX1-NEXT:    vmovd %xmm0, (%rsi)
369; AVX1-NEXT:    retq
370;
371; AVX2-SLOW-LABEL: shuffle_v8i16_to_v2i16_3:
372; AVX2-SLOW:       # %bb.0:
373; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
374; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
375; AVX2-SLOW-NEXT:    vmovd %xmm0, (%rsi)
376; AVX2-SLOW-NEXT:    retq
377;
378; AVX2-FAST-LABEL: shuffle_v8i16_to_v2i16_3:
379; AVX2-FAST:       # %bb.0:
380; AVX2-FAST-NEXT:    vmovdqa (%rdi), %xmm0
381; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,u,u,u,u,u,u,u,u,u,u,u,u]
382; AVX2-FAST-NEXT:    vmovd %xmm0, (%rsi)
383; AVX2-FAST-NEXT:    retq
384;
385; AVX512F-LABEL: shuffle_v8i16_to_v2i16_3:
386; AVX512F:       # %bb.0:
387; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
388; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
389; AVX512F-NEXT:    vmovd %xmm0, (%rsi)
390; AVX512F-NEXT:    retq
391;
392; AVX512VL-LABEL: shuffle_v8i16_to_v2i16_3:
393; AVX512VL:       # %bb.0:
394; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
395; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,u,u,u,u,u,u,u,u,u,u,u,u]
396; AVX512VL-NEXT:    vmovd %xmm0, (%rsi)
397; AVX512VL-NEXT:    retq
398;
399; AVX512BW-LABEL: shuffle_v8i16_to_v2i16_3:
400; AVX512BW:       # %bb.0:
401; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
402; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,u,u,u,u,u,u,u,u,u,u,u,u]
403; AVX512BW-NEXT:    vmovd %xmm0, (%rsi)
404; AVX512BW-NEXT:    retq
405;
406; AVX512BWVL-LABEL: shuffle_v8i16_to_v2i16_3:
407; AVX512BWVL:       # %bb.0:
408; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
409; AVX512BWVL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,u,u,u,u,u,u,u,u,u,u,u,u]
410; AVX512BWVL-NEXT:    vmovd %xmm0, (%rsi)
411; AVX512BWVL-NEXT:    retq
412  %vec = load <8 x i16>, <8 x i16>* %L
413  %strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <2 x i32> <i32 3, i32 7>
414  store <2 x i16> %strided.vec, <2 x i16>* %S
415  ret void
416}
417
418define void @shuffle_v16i8_to_v2i8_1(<16 x i8>* %L, <2 x i8>* %S) nounwind {
419; SSE2-LABEL: shuffle_v16i8_to_v2i8_1:
420; SSE2:       # %bb.0:
421; SSE2-NEXT:    movdqa (%rdi), %xmm0
422; SSE2-NEXT:    pxor %xmm1, %xmm1
423; SSE2-NEXT:    movdqa %xmm0, %xmm2
424; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
425; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
426; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
427; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
428; SSE2-NEXT:    packuswb %xmm0, %xmm0
429; SSE2-NEXT:    movd %xmm0, %eax
430; SSE2-NEXT:    movw %ax, (%rsi)
431; SSE2-NEXT:    retq
432;
433; SSE42-LABEL: shuffle_v16i8_to_v2i8_1:
434; SSE42:       # %bb.0:
435; SSE42-NEXT:    movdqa (%rdi), %xmm0
436; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
437; SSE42-NEXT:    pextrw $0, %xmm0, (%rsi)
438; SSE42-NEXT:    retq
439;
440; AVX-LABEL: shuffle_v16i8_to_v2i8_1:
441; AVX:       # %bb.0:
442; AVX-NEXT:    vmovdqa (%rdi), %xmm0
443; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
444; AVX-NEXT:    vpextrw $0, %xmm0, (%rsi)
445; AVX-NEXT:    retq
446;
447; AVX512-LABEL: shuffle_v16i8_to_v2i8_1:
448; AVX512:       # %bb.0:
449; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
450; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
451; AVX512-NEXT:    vpextrw $0, %xmm0, (%rsi)
452; AVX512-NEXT:    retq
453  %vec = load <16 x i8>, <16 x i8>* %L
454  %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 1, i32 9>
455  store <2 x i8> %strided.vec, <2 x i8>* %S
456  ret void
457}
458
459define void @shuffle_v16i8_to_v2i8_2(<16 x i8>* %L, <2 x i8>* %S) nounwind {
460; SSE2-LABEL: shuffle_v16i8_to_v2i8_2:
461; SSE2:       # %bb.0:
462; SSE2-NEXT:    movdqa (%rdi), %xmm0
463; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
464; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
465; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
466; SSE2-NEXT:    packuswb %xmm0, %xmm0
467; SSE2-NEXT:    movd %xmm0, %eax
468; SSE2-NEXT:    movw %ax, (%rsi)
469; SSE2-NEXT:    retq
470;
471; SSE42-LABEL: shuffle_v16i8_to_v2i8_2:
472; SSE42:       # %bb.0:
473; SSE42-NEXT:    movdqa (%rdi), %xmm0
474; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
475; SSE42-NEXT:    pextrw $0, %xmm0, (%rsi)
476; SSE42-NEXT:    retq
477;
478; AVX-LABEL: shuffle_v16i8_to_v2i8_2:
479; AVX:       # %bb.0:
480; AVX-NEXT:    vmovdqa (%rdi), %xmm0
481; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
482; AVX-NEXT:    vpextrw $0, %xmm0, (%rsi)
483; AVX-NEXT:    retq
484;
485; AVX512-LABEL: shuffle_v16i8_to_v2i8_2:
486; AVX512:       # %bb.0:
487; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
488; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
489; AVX512-NEXT:    vpextrw $0, %xmm0, (%rsi)
490; AVX512-NEXT:    retq
491  %vec = load <16 x i8>, <16 x i8>* %L
492  %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 2, i32 10>
493  store <2 x i8> %strided.vec, <2 x i8>* %S
494  ret void
495}
496
497define void @shuffle_v16i8_to_v2i8_3(<16 x i8>* %L, <2 x i8>* %S) nounwind {
498; SSE2-LABEL: shuffle_v16i8_to_v2i8_3:
499; SSE2:       # %bb.0:
500; SSE2-NEXT:    movdqa (%rdi), %xmm0
501; SSE2-NEXT:    pxor %xmm1, %xmm1
502; SSE2-NEXT:    movdqa %xmm0, %xmm2
503; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
504; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
505; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
506; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
507; SSE2-NEXT:    packuswb %xmm0, %xmm0
508; SSE2-NEXT:    movd %xmm0, %eax
509; SSE2-NEXT:    movw %ax, (%rsi)
510; SSE2-NEXT:    retq
511;
512; SSE42-LABEL: shuffle_v16i8_to_v2i8_3:
513; SSE42:       # %bb.0:
514; SSE42-NEXT:    movdqa (%rdi), %xmm0
515; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
516; SSE42-NEXT:    pextrw $0, %xmm0, (%rsi)
517; SSE42-NEXT:    retq
518;
519; AVX-LABEL: shuffle_v16i8_to_v2i8_3:
520; AVX:       # %bb.0:
521; AVX-NEXT:    vmovdqa (%rdi), %xmm0
522; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
523; AVX-NEXT:    vpextrw $0, %xmm0, (%rsi)
524; AVX-NEXT:    retq
525;
526; AVX512-LABEL: shuffle_v16i8_to_v2i8_3:
527; AVX512:       # %bb.0:
528; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
529; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
530; AVX512-NEXT:    vpextrw $0, %xmm0, (%rsi)
531; AVX512-NEXT:    retq
532  %vec = load <16 x i8>, <16 x i8>* %L
533  %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 3, i32 11>
534  store <2 x i8> %strided.vec, <2 x i8>* %S
535  ret void
536}
537
538define void @shuffle_v16i8_to_v2i8_4(<16 x i8>* %L, <2 x i8>* %S) nounwind {
539; SSE2-LABEL: shuffle_v16i8_to_v2i8_4:
540; SSE2:       # %bb.0:
541; SSE2-NEXT:    movdqa (%rdi), %xmm0
542; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
543; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
544; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
545; SSE2-NEXT:    packuswb %xmm0, %xmm0
546; SSE2-NEXT:    movd %xmm0, %eax
547; SSE2-NEXT:    movw %ax, (%rsi)
548; SSE2-NEXT:    retq
549;
550; SSE42-LABEL: shuffle_v16i8_to_v2i8_4:
551; SSE42:       # %bb.0:
552; SSE42-NEXT:    movdqa (%rdi), %xmm0
553; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
554; SSE42-NEXT:    pextrw $0, %xmm0, (%rsi)
555; SSE42-NEXT:    retq
556;
557; AVX-LABEL: shuffle_v16i8_to_v2i8_4:
558; AVX:       # %bb.0:
559; AVX-NEXT:    vmovdqa (%rdi), %xmm0
560; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
561; AVX-NEXT:    vpextrw $0, %xmm0, (%rsi)
562; AVX-NEXT:    retq
563;
564; AVX512-LABEL: shuffle_v16i8_to_v2i8_4:
565; AVX512:       # %bb.0:
566; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
567; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
568; AVX512-NEXT:    vpextrw $0, %xmm0, (%rsi)
569; AVX512-NEXT:    retq
570  %vec = load <16 x i8>, <16 x i8>* %L
571  %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 4, i32 12>
572  store <2 x i8> %strided.vec, <2 x i8>* %S
573  ret void
574}
575
576define void @shuffle_v16i8_to_v2i8_5(<16 x i8>* %L, <2 x i8>* %S) nounwind {
577; SSE2-LABEL: shuffle_v16i8_to_v2i8_5:
578; SSE2:       # %bb.0:
579; SSE2-NEXT:    movdqa (%rdi), %xmm0
580; SSE2-NEXT:    pxor %xmm1, %xmm1
581; SSE2-NEXT:    movdqa %xmm0, %xmm2
582; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
583; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
584; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
585; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
586; SSE2-NEXT:    packuswb %xmm0, %xmm0
587; SSE2-NEXT:    movd %xmm0, %eax
588; SSE2-NEXT:    movw %ax, (%rsi)
589; SSE2-NEXT:    retq
590;
591; SSE42-LABEL: shuffle_v16i8_to_v2i8_5:
592; SSE42:       # %bb.0:
593; SSE42-NEXT:    movdqa (%rdi), %xmm0
594; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
595; SSE42-NEXT:    pextrw $0, %xmm0, (%rsi)
596; SSE42-NEXT:    retq
597;
598; AVX-LABEL: shuffle_v16i8_to_v2i8_5:
599; AVX:       # %bb.0:
600; AVX-NEXT:    vmovdqa (%rdi), %xmm0
601; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
602; AVX-NEXT:    vpextrw $0, %xmm0, (%rsi)
603; AVX-NEXT:    retq
604;
605; AVX512-LABEL: shuffle_v16i8_to_v2i8_5:
606; AVX512:       # %bb.0:
607; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
608; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
609; AVX512-NEXT:    vpextrw $0, %xmm0, (%rsi)
610; AVX512-NEXT:    retq
611  %vec = load <16 x i8>, <16 x i8>* %L
612  %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 5, i32 13>
613  store <2 x i8> %strided.vec, <2 x i8>* %S
614  ret void
615}
616
617define void @shuffle_v16i8_to_v2i8_6(<16 x i8>* %L, <2 x i8>* %S) nounwind {
618; SSE2-LABEL: shuffle_v16i8_to_v2i8_6:
619; SSE2:       # %bb.0:
620; SSE2-NEXT:    movdqa (%rdi), %xmm0
621; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
622; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
623; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
624; SSE2-NEXT:    packuswb %xmm0, %xmm0
625; SSE2-NEXT:    movd %xmm0, %eax
626; SSE2-NEXT:    movw %ax, (%rsi)
627; SSE2-NEXT:    retq
628;
629; SSE42-LABEL: shuffle_v16i8_to_v2i8_6:
630; SSE42:       # %bb.0:
631; SSE42-NEXT:    movdqa (%rdi), %xmm0
632; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
633; SSE42-NEXT:    pextrw $0, %xmm0, (%rsi)
634; SSE42-NEXT:    retq
635;
636; AVX-LABEL: shuffle_v16i8_to_v2i8_6:
637; AVX:       # %bb.0:
638; AVX-NEXT:    vmovdqa (%rdi), %xmm0
639; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
640; AVX-NEXT:    vpextrw $0, %xmm0, (%rsi)
641; AVX-NEXT:    retq
642;
643; AVX512-LABEL: shuffle_v16i8_to_v2i8_6:
644; AVX512:       # %bb.0:
645; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
646; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
647; AVX512-NEXT:    vpextrw $0, %xmm0, (%rsi)
648; AVX512-NEXT:    retq
649  %vec = load <16 x i8>, <16 x i8>* %L
650  %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 6, i32 14>
651  store <2 x i8> %strided.vec, <2 x i8>* %S
652  ret void
653}
654
655define void @shuffle_v16i8_to_v2i8_7(<16 x i8>* %L, <2 x i8>* %S) nounwind {
656; SSE2-LABEL: shuffle_v16i8_to_v2i8_7:
657; SSE2:       # %bb.0:
658; SSE2-NEXT:    movdqa (%rdi), %xmm0
659; SSE2-NEXT:    pxor %xmm1, %xmm1
660; SSE2-NEXT:    movdqa %xmm0, %xmm2
661; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
662; SSE2-NEXT:    psrlw $8, %xmm0
663; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
664; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[3,3,3,3]
665; SSE2-NEXT:    packuswb %xmm0, %xmm0
666; SSE2-NEXT:    movd %xmm0, %eax
667; SSE2-NEXT:    movw %ax, (%rsi)
668; SSE2-NEXT:    retq
669;
670; SSE42-LABEL: shuffle_v16i8_to_v2i8_7:
671; SSE42:       # %bb.0:
672; SSE42-NEXT:    movdqa (%rdi), %xmm0
673; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
674; SSE42-NEXT:    pextrw $0, %xmm0, (%rsi)
675; SSE42-NEXT:    retq
676;
677; AVX-LABEL: shuffle_v16i8_to_v2i8_7:
678; AVX:       # %bb.0:
679; AVX-NEXT:    vmovdqa (%rdi), %xmm0
680; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
681; AVX-NEXT:    vpextrw $0, %xmm0, (%rsi)
682; AVX-NEXT:    retq
683;
684; AVX512-LABEL: shuffle_v16i8_to_v2i8_7:
685; AVX512:       # %bb.0:
686; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
687; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
688; AVX512-NEXT:    vpextrw $0, %xmm0, (%rsi)
689; AVX512-NEXT:    retq
690  %vec = load <16 x i8>, <16 x i8>* %L
691  %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 7, i32 15>
692  store <2 x i8> %strided.vec, <2 x i8>* %S
693  ret void
694}
695
696