1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VBMI
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VBMIVL
8
9; PR31551
10; Pairs of shufflevector:trunc functions with functional equivalence.
11; Ideally, the shuffles should be lowered to code with the same quality as the truncates.
12
13define void @shuffle_v64i8_to_v32i8(<64 x i8>* %L, <32 x i8>* %S) nounwind {
14; AVX512F-LABEL: shuffle_v64i8_to_v32i8:
15; AVX512F:       # %bb.0:
16; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
17; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
18; AVX512F-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
19; AVX512F-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
20; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
21; AVX512F-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
22; AVX512F-NEXT:    vmovdqa %ymm0, (%rsi)
23; AVX512F-NEXT:    vzeroupper
24; AVX512F-NEXT:    retq
25;
26; AVX512VL-LABEL: shuffle_v64i8_to_v32i8:
27; AVX512VL:       # %bb.0:
28; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
29; AVX512VL-NEXT:    vmovdqa 32(%rdi), %ymm1
30; AVX512VL-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
31; AVX512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
32; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,5,7]
33; AVX512VL-NEXT:    vpermi2q %ymm1, %ymm0, %ymm2
34; AVX512VL-NEXT:    vmovdqa %ymm2, (%rsi)
35; AVX512VL-NEXT:    vzeroupper
36; AVX512VL-NEXT:    retq
37;
38; AVX512BW-LABEL: shuffle_v64i8_to_v32i8:
39; AVX512BW:       # %bb.0:
40; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
41; AVX512BW-NEXT:    vpmovwb %zmm0, (%rsi)
42; AVX512BW-NEXT:    vzeroupper
43; AVX512BW-NEXT:    retq
44;
45; AVX512BWVL-LABEL: shuffle_v64i8_to_v32i8:
46; AVX512BWVL:       # %bb.0:
47; AVX512BWVL-NEXT:    vmovdqa64 (%rdi), %zmm0
48; AVX512BWVL-NEXT:    vpmovwb %zmm0, (%rsi)
49; AVX512BWVL-NEXT:    vzeroupper
50; AVX512BWVL-NEXT:    retq
51;
52; AVX512VBMI-LABEL: shuffle_v64i8_to_v32i8:
53; AVX512VBMI:       # %bb.0:
54; AVX512VBMI-NEXT:    vmovdqa64 (%rdi), %zmm0
55; AVX512VBMI-NEXT:    vpmovwb %zmm0, (%rsi)
56; AVX512VBMI-NEXT:    vzeroupper
57; AVX512VBMI-NEXT:    retq
58;
59; AVX512VBMIVL-LABEL: shuffle_v64i8_to_v32i8:
60; AVX512VBMIVL:       # %bb.0:
61; AVX512VBMIVL-NEXT:    vmovdqa64 (%rdi), %zmm0
62; AVX512VBMIVL-NEXT:    vpmovwb %zmm0, (%rsi)
63; AVX512VBMIVL-NEXT:    vzeroupper
64; AVX512VBMIVL-NEXT:    retq
65  %vec = load <64 x i8>, <64 x i8>* %L
66  %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
67  store <32 x i8> %strided.vec, <32 x i8>* %S
68  ret void
69}
70
71define void @trunc_v32i16_to_v32i8(<64 x i8>* %L, <32 x i8>* %S) nounwind {
72; AVX512F-LABEL: trunc_v32i16_to_v32i8:
73; AVX512F:       # %bb.0:
74; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
75; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
76; AVX512F-NEXT:    vpmovdb %zmm1, 16(%rsi)
77; AVX512F-NEXT:    vpmovdb %zmm0, (%rsi)
78; AVX512F-NEXT:    vzeroupper
79; AVX512F-NEXT:    retq
80;
81; AVX512VL-LABEL: trunc_v32i16_to_v32i8:
82; AVX512VL:       # %bb.0:
83; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
84; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
85; AVX512VL-NEXT:    vpmovdb %zmm1, 16(%rsi)
86; AVX512VL-NEXT:    vpmovdb %zmm0, (%rsi)
87; AVX512VL-NEXT:    vzeroupper
88; AVX512VL-NEXT:    retq
89;
90; AVX512BW-LABEL: trunc_v32i16_to_v32i8:
91; AVX512BW:       # %bb.0:
92; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
93; AVX512BW-NEXT:    vpmovwb %zmm0, (%rsi)
94; AVX512BW-NEXT:    vzeroupper
95; AVX512BW-NEXT:    retq
96;
97; AVX512BWVL-LABEL: trunc_v32i16_to_v32i8:
98; AVX512BWVL:       # %bb.0:
99; AVX512BWVL-NEXT:    vmovdqa64 (%rdi), %zmm0
100; AVX512BWVL-NEXT:    vpmovwb %zmm0, (%rsi)
101; AVX512BWVL-NEXT:    vzeroupper
102; AVX512BWVL-NEXT:    retq
103;
104; AVX512VBMI-LABEL: trunc_v32i16_to_v32i8:
105; AVX512VBMI:       # %bb.0:
106; AVX512VBMI-NEXT:    vmovdqa64 (%rdi), %zmm0
107; AVX512VBMI-NEXT:    vpmovwb %zmm0, (%rsi)
108; AVX512VBMI-NEXT:    vzeroupper
109; AVX512VBMI-NEXT:    retq
110;
111; AVX512VBMIVL-LABEL: trunc_v32i16_to_v32i8:
112; AVX512VBMIVL:       # %bb.0:
113; AVX512VBMIVL-NEXT:    vmovdqa64 (%rdi), %zmm0
114; AVX512VBMIVL-NEXT:    vpmovwb %zmm0, (%rsi)
115; AVX512VBMIVL-NEXT:    vzeroupper
116; AVX512VBMIVL-NEXT:    retq
117  %vec = load <64 x i8>, <64 x i8>* %L
118  %bc = bitcast <64 x i8> %vec to <32 x i16>
119  %strided.vec = trunc <32 x i16> %bc to <32 x i8>
120  store <32 x i8> %strided.vec, <32 x i8>* %S
121  ret void
122}
123
124define void @shuffle_v32i16_to_v16i16(<32 x i16>* %L, <16 x i16>* %S) nounwind {
125; AVX512-LABEL: shuffle_v32i16_to_v16i16:
126; AVX512:       # %bb.0:
127; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm0
128; AVX512-NEXT:    vpmovdw %zmm0, (%rsi)
129; AVX512-NEXT:    vzeroupper
130; AVX512-NEXT:    retq
131  %vec = load <32 x i16>, <32 x i16>* %L
132  %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
133  store <16 x i16> %strided.vec, <16 x i16>* %S
134  ret void
135}
136
137define void @trunc_v16i32_to_v16i16(<32 x i16>* %L, <16 x i16>* %S) nounwind {
138; AVX512-LABEL: trunc_v16i32_to_v16i16:
139; AVX512:       # %bb.0:
140; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm0
141; AVX512-NEXT:    vpmovdw %zmm0, (%rsi)
142; AVX512-NEXT:    vzeroupper
143; AVX512-NEXT:    retq
144  %vec = load <32 x i16>, <32 x i16>* %L
145  %bc = bitcast <32 x i16> %vec to <16 x i32>
146  %strided.vec = trunc <16 x i32> %bc to <16 x i16>
147  store <16 x i16> %strided.vec, <16 x i16>* %S
148  ret void
149}
150
151define void @shuffle_v16i32_to_v8i32(<16 x i32>* %L, <8 x i32>* %S) nounwind {
152; AVX512-LABEL: shuffle_v16i32_to_v8i32:
153; AVX512:       # %bb.0:
154; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm0
155; AVX512-NEXT:    vpmovqd %zmm0, (%rsi)
156; AVX512-NEXT:    vzeroupper
157; AVX512-NEXT:    retq
158  %vec = load <16 x i32>, <16 x i32>* %L
159  %strided.vec = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
160  store <8 x i32> %strided.vec, <8 x i32>* %S
161  ret void
162}
163
164define void @trunc_v8i64_to_v8i32(<16 x i32>* %L, <8 x i32>* %S) nounwind {
165; AVX512-LABEL: trunc_v8i64_to_v8i32:
166; AVX512:       # %bb.0:
167; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm0
168; AVX512-NEXT:    vpmovqd %zmm0, (%rsi)
169; AVX512-NEXT:    vzeroupper
170; AVX512-NEXT:    retq
171  %vec = load <16 x i32>, <16 x i32>* %L
172  %bc = bitcast <16 x i32> %vec to <8 x i64>
173  %strided.vec = trunc <8 x i64> %bc to <8 x i32>
174  store <8 x i32> %strided.vec, <8 x i32>* %S
175  ret void
176}
177
178define void @shuffle_v64i8_to_v16i8(<64 x i8>* %L, <16 x i8>* %S) nounwind {
179; AVX512-LABEL: shuffle_v64i8_to_v16i8:
180; AVX512:       # %bb.0:
181; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm0
182; AVX512-NEXT:    vpmovdb %zmm0, (%rsi)
183; AVX512-NEXT:    vzeroupper
184; AVX512-NEXT:    retq
185  %vec = load <64 x i8>, <64 x i8>* %L
186  %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
187  store <16 x i8> %strided.vec, <16 x i8>* %S
188  ret void
189}
190
191define void @trunc_v16i32_to_v16i8(<64 x i8>* %L, <16 x i8>* %S) nounwind {
192; AVX512-LABEL: trunc_v16i32_to_v16i8:
193; AVX512:       # %bb.0:
194; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm0
195; AVX512-NEXT:    vpmovdb %zmm0, (%rsi)
196; AVX512-NEXT:    vzeroupper
197; AVX512-NEXT:    retq
198  %vec = load <64 x i8>, <64 x i8>* %L
199  %bc = bitcast <64 x i8> %vec to <16 x i32>
200  %strided.vec = trunc <16 x i32> %bc to <16 x i8>
201  store <16 x i8> %strided.vec, <16 x i8>* %S
202  ret void
203}
204
205define void @shuffle_v32i16_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind {
206; AVX512-LABEL: shuffle_v32i16_to_v8i16:
207; AVX512:       # %bb.0:
208; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm0
209; AVX512-NEXT:    vpmovqw %zmm0, (%rsi)
210; AVX512-NEXT:    vzeroupper
211; AVX512-NEXT:    retq
212  %vec = load <32 x i16>, <32 x i16>* %L
213  %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
214  store <8 x i16> %strided.vec, <8 x i16>* %S
215  ret void
216}
217
218define void @trunc_v8i64_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind {
219; AVX512-LABEL: trunc_v8i64_to_v8i16:
220; AVX512:       # %bb.0:
221; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm0
222; AVX512-NEXT:    vpmovqw %zmm0, (%rsi)
223; AVX512-NEXT:    vzeroupper
224; AVX512-NEXT:    retq
225  %vec = load <32 x i16>, <32 x i16>* %L
226  %bc = bitcast <32 x i16> %vec to <8 x i64>
227  %strided.vec = trunc <8 x i64> %bc to <8 x i16>
228  store <8 x i16> %strided.vec, <8 x i16>* %S
229  ret void
230}
231
232define void @shuffle_v64i8_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind {
233; AVX512-LABEL: shuffle_v64i8_to_v8i8:
234; AVX512:       # %bb.0:
235; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm0
236; AVX512-NEXT:    vpmovqb %zmm0, (%rsi)
237; AVX512-NEXT:    vzeroupper
238; AVX512-NEXT:    retq
239  %vec = load <64 x i8>, <64 x i8>* %L
240  %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56>
241  store <8 x i8> %strided.vec, <8 x i8>* %S
242  ret void
243}
244
245define void @trunc_v8i64_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind {
246; AVX512-LABEL: trunc_v8i64_to_v8i8:
247; AVX512:       # %bb.0:
248; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm0
249; AVX512-NEXT:    vpmovqb %zmm0, (%rsi)
250; AVX512-NEXT:    vzeroupper
251; AVX512-NEXT:    retq
252  %vec = load <64 x i8>, <64 x i8>* %L
253  %bc = bitcast <64 x i8> %vec to <8 x i64>
254  %strided.vec = trunc <8 x i64> %bc to <8 x i8>
255  store <8 x i8> %strided.vec, <8 x i8>* %S
256  ret void
257}
258
259define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61(<64 x i8> %x) {
260; AVX512F-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
261; AVX512F:       # %bb.0:
262; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
263; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
264; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
265; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
266; AVX512F-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
267; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
268; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm2
269; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
270; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
271; AVX512F-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
272; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
273; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
274; AVX512F-NEXT:    vzeroupper
275; AVX512F-NEXT:    retq
276;
277; AVX512VL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
278; AVX512VL:       # %bb.0:
279; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
280; AVX512VL-NEXT:    vextracti128 $1, %ymm1, %xmm2
281; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
282; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
283; AVX512VL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
284; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
285; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm2
286; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
287; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
288; AVX512VL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
289; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
290; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
291; AVX512VL-NEXT:    vzeroupper
292; AVX512VL-NEXT:    retq
293;
294; AVX512BW-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
295; AVX512BW:       # %bb.0:
296; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
297; AVX512BW-NEXT:    vextracti128 $1, %ymm1, %xmm2
298; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
299; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
300; AVX512BW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
301; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
302; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm2
303; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
304; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
305; AVX512BW-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
306; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
307; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
308; AVX512BW-NEXT:    vzeroupper
309; AVX512BW-NEXT:    retq
310;
311; AVX512BWVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
312; AVX512BWVL:       # %bb.0:
313; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
314; AVX512BWVL-NEXT:    vextracti128 $1, %ymm1, %xmm2
315; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
316; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
317; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
318; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
319; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm2
320; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
321; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
322; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
323; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
324; AVX512BWVL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
325; AVX512BWVL-NEXT:    vzeroupper
326; AVX512BWVL-NEXT:    retq
327;
328; AVX512VBMI-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
329; AVX512VBMI:       # %bb.0:
330; AVX512VBMI-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61]
331; AVX512VBMI-NEXT:    vpermb %zmm0, %zmm1, %zmm0
332; AVX512VBMI-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
333; AVX512VBMI-NEXT:    vzeroupper
334; AVX512VBMI-NEXT:    retq
335;
336; AVX512VBMIVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
337; AVX512VBMIVL:       # %bb.0:
338; AVX512VBMIVL-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61]
339; AVX512VBMIVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
340; AVX512VBMIVL-NEXT:    vpermt2b %ymm2, %ymm1, %ymm0
341; AVX512VBMIVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
342; AVX512VBMIVL-NEXT:    vzeroupper
343; AVX512VBMIVL-NEXT:    retq
344  %res = shufflevector <64 x i8> %x, <64 x i8> %x, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
345  ret <16 x i8> %res
346}
347
348define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62(<64 x i8> %x) {
349; AVX512F-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
350; AVX512F:       # %bb.0:
351; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
352; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
353; AVX512F-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
354; AVX512F-NEXT:    vpshufb %xmm2, %xmm0, %xmm2
355; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
356; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
357; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm2
358; AVX512F-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
359; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
360; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
361; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
362; AVX512F-NEXT:    vzeroupper
363; AVX512F-NEXT:    retq
364;
365; AVX512VL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
366; AVX512VL:       # %bb.0:
367; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
368; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
369; AVX512VL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
370; AVX512VL-NEXT:    vpshufb %xmm2, %xmm0, %xmm2
371; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
372; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
373; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm2
374; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
375; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
376; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
377; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
378; AVX512VL-NEXT:    vzeroupper
379; AVX512VL-NEXT:    retq
380;
381; AVX512BW-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
382; AVX512BW:       # %bb.0:
383; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
384; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
385; AVX512BW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
386; AVX512BW-NEXT:    vpshufb %xmm2, %xmm0, %xmm2
387; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
388; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
389; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm2
390; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
391; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
392; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
393; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
394; AVX512BW-NEXT:    vzeroupper
395; AVX512BW-NEXT:    retq
396;
397; AVX512BWVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
398; AVX512BWVL:       # %bb.0:
399; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
400; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
401; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
402; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm0, %xmm2
403; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
404; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
405; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm2
406; AVX512BWVL-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
407; AVX512BWVL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
408; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
409; AVX512BWVL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
410; AVX512BWVL-NEXT:    vzeroupper
411; AVX512BWVL-NEXT:    retq
412;
413; AVX512VBMI-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
414; AVX512VBMI:       # %bb.0:
415; AVX512VBMI-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,62]
416; AVX512VBMI-NEXT:    vpermb %zmm0, %zmm1, %zmm0
417; AVX512VBMI-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
418; AVX512VBMI-NEXT:    vzeroupper
419; AVX512VBMI-NEXT:    retq
420;
421; AVX512VBMIVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
422; AVX512VBMIVL:       # %bb.0:
423; AVX512VBMIVL-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,62]
424; AVX512VBMIVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
425; AVX512VBMIVL-NEXT:    vpermt2b %ymm2, %ymm1, %ymm0
426; AVX512VBMIVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
427; AVX512VBMIVL-NEXT:    vzeroupper
428; AVX512VBMIVL-NEXT:    retq
429  %res = shufflevector <64 x i8> %x, <64 x i8> %x, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 62>
430  ret <16 x i8> %res
431}
432
433define <4 x double> @PR34175(<32 x i16>* %p) {
434; AVX512F-LABEL: PR34175:
435; AVX512F:       # %bb.0:
436; AVX512F-NEXT:    vmovdqu (%rdi), %xmm0
437; AVX512F-NEXT:    vmovdqu 32(%rdi), %xmm1
438; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
439; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
440; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
441; AVX512F-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
442; AVX512F-NEXT:    vcvtdq2pd %xmm0, %ymm0
443; AVX512F-NEXT:    retq
444;
445; AVX512VL-LABEL: PR34175:
446; AVX512VL:       # %bb.0:
447; AVX512VL-NEXT:    vmovdqu (%rdi), %xmm0
448; AVX512VL-NEXT:    vmovdqu 32(%rdi), %xmm1
449; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
450; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
451; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
452; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
453; AVX512VL-NEXT:    vcvtdq2pd %xmm0, %ymm0
454; AVX512VL-NEXT:    retq
455;
456; AVX512BW-LABEL: PR34175:
457; AVX512BW:       # %bb.0:
458; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm0 = <0,8,32,40,u,u,u,u,u,u,u,u,u,u,u,u>
459; AVX512BW-NEXT:    vmovdqu (%rdi), %ymm1
460; AVX512BW-NEXT:    vmovdqu 32(%rdi), %ymm2
461; AVX512BW-NEXT:    vpermt2w %zmm2, %zmm0, %zmm1
462; AVX512BW-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
463; AVX512BW-NEXT:    vcvtdq2pd %xmm0, %ymm0
464; AVX512BW-NEXT:    retq
465;
466; AVX512BWVL-LABEL: PR34175:
467; AVX512BWVL:       # %bb.0:
468; AVX512BWVL-NEXT:    vmovdqu (%rdi), %ymm0
469; AVX512BWVL-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [6755468161056768,6755468161056768,6755468161056768,6755468161056768]
470; AVX512BWVL-NEXT:    vpermi2w 32(%rdi), %ymm0, %ymm1
471; AVX512BWVL-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
472; AVX512BWVL-NEXT:    vcvtdq2pd %xmm0, %ymm0
473; AVX512BWVL-NEXT:    retq
474;
475; AVX512VBMI-LABEL: PR34175:
476; AVX512VBMI:       # %bb.0:
477; AVX512VBMI-NEXT:    vmovdqa {{.*#+}} ymm0 = <0,8,32,40,u,u,u,u,u,u,u,u,u,u,u,u>
478; AVX512VBMI-NEXT:    vmovdqu (%rdi), %ymm1
479; AVX512VBMI-NEXT:    vmovdqu 32(%rdi), %ymm2
480; AVX512VBMI-NEXT:    vpermt2w %zmm2, %zmm0, %zmm1
481; AVX512VBMI-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
482; AVX512VBMI-NEXT:    vcvtdq2pd %xmm0, %ymm0
483; AVX512VBMI-NEXT:    retq
484;
485; AVX512VBMIVL-LABEL: PR34175:
486; AVX512VBMIVL:       # %bb.0:
487; AVX512VBMIVL-NEXT:    vmovdqu (%rdi), %ymm0
488; AVX512VBMIVL-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [6755468161056768,6755468161056768,6755468161056768,6755468161056768]
489; AVX512VBMIVL-NEXT:    vpermi2w 32(%rdi), %ymm0, %ymm1
490; AVX512VBMIVL-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
491; AVX512VBMIVL-NEXT:    vcvtdq2pd %xmm0, %ymm0
492; AVX512VBMIVL-NEXT:    retq
493  %v = load <32 x i16>, <32 x i16>* %p, align 2
494  %shuf = shufflevector <32 x i16> %v, <32 x i16> undef, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
495  %tofp = uitofp <4 x i16> %shuf to <4 x double>
496  ret <4 x double> %tofp
497}
498
499define <16 x i8> @trunc_v8i64_to_v8i8_return_v16i8(<8 x i64> %vec) nounwind {
500; AVX512-LABEL: trunc_v8i64_to_v8i8_return_v16i8:
501; AVX512:       # %bb.0:
502; AVX512-NEXT:    vpmovqb %zmm0, %xmm0
503; AVX512-NEXT:    vzeroupper
504; AVX512-NEXT:    retq
505  %truncated = trunc <8 x i64> %vec to <8 x i8>
506  %result = shufflevector <8 x i8> %truncated, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
507  ret <16 x i8> %result
508}
509
510