1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s
3
4declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>)
5declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>)
6declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>)
7declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>)
8
9define <32 x i8> @combine_pshufb_pslldq(<32 x i8> %a0) {
10; CHECK-LABEL: combine_pshufb_pslldq:
11; CHECK:       # BB#0:
12; CHECK-NEXT:    vxorps %ymm0, %ymm0, %ymm0
13; CHECK-NEXT:    retq
14  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)
15  %2 = shufflevector <32 x i8> %1, <32 x i8> zeroinitializer, <32 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
16  ret <32 x i8> %2
17}
18
19define <32 x i8> @combine_pshufb_psrldq(<32 x i8> %a0) {
20; CHECK-LABEL: combine_pshufb_psrldq:
21; CHECK:       # BB#0:
22; CHECK-NEXT:    vxorps %ymm0, %ymm0, %ymm0
23; CHECK-NEXT:    retq
24  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>)
25  %2 = shufflevector <32 x i8> %1, <32 x i8> zeroinitializer, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>
26  ret <32 x i8> %2
27}
28
29define <32 x i8> @combine_pshufb_vpermd(<8 x i32> %a) {
30; CHECK-LABEL: combine_pshufb_vpermd:
31; CHECK:       # BB#0:
32; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,16,17,18,18]
33; CHECK-NEXT:    retq
34  %tmp0 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 4>)
35  %tmp1 = bitcast <8 x i32> %tmp0 to <32 x i8>
36  %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 30>
37  ret <32 x i8> %tmp2
38}
39
40define <32 x i8> @combine_pshufb_vpermps(<8 x float> %a) {
41; CHECK-LABEL: combine_pshufb_vpermps:
42; CHECK:       # BB#0:
43; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,16,17,18,18]
44; CHECK-NEXT:    retq
45  %tmp0 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 4>)
46  %tmp1 = bitcast <8 x float> %tmp0 to <32 x i8>
47  %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 30>
48  ret <32 x i8> %tmp2
49}
50
51define <4 x i64> @combine_permq_pshufb_as_vperm2i128(<4 x i64> %a0) {
52; CHECK-LABEL: combine_permq_pshufb_as_vperm2i128:
53; CHECK:       # BB#0:
54; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
55; CHECK-NEXT:    vpaddq {{.*}}(%rip), %ymm0, %ymm0
56; CHECK-NEXT:    retq
57  %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
58  %2 = bitcast <4 x i64> %1 to <32 x i8>
59  %3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255>)
60  %4 = bitcast <32 x i8> %3 to <4 x i64>
61  %5 = add <4 x i64> %4, <i64 1, i64 1, i64 3, i64 3>
62  ret <4 x i64> %5
63}
64
65define <32 x i8> @combine_permq_pshufb_as_vpblendd(<4 x i64> %a0) {
66; CHECK-LABEL: combine_permq_pshufb_as_vpblendd:
67; CHECK:       # BB#0:
68; CHECK-NEXT:    vpxor %ymm1, %ymm1, %ymm1
69; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
70; CHECK-NEXT:    retq
71  %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
72  %2 = bitcast <4 x i64> %1 to <32 x i8>
73  %3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255>)
74  ret <32 x i8> %3
75}
76
77define <16 x i8> @combine_pshufb_as_vpbroadcastb128(<16 x i8> %a) {
78; CHECK-LABEL: combine_pshufb_as_vpbroadcastb128:
79; CHECK:       # BB#0:
80; CHECK-NEXT:    vpbroadcastb %xmm0, %xmm0
81; CHECK-NEXT:    retq
82  %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a, <16 x i8> zeroinitializer)
83  ret <16 x i8> %1
84}
85
86define <32 x i8> @combine_pshufb_as_vpbroadcastb256(<2 x i64> %a) {
87; CHECK-LABEL: combine_pshufb_as_vpbroadcastb256:
88; CHECK:       # BB#0:
89; CHECK-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
90; CHECK-NEXT:    vpbroadcastb %xmm0, %ymm0
91; CHECK-NEXT:    retq
92  %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
93  %2 = bitcast <4 x i64> %1 to <32 x i8>
94  %3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> zeroinitializer)
95  %4 = bitcast <32 x i8> %3 to <8 x i32>
96  %5 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %4, <8 x i32> zeroinitializer)
97  %6 = bitcast <8 x i32> %5 to <32 x i8>
98  ret <32 x i8> %6
99}
100
101define <16 x i8> @combine_pshufb_as_vpbroadcastw128(<16 x i8> %a) {
102; CHECK-LABEL: combine_pshufb_as_vpbroadcastw128:
103; CHECK:       # BB#0:
104; CHECK-NEXT:    vpbroadcastw %xmm0, %xmm0
105; CHECK-NEXT:    retq
106  %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a, <16 x i8> <i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1>)
107  ret <16 x i8> %1
108}
109
110define <32 x i8> @combine_pshufb_as_vpbroadcastw256(<2 x i64> %a) {
111; CHECK-LABEL: combine_pshufb_as_vpbroadcastw256:
112; CHECK:       # BB#0:
113; CHECK-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
114; CHECK-NEXT:    vpbroadcastw %xmm0, %ymm0
115; CHECK-NEXT:    retq
116  %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
117  %2 = bitcast <4 x i64> %1 to <32 x i8>
118  %3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1>)
119  %4 = bitcast <32 x i8> %3 to <8 x i32>
120  %5 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %4, <8 x i32> zeroinitializer)
121  %6 = bitcast <8 x i32> %5 to <32 x i8>
122  ret <32 x i8> %6
123}
124
125define <16 x i8> @combine_pshufb_as_vpbroadcastd128(<16 x i8> %a) {
126; CHECK-LABEL: combine_pshufb_as_vpbroadcastd128:
127; CHECK:       # BB#0:
128; CHECK-NEXT:    vpbroadcastd %xmm0, %xmm0
129; CHECK-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
130; CHECK-NEXT:    retq
131  %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>)
132  %2 = add <16 x i8> %1, <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>
133  ret <16 x i8> %2
134}
135
136define <8 x i32> @combine_permd_as_vpbroadcastd256(<4 x i32> %a) {
137; CHECK-LABEL: combine_permd_as_vpbroadcastd256:
138; CHECK:       # BB#0:
139; CHECK-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
140; CHECK-NEXT:    vpbroadcastd %xmm0, %ymm0
141; CHECK-NEXT:    vpaddd {{.*}}(%rip), %ymm0, %ymm0
142; CHECK-NEXT:    retq
143  %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
144  %2 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %1, <8 x i32> zeroinitializer)
145  %3 = add <8 x i32> %2, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
146  ret <8 x i32> %3
147}
148
149define <16 x i8> @combine_pshufb_as_vpbroadcastq128(<16 x i8> %a) {
150; CHECK-LABEL: combine_pshufb_as_vpbroadcastq128:
151; CHECK:       # BB#0:
152; CHECK-NEXT:    vpbroadcastq %xmm0, %xmm0
153; CHECK-NEXT:    retq
154  %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)
155  ret <16 x i8> %1
156}
157
158define <8 x i32> @combine_permd_as_vpbroadcastq256(<4 x i32> %a) {
159; CHECK-LABEL: combine_permd_as_vpbroadcastq256:
160; CHECK:       # BB#0:
161; CHECK-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
162; CHECK-NEXT:    vpbroadcastq %xmm0, %ymm0
163; CHECK-NEXT:    vpaddd {{.*}}(%rip), %ymm0, %ymm0
164; CHECK-NEXT:    retq
165  %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
166  %2 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %1, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>)
167  %3 = add <8 x i32> %2, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
168  ret <8 x i32> %3
169}
170
171define <4 x float> @combine_pshufb_as_vpbroadcastss128(<4 x float> %a) {
172; CHECK-LABEL: combine_pshufb_as_vpbroadcastss128:
173; CHECK:       # BB#0:
174; CHECK-NEXT:    vbroadcastss %xmm0, %xmm0
175; CHECK-NEXT:    retq
176  %1 = bitcast <4 x float> %a to <16 x i8>
177  %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>)
178  %3 = bitcast <16 x i8> %2 to <4 x float>
179  ret <4 x float> %3
180}
181
182define <8 x float> @combine_permd_as_vpbroadcastss256(<4 x float> %a) {
183; CHECK-LABEL: combine_permd_as_vpbroadcastss256:
184; CHECK:       # BB#0:
185; CHECK-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
186; CHECK-NEXT:    vbroadcastss %xmm0, %ymm0
187; CHECK-NEXT:    retq
188  %1 = shufflevector <4 x float> %a, <4 x float> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
189  %2 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %1, <8 x i32> zeroinitializer)
190  ret <8 x float> %2
191}
192
193define <4 x double> @combine_permd_as_vpbroadcastsd256(<2 x double> %a) {
194; CHECK-LABEL: combine_permd_as_vpbroadcastsd256:
195; CHECK:       # BB#0:
196; CHECK-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
197; CHECK-NEXT:    vbroadcastsd %xmm0, %ymm0
198; CHECK-NEXT:    retq
199  %1 = shufflevector <2 x double> %a, <2 x double> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
200  %2 = bitcast <4 x double> %1 to <8 x float>
201  %3 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %2, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>)
202  %4 = bitcast <8 x float> %3 to <4 x double>
203  ret <4 x double> %4
204}
205
206define <8 x i32> @combine_permd_as_permq(<8 x i32> %a) {
207; CHECK-LABEL: combine_permd_as_permq:
208; CHECK:       # BB#0:
209; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,1]
210; CHECK-NEXT:    retq
211  %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 4, i32 5, i32 2, i32 3>)
212  ret <8 x i32> %1
213}
214
215define <8 x float> @combine_permps_as_permpd(<8 x float> %a) {
216; CHECK-LABEL: combine_permps_as_permpd:
217; CHECK:       # BB#0:
218; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,2,0,1]
219; CHECK-NEXT:    retq
220  %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 0, i32 1, i32 2, i32 3>)
221  ret <8 x float> %1
222}
223
224define <32 x i8> @combine_pshufb_as_pslldq(<32 x i8> %a0) {
225; CHECK-LABEL: combine_pshufb_as_pslldq:
226; CHECK:       # BB#0:
227; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21]
228; CHECK-NEXT:    retq
229  %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5>)
230  ret <32 x i8> %res0
231}
232
233define <32 x i8> @combine_pshufb_as_psrldq(<32 x i8> %a0) {
234; CHECK-LABEL: combine_pshufb_as_psrldq:
235; CHECK:       # BB#0:
236; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
237; CHECK-NEXT:    retq
238  %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>)
239  ret <32 x i8> %res0
240}
241
242define <32 x i8> @combine_pshufb_as_pshuflw(<32 x i8> %a0) {
243; CHECK-LABEL: combine_pshufb_as_pshuflw:
244; CHECK:       # BB#0:
245; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15]
246; CHECK-NEXT:    retq
247  %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
248  ret <32 x i8> %res0
249}
250
251define <32 x i8> @combine_pshufb_as_pshufhw(<32 x i8> %a0) {
252; CHECK-LABEL: combine_pshufb_as_pshufhw:
253; CHECK:       # BB#0:
254; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,4,7,6,8,9,10,11,13,12,15,14]
255; CHECK-NEXT:    retq
256  %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13>)
257  ret <32 x i8> %res0
258}
259
260define <32 x i8> @combine_pshufb_not_as_pshufw(<32 x i8> %a0) {
261; CHECK-LABEL: combine_pshufb_not_as_pshufw:
262; CHECK:       # BB#0:
263; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13,18,19,16,17,22,23,20,21,26,27,24,25,30,31,28,29]
264; CHECK-NEXT:    retq
265  %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
266  %res1 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %res0, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13>)
267  ret <32 x i8> %res1
268}
269