1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
7
8target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
9target triple = "x86_64-unknown-unknown"
10
11define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i8> %a, <16 x i8> %b) {
12; SSE2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
13; SSE2:       # BB#0:
14; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
15; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
16; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
17; SSE2-NEXT:    retq
18;
19; SSSE3-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
20; SSSE3:       # BB#0:
21; SSSE3-NEXT:    pxor %xmm1, %xmm1
22; SSSE3-NEXT:    pshufb %xmm1, %xmm0
23; SSSE3-NEXT:    retq
24;
25; SSE41-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
26; SSE41:       # BB#0:
27; SSE41-NEXT:    pxor %xmm1, %xmm1
28; SSE41-NEXT:    pshufb %xmm1, %xmm0
29; SSE41-NEXT:    retq
30;
31; AVX1-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
32; AVX1:       # BB#0:
33; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
34; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
35; AVX1-NEXT:    retq
36;
37; AVX2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
38; AVX2:       # BB#0:
39; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm0
40; AVX2-NEXT:    retq
41  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
42  ret <16 x i8> %shuffle
43}
44
45define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01(<16 x i8> %a, <16 x i8> %b) {
46; SSE2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01:
47; SSE2:       # BB#0:
48; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
49; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,1,1,4,5,6,7]
50; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
51; SSE2-NEXT:    retq
52;
53; SSSE3-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01:
54; SSSE3:       # BB#0:
55; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
56; SSSE3-NEXT:    retq
57;
58; SSE41-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01:
59; SSE41:       # BB#0:
60; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
61; SSE41-NEXT:    retq
62;
63; AVX-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01:
64; AVX:       # BB#0:
65; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
66; AVX-NEXT:    retq
67  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
68  ret <16 x i8> %shuffle
69}
70
71define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08(<16 x i8> %a, <16 x i8> %b) {
72; SSE2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
73; SSE2:       # BB#0:
74; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
75; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
76; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
77; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7]
78; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
79; SSE2-NEXT:    retq
80;
81; SSSE3-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
82; SSSE3:       # BB#0:
83; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8]
84; SSSE3-NEXT:    retq
85;
86; SSE41-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
87; SSE41:       # BB#0:
88; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8]
89; SSE41-NEXT:    retq
90;
91; AVX-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
92; AVX:       # BB#0:
93; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8]
94; AVX-NEXT:    retq
95  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
96  ret <16 x i8> %shuffle
97}
98
99define <16 x i8> @shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03(<16 x i8> %a, <16 x i8> %b) {
100; SSE-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03:
101; SSE:       # BB#0:
102; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
103; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
104; SSE-NEXT:    retq
105;
106; AVX-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03:
107; AVX:       # BB#0:
108; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
109; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
110; AVX-NEXT:    retq
111  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
112  ret <16 x i8> %shuffle
113}
114
115define <16 x i8> @shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07(<16 x i8> %a, <16 x i8> %b) {
116; SSE-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07:
117; SSE:       # BB#0:
118; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
119; SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
120; SSE-NEXT:    retq
121;
122; AVX-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07:
123; AVX:       # BB#0:
124; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
125; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
126; AVX-NEXT:    retq
127  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
128  ret <16 x i8> %shuffle
129}
130
131define <16 x i8> @shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12(<16 x i8> %a, <16 x i8> %b) {
132; SSE2-LABEL: shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12:
133; SSE2:       # BB#0:
134; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
135; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
136; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
137; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
138; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7]
139; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,6]
140; SSE2-NEXT:    retq
141;
142; SSSE3-LABEL: shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12:
143; SSSE3:       # BB#0:
144; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12]
145; SSSE3-NEXT:    retq
146;
147; SSE41-LABEL: shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12:
148; SSE41:       # BB#0:
149; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12]
150; SSE41-NEXT:    retq
151;
152; AVX-LABEL: shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12:
153; AVX:       # BB#0:
154; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12]
155; AVX-NEXT:    retq
156  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12>
157  ret <16 x i8> %shuffle
158}
159
160define <16 x i8> @shuffle_v16i8_00_00_01_01_02_02_03_03_04_04_05_05_06_06_07_07(<16 x i8> %a, <16 x i8> %b) {
161; SSE-LABEL: shuffle_v16i8_00_00_01_01_02_02_03_03_04_04_05_05_06_06_07_07:
162; SSE:       # BB#0:
163; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
164; SSE-NEXT:    retq
165;
166; AVX-LABEL: shuffle_v16i8_00_00_01_01_02_02_03_03_04_04_05_05_06_06_07_07:
167; AVX:       # BB#0:
168; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
169; AVX-NEXT:    retq
170  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
171  ret <16 x i8> %shuffle
172}
173
174define <16 x i8> @shuffle_v16i8_0101010101010101(<16 x i8> %a, <16 x i8> %b) {
175; SSE-LABEL: shuffle_v16i8_0101010101010101:
176; SSE:       # BB#0:
177; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
178; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
179; SSE-NEXT:    retq
180;
181; AVX1-LABEL: shuffle_v16i8_0101010101010101:
182; AVX1:       # BB#0:
183; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
184; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
185; AVX1-NEXT:    retq
186;
187; AVX2-LABEL: shuffle_v16i8_0101010101010101:
188; AVX2:       # BB#0:
189; AVX2-NEXT:    vpbroadcastw %xmm0, %xmm0
190; AVX2-NEXT:    retq
191  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
192  ret <16 x i8> %shuffle
193}
194
195define <16 x i8> @shuffle_v16i8_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23(<16 x i8> %a, <16 x i8> %b) {
196; SSE-LABEL: shuffle_v16i8_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23:
197; SSE:       # BB#0:
198; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
199; SSE-NEXT:    retq
200;
201; AVX-LABEL: shuffle_v16i8_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23:
202; AVX:       # BB#0:
203; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
204; AVX-NEXT:    retq
205  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
206  ret <16 x i8> %shuffle
207}
208
209define <16 x i8> @shuffle_v16i8_08_24_09_25_10_26_11_27_12_28_13_29_14_30_15_31(<16 x i8> %a, <16 x i8> %b) {
210; SSE-LABEL: shuffle_v16i8_08_24_09_25_10_26_11_27_12_28_13_29_14_30_15_31:
211; SSE:       # BB#0:
212; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
213; SSE-NEXT:    retq
214;
215; AVX-LABEL: shuffle_v16i8_08_24_09_25_10_26_11_27_12_28_13_29_14_30_15_31:
216; AVX:       # BB#0:
217; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
218; AVX-NEXT:    retq
219  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
220  ret <16 x i8> %shuffle
221}
222
223define <16 x i8> @shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07(<16 x i8> %a, <16 x i8> %b) {
224; SSE2-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
225; SSE2:       # BB#0:
226; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
227; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
228; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
229; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
230; SSE2-NEXT:    pand %xmm2, %xmm1
231; SSE2-NEXT:    pandn %xmm0, %xmm2
232; SSE2-NEXT:    por %xmm1, %xmm2
233; SSE2-NEXT:    movdqa %xmm2, %xmm0
234; SSE2-NEXT:    retq
235;
236; SSSE3-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
237; SSSE3:       # BB#0:
238; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
239; SSSE3-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
240; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
241; SSSE3-NEXT:    movdqa %xmm1, %xmm0
242; SSSE3-NEXT:    retq
243;
244; SSE41-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
245; SSE41:       # BB#0:
246; SSE41-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
247; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
248; SSE41-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
249; SSE41-NEXT:    movdqa %xmm1, %xmm0
250; SSE41-NEXT:    retq
251;
252; AVX1-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
253; AVX1:       # BB#0:
254; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
255; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
256; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
257; AVX1-NEXT:    retq
258;
259; AVX2-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
260; AVX2:       # BB#0:
261; AVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
262; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
263; AVX2-NEXT:    retq
264  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 0, i32 16, i32 1, i32 16, i32 2, i32 16, i32 3, i32 16, i32 4, i32 16, i32 5, i32 16, i32 6, i32 16, i32 7>
265  ret <16 x i8> %shuffle
266}
267
268define <16 x i8> @shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12(<16 x i8> %a, <16 x i8> %b) {
269; SSE2-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12:
270; SSE2:       # BB#0:
271; SSE2-NEXT:    pxor %xmm1, %xmm1
272; SSE2-NEXT:    movdqa %xmm0, %xmm2
273; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
274; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
275; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
276; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
277; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
278; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
279; SSE2-NEXT:    packuswb %xmm2, %xmm0
280; SSE2-NEXT:    retq
281;
282; SSSE3-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12:
283; SSSE3:       # BB#0:
284; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
285; SSSE3-NEXT:    retq
286;
287; SSE41-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12:
288; SSE41:       # BB#0:
289; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
290; SSE41-NEXT:    retq
291;
292; AVX-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12:
293; AVX:       # BB#0:
294; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
295; AVX-NEXT:    retq
296  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
297  ret <16 x i8> %shuffle
298}
299
300define <16 x i8> @shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20(<16 x i8> %a, <16 x i8> %b) {
301; SSE2-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20:
302; SSE2:       # BB#0:
303; SSE2-NEXT:    pxor %xmm2, %xmm2
304; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
305; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
306; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
307; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
308; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
309; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
310; SSE2-NEXT:    packuswb %xmm1, %xmm0
311; SSE2-NEXT:    retq
312;
313; SSSE3-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20:
314; SSSE3:       # BB#0:
315; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
316; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[6,4,2,0,14,12,10,8,7,5,3,1,15,13,11,9]
317; SSSE3-NEXT:    retq
318;
319; SSE41-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20:
320; SSE41:       # BB#0:
321; SSE41-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
322; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[6,4,2,0,14,12,10,8,7,5,3,1,15,13,11,9]
323; SSE41-NEXT:    retq
324;
325; AVX-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20:
326; AVX:       # BB#0:
327; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
328; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,4,2,0,14,12,10,8,7,5,3,1,15,13,11,9]
329; AVX-NEXT:    retq
330  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 19, i32 18, i32 17, i32 16, i32 23, i32 22, i32 21, i32 20>
331  ret <16 x i8> %shuffle
332}
333
334define <16 x i8> @shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20(<16 x i8> %a, <16 x i8> %b) {
335; SSE2-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20:
336; SSE2:       # BB#0:
337; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
338; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
339; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
340; SSE2-NEXT:    pxor %xmm1, %xmm1
341; SSE2-NEXT:    movdqa %xmm0, %xmm2
342; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
343; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm2[3,2,1,0,4,5,6,7]
344; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
345; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
346; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
347; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
348; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
349; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
350; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
351; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
352; SSE2-NEXT:    packuswb %xmm3, %xmm0
353; SSE2-NEXT:    retq
354;
355; SSSE3-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20:
356; SSSE3:       # BB#0:
357; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[15,14,13,12,7,6,5,4,u,u,u,u,u,u,u,u]
358; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,11,10,9,8,u,u,u,u,u,u,u,u]
359; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
360; SSSE3-NEXT:    retq
361;
362; SSE41-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20:
363; SSE41:       # BB#0:
364; SSE41-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[15,14,13,12,7,6,5,4,u,u,u,u,u,u,u,u]
365; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,11,10,9,8,u,u,u,u,u,u,u,u]
366; SSE41-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
367; SSE41-NEXT:    retq
368;
369; AVX-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20:
370; AVX:       # BB#0:
371; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[15,14,13,12,7,6,5,4,u,u,u,u,u,u,u,u]
372; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,11,10,9,8,u,u,u,u,u,u,u,u]
373; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
374; AVX-NEXT:    retq
375  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 31, i32 30, i32 29, i32 28, i32 11, i32 10, i32 9, i32 8, i32 23, i32 22, i32 21, i32 20>
376  ret <16 x i8> %shuffle
377}
378
379define <16 x i8> @shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31(<16 x i8> %a, <16 x i8> %b) {
380; SSE2-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
381; SSE2:       # BB#0:
382; SSE2-NEXT:    movaps {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
383; SSE2-NEXT:    andps %xmm2, %xmm0
384; SSE2-NEXT:    andnps %xmm1, %xmm2
385; SSE2-NEXT:    orps %xmm2, %xmm0
386; SSE2-NEXT:    retq
387;
388; SSSE3-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
389; SSSE3:       # BB#0:
390; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
391; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
392; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
393; SSSE3-NEXT:    retq
394;
395; SSE41-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
396; SSE41:       # BB#0:
397; SSE41-NEXT:    movdqa %xmm0, %xmm2
398; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
399; SSE41-NEXT:    pblendvb %xmm2, %xmm1
400; SSE41-NEXT:    movdqa %xmm1, %xmm0
401; SSE41-NEXT:    retq
402;
403; AVX-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
404; AVX:       # BB#0:
405; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
406; AVX-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
407; AVX-NEXT:    retq
408  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
409  ret <16 x i8> %shuffle
410}
411
412define <16 x i8> @shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31(<16 x i8> %a, <16 x i8> %b) {
413; SSE2-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31:
414; SSE2:       # BB#0:
415; SSE2-NEXT:    movaps {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
416; SSE2-NEXT:    andps %xmm2, %xmm0
417; SSE2-NEXT:    andnps %xmm1, %xmm2
418; SSE2-NEXT:    orps %xmm2, %xmm0
419; SSE2-NEXT:    retq
420;
421; SSSE3-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31:
422; SSSE3:       # BB#0:
423; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[15]
424; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,2],zero,xmm0[4,5,6],zero,xmm0[8,9,10],zero,xmm0[12,13,14],zero
425; SSSE3-NEXT:    por %xmm1, %xmm0
426; SSSE3-NEXT:    retq
427;
428; SSE41-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31:
429; SSE41:       # BB#0:
430; SSE41-NEXT:    movdqa %xmm0, %xmm2
431; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
432; SSE41-NEXT:    pblendvb %xmm2, %xmm1
433; SSE41-NEXT:    movdqa %xmm1, %xmm0
434; SSE41-NEXT:    retq
435;
436; AVX-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31:
437; AVX:       # BB#0:
438; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
439; AVX-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
440; AVX-NEXT:    retq
441  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 4, i32 5, i32 6, i32 23, i32 8, i32 9, i32 10, i32 27, i32 12, i32 13, i32 14, i32 31>
442  ret <16 x i8> %shuffle
443}
444
445define <16 x i8> @shuffle_v16i8_00_01_02_zz_04_05_06_zz_08_09_10_zz_12_13_14_zz(<16 x i8> %a) {
446; SSE-LABEL: shuffle_v16i8_00_01_02_zz_04_05_06_zz_08_09_10_zz_12_13_14_zz:
447; SSE:       # BB#0:
448; SSE-NEXT:    andps {{.*}}(%rip), %xmm0
449; SSE-NEXT:    retq
450;
451; AVX-LABEL: shuffle_v16i8_00_01_02_zz_04_05_06_zz_08_09_10_zz_12_13_14_zz:
452; AVX:       # BB#0:
453; AVX-NEXT:    vandps {{.*}}(%rip), %xmm0, %xmm0
454; AVX-NEXT:    retq
455  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 4, i32 5, i32 6, i32 23, i32 8, i32 9, i32 10, i32 27, i32 12, i32 13, i32 14, i32 31>
456  ret <16 x i8> %shuffle
457}
458
459define <16 x i8> @shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31(<16 x i8> %a, <16 x i8> %b) {
460; SSE2-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31:
461; SSE2:       # BB#0:
462; SSE2-NEXT:    movaps {{.*#+}} xmm2 = [255,255,255,255,0,255,255,0,255,255,255,255,0,255,255,0]
463; SSE2-NEXT:    andps %xmm2, %xmm0
464; SSE2-NEXT:    andnps %xmm1, %xmm2
465; SSE2-NEXT:    orps %xmm2, %xmm0
466; SSE2-NEXT:    retq
467;
468; SSSE3-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31:
469; SSSE3:       # BB#0:
470; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[4],zero,zero,xmm1[7],zero,zero,zero,zero,xmm1[12],zero,zero,xmm1[15]
471; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,xmm0[5,6],zero,xmm0[8,9,10,11],zero,xmm0[13,14],zero
472; SSSE3-NEXT:    por %xmm1, %xmm0
473; SSSE3-NEXT:    retq
474;
475; SSE41-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31:
476; SSE41:       # BB#0:
477; SSE41-NEXT:    movdqa %xmm0, %xmm2
478; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [255,255,255,255,0,255,255,0,255,255,255,255,0,255,255,0]
479; SSE41-NEXT:    pblendvb %xmm2, %xmm1
480; SSE41-NEXT:    movdqa %xmm1, %xmm0
481; SSE41-NEXT:    retq
482;
483; AVX-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31:
484; AVX:       # BB#0:
485; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,255,255,255,0,255,255,0,255,255,255,255,0,255,255,0]
486; AVX-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
487; AVX-NEXT:    retq
488  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 20, i32 5, i32 6, i32 23, i32 8, i32 9, i32 10, i32 11, i32 28, i32 13, i32 14, i32 31>
489  ret <16 x i8> %shuffle
490}
491
492define <16 x i8> @shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15(<16 x i8> %a, <16 x i8> %b) {
493; SSE2-LABEL: shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15:
494; SSE2:       # BB#0:
495; SSE2-NEXT:    movaps {{.*#+}} xmm2 = [255,255,255,255,0,0,0,0,255,255,0,0,255,0,255,0]
496; SSE2-NEXT:    andps %xmm2, %xmm1
497; SSE2-NEXT:    andnps %xmm0, %xmm2
498; SSE2-NEXT:    orps %xmm1, %xmm2
499; SSE2-NEXT:    movaps %xmm2, %xmm0
500; SSE2-NEXT:    retq
501;
502; SSSE3-LABEL: shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15:
503; SSSE3:       # BB#0:
504; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[4,5,6,7],zero,zero,xmm0[10,11],zero,xmm0[13],zero,xmm0[15]
505; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,1,2,3],zero,zero,zero,zero,xmm1[8,9],zero,zero,xmm1[12],zero,xmm1[14],zero
506; SSSE3-NEXT:    por %xmm1, %xmm0
507; SSSE3-NEXT:    retq
508;
509; SSE41-LABEL: shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15:
510; SSE41:       # BB#0:
511; SSE41-NEXT:    movdqa %xmm0, %xmm2
512; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [255,255,255,255,0,0,0,0,255,255,0,0,255,0,255,0]
513; SSE41-NEXT:    pblendvb %xmm1, %xmm2
514; SSE41-NEXT:    movdqa %xmm2, %xmm0
515; SSE41-NEXT:    retq
516;
517; AVX-LABEL: shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15:
518; AVX:       # BB#0:
519; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,255,255,255,0,0,0,0,255,255,0,0,255,0,255,0]
520; AVX-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
521; AVX-NEXT:    retq
522  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 10, i32 11, i32 28, i32 13, i32 30, i32 15>
523  ret <16 x i8> %shuffle
524}
525
526define <16 x i8> @trunc_v4i32_shuffle(<16 x i8> %a) {
527; SSE2-LABEL: trunc_v4i32_shuffle:
528; SSE2:       # BB#0:
529; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
530; SSE2-NEXT:    packuswb %xmm0, %xmm0
531; SSE2-NEXT:    packuswb %xmm0, %xmm0
532; SSE2-NEXT:    retq
533;
534; SSSE3-LABEL: trunc_v4i32_shuffle:
535; SSSE3:       # BB#0:
536; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
537; SSSE3-NEXT:    retq
538;
539; SSE41-LABEL: trunc_v4i32_shuffle:
540; SSE41:       # BB#0:
541; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
542; SSE41-NEXT:    retq
543;
544; AVX-LABEL: trunc_v4i32_shuffle:
545; AVX:       # BB#0:
546; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
547; AVX-NEXT:    retq
548  %shuffle = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
549  ret <16 x i8> %shuffle
550}
551
552define <16 x i8> @stress_test0(<16 x i8> %s.0.1, <16 x i8> %s.0.2, <16 x i8> %s.0.3, <16 x i8> %s.0.4, <16 x i8> %s.0.5, <16 x i8> %s.0.6, <16 x i8> %s.0.7, <16 x i8> %s.0.8, <16 x i8> %s.0.9) {
553; We don't have anything useful to check here. This generates 100s of
554; instructions. Instead, just make sure we survived codegen.
555; ALL-LABEL: stress_test0:
556; ALL:         retq
557entry:
558  %s.1.4 = shufflevector <16 x i8> %s.0.4, <16 x i8> %s.0.5, <16 x i32> <i32 1, i32 22, i32 21, i32 28, i32 3, i32 16, i32 6, i32 1, i32 19, i32 29, i32 12, i32 31, i32 2, i32 3, i32 3, i32 6>
559  %s.1.5 = shufflevector <16 x i8> %s.0.5, <16 x i8> %s.0.6, <16 x i32> <i32 31, i32 20, i32 12, i32 19, i32 2, i32 15, i32 12, i32 31, i32 2, i32 28, i32 2, i32 30, i32 7, i32 8, i32 17, i32 28>
560  %s.1.8 = shufflevector <16 x i8> %s.0.8, <16 x i8> %s.0.9, <16 x i32> <i32 14, i32 10, i32 17, i32 5, i32 17, i32 9, i32 17, i32 21, i32 31, i32 24, i32 16, i32 6, i32 20, i32 28, i32 23, i32 8>
561  %s.2.2 = shufflevector <16 x i8> %s.0.3, <16 x i8> %s.0.4, <16 x i32> <i32 20, i32 9, i32 21, i32 11, i32 11, i32 4, i32 3, i32 18, i32 3, i32 30, i32 4, i32 31, i32 11, i32 24, i32 13, i32 29>
562  %s.3.2 = shufflevector <16 x i8> %s.2.2, <16 x i8> %s.1.4, <16 x i32> <i32 15, i32 13, i32 5, i32 11, i32 7, i32 17, i32 14, i32 22, i32 22, i32 16, i32 7, i32 24, i32 16, i32 22, i32 7, i32 29>
563  %s.5.4 = shufflevector <16 x i8> %s.1.5, <16 x i8> %s.1.8, <16 x i32> <i32 3, i32 13, i32 19, i32 7, i32 23, i32 11, i32 1, i32 9, i32 16, i32 25, i32 2, i32 7, i32 0, i32 21, i32 23, i32 17>
564  %s.6.1 = shufflevector <16 x i8> %s.3.2, <16 x i8> %s.3.2, <16 x i32> <i32 11, i32 2, i32 28, i32 31, i32 27, i32 3, i32 9, i32 27, i32 25, i32 25, i32 14, i32 7, i32 12, i32 28, i32 12, i32 23>
565  %s.7.1 = shufflevector <16 x i8> %s.6.1, <16 x i8> %s.3.2, <16 x i32> <i32 15, i32 29, i32 14, i32 0, i32 29, i32 15, i32 26, i32 30, i32 6, i32 7, i32 2, i32 8, i32 12, i32 10, i32 29, i32 17>
566  %s.7.2 = shufflevector <16 x i8> %s.3.2, <16 x i8> %s.5.4, <16 x i32> <i32 3, i32 29, i32 3, i32 19, i32 undef, i32 20, i32 undef, i32 3, i32 27, i32 undef, i32 undef, i32 11, i32 undef, i32 undef, i32 undef, i32 undef>
567  %s.16.0 = shufflevector <16 x i8> %s.7.1, <16 x i8> %s.7.2, <16 x i32> <i32 13, i32 1, i32 16, i32 16, i32 6, i32 7, i32 29, i32 18, i32 19, i32 28, i32 undef, i32 undef, i32 31, i32 1, i32 undef, i32 10>
568  ret <16 x i8> %s.16.0
569}
570
571define <16 x i8> @undef_test1(<16 x i8> %s.0.5, <16 x i8> %s.0.8, <16 x i8> %s.0.9) noinline nounwind {
572; There is nothing interesting to check about these instructions other than
573; that they survive codegen. However, we actually do better and delete all of
574; them because the result is 'undef'.
575;
576; ALL-LABEL: undef_test1:
577; ALL:       # BB#0: # %entry
578; ALL-NEXT:    retq
579entry:
580  %s.1.8 = shufflevector <16 x i8> %s.0.8, <16 x i8> undef, <16 x i32> <i32 9, i32 9, i32 undef, i32 undef, i32 undef, i32 2, i32 undef, i32 6, i32 undef, i32 6, i32 undef, i32 14, i32 14, i32 undef, i32 undef, i32 0>
581  %s.2.4 = shufflevector <16 x i8> undef, <16 x i8> %s.0.5, <16 x i32> <i32 21, i32 undef, i32 undef, i32 19, i32 undef, i32 undef, i32 29, i32 24, i32 21, i32 23, i32 21, i32 17, i32 19, i32 undef, i32 20, i32 22>
582  %s.2.5 = shufflevector <16 x i8> %s.0.5, <16 x i8> undef, <16 x i32> <i32 3, i32 8, i32 undef, i32 7, i32 undef, i32 10, i32 8, i32 0, i32 15, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 9>
583  %s.2.9 = shufflevector <16 x i8> %s.0.9, <16 x i8> undef, <16 x i32> <i32 7, i32 undef, i32 14, i32 7, i32 8, i32 undef, i32 7, i32 8, i32 5, i32 15, i32 undef, i32 1, i32 11, i32 undef, i32 undef, i32 11>
584  %s.3.4 = shufflevector <16 x i8> %s.2.4, <16 x i8> %s.0.5, <16 x i32> <i32 5, i32 0, i32 21, i32 6, i32 15, i32 27, i32 22, i32 21, i32 4, i32 22, i32 19, i32 26, i32 9, i32 26, i32 8, i32 29>
585  %s.3.9 = shufflevector <16 x i8> %s.2.9, <16 x i8> undef, <16 x i32> <i32 8, i32 6, i32 8, i32 1, i32 undef, i32 4, i32 undef, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 6, i32 undef>
586  %s.4.7 = shufflevector <16 x i8> %s.1.8, <16 x i8> %s.2.9, <16 x i32> <i32 9, i32 0, i32 22, i32 20, i32 24, i32 7, i32 21, i32 17, i32 20, i32 12, i32 19, i32 23, i32 2, i32 9, i32 17, i32 10>
587  %s.4.8 = shufflevector <16 x i8> %s.2.9, <16 x i8> %s.3.9, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 6, i32 10, i32 undef, i32 0, i32 5, i32 undef, i32 9, i32 undef>
588  %s.5.7 = shufflevector <16 x i8> %s.4.7, <16 x i8> %s.4.8, <16 x i32> <i32 16, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
589  %s.8.4 = shufflevector <16 x i8> %s.3.4, <16 x i8> %s.5.7, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 28, i32 undef, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
590  %s.9.4 = shufflevector <16 x i8> %s.8.4, <16 x i8> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 10, i32 5>
591  %s.10.4 = shufflevector <16 x i8> %s.9.4, <16 x i8> undef, <16 x i32> <i32 undef, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
592  %s.12.4 = shufflevector <16 x i8> %s.10.4, <16 x i8> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 13, i32 undef, i32 undef, i32 undef>
593
594  ret <16 x i8> %s.12.4
595}
596
597define <16 x i8> @PR20540(<8 x i8> %a) {
598; SSE2-LABEL: PR20540:
599; SSE2:       # BB#0:
600; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
601; SSE2-NEXT:    packuswb %xmm0, %xmm0
602; SSE2-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
603; SSE2-NEXT:    retq
604;
605; SSSE3-LABEL: PR20540:
606; SSSE3:       # BB#0:
607; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
608; SSSE3-NEXT:    retq
609;
610; SSE41-LABEL: PR20540:
611; SSE41:       # BB#0:
612; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
613; SSE41-NEXT:    retq
614;
615; AVX-LABEL: PR20540:
616; AVX:       # BB#0:
617; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
618; AVX-NEXT:    retq
619  %shuffle = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
620  ret <16 x i8> %shuffle
621}
622
623define <16 x i8> @shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) {
624; SSE-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
625; SSE:       # BB#0:
626; SSE-NEXT:    movzbl %dil, %eax
627; SSE-NEXT:    movd %eax, %xmm0
628; SSE-NEXT:    retq
629;
630; AVX-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
631; AVX:       # BB#0:
632; AVX-NEXT:    movzbl %dil, %eax
633; AVX-NEXT:    vmovd %eax, %xmm0
634; AVX-NEXT:    retq
635  %a = insertelement <16 x i8> undef, i8 %i, i32 0
636  %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 16, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
637  ret <16 x i8> %shuffle
638}
639
640define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) {
641; SSE2-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
642; SSE2:       # BB#0:
643; SSE2-NEXT:    shll $8, %edi
644; SSE2-NEXT:    pxor %xmm0, %xmm0
645; SSE2-NEXT:    pinsrw $2, %edi, %xmm0
646; SSE2-NEXT:    retq
647;
648; SSSE3-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
649; SSSE3:       # BB#0:
650; SSSE3-NEXT:    shll $8, %edi
651; SSSE3-NEXT:    pxor %xmm0, %xmm0
652; SSSE3-NEXT:    pinsrw $2, %edi, %xmm0
653; SSSE3-NEXT:    retq
654;
655; SSE41-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
656; SSE41:       # BB#0:
657; SSE41-NEXT:    pxor %xmm0, %xmm0
658; SSE41-NEXT:    pinsrb $5, %edi, %xmm0
659; SSE41-NEXT:    retq
660;
661; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
662; AVX:       # BB#0:
663; AVX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
664; AVX-NEXT:    vpinsrb $5, %edi, %xmm0, %xmm0
665; AVX-NEXT:    retq
666  %a = insertelement <16 x i8> undef, i8 %i, i32 0
667  %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
668  ret <16 x i8> %shuffle
669}
670
671define <16 x i8> @shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16(i8 %i) {
672; SSE2-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
673; SSE2:       # BB#0:
674; SSE2-NEXT:    shll $8, %edi
675; SSE2-NEXT:    pxor %xmm0, %xmm0
676; SSE2-NEXT:    pinsrw $7, %edi, %xmm0
677; SSE2-NEXT:    retq
678;
679; SSSE3-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
680; SSSE3:       # BB#0:
681; SSSE3-NEXT:    shll $8, %edi
682; SSSE3-NEXT:    pxor %xmm0, %xmm0
683; SSSE3-NEXT:    pinsrw $7, %edi, %xmm0
684; SSSE3-NEXT:    retq
685;
686; SSE41-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
687; SSE41:       # BB#0:
688; SSE41-NEXT:    pxor %xmm0, %xmm0
689; SSE41-NEXT:    pinsrb $15, %edi, %xmm0
690; SSE41-NEXT:    retq
691;
692; AVX-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
693; AVX:       # BB#0:
694; AVX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
695; AVX-NEXT:    vpinsrb $15, %edi, %xmm0, %xmm0
696; AVX-NEXT:    retq
697  %a = insertelement <16 x i8> undef, i8 %i, i32 0
698  %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 16>
699  ret <16 x i8> %shuffle
700}
701
702define <16 x i8> @shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) {
703; SSE2-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
704; SSE2:       # BB#0:
705; SSE2-NEXT:    movzbl %dil, %eax
706; SSE2-NEXT:    pxor %xmm0, %xmm0
707; SSE2-NEXT:    pinsrw $1, %eax, %xmm0
708; SSE2-NEXT:    retq
709;
710; SSSE3-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
711; SSSE3:       # BB#0:
712; SSSE3-NEXT:    movzbl %dil, %eax
713; SSSE3-NEXT:    pxor %xmm0, %xmm0
714; SSSE3-NEXT:    pinsrw $1, %eax, %xmm0
715; SSSE3-NEXT:    retq
716;
717; SSE41-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
718; SSE41:       # BB#0:
719; SSE41-NEXT:    pxor %xmm0, %xmm0
720; SSE41-NEXT:    pinsrb $2, %edi, %xmm0
721; SSE41-NEXT:    retq
722;
723; AVX-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
724; AVX:       # BB#0:
725; AVX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
726; AVX-NEXT:    vpinsrb $2, %edi, %xmm0, %xmm0
727; AVX-NEXT:    retq
728  %a = insertelement <16 x i8> undef, i8 %i, i32 3
729  %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 19, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
730  ret <16 x i8> %shuffle
731}
732
733define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_16_uu_18_uu(<16 x i8> %a) {
734; SSE-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_16_uu_18_uu:
735; SSE:       # BB#0:
736; SSE-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
737; SSE-NEXT:    retq
738;
739; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_16_uu_18_uu:
740; AVX:       # BB#0:
741; AVX-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
742; AVX-NEXT:    retq
743  %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 undef, i32 18, i32 undef>
744  ret <16 x i8> %shuffle
745}
746
747define <16 x i8> @shuffle_v16i8_28_uu_30_31_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(<16 x i8> %a) {
748; SSE-LABEL: shuffle_v16i8_28_uu_30_31_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
749; SSE:       # BB#0:
750; SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
751; SSE-NEXT:    retq
752;
753; AVX-LABEL: shuffle_v16i8_28_uu_30_31_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
754; AVX:       # BB#0:
755; AVX-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
756; AVX-NEXT:    retq
757  %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 28, i32 undef, i32 30, i32 31, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 09, i32 0, i32 0, i32 0, i32 0, i32 0>
758  ret <16 x i8> %shuffle
759}
760
761define <16 x i8> @shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14(<16 x i8> %a, <16 x i8> %b) {
762; SSE2-LABEL: shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
763; SSE2:       # BB#0:
764; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
765; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
766; SSE2-NEXT:    por %xmm1, %xmm0
767; SSE2-NEXT:    retq
768;
769; SSSE3-LABEL: shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
770; SSSE3:       # BB#0:
771; SSSE3-NEXT:    palignr {{.*#+}} xmm0 = xmm1[15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
772; SSSE3-NEXT:    retq
773;
774; SSE41-LABEL: shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
775; SSE41:       # BB#0:
776; SSE41-NEXT:    palignr {{.*#+}} xmm0 = xmm1[15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
777; SSE41-NEXT:    retq
778;
779; AVX-LABEL: shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
780; AVX:       # BB#0:
781; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
782; AVX-NEXT:    retq
783  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
784  ret <16 x i8> %shuffle
785}
786
787define <16 x i8> @shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14(<16 x i8> %a, <16 x i8> %b) {
788; SSE2-LABEL: shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
789; SSE2:       # BB#0:
790; SSE2-NEXT:    movdqa %xmm0, %xmm1
791; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
792; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
793; SSE2-NEXT:    por %xmm1, %xmm0
794; SSE2-NEXT:    retq
795;
796; SSSE3-LABEL: shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
797; SSSE3:       # BB#0:
798; SSSE3-NEXT:    palignr {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
799; SSSE3-NEXT:    retq
800;
801; SSE41-LABEL: shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
802; SSE41:       # BB#0:
803; SSE41-NEXT:    palignr {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
804; SSE41-NEXT:    retq
805;
806; AVX-LABEL: shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
807; AVX:       # BB#0:
808; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
809; AVX-NEXT:    retq
810  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
811  ret <16 x i8> %shuffle
812}
813
814define <16 x i8> @shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00(<16 x i8> %a, <16 x i8> %b) {
815; SSE2-LABEL: shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00:
816; SSE2:       # BB#0:
817; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
818; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
819; SSE2-NEXT:    por %xmm1, %xmm0
820; SSE2-NEXT:    retq
821;
822; SSSE3-LABEL: shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00:
823; SSSE3:       # BB#0:
824; SSSE3-NEXT:    palignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
825; SSSE3-NEXT:    retq
826;
827; SSE41-LABEL: shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00:
828; SSE41:       # BB#0:
829; SSE41-NEXT:    palignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
830; SSE41-NEXT:    retq
831;
832; AVX-LABEL: shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00:
833; AVX:       # BB#0:
834; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
835; AVX-NEXT:    retq
836  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0>
837  ret <16 x i8> %shuffle
838}
839
840define <16 x i8> @shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16(<16 x i8> %a, <16 x i8> %b) {
841; SSE2-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16:
842; SSE2:       # BB#0:
843; SSE2-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
844; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
845; SSE2-NEXT:    por %xmm1, %xmm0
846; SSE2-NEXT:    retq
847;
848; SSSE3-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16:
849; SSSE3:       # BB#0:
850; SSSE3-NEXT:    palignr {{.*#+}} xmm1 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0]
851; SSSE3-NEXT:    movdqa %xmm1, %xmm0
852; SSSE3-NEXT:    retq
853;
854; SSE41-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16:
855; SSE41:       # BB#0:
856; SSE41-NEXT:    palignr {{.*#+}} xmm1 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0]
857; SSE41-NEXT:    movdqa %xmm1, %xmm0
858; SSE41-NEXT:    retq
859;
860; AVX-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16:
861; AVX:       # BB#0:
862; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0]
863; AVX-NEXT:    retq
864  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
865  ret <16 x i8> %shuffle
866}
867
868define <16 x i8> @shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00(<16 x i8> %a, <16 x i8> %b) {
869; SSE2-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00:
870; SSE2:       # BB#0:
871; SSE2-NEXT:    movdqa %xmm0, %xmm1
872; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
873; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
874; SSE2-NEXT:    por %xmm1, %xmm0
875; SSE2-NEXT:    retq
876;
877; SSSE3-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00:
878; SSSE3:       # BB#0:
879; SSSE3-NEXT:    palignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0]
880; SSSE3-NEXT:    retq
881;
882; SSE41-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00:
883; SSE41:       # BB#0:
884; SSE41-NEXT:    palignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0]
885; SSE41-NEXT:    retq
886;
887; AVX-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00:
888; AVX:       # BB#0:
889; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0]
890; AVX-NEXT:    retq
891  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0>
892  ret <16 x i8> %shuffle
893}
894
895define <16 x i8> @shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30(<16 x i8> %a, <16 x i8> %b) {
896; SSE2-LABEL: shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
897; SSE2:       # BB#0:
898; SSE2-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
899; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
900; SSE2-NEXT:    por %xmm1, %xmm0
901; SSE2-NEXT:    retq
902;
903; SSSE3-LABEL: shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
904; SSSE3:       # BB#0:
905; SSSE3-NEXT:    palignr {{.*#+}} xmm1 = xmm0[15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
906; SSSE3-NEXT:    movdqa %xmm1, %xmm0
907; SSSE3-NEXT:    retq
908;
909; SSE41-LABEL: shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
910; SSE41:       # BB#0:
911; SSE41-NEXT:    palignr {{.*#+}} xmm1 = xmm0[15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
912; SSE41-NEXT:    movdqa %xmm1, %xmm0
913; SSE41-NEXT:    retq
914;
915; AVX-LABEL: shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
916; AVX:       # BB#0:
917; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
918; AVX-NEXT:    retq
919  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
920  ret <16 x i8> %shuffle
921}
922
923define <16 x i8> @shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu(<16 x i8> %a) {
924; SSE2-LABEL: shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu:
925; SSE2:       # BB#0:
926; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
927; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
928; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
929; SSE2-NEXT:    retq
930;
931; SSSE3-LABEL: shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu:
932; SSSE3:       # BB#0:
933; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
934; SSSE3-NEXT:    retq
935;
936; SSE41-LABEL: shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu:
937; SSE41:       # BB#0:
938; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
939; SSE41-NEXT:    retq
940;
941; AVX-LABEL: shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu:
942; AVX:       # BB#0:
943; AVX-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
944; AVX-NEXT:    retq
945  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
946  ret <16 x i8> %shuffle
947}
948
949define <16 x i8> @shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz(<16 x i8> %a) {
950; SSE2-LABEL: shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz:
951; SSE2:       # BB#0:
952; SSE2-NEXT:    pxor %xmm1, %xmm1
953; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
954; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
955; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
956; SSE2-NEXT:    retq
957;
958; SSSE3-LABEL: shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz:
959; SSSE3:       # BB#0:
960; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
961; SSSE3-NEXT:    retq
962;
963; SSE41-LABEL: shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz:
964; SSE41:       # BB#0:
965; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
966; SSE41-NEXT:    retq
967;
968; AVX-LABEL: shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz:
969; AVX:       # BB#0:
970; AVX-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
971; AVX-NEXT:    retq
972  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 1, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
973  ret <16 x i8> %shuffle
974}
975
976define <16 x i8> @shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu(<16 x i8> %a) {
977; SSE2-LABEL: shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu:
978; SSE2:       # BB#0:
979; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
980; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
981; SSE2-NEXT:    retq
982;
983; SSSE3-LABEL: shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu:
984; SSSE3:       # BB#0:
985; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
986; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
987; SSSE3-NEXT:    retq
988;
989; SSE41-LABEL: shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu:
990; SSE41:       # BB#0:
991; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
992; SSE41-NEXT:    retq
993;
994; AVX-LABEL: shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu:
995; AVX:       # BB#0:
996; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
997; AVX-NEXT:    retq
998  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 2, i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 undef>
999  ret <16 x i8> %shuffle
1000}
1001
1002define <16 x i8> @shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz(<16 x i8> %a) {
1003; SSE2-LABEL: shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz:
1004; SSE2:       # BB#0:
1005; SSE2-NEXT:    pxor %xmm1, %xmm1
1006; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1007; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1008; SSE2-NEXT:    retq
1009;
1010; SSSE3-LABEL: shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz:
1011; SSSE3:       # BB#0:
1012; SSSE3-NEXT:    pxor %xmm1, %xmm1
1013; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1014; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1015; SSSE3-NEXT:    retq
1016;
1017; SSE41-LABEL: shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz:
1018; SSE41:       # BB#0:
1019; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1020; SSE41-NEXT:    retq
1021;
1022; AVX-LABEL: shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz:
1023; AVX:       # BB#0:
1024; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1025; AVX-NEXT:    retq
1026  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 1, i32 21, i32 22, i32 23, i32 2, i32 25, i32 26, i32 27, i32 3, i32 29, i32 30, i32 31>
1027  ret <16 x i8> %shuffle
1028}
1029
1030define <16 x i8> @shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu(<16 x i8> %a) {
1031; SSE2-LABEL: shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu:
1032; SSE2:       # BB#0:
1033; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1034; SSE2-NEXT:    retq
1035;
1036; SSSE3-LABEL: shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu:
1037; SSSE3:       # BB#0:
1038; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1039; SSSE3-NEXT:    retq
1040;
1041; SSE41-LABEL: shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu:
1042; SSE41:       # BB#0:
1043; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1044; SSE41-NEXT:    retq
1045;
1046; AVX-LABEL: shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu:
1047; AVX:       # BB#0:
1048; AVX-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1049; AVX-NEXT:    retq
1050  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 undef, i32 1, i32 undef, i32 2, i32 undef, i32 3, i32 undef, i32 4, i32 undef, i32 5, i32 undef, i32 6, i32 undef, i32 7, i32 undef>
1051  ret <16 x i8> %shuffle
1052}
1053
1054define <16 x i8> @shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz(<16 x i8> %a) {
1055; SSE2-LABEL: shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz:
1056; SSE2:       # BB#0:
1057; SSE2-NEXT:    pxor %xmm1, %xmm1
1058; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1059; SSE2-NEXT:    retq
1060;
1061; SSSE3-LABEL: shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz:
1062; SSSE3:       # BB#0:
1063; SSSE3-NEXT:    pxor %xmm1, %xmm1
1064; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1065; SSSE3-NEXT:    retq
1066;
1067; SSE41-LABEL: shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz:
1068; SSE41:       # BB#0:
1069; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1070; SSE41-NEXT:    retq
1071;
1072; AVX-LABEL: shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz:
1073; AVX:       # BB#0:
1074; AVX-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1075; AVX-NEXT:    retq
1076  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 1, i32 19, i32 2, i32 21, i32 3, i32 23, i32 4, i32 25, i32 5, i32 27, i32 6, i32 29, i32 7, i32 31>
1077  ret <16 x i8> %shuffle
1078}
1079
1080define <16 x i8> @shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00(<16 x i8> %a, <16 x i8> %b) {
1081; SSE2-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
1082; SSE2:       # BB#0: # %entry
1083; SSE2-NEXT:    pxor %xmm2, %xmm2
1084; SSE2-NEXT:    movdqa %xmm0, %xmm3
1085; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
1086; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,3,0,1]
1087; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,1,2,2,4,5,6,7]
1088; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7]
1089; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,65535,0,0,65535]
1090; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
1091; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,1,0,3]
1092; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,3,1,3,4,5,6,7]
1093; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,4]
1094; SSE2-NEXT:    pand %xmm5, %xmm2
1095; SSE2-NEXT:    pandn %xmm4, %xmm5
1096; SSE2-NEXT:    por %xmm2, %xmm5
1097; SSE2-NEXT:    psrlq $16, %xmm3
1098; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
1099; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,1,3]
1100; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
1101; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,4]
1102; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
1103; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
1104; SSE2-NEXT:    packuswb %xmm5, %xmm2
1105; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
1106; SSE2-NEXT:    pand %xmm0, %xmm2
1107; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,3,1,1,4,5,6,7]
1108; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
1109; SSE2-NEXT:    pandn %xmm1, %xmm0
1110; SSE2-NEXT:    por %xmm2, %xmm0
1111; SSE2-NEXT:    retq
1112;
1113; SSSE3-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
1114; SSSE3:       # BB#0: # %entry
1115; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[u],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[2],zero,zero,zero
1116; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[u,10,2,7],zero,xmm0[14,7,2],zero,xmm0[3,1,14],zero,xmm0[9,11,0]
1117; SSSE3-NEXT:    por %xmm1, %xmm0
1118; SSSE3-NEXT:    retq
1119;
1120; SSE41-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
1121; SSE41:       # BB#0: # %entry
1122; SSE41-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[u],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[2],zero,zero,zero
1123; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[u,10,2,7],zero,xmm0[14,7,2],zero,xmm0[3,1,14],zero,xmm0[9,11,0]
1124; SSE41-NEXT:    por %xmm1, %xmm0
1125; SSE41-NEXT:    retq
1126;
1127; AVX-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
1128; AVX:       # BB#0: # %entry
1129; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[2],zero,zero,zero
1130; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,10,2,7],zero,xmm0[14,7,2],zero,xmm0[3,1,14],zero,xmm0[9,11,0]
1131; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
1132; AVX-NEXT:    retq
1133entry:
1134  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 undef, i32 10, i32 2, i32 7, i32 22, i32 14, i32 7, i32 2, i32 18, i32 3, i32 1, i32 14, i32 18, i32 9, i32 11, i32 0>
1135
1136  ret <16 x i8> %shuffle
1137}
1138
1139define <16 x i8> @stress_test2(<16 x i8> %s.0.0, <16 x i8> %s.0.1, <16 x i8> %s.0.2) {
1140; Nothing interesting to test here. Just make sure we didn't crashe.
1141; ALL-LABEL: stress_test2:
1142; ALL:         retq
1143entry:
1144  %s.1.0 = shufflevector <16 x i8> %s.0.0, <16 x i8> %s.0.1, <16 x i32> <i32 29, i32 30, i32 2, i32 16, i32 26, i32 21, i32 11, i32 26, i32 26, i32 3, i32 4, i32 5, i32 30, i32 28, i32 15, i32 5>
1145  %s.1.1 = shufflevector <16 x i8> %s.0.1, <16 x i8> %s.0.2, <16 x i32> <i32 31, i32 1, i32 24, i32 12, i32 28, i32 5, i32 2, i32 9, i32 29, i32 1, i32 31, i32 5, i32 6, i32 17, i32 15, i32 22>
1146  %s.2.0 = shufflevector <16 x i8> %s.1.0, <16 x i8> %s.1.1, <16 x i32> <i32 22, i32 1, i32 12, i32 3, i32 30, i32 4, i32 30, i32 undef, i32 1, i32 10, i32 14, i32 18, i32 27, i32 13, i32 16, i32 19>
1147
1148  ret <16 x i8> %s.2.0
1149}
1150
1151define void @constant_gets_selected(<4 x i32>* %ptr1, <4 x i32>* %ptr2) {
1152; SSE-LABEL: constant_gets_selected:
1153; SSE:       # BB#0: # %entry
1154; SSE-NEXT:    xorps %xmm0, %xmm0
1155; SSE-NEXT:    movaps %xmm0, (%rdi)
1156; SSE-NEXT:    movaps %xmm0, (%rsi)
1157; SSE-NEXT:    retq
1158;
1159; AVX-LABEL: constant_gets_selected:
1160; AVX:       # BB#0: # %entry
1161; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
1162; AVX-NEXT:    vmovaps %xmm0, (%rdi)
1163; AVX-NEXT:    vmovaps %xmm0, (%rsi)
1164; AVX-NEXT:    retq
1165entry:
1166  %weird_zero = bitcast <4 x i32> zeroinitializer to <16 x i8>
1167  %shuffle.i = shufflevector <16 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 0>, <16 x i8> %weird_zero, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27>
1168  %weirder_zero = bitcast <16 x i8> %shuffle.i to <4 x i32>
1169  store <4 x i32> %weirder_zero, <4 x i32>* %ptr1, align 16
1170  store <4 x i32> zeroinitializer, <4 x i32>* %ptr2, align 16
1171  ret void
1172}
1173
1174;
1175; Shuffle to logical bit shifts
1176;
1177
1178define <16 x i8> @shuffle_v16i8_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14(<16 x i8> %a, <16 x i8> %b) {
1179; SSE-LABEL: shuffle_v16i8_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14:
1180; SSE:       # BB#0:
1181; SSE-NEXT:    psllw $8, %xmm0
1182; SSE-NEXT:    retq
1183;
1184; AVX-LABEL: shuffle_v16i8_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14:
1185; AVX:       # BB#0:
1186; AVX-NEXT:    vpsllw $8, %xmm0, %xmm0
1187; AVX-NEXT:    retq
1188  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 16, i32 0, i32 16, i32 2, i32 16, i32 4, i32 16, i32 6, i32 16, i32 8, i32 16, i32 10, i32 16, i32 12, i32 16, i32 14>
1189  ret <16 x i8> %shuffle
1190}
1191
1192define <16 x i8> @shuffle_v16i8_zz_zz_zz_00_zz_zz_zz_04_zz_zz_zz_08_zz_zz_zz_12(<16 x i8> %a, <16 x i8> %b) {
1193; SSE-LABEL: shuffle_v16i8_zz_zz_zz_00_zz_zz_zz_04_zz_zz_zz_08_zz_zz_zz_12:
1194; SSE:       # BB#0:
1195; SSE-NEXT:    pslld $24, %xmm0
1196; SSE-NEXT:    retq
1197;
1198; AVX-LABEL: shuffle_v16i8_zz_zz_zz_00_zz_zz_zz_04_zz_zz_zz_08_zz_zz_zz_12:
1199; AVX:       # BB#0:
1200; AVX-NEXT:    vpslld $24, %xmm0, %xmm0
1201; AVX-NEXT:    retq
1202  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 16, i32 16, i32 16, i32 0, i32 16, i32 16, i32 16, i32 4, i32 16, i32 16, i32 16, i32 8, i32 16, i32 16, i32 16, i32 12>
1203  ret <16 x i8> %shuffle
1204}
1205
1206define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_00_zz_zz_zz_zz_zz_zz_zz_08(<16 x i8> %a, <16 x i8> %b) {
1207; SSE-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_00_zz_zz_zz_zz_zz_zz_zz_08:
1208; SSE:       # BB#0:
1209; SSE-NEXT:    psllq $56, %xmm0
1210; SSE-NEXT:    retq
1211;
1212; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_00_zz_zz_zz_zz_zz_zz_zz_08:
1213; AVX:       # BB#0:
1214; AVX-NEXT:    vpsllq $56, %xmm0, %xmm0
1215; AVX-NEXT:    retq
1216  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 8>
1217  ret <16 x i8> %shuffle
1218}
1219
1220define <16 x i8> @shuffle_v16i8_zz_00_uu_02_03_uu_05_06_zz_08_09_uu_11_12_13_14(<16 x i8> %a, <16 x i8> %b) {
1221; SSE-LABEL: shuffle_v16i8_zz_00_uu_02_03_uu_05_06_zz_08_09_uu_11_12_13_14:
1222; SSE:       # BB#0:
1223; SSE-NEXT:    psllq $8, %xmm0
1224; SSE-NEXT:    retq
1225;
1226; AVX-LABEL: shuffle_v16i8_zz_00_uu_02_03_uu_05_06_zz_08_09_uu_11_12_13_14:
1227; AVX:       # BB#0:
1228; AVX-NEXT:    vpsllq $8, %xmm0, %xmm0
1229; AVX-NEXT:    retq
1230  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 16, i32 0, i32 undef, i32 2, i32 3, i32 undef, i32 5, i32 6, i32 16, i32 8, i32 9, i32 undef, i32 11, i32 12, i32 13, i32 14>
1231  ret <16 x i8> %shuffle
1232}
1233
1234define <16 x i8> @shuffle_v16i8_01_uu_uu_uu_uu_zz_uu_zz_uu_zz_11_zz_13_zz_15_zz(<16 x i8> %a, <16 x i8> %b) {
1235; SSE-LABEL: shuffle_v16i8_01_uu_uu_uu_uu_zz_uu_zz_uu_zz_11_zz_13_zz_15_zz:
1236; SSE:       # BB#0:
1237; SSE-NEXT:    psrlw $8, %xmm0
1238; SSE-NEXT:    retq
1239;
1240; AVX-LABEL: shuffle_v16i8_01_uu_uu_uu_uu_zz_uu_zz_uu_zz_11_zz_13_zz_15_zz:
1241; AVX:       # BB#0:
1242; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm0
1243; AVX-NEXT:    retq
1244  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 16, i32 undef, i32 16, i32 undef, i32 16, i32 11, i32 16, i32 13, i32 16, i32 15, i32 16>
1245  ret <16 x i8> %shuffle
1246}
1247
1248define <16 x i8> @shuffle_v16i8_02_03_zz_zz_06_07_uu_uu_uu_uu_uu_uu_14_15_zz_zz(<16 x i8> %a, <16 x i8> %b) {
1249; SSE-LABEL: shuffle_v16i8_02_03_zz_zz_06_07_uu_uu_uu_uu_uu_uu_14_15_zz_zz:
1250; SSE:       # BB#0:
1251; SSE-NEXT:    psrld $16, %xmm0
1252; SSE-NEXT:    retq
1253;
1254; AVX-LABEL: shuffle_v16i8_02_03_zz_zz_06_07_uu_uu_uu_uu_uu_uu_14_15_zz_zz:
1255; AVX:       # BB#0:
1256; AVX-NEXT:    vpsrld $16, %xmm0, %xmm0
1257; AVX-NEXT:    retq
1258  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 2, i32 3, i32 16, i32 16, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 14, i32 15, i32 16, i32 16>
1259  ret <16 x i8> %shuffle
1260}
1261
1262define <16 x i8> @shuffle_v16i8_07_zz_zz_zz_zz_zz_uu_uu_15_uu_uu_uu_uu_uu_zz_zz(<16 x i8> %a, <16 x i8> %b) {
1263; SSE-LABEL: shuffle_v16i8_07_zz_zz_zz_zz_zz_uu_uu_15_uu_uu_uu_uu_uu_zz_zz:
1264; SSE:       # BB#0:
1265; SSE-NEXT:    psrlq $56, %xmm0
1266; SSE-NEXT:    retq
1267;
1268; AVX-LABEL: shuffle_v16i8_07_zz_zz_zz_zz_zz_uu_uu_15_uu_uu_uu_uu_uu_zz_zz:
1269; AVX:       # BB#0:
1270; AVX-NEXT:    vpsrlq $56, %xmm0, %xmm0
1271; AVX-NEXT:    retq
1272  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 7, i32 16, i32 16, i32 16, i32 16, i32 16, i32 undef, i32 undef, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 16, i32 16>
1273  ret <16 x i8> %shuffle
1274}
1275
1276define <16 x i8> @PR12412(<16 x i8> %inval1, <16 x i8> %inval2) {
1277; SSE2-LABEL: PR12412:
1278; SSE2:       # BB#0: # %entry
1279; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1280; SSE2-NEXT:    pand %xmm2, %xmm1
1281; SSE2-NEXT:    pand %xmm2, %xmm0
1282; SSE2-NEXT:    packuswb %xmm1, %xmm0
1283; SSE2-NEXT:    retq
1284;
1285; SSSE3-LABEL: PR12412:
1286; SSSE3:       # BB#0: # %entry
1287; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1288; SSSE3-NEXT:    pshufb %xmm2, %xmm1
1289; SSSE3-NEXT:    pshufb %xmm2, %xmm0
1290; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1291; SSSE3-NEXT:    retq
1292;
1293; SSE41-LABEL: PR12412:
1294; SSE41:       # BB#0: # %entry
1295; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1296; SSE41-NEXT:    pshufb %xmm2, %xmm1
1297; SSE41-NEXT:    pshufb %xmm2, %xmm0
1298; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1299; SSE41-NEXT:    retq
1300;
1301; AVX-LABEL: PR12412:
1302; AVX:       # BB#0: # %entry
1303; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1304; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1305; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1306; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1307; AVX-NEXT:    retq
1308entry:
1309  %0 = shufflevector <16 x i8> %inval1, <16 x i8> %inval2, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
1310  ret <16 x i8> %0
1311}
1312
1313define <16 x i8> @shuffle_v16i8_uu_02_03_zz_uu_06_07_zz_uu_10_11_zz_uu_14_15_zz(<16 x i8> %a) {
1314; SSE-LABEL: shuffle_v16i8_uu_02_03_zz_uu_06_07_zz_uu_10_11_zz_uu_14_15_zz:
1315; SSE:       # BB#0:
1316; SSE-NEXT:    psrld $8, %xmm0
1317; SSE-NEXT:    retq
1318;
1319; AVX-LABEL: shuffle_v16i8_uu_02_03_zz_uu_06_07_zz_uu_10_11_zz_uu_14_15_zz:
1320; AVX:       # BB#0:
1321; AVX-NEXT:    vpsrld $8, %xmm0, %xmm0
1322; AVX-NEXT:    retq
1323  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 undef, i32 2, i32 3, i32 16, i32 undef, i32 6, i32 7, i32 16, i32 undef, i32 10, i32 11, i32 16, i32 undef, i32 14, i32 15, i32 16>
1324  ret <16 x i8> %shuffle
1325}
1326
1327define <16 x i8> @shuffle_v16i8_bitcast_unpack(<16 x i8> %a, <16 x i8> %b) {
1328; SSE-LABEL: shuffle_v16i8_bitcast_unpack:
1329; SSE:       # BB#0:
1330; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1331; SSE-NEXT:    retq
1332;
1333; AVX-LABEL: shuffle_v16i8_bitcast_unpack:
1334; AVX:       # BB#0:
1335; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1336; AVX-NEXT:    retq
1337  %shuffle8  = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 7, i32 23, i32 6, i32 22, i32 5, i32 21, i32 4, i32 20, i32 3, i32 19, i32 2, i32 18, i32 1, i32 17, i32 0, i32 16>
1338  %bitcast32 = bitcast <16 x i8> %shuffle8 to <4 x float>
1339  %shuffle32 = shufflevector <4 x float> %bitcast32, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
1340  %bitcast16 = bitcast <4 x float> %shuffle32 to <8 x i16>
1341  %shuffle16 = shufflevector <8 x i16> %bitcast16, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
1342  %bitcast8  = bitcast <8 x i16> %shuffle16 to <16 x i8>
1343  ret <16 x i8> %bitcast8
1344}
1345
1346define <16 x i8> @insert_dup_mem_v16i8_i32(i32* %ptr) {
1347; SSE2-LABEL: insert_dup_mem_v16i8_i32:
1348; SSE2:       # BB#0:
1349; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1350; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1351; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
1352; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
1353; SSE2-NEXT:    retq
1354;
1355; SSSE3-LABEL: insert_dup_mem_v16i8_i32:
1356; SSSE3:       # BB#0:
1357; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1358; SSSE3-NEXT:    pxor %xmm1, %xmm1
1359; SSSE3-NEXT:    pshufb %xmm1, %xmm0
1360; SSSE3-NEXT:    retq
1361;
1362; SSE41-LABEL: insert_dup_mem_v16i8_i32:
1363; SSE41:       # BB#0:
1364; SSE41-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1365; SSE41-NEXT:    pxor %xmm1, %xmm1
1366; SSE41-NEXT:    pshufb %xmm1, %xmm0
1367; SSE41-NEXT:    retq
1368;
1369; AVX1-LABEL: insert_dup_mem_v16i8_i32:
1370; AVX1:       # BB#0:
1371; AVX1-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1372; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1373; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
1374; AVX1-NEXT:    retq
1375;
1376; AVX2-LABEL: insert_dup_mem_v16i8_i32:
1377; AVX2:       # BB#0:
1378; AVX2-NEXT:    vpbroadcastb (%rdi), %xmm0
1379; AVX2-NEXT:    retq
1380  %tmp = load i32, i32* %ptr, align 4
1381  %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
1382  %tmp2 = bitcast <4 x i32> %tmp1 to <16 x i8>
1383  %tmp3 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <16 x i32> zeroinitializer
1384  ret <16 x i8> %tmp3
1385}
1386
1387define <16 x i8> @insert_dup_mem_v16i8_sext_i8(i8* %ptr) {
1388; SSE2-LABEL: insert_dup_mem_v16i8_sext_i8:
1389; SSE2:       # BB#0:
1390; SSE2-NEXT:    movsbl (%rdi), %eax
1391; SSE2-NEXT:    movd %eax, %xmm0
1392; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1393; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
1394; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
1395; SSE2-NEXT:    retq
1396;
1397; SSSE3-LABEL: insert_dup_mem_v16i8_sext_i8:
1398; SSSE3:       # BB#0:
1399; SSSE3-NEXT:    movsbl (%rdi), %eax
1400; SSSE3-NEXT:    movd %eax, %xmm0
1401; SSSE3-NEXT:    pxor %xmm1, %xmm1
1402; SSSE3-NEXT:    pshufb %xmm1, %xmm0
1403; SSSE3-NEXT:    retq
1404;
1405; SSE41-LABEL: insert_dup_mem_v16i8_sext_i8:
1406; SSE41:       # BB#0:
1407; SSE41-NEXT:    movsbl (%rdi), %eax
1408; SSE41-NEXT:    movd %eax, %xmm0
1409; SSE41-NEXT:    pxor %xmm1, %xmm1
1410; SSE41-NEXT:    pshufb %xmm1, %xmm0
1411; SSE41-NEXT:    retq
1412;
1413; AVX1-LABEL: insert_dup_mem_v16i8_sext_i8:
1414; AVX1:       # BB#0:
1415; AVX1-NEXT:    movsbl (%rdi), %eax
1416; AVX1-NEXT:    vmovd %eax, %xmm0
1417; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1418; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
1419; AVX1-NEXT:    retq
1420;
1421; AVX2-LABEL: insert_dup_mem_v16i8_sext_i8:
1422; AVX2:       # BB#0:
1423; AVX2-NEXT:    vpbroadcastb (%rdi), %xmm0
1424; AVX2-NEXT:    retq
1425  %tmp = load i8, i8* %ptr, align 1
1426  %tmp1 = sext i8 %tmp to i32
1427  %tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 0
1428  %tmp3 = bitcast <4 x i32> %tmp2 to <16 x i8>
1429  %tmp4 = shufflevector <16 x i8> %tmp3, <16 x i8> undef, <16 x i32> zeroinitializer
1430  ret <16 x i8> %tmp4
1431}
1432
1433define <16 x i8> @insert_dup_elt1_mem_v16i8_i32(i32* %ptr) {
1434; SSE2-LABEL: insert_dup_elt1_mem_v16i8_i32:
1435; SSE2:       # BB#0:
1436; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1437; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1438; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
1439; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
1440; SSE2-NEXT:    retq
1441;
1442; SSSE3-LABEL: insert_dup_elt1_mem_v16i8_i32:
1443; SSSE3:       # BB#0:
1444; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1445; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1446; SSSE3-NEXT:    retq
1447;
1448; SSE41-LABEL: insert_dup_elt1_mem_v16i8_i32:
1449; SSE41:       # BB#0:
1450; SSE41-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1451; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1452; SSE41-NEXT:    retq
1453;
1454; AVX1-LABEL: insert_dup_elt1_mem_v16i8_i32:
1455; AVX1:       # BB#0:
1456; AVX1-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1457; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1458; AVX1-NEXT:    retq
1459;
1460; AVX2-LABEL: insert_dup_elt1_mem_v16i8_i32:
1461; AVX2:       # BB#0:
1462; AVX2-NEXT:    vpbroadcastb 1(%rdi), %xmm0
1463; AVX2-NEXT:    retq
1464  %tmp = load i32, i32* %ptr, align 4
1465  %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
1466  %tmp2 = bitcast <4 x i32> %tmp1 to <16 x i8>
1467  %tmp3 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1468  ret <16 x i8> %tmp3
1469}
1470
1471define <16 x i8> @insert_dup_elt2_mem_v16i8_i32(i32* %ptr) {
1472; SSE2-LABEL: insert_dup_elt2_mem_v16i8_i32:
1473; SSE2:       # BB#0:
1474; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1475; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1476; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7]
1477; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
1478; SSE2-NEXT:    retq
1479;
1480; SSSE3-LABEL: insert_dup_elt2_mem_v16i8_i32:
1481; SSSE3:       # BB#0:
1482; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1483; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
1484; SSSE3-NEXT:    retq
1485;
1486; SSE41-LABEL: insert_dup_elt2_mem_v16i8_i32:
1487; SSE41:       # BB#0:
1488; SSE41-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1489; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
1490; SSE41-NEXT:    retq
1491;
1492; AVX1-LABEL: insert_dup_elt2_mem_v16i8_i32:
1493; AVX1:       # BB#0:
1494; AVX1-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1495; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
1496; AVX1-NEXT:    retq
1497;
1498; AVX2-LABEL: insert_dup_elt2_mem_v16i8_i32:
1499; AVX2:       # BB#0:
1500; AVX2-NEXT:    vpbroadcastb 2(%rdi), %xmm0
1501; AVX2-NEXT:    retq
1502  %tmp = load i32, i32* %ptr, align 4
1503  %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
1504  %tmp2 = bitcast <4 x i32> %tmp1 to <16 x i8>
1505  %tmp3 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
1506  ret <16 x i8> %tmp3
1507}
1508
1509define <16 x i8> @insert_dup_elt1_mem_v16i8_sext_i8(i8* %ptr) {
1510; SSE2-LABEL: insert_dup_elt1_mem_v16i8_sext_i8:
1511; SSE2:       # BB#0:
1512; SSE2-NEXT:    movsbl (%rdi), %eax
1513; SSE2-NEXT:    movd %eax, %xmm0
1514; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1515; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
1516; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
1517; SSE2-NEXT:    retq
1518;
1519; SSSE3-LABEL: insert_dup_elt1_mem_v16i8_sext_i8:
1520; SSSE3:       # BB#0:
1521; SSSE3-NEXT:    movsbl (%rdi), %eax
1522; SSSE3-NEXT:    movd %eax, %xmm0
1523; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1524; SSSE3-NEXT:    retq
1525;
1526; SSE41-LABEL: insert_dup_elt1_mem_v16i8_sext_i8:
1527; SSE41:       # BB#0:
1528; SSE41-NEXT:    movsbl (%rdi), %eax
1529; SSE41-NEXT:    movd %eax, %xmm0
1530; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1531; SSE41-NEXT:    retq
1532;
1533; AVX1-LABEL: insert_dup_elt1_mem_v16i8_sext_i8:
1534; AVX1:       # BB#0:
1535; AVX1-NEXT:    movsbl (%rdi), %eax
1536; AVX1-NEXT:    vmovd %eax, %xmm0
1537; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1538; AVX1-NEXT:    retq
1539;
1540; AVX2-LABEL: insert_dup_elt1_mem_v16i8_sext_i8:
1541; AVX2:       # BB#0:
1542; AVX2-NEXT:    movsbl (%rdi), %eax
1543; AVX2-NEXT:    shrl $8, %eax
1544; AVX2-NEXT:    vmovd %eax, %xmm0
1545; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm0
1546; AVX2-NEXT:    retq
1547  %tmp = load i8, i8* %ptr, align 1
1548  %tmp1 = sext i8 %tmp to i32
1549  %tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 0
1550  %tmp3 = bitcast <4 x i32> %tmp2 to <16 x i8>
1551  %tmp4 = shufflevector <16 x i8> %tmp3, <16 x i8> undef, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1552  ret <16 x i8> %tmp4
1553}
1554
1555define <16 x i8> @insert_dup_elt2_mem_v16i8_sext_i8(i8* %ptr) {
1556; SSE2-LABEL: insert_dup_elt2_mem_v16i8_sext_i8:
1557; SSE2:       # BB#0:
1558; SSE2-NEXT:    movsbl (%rdi), %eax
1559; SSE2-NEXT:    movd %eax, %xmm0
1560; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1561; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7]
1562; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
1563; SSE2-NEXT:    retq
1564;
1565; SSSE3-LABEL: insert_dup_elt2_mem_v16i8_sext_i8:
1566; SSSE3:       # BB#0:
1567; SSSE3-NEXT:    movsbl (%rdi), %eax
1568; SSSE3-NEXT:    movd %eax, %xmm0
1569; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
1570; SSSE3-NEXT:    retq
1571;
1572; SSE41-LABEL: insert_dup_elt2_mem_v16i8_sext_i8:
1573; SSE41:       # BB#0:
1574; SSE41-NEXT:    movsbl (%rdi), %eax
1575; SSE41-NEXT:    movd %eax, %xmm0
1576; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
1577; SSE41-NEXT:    retq
1578;
1579; AVX1-LABEL: insert_dup_elt2_mem_v16i8_sext_i8:
1580; AVX1:       # BB#0:
1581; AVX1-NEXT:    movsbl (%rdi), %eax
1582; AVX1-NEXT:    vmovd %eax, %xmm0
1583; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
1584; AVX1-NEXT:    retq
1585;
1586; AVX2-LABEL: insert_dup_elt2_mem_v16i8_sext_i8:
1587; AVX2:       # BB#0:
1588; AVX2-NEXT:    movsbl (%rdi), %eax
1589; AVX2-NEXT:    shrl $16, %eax
1590; AVX2-NEXT:    vmovd %eax, %xmm0
1591; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm0
1592; AVX2-NEXT:    retq
1593  %tmp = load i8, i8* %ptr, align 1
1594  %tmp1 = sext i8 %tmp to i32
1595  %tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 0
1596  %tmp3 = bitcast <4 x i32> %tmp2 to <16 x i8>
1597  %tmp4 = shufflevector <16 x i8> %tmp3, <16 x i8> undef, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
1598  ret <16 x i8> %tmp4
1599}
1600