1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
7;
8; Just one 32-bit run to make sure we do reasonable things there.
9; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X32-SSE41
10
11define <8 x i16> @sext_16i8_to_8i16(<16 x i8> %A) nounwind uwtable readnone ssp {
12; SSE2-LABEL: sext_16i8_to_8i16:
13; SSE2:       # BB#0: # %entry
14; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
15; SSE2-NEXT:    psraw $8, %xmm0
16; SSE2-NEXT:    retq
17;
18; SSSE3-LABEL: sext_16i8_to_8i16:
19; SSSE3:       # BB#0: # %entry
20; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
21; SSSE3-NEXT:    psraw $8, %xmm0
22; SSSE3-NEXT:    retq
23;
24; SSE41-LABEL: sext_16i8_to_8i16:
25; SSE41:       # BB#0: # %entry
26; SSE41-NEXT:    pmovsxbw %xmm0, %xmm0
27; SSE41-NEXT:    retq
28;
29; AVX-LABEL: sext_16i8_to_8i16:
30; AVX:       # BB#0: # %entry
31; AVX-NEXT:    vpmovsxbw %xmm0, %xmm0
32; AVX-NEXT:    retq
33;
34; X32-SSE41-LABEL: sext_16i8_to_8i16:
35; X32-SSE41:       # BB#0: # %entry
36; X32-SSE41-NEXT:    pmovsxbw %xmm0, %xmm0
37; X32-SSE41-NEXT:    retl
38entry:
39  %B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
40  %C = sext <8 x i8> %B to <8 x i16>
41  ret <8 x i16> %C
42}
43
44define <16 x i16> @sext_16i8_to_16i16(<16 x i8> %A) nounwind uwtable readnone ssp {
45; SSE2-LABEL: sext_16i8_to_16i16:
46; SSE2:       # BB#0: # %entry
47; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
48; SSE2-NEXT:    psraw $8, %xmm2
49; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
50; SSE2-NEXT:    psraw $8, %xmm1
51; SSE2-NEXT:    movdqa %xmm2, %xmm0
52; SSE2-NEXT:    retq
53;
54; SSSE3-LABEL: sext_16i8_to_16i16:
55; SSSE3:       # BB#0: # %entry
56; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
57; SSSE3-NEXT:    psraw $8, %xmm2
58; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
59; SSSE3-NEXT:    psraw $8, %xmm1
60; SSSE3-NEXT:    movdqa %xmm2, %xmm0
61; SSSE3-NEXT:    retq
62;
63; SSE41-LABEL: sext_16i8_to_16i16:
64; SSE41:       # BB#0: # %entry
65; SSE41-NEXT:    pmovsxbw %xmm0, %xmm2
66; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
67; SSE41-NEXT:    pmovsxbw %xmm0, %xmm1
68; SSE41-NEXT:    movdqa %xmm2, %xmm0
69; SSE41-NEXT:    retq
70;
71; AVX1-LABEL: sext_16i8_to_16i16:
72; AVX1:       # BB#0: # %entry
73; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm1
74; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
75; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm0
76; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
77; AVX1-NEXT:    retq
78;
79; AVX2-LABEL: sext_16i8_to_16i16:
80; AVX2:       # BB#0: # %entry
81; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
82; AVX2-NEXT:    retq
83;
84; X32-SSE41-LABEL: sext_16i8_to_16i16:
85; X32-SSE41:       # BB#0: # %entry
86; X32-SSE41-NEXT:    pmovsxbw %xmm0, %xmm2
87; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
88; X32-SSE41-NEXT:    pmovsxbw %xmm0, %xmm1
89; X32-SSE41-NEXT:    movdqa %xmm2, %xmm0
90; X32-SSE41-NEXT:    retl
91entry:
92  %B = sext <16 x i8> %A to <16 x i16>
93  ret <16 x i16> %B
94}
95
96define <4 x i32> @sext_16i8_to_4i32(<16 x i8> %A) nounwind uwtable readnone ssp {
97; SSE2-LABEL: sext_16i8_to_4i32:
98; SSE2:       # BB#0: # %entry
99; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
100; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
101; SSE2-NEXT:    psrad $24, %xmm0
102; SSE2-NEXT:    retq
103;
104; SSSE3-LABEL: sext_16i8_to_4i32:
105; SSSE3:       # BB#0: # %entry
106; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
107; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
108; SSSE3-NEXT:    psrad $24, %xmm0
109; SSSE3-NEXT:    retq
110;
111; SSE41-LABEL: sext_16i8_to_4i32:
112; SSE41:       # BB#0: # %entry
113; SSE41-NEXT:    pmovsxbd %xmm0, %xmm0
114; SSE41-NEXT:    retq
115;
116; AVX-LABEL: sext_16i8_to_4i32:
117; AVX:       # BB#0: # %entry
118; AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
119; AVX-NEXT:    retq
120;
121; X32-SSE41-LABEL: sext_16i8_to_4i32:
122; X32-SSE41:       # BB#0: # %entry
123; X32-SSE41-NEXT:    pmovsxbd %xmm0, %xmm0
124; X32-SSE41-NEXT:    retl
125entry:
126  %B = shufflevector <16 x i8> %A, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
127  %C = sext <4 x i8> %B to <4 x i32>
128  ret <4 x i32> %C
129}
130
131define <8 x i32> @sext_16i8_to_8i32(<16 x i8> %A) nounwind uwtable readnone ssp {
132; SSE2-LABEL: sext_16i8_to_8i32:
133; SSE2:       # BB#0: # %entry
134; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
135; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
136; SSE2-NEXT:    psrad $24, %xmm2
137; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
138; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
139; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
140; SSE2-NEXT:    psrad $24, %xmm1
141; SSE2-NEXT:    movdqa %xmm2, %xmm0
142; SSE2-NEXT:    retq
143;
144; SSSE3-LABEL: sext_16i8_to_8i32:
145; SSSE3:       # BB#0: # %entry
146; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
147; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
148; SSSE3-NEXT:    psrad $24, %xmm2
149; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
150; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
151; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
152; SSSE3-NEXT:    psrad $24, %xmm1
153; SSSE3-NEXT:    movdqa %xmm2, %xmm0
154; SSSE3-NEXT:    retq
155;
156; SSE41-LABEL: sext_16i8_to_8i32:
157; SSE41:       # BB#0: # %entry
158; SSE41-NEXT:    pmovsxbd %xmm0, %xmm2
159; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
160; SSE41-NEXT:    pmovsxbd %xmm0, %xmm1
161; SSE41-NEXT:    movdqa %xmm2, %xmm0
162; SSE41-NEXT:    retq
163;
164; AVX1-LABEL: sext_16i8_to_8i32:
165; AVX1:       # BB#0: # %entry
166; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm1
167; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
168; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
169; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
170; AVX1-NEXT:    retq
171;
172; AVX2-LABEL: sext_16i8_to_8i32:
173; AVX2:       # BB#0: # %entry
174; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
175; AVX2-NEXT:    vpslld $24, %ymm0, %ymm0
176; AVX2-NEXT:    vpsrad $24, %ymm0, %ymm0
177; AVX2-NEXT:    retq
178;
179; X32-SSE41-LABEL: sext_16i8_to_8i32:
180; X32-SSE41:       # BB#0: # %entry
181; X32-SSE41-NEXT:    pmovsxbd %xmm0, %xmm2
182; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
183; X32-SSE41-NEXT:    pmovsxbd %xmm0, %xmm1
184; X32-SSE41-NEXT:    movdqa %xmm2, %xmm0
185; X32-SSE41-NEXT:    retl
186entry:
187  %B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
188  %C = sext <8 x i8> %B to <8 x i32>
189  ret <8 x i32> %C
190}
191
192define <2 x i64> @sext_16i8_to_2i64(<16 x i8> %A) nounwind uwtable readnone ssp {
193; SSE2-LABEL: sext_16i8_to_2i64:
194; SSE2:       # BB#0: # %entry
195; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
196; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
197; SSE2-NEXT:    movdqa %xmm0, %xmm1
198; SSE2-NEXT:    psrad $31, %xmm1
199; SSE2-NEXT:    psrad $24, %xmm0
200; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
201; SSE2-NEXT:    retq
202;
203; SSSE3-LABEL: sext_16i8_to_2i64:
204; SSSE3:       # BB#0: # %entry
205; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
206; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
207; SSSE3-NEXT:    movdqa %xmm0, %xmm1
208; SSSE3-NEXT:    psrad $31, %xmm1
209; SSSE3-NEXT:    psrad $24, %xmm0
210; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
211; SSSE3-NEXT:    retq
212;
213; SSE41-LABEL: sext_16i8_to_2i64:
214; SSE41:       # BB#0: # %entry
215; SSE41-NEXT:    pmovsxbq %xmm0, %xmm0
216; SSE41-NEXT:    retq
217;
218; AVX-LABEL: sext_16i8_to_2i64:
219; AVX:       # BB#0: # %entry
220; AVX-NEXT:    vpmovsxbq %xmm0, %xmm0
221; AVX-NEXT:    retq
222;
223; X32-SSE41-LABEL: sext_16i8_to_2i64:
224; X32-SSE41:       # BB#0: # %entry
225; X32-SSE41-NEXT:    pmovsxbq %xmm0, %xmm0
226; X32-SSE41-NEXT:    retl
227entry:
228  %B = shufflevector <16 x i8> %A, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
229  %C = sext <2 x i8> %B to <2 x i64>
230  ret <2 x i64> %C
231}
232
233define <4 x i64> @sext_16i8_to_4i64(<16 x i8> %A) nounwind uwtable readnone ssp {
234; SSE2-LABEL: sext_16i8_to_4i64:
235; SSE2:       # BB#0: # %entry
236; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
237; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
238; SSE2-NEXT:    movdqa %xmm2, %xmm1
239; SSE2-NEXT:    psrad $31, %xmm1
240; SSE2-NEXT:    psrad $24, %xmm2
241; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
242; SSE2-NEXT:    psrld $16, %xmm0
243; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
244; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
245; SSE2-NEXT:    movdqa %xmm1, %xmm0
246; SSE2-NEXT:    psrad $31, %xmm0
247; SSE2-NEXT:    psrad $24, %xmm1
248; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
249; SSE2-NEXT:    movdqa %xmm2, %xmm0
250; SSE2-NEXT:    retq
251;
252; SSSE3-LABEL: sext_16i8_to_4i64:
253; SSSE3:       # BB#0: # %entry
254; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
255; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
256; SSSE3-NEXT:    movdqa %xmm2, %xmm1
257; SSSE3-NEXT:    psrad $31, %xmm1
258; SSSE3-NEXT:    psrad $24, %xmm2
259; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
260; SSSE3-NEXT:    psrld $16, %xmm0
261; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
262; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
263; SSSE3-NEXT:    movdqa %xmm1, %xmm0
264; SSSE3-NEXT:    psrad $31, %xmm0
265; SSSE3-NEXT:    psrad $24, %xmm1
266; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
267; SSSE3-NEXT:    movdqa %xmm2, %xmm0
268; SSSE3-NEXT:    retq
269;
270; SSE41-LABEL: sext_16i8_to_4i64:
271; SSE41:       # BB#0: # %entry
272; SSE41-NEXT:    pmovsxbq %xmm0, %xmm2
273; SSE41-NEXT:    psrld $16, %xmm0
274; SSE41-NEXT:    pmovsxbq %xmm0, %xmm1
275; SSE41-NEXT:    movdqa %xmm2, %xmm0
276; SSE41-NEXT:    retq
277;
278; AVX1-LABEL: sext_16i8_to_4i64:
279; AVX1:       # BB#0: # %entry
280; AVX1-NEXT:    vpmovsxbq %xmm0, %xmm1
281; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
282; AVX1-NEXT:    vpmovsxbq %xmm0, %xmm0
283; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
284; AVX1-NEXT:    retq
285;
286; AVX2-LABEL: sext_16i8_to_4i64:
287; AVX2:       # BB#0: # %entry
288; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
289; AVX2-NEXT:    vpslld $24, %xmm0, %xmm0
290; AVX2-NEXT:    vpsrad $24, %xmm0, %xmm0
291; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
292; AVX2-NEXT:    retq
293;
294; X32-SSE41-LABEL: sext_16i8_to_4i64:
295; X32-SSE41:       # BB#0: # %entry
296; X32-SSE41-NEXT:    pmovsxbq %xmm0, %xmm2
297; X32-SSE41-NEXT:    psrld $16, %xmm0
298; X32-SSE41-NEXT:    pmovsxbq %xmm0, %xmm1
299; X32-SSE41-NEXT:    movdqa %xmm2, %xmm0
300; X32-SSE41-NEXT:    retl
301entry:
302  %B = shufflevector <16 x i8> %A, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
303  %C = sext <4 x i8> %B to <4 x i64>
304  ret <4 x i64> %C
305}
306
307define <4 x i32> @sext_8i16_to_4i32(<8 x i16> %A) nounwind uwtable readnone ssp {
308; SSE2-LABEL: sext_8i16_to_4i32:
309; SSE2:       # BB#0: # %entry
310; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
311; SSE2-NEXT:    psrad $16, %xmm0
312; SSE2-NEXT:    retq
313;
314; SSSE3-LABEL: sext_8i16_to_4i32:
315; SSSE3:       # BB#0: # %entry
316; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
317; SSSE3-NEXT:    psrad $16, %xmm0
318; SSSE3-NEXT:    retq
319;
320; SSE41-LABEL: sext_8i16_to_4i32:
321; SSE41:       # BB#0: # %entry
322; SSE41-NEXT:    pmovsxwd %xmm0, %xmm0
323; SSE41-NEXT:    retq
324;
325; AVX-LABEL: sext_8i16_to_4i32:
326; AVX:       # BB#0: # %entry
327; AVX-NEXT:    vpmovsxwd %xmm0, %xmm0
328; AVX-NEXT:    retq
329;
330; X32-SSE41-LABEL: sext_8i16_to_4i32:
331; X32-SSE41:       # BB#0: # %entry
332; X32-SSE41-NEXT:    pmovsxwd %xmm0, %xmm0
333; X32-SSE41-NEXT:    retl
334entry:
335  %B = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
336  %C = sext <4 x i16> %B to <4 x i32>
337  ret <4 x i32> %C
338}
339
340define <8 x i32> @sext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp {
341; SSE2-LABEL: sext_8i16_to_8i32:
342; SSE2:       # BB#0: # %entry
343; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
344; SSE2-NEXT:    psrad $16, %xmm2
345; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
346; SSE2-NEXT:    psrad $16, %xmm1
347; SSE2-NEXT:    movdqa %xmm2, %xmm0
348; SSE2-NEXT:    retq
349;
350; SSSE3-LABEL: sext_8i16_to_8i32:
351; SSSE3:       # BB#0: # %entry
352; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
353; SSSE3-NEXT:    psrad $16, %xmm2
354; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
355; SSSE3-NEXT:    psrad $16, %xmm1
356; SSSE3-NEXT:    movdqa %xmm2, %xmm0
357; SSSE3-NEXT:    retq
358;
359; SSE41-LABEL: sext_8i16_to_8i32:
360; SSE41:       # BB#0: # %entry
361; SSE41-NEXT:    pmovsxwd %xmm0, %xmm2
362; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
363; SSE41-NEXT:    pmovsxwd %xmm0, %xmm1
364; SSE41-NEXT:    movdqa %xmm2, %xmm0
365; SSE41-NEXT:    retq
366;
367; AVX1-LABEL: sext_8i16_to_8i32:
368; AVX1:       # BB#0: # %entry
369; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm1
370; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
371; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
372; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
373; AVX1-NEXT:    retq
374;
375; AVX2-LABEL: sext_8i16_to_8i32:
376; AVX2:       # BB#0: # %entry
377; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
378; AVX2-NEXT:    retq
379;
380; X32-SSE41-LABEL: sext_8i16_to_8i32:
381; X32-SSE41:       # BB#0: # %entry
382; X32-SSE41-NEXT:    pmovsxwd %xmm0, %xmm2
383; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
384; X32-SSE41-NEXT:    pmovsxwd %xmm0, %xmm1
385; X32-SSE41-NEXT:    movdqa %xmm2, %xmm0
386; X32-SSE41-NEXT:    retl
387entry:
388  %B = sext <8 x i16> %A to <8 x i32>
389  ret <8 x i32> %B
390}
391
392define <2 x i64> @sext_8i16_to_2i64(<8 x i16> %A) nounwind uwtable readnone ssp {
393; SSE2-LABEL: sext_8i16_to_2i64:
394; SSE2:       # BB#0: # %entry
395; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
396; SSE2-NEXT:    movdqa %xmm0, %xmm1
397; SSE2-NEXT:    psrad $31, %xmm1
398; SSE2-NEXT:    psrad $16, %xmm0
399; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
400; SSE2-NEXT:    retq
401;
402; SSSE3-LABEL: sext_8i16_to_2i64:
403; SSSE3:       # BB#0: # %entry
404; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
405; SSSE3-NEXT:    movdqa %xmm0, %xmm1
406; SSSE3-NEXT:    psrad $31, %xmm1
407; SSSE3-NEXT:    psrad $16, %xmm0
408; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
409; SSSE3-NEXT:    retq
410;
411; SSE41-LABEL: sext_8i16_to_2i64:
412; SSE41:       # BB#0: # %entry
413; SSE41-NEXT:    pmovsxwq %xmm0, %xmm0
414; SSE41-NEXT:    retq
415;
416; AVX-LABEL: sext_8i16_to_2i64:
417; AVX:       # BB#0: # %entry
418; AVX-NEXT:    vpmovsxwq %xmm0, %xmm0
419; AVX-NEXT:    retq
420;
421; X32-SSE41-LABEL: sext_8i16_to_2i64:
422; X32-SSE41:       # BB#0: # %entry
423; X32-SSE41-NEXT:    pmovsxwq %xmm0, %xmm0
424; X32-SSE41-NEXT:    retl
425entry:
426  %B = shufflevector <8 x i16> %A, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
427  %C = sext <2 x i16> %B to <2 x i64>
428  ret <2 x i64> %C
429}
430
431define <4 x i64> @sext_8i16_to_4i64(<8 x i16> %A) nounwind uwtable readnone ssp {
432; SSE2-LABEL: sext_8i16_to_4i64:
433; SSE2:       # BB#0: # %entry
434; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
435; SSE2-NEXT:    movdqa %xmm2, %xmm1
436; SSE2-NEXT:    psrad $31, %xmm1
437; SSE2-NEXT:    psrad $16, %xmm2
438; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
439; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
440; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
441; SSE2-NEXT:    movdqa %xmm1, %xmm0
442; SSE2-NEXT:    psrad $31, %xmm0
443; SSE2-NEXT:    psrad $16, %xmm1
444; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
445; SSE2-NEXT:    movdqa %xmm2, %xmm0
446; SSE2-NEXT:    retq
447;
448; SSSE3-LABEL: sext_8i16_to_4i64:
449; SSSE3:       # BB#0: # %entry
450; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
451; SSSE3-NEXT:    movdqa %xmm2, %xmm1
452; SSSE3-NEXT:    psrad $31, %xmm1
453; SSSE3-NEXT:    psrad $16, %xmm2
454; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
455; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
456; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
457; SSSE3-NEXT:    movdqa %xmm1, %xmm0
458; SSSE3-NEXT:    psrad $31, %xmm0
459; SSSE3-NEXT:    psrad $16, %xmm1
460; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
461; SSSE3-NEXT:    movdqa %xmm2, %xmm0
462; SSSE3-NEXT:    retq
463;
464; SSE41-LABEL: sext_8i16_to_4i64:
465; SSE41:       # BB#0: # %entry
466; SSE41-NEXT:    pmovsxwq %xmm0, %xmm2
467; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
468; SSE41-NEXT:    pmovsxwq %xmm0, %xmm1
469; SSE41-NEXT:    movdqa %xmm2, %xmm0
470; SSE41-NEXT:    retq
471;
472; AVX1-LABEL: sext_8i16_to_4i64:
473; AVX1:       # BB#0: # %entry
474; AVX1-NEXT:    vpmovsxwq %xmm0, %xmm1
475; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
476; AVX1-NEXT:    vpmovsxwq %xmm0, %xmm0
477; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
478; AVX1-NEXT:    retq
479;
480; AVX2-LABEL: sext_8i16_to_4i64:
481; AVX2:       # BB#0: # %entry
482; AVX2-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
483; AVX2-NEXT:    vpslld $16, %xmm0, %xmm0
484; AVX2-NEXT:    vpsrad $16, %xmm0, %xmm0
485; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
486; AVX2-NEXT:    retq
487;
488; X32-SSE41-LABEL: sext_8i16_to_4i64:
489; X32-SSE41:       # BB#0: # %entry
490; X32-SSE41-NEXT:    pmovsxwq %xmm0, %xmm2
491; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
492; X32-SSE41-NEXT:    pmovsxwq %xmm0, %xmm1
493; X32-SSE41-NEXT:    movdqa %xmm2, %xmm0
494; X32-SSE41-NEXT:    retl
495entry:
496  %B = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
497  %C = sext <4 x i16> %B to <4 x i64>
498  ret <4 x i64> %C
499}
500
501define <2 x i64> @sext_4i32_to_2i64(<4 x i32> %A) nounwind uwtable readnone ssp {
502; SSE2-LABEL: sext_4i32_to_2i64:
503; SSE2:       # BB#0: # %entry
504; SSE2-NEXT:    movdqa %xmm0, %xmm1
505; SSE2-NEXT:    psrad $31, %xmm1
506; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
507; SSE2-NEXT:    retq
508;
509; SSSE3-LABEL: sext_4i32_to_2i64:
510; SSSE3:       # BB#0: # %entry
511; SSSE3-NEXT:    movdqa %xmm0, %xmm1
512; SSSE3-NEXT:    psrad $31, %xmm1
513; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
514; SSSE3-NEXT:    retq
515;
516; SSE41-LABEL: sext_4i32_to_2i64:
517; SSE41:       # BB#0: # %entry
518; SSE41-NEXT:    pmovsxdq %xmm0, %xmm0
519; SSE41-NEXT:    retq
520;
521; AVX-LABEL: sext_4i32_to_2i64:
522; AVX:       # BB#0: # %entry
523; AVX-NEXT:    vpmovsxdq %xmm0, %xmm0
524; AVX-NEXT:    retq
525;
526; X32-SSE41-LABEL: sext_4i32_to_2i64:
527; X32-SSE41:       # BB#0: # %entry
528; X32-SSE41-NEXT:    pmovsxdq %xmm0, %xmm0
529; X32-SSE41-NEXT:    retl
530entry:
531  %B = shufflevector <4 x i32> %A, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
532  %C = sext <2 x i32> %B to <2 x i64>
533  ret <2 x i64> %C
534}
535
536define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp {
537; SSE2-LABEL: sext_4i32_to_4i64:
538; SSE2:       # BB#0: # %entry
539; SSE2-NEXT:    movdqa %xmm0, %xmm2
540; SSE2-NEXT:    psrad $31, %xmm2
541; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
542; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
543; SSE2-NEXT:    movdqa %xmm1, %xmm2
544; SSE2-NEXT:    psrad $31, %xmm2
545; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
546; SSE2-NEXT:    retq
547;
548; SSSE3-LABEL: sext_4i32_to_4i64:
549; SSSE3:       # BB#0: # %entry
550; SSSE3-NEXT:    movdqa %xmm0, %xmm2
551; SSSE3-NEXT:    psrad $31, %xmm2
552; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
553; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
554; SSSE3-NEXT:    movdqa %xmm1, %xmm2
555; SSSE3-NEXT:    psrad $31, %xmm2
556; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
557; SSSE3-NEXT:    retq
558;
559; SSE41-LABEL: sext_4i32_to_4i64:
560; SSE41:       # BB#0: # %entry
561; SSE41-NEXT:    pmovsxdq %xmm0, %xmm2
562; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
563; SSE41-NEXT:    pmovsxdq %xmm0, %xmm1
564; SSE41-NEXT:    movdqa %xmm2, %xmm0
565; SSE41-NEXT:    retq
566;
567; AVX1-LABEL: sext_4i32_to_4i64:
568; AVX1:       # BB#0: # %entry
569; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
570; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
571; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
572; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
573; AVX1-NEXT:    retq
574;
575; AVX2-LABEL: sext_4i32_to_4i64:
576; AVX2:       # BB#0: # %entry
577; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
578; AVX2-NEXT:    retq
579;
580; X32-SSE41-LABEL: sext_4i32_to_4i64:
581; X32-SSE41:       # BB#0: # %entry
582; X32-SSE41-NEXT:    pmovsxdq %xmm0, %xmm2
583; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
584; X32-SSE41-NEXT:    pmovsxdq %xmm0, %xmm1
585; X32-SSE41-NEXT:    movdqa %xmm2, %xmm0
586; X32-SSE41-NEXT:    retl
587entry:
588  %B = sext <4 x i32> %A to <4 x i64>
589  ret <4 x i64> %B
590}
591
592define <2 x i64> @load_sext_2i1_to_2i64(<2 x i1> *%ptr) {
593; SSE-LABEL: load_sext_2i1_to_2i64:
594; SSE:       # BB#0: # %entry
595; SSE-NEXT:    movzbl (%rdi), %eax
596; SSE-NEXT:    movq %rax, %rcx
597; SSE-NEXT:    shlq $62, %rcx
598; SSE-NEXT:    sarq $63, %rcx
599; SSE-NEXT:    movd %rcx, %xmm1
600; SSE-NEXT:    shlq $63, %rax
601; SSE-NEXT:    sarq $63, %rax
602; SSE-NEXT:    movd %rax, %xmm0
603; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
604; SSE-NEXT:    retq
605;
606; AVX-LABEL: load_sext_2i1_to_2i64:
607; AVX:       # BB#0: # %entry
608; AVX-NEXT:    movzbl (%rdi), %eax
609; AVX-NEXT:    movq %rax, %rcx
610; AVX-NEXT:    shlq $62, %rcx
611; AVX-NEXT:    sarq $63, %rcx
612; AVX-NEXT:    vmovq %rcx, %xmm0
613; AVX-NEXT:    shlq $63, %rax
614; AVX-NEXT:    sarq $63, %rax
615; AVX-NEXT:    vmovq %rax, %xmm1
616; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
617; AVX-NEXT:    retq
618;
619; X32-SSE41-LABEL: load_sext_2i1_to_2i64:
620; X32-SSE41:       # BB#0: # %entry
621; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
622; X32-SSE41-NEXT:    movzbl (%eax), %eax
623; X32-SSE41-NEXT:    movl %eax, %ecx
624; X32-SSE41-NEXT:    shll $31, %ecx
625; X32-SSE41-NEXT:    sarl $31, %ecx
626; X32-SSE41-NEXT:    movd %ecx, %xmm0
627; X32-SSE41-NEXT:    pinsrd $1, %ecx, %xmm0
628; X32-SSE41-NEXT:    shll $30, %eax
629; X32-SSE41-NEXT:    sarl $31, %eax
630; X32-SSE41-NEXT:    pinsrd $2, %eax, %xmm0
631; X32-SSE41-NEXT:    pinsrd $3, %eax, %xmm0
632; X32-SSE41-NEXT:    retl
633entry:
634 %X = load <2 x i1>, <2 x i1>* %ptr
635 %Y = sext <2 x i1> %X to <2 x i64>
636 ret <2 x i64> %Y
637}
638
639define <2 x i64> @load_sext_2i8_to_2i64(<2 x i8> *%ptr) {
640; SSE2-LABEL: load_sext_2i8_to_2i64:
641; SSE2:       # BB#0: # %entry
642; SSE2-NEXT:    movzwl (%rdi), %eax
643; SSE2-NEXT:    movd %eax, %xmm0
644; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
645; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
646; SSE2-NEXT:    movdqa %xmm0, %xmm1
647; SSE2-NEXT:    psrad $31, %xmm1
648; SSE2-NEXT:    psrad $24, %xmm0
649; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
650; SSE2-NEXT:    retq
651;
652; SSSE3-LABEL: load_sext_2i8_to_2i64:
653; SSSE3:       # BB#0: # %entry
654; SSSE3-NEXT:    movzwl (%rdi), %eax
655; SSSE3-NEXT:    movd %eax, %xmm0
656; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
657; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
658; SSSE3-NEXT:    movdqa %xmm0, %xmm1
659; SSSE3-NEXT:    psrad $31, %xmm1
660; SSSE3-NEXT:    psrad $24, %xmm0
661; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
662; SSSE3-NEXT:    retq
663;
664; SSE41-LABEL: load_sext_2i8_to_2i64:
665; SSE41:       # BB#0: # %entry
666; SSE41-NEXT:    pmovsxbq (%rdi), %xmm0
667; SSE41-NEXT:    retq
668;
669; AVX-LABEL: load_sext_2i8_to_2i64:
670; AVX:       # BB#0: # %entry
671; AVX-NEXT:    vpmovsxbq (%rdi), %xmm0
672; AVX-NEXT:    retq
673;
674; X32-SSE41-LABEL: load_sext_2i8_to_2i64:
675; X32-SSE41:       # BB#0: # %entry
676; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
677; X32-SSE41-NEXT:    pmovsxbq (%eax), %xmm0
678; X32-SSE41-NEXT:    retl
679entry:
680 %X = load <2 x i8>, <2 x i8>* %ptr
681 %Y = sext <2 x i8> %X to <2 x i64>
682 ret <2 x i64> %Y
683}
684
685define <4 x i32> @load_sext_4i1_to_4i32(<4 x i1> *%ptr) {
686; SSE2-LABEL: load_sext_4i1_to_4i32:
687; SSE2:       # BB#0: # %entry
688; SSE2-NEXT:    movzbl (%rdi), %eax
689; SSE2-NEXT:    movq %rax, %rcx
690; SSE2-NEXT:    shlq $60, %rcx
691; SSE2-NEXT:    sarq $63, %rcx
692; SSE2-NEXT:    movd %ecx, %xmm0
693; SSE2-NEXT:    movq %rax, %rcx
694; SSE2-NEXT:    shlq $62, %rcx
695; SSE2-NEXT:    sarq $63, %rcx
696; SSE2-NEXT:    movd %ecx, %xmm1
697; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
698; SSE2-NEXT:    movq %rax, %rcx
699; SSE2-NEXT:    shlq $61, %rcx
700; SSE2-NEXT:    sarq $63, %rcx
701; SSE2-NEXT:    movd %ecx, %xmm2
702; SSE2-NEXT:    shlq $63, %rax
703; SSE2-NEXT:    sarq $63, %rax
704; SSE2-NEXT:    movd %eax, %xmm0
705; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
706; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
707; SSE2-NEXT:    retq
708;
709; SSSE3-LABEL: load_sext_4i1_to_4i32:
710; SSSE3:       # BB#0: # %entry
711; SSSE3-NEXT:    movzbl (%rdi), %eax
712; SSSE3-NEXT:    movq %rax, %rcx
713; SSSE3-NEXT:    shlq $60, %rcx
714; SSSE3-NEXT:    sarq $63, %rcx
715; SSSE3-NEXT:    movd %ecx, %xmm0
716; SSSE3-NEXT:    movq %rax, %rcx
717; SSSE3-NEXT:    shlq $62, %rcx
718; SSSE3-NEXT:    sarq $63, %rcx
719; SSSE3-NEXT:    movd %ecx, %xmm1
720; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
721; SSSE3-NEXT:    movq %rax, %rcx
722; SSSE3-NEXT:    shlq $61, %rcx
723; SSSE3-NEXT:    sarq $63, %rcx
724; SSSE3-NEXT:    movd %ecx, %xmm2
725; SSSE3-NEXT:    shlq $63, %rax
726; SSSE3-NEXT:    sarq $63, %rax
727; SSSE3-NEXT:    movd %eax, %xmm0
728; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
729; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
730; SSSE3-NEXT:    retq
731;
732; SSE41-LABEL: load_sext_4i1_to_4i32:
733; SSE41:       # BB#0: # %entry
734; SSE41-NEXT:    movzbl (%rdi), %eax
735; SSE41-NEXT:    movq %rax, %rcx
736; SSE41-NEXT:    shlq $62, %rcx
737; SSE41-NEXT:    sarq $63, %rcx
738; SSE41-NEXT:    movq %rax, %rdx
739; SSE41-NEXT:    shlq $63, %rdx
740; SSE41-NEXT:    sarq $63, %rdx
741; SSE41-NEXT:    movd %edx, %xmm0
742; SSE41-NEXT:    pinsrd $1, %ecx, %xmm0
743; SSE41-NEXT:    movq %rax, %rcx
744; SSE41-NEXT:    shlq $61, %rcx
745; SSE41-NEXT:    sarq $63, %rcx
746; SSE41-NEXT:    pinsrd $2, %ecx, %xmm0
747; SSE41-NEXT:    shlq $60, %rax
748; SSE41-NEXT:    sarq $63, %rax
749; SSE41-NEXT:    pinsrd $3, %eax, %xmm0
750; SSE41-NEXT:    retq
751;
752; AVX-LABEL: load_sext_4i1_to_4i32:
753; AVX:       # BB#0: # %entry
754; AVX-NEXT:    movzbl (%rdi), %eax
755; AVX-NEXT:    movq %rax, %rcx
756; AVX-NEXT:    shlq $62, %rcx
757; AVX-NEXT:    sarq $63, %rcx
758; AVX-NEXT:    movq %rax, %rdx
759; AVX-NEXT:    shlq $63, %rdx
760; AVX-NEXT:    sarq $63, %rdx
761; AVX-NEXT:    vmovd %edx, %xmm0
762; AVX-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0
763; AVX-NEXT:    movq %rax, %rcx
764; AVX-NEXT:    shlq $61, %rcx
765; AVX-NEXT:    sarq $63, %rcx
766; AVX-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0
767; AVX-NEXT:    shlq $60, %rax
768; AVX-NEXT:    sarq $63, %rax
769; AVX-NEXT:    vpinsrd $3, %eax, %xmm0, %xmm0
770; AVX-NEXT:    retq
771;
772; X32-SSE41-LABEL: load_sext_4i1_to_4i32:
773; X32-SSE41:       # BB#0: # %entry
774; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
775; X32-SSE41-NEXT:    movl (%eax), %eax
776; X32-SSE41-NEXT:    movl %eax, %ecx
777; X32-SSE41-NEXT:    shll $30, %ecx
778; X32-SSE41-NEXT:    sarl $31, %ecx
779; X32-SSE41-NEXT:    movl %eax, %edx
780; X32-SSE41-NEXT:    shll $31, %edx
781; X32-SSE41-NEXT:    sarl $31, %edx
782; X32-SSE41-NEXT:    movd %edx, %xmm0
783; X32-SSE41-NEXT:    pinsrd $1, %ecx, %xmm0
784; X32-SSE41-NEXT:    movl %eax, %ecx
785; X32-SSE41-NEXT:    shll $29, %ecx
786; X32-SSE41-NEXT:    sarl $31, %ecx
787; X32-SSE41-NEXT:    pinsrd $2, %ecx, %xmm0
788; X32-SSE41-NEXT:    shll $28, %eax
789; X32-SSE41-NEXT:    sarl $31, %eax
790; X32-SSE41-NEXT:    pinsrd $3, %eax, %xmm0
791; X32-SSE41-NEXT:    retl
792entry:
793 %X = load <4 x i1>, <4 x i1>* %ptr
794 %Y = sext <4 x i1> %X to <4 x i32>
795 ret <4 x i32> %Y
796}
797
798define <4 x i32> @load_sext_4i8_to_4i32(<4 x i8> *%ptr) {
799; SSE2-LABEL: load_sext_4i8_to_4i32:
800; SSE2:       # BB#0: # %entry
801; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
802; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
803; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
804; SSE2-NEXT:    psrad $24, %xmm0
805; SSE2-NEXT:    retq
806;
807; SSSE3-LABEL: load_sext_4i8_to_4i32:
808; SSSE3:       # BB#0: # %entry
809; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
810; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
811; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
812; SSSE3-NEXT:    psrad $24, %xmm0
813; SSSE3-NEXT:    retq
814;
815; SSE41-LABEL: load_sext_4i8_to_4i32:
816; SSE41:       # BB#0: # %entry
817; SSE41-NEXT:    pmovsxbd (%rdi), %xmm0
818; SSE41-NEXT:    retq
819;
820; AVX-LABEL: load_sext_4i8_to_4i32:
821; AVX:       # BB#0: # %entry
822; AVX-NEXT:    vpmovsxbd (%rdi), %xmm0
823; AVX-NEXT:    retq
824;
825; X32-SSE41-LABEL: load_sext_4i8_to_4i32:
826; X32-SSE41:       # BB#0: # %entry
827; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
828; X32-SSE41-NEXT:    pmovsxbd (%eax), %xmm0
829; X32-SSE41-NEXT:    retl
830entry:
831 %X = load <4 x i8>, <4 x i8>* %ptr
832 %Y = sext <4 x i8> %X to <4 x i32>
833 ret <4 x i32> %Y
834}
835
836define <4 x i64> @load_sext_4i1_to_4i64(<4 x i1> *%ptr) {
837; SSE2-LABEL: load_sext_4i1_to_4i64:
838; SSE2:       # BB#0: # %entry
839; SSE2-NEXT:    movzbl (%rdi), %eax
840; SSE2-NEXT:    movl %eax, %ecx
841; SSE2-NEXT:    shrl $3, %ecx
842; SSE2-NEXT:    andl $1, %ecx
843; SSE2-NEXT:    movd %ecx, %xmm0
844; SSE2-NEXT:    movl %eax, %ecx
845; SSE2-NEXT:    shrl %ecx
846; SSE2-NEXT:    andl $1, %ecx
847; SSE2-NEXT:    movd %ecx, %xmm1
848; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
849; SSE2-NEXT:    movl %eax, %ecx
850; SSE2-NEXT:    andl $1, %ecx
851; SSE2-NEXT:    movd %ecx, %xmm2
852; SSE2-NEXT:    shrl $2, %eax
853; SSE2-NEXT:    andl $1, %eax
854; SSE2-NEXT:    movd %eax, %xmm0
855; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
856; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
857; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,1,1,3]
858; SSE2-NEXT:    psllq $63, %xmm0
859; SSE2-NEXT:    psrad $31, %xmm0
860; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
861; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,1,3,3]
862; SSE2-NEXT:    psllq $63, %xmm1
863; SSE2-NEXT:    psrad $31, %xmm1
864; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
865; SSE2-NEXT:    retq
866;
867; SSSE3-LABEL: load_sext_4i1_to_4i64:
868; SSSE3:       # BB#0: # %entry
869; SSSE3-NEXT:    movzbl (%rdi), %eax
870; SSSE3-NEXT:    movl %eax, %ecx
871; SSSE3-NEXT:    shrl $3, %ecx
872; SSSE3-NEXT:    andl $1, %ecx
873; SSSE3-NEXT:    movd %ecx, %xmm0
874; SSSE3-NEXT:    movl %eax, %ecx
875; SSSE3-NEXT:    shrl %ecx
876; SSSE3-NEXT:    andl $1, %ecx
877; SSSE3-NEXT:    movd %ecx, %xmm1
878; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
879; SSSE3-NEXT:    movl %eax, %ecx
880; SSSE3-NEXT:    andl $1, %ecx
881; SSSE3-NEXT:    movd %ecx, %xmm2
882; SSSE3-NEXT:    shrl $2, %eax
883; SSSE3-NEXT:    andl $1, %eax
884; SSSE3-NEXT:    movd %eax, %xmm0
885; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
886; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
887; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,1,1,3]
888; SSSE3-NEXT:    psllq $63, %xmm0
889; SSSE3-NEXT:    psrad $31, %xmm0
890; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
891; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,1,3,3]
892; SSSE3-NEXT:    psllq $63, %xmm1
893; SSSE3-NEXT:    psrad $31, %xmm1
894; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
895; SSSE3-NEXT:    retq
896;
897; SSE41-LABEL: load_sext_4i1_to_4i64:
898; SSE41:       # BB#0: # %entry
899; SSE41-NEXT:    movzbl (%rdi), %eax
900; SSE41-NEXT:    movl %eax, %ecx
901; SSE41-NEXT:    shrl %ecx
902; SSE41-NEXT:    andl $1, %ecx
903; SSE41-NEXT:    movl %eax, %edx
904; SSE41-NEXT:    andl $1, %edx
905; SSE41-NEXT:    movd %edx, %xmm1
906; SSE41-NEXT:    pinsrd $1, %ecx, %xmm1
907; SSE41-NEXT:    movl %eax, %ecx
908; SSE41-NEXT:    shrl $2, %ecx
909; SSE41-NEXT:    andl $1, %ecx
910; SSE41-NEXT:    pinsrd $2, %ecx, %xmm1
911; SSE41-NEXT:    shrl $3, %eax
912; SSE41-NEXT:    andl $1, %eax
913; SSE41-NEXT:    pinsrd $3, %eax, %xmm1
914; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
915; SSE41-NEXT:    psllq $63, %xmm0
916; SSE41-NEXT:    psrad $31, %xmm0
917; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
918; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
919; SSE41-NEXT:    psllq $63, %xmm1
920; SSE41-NEXT:    psrad $31, %xmm1
921; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
922; SSE41-NEXT:    retq
923;
924; AVX1-LABEL: load_sext_4i1_to_4i64:
925; AVX1:       # BB#0: # %entry
926; AVX1-NEXT:    movzbl (%rdi), %eax
927; AVX1-NEXT:    movq %rax, %rcx
928; AVX1-NEXT:    shlq $62, %rcx
929; AVX1-NEXT:    sarq $63, %rcx
930; AVX1-NEXT:    movq %rax, %rdx
931; AVX1-NEXT:    shlq $63, %rdx
932; AVX1-NEXT:    sarq $63, %rdx
933; AVX1-NEXT:    vmovd %edx, %xmm0
934; AVX1-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0
935; AVX1-NEXT:    movq %rax, %rcx
936; AVX1-NEXT:    shlq $61, %rcx
937; AVX1-NEXT:    sarq $63, %rcx
938; AVX1-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0
939; AVX1-NEXT:    shlq $60, %rax
940; AVX1-NEXT:    sarq $63, %rax
941; AVX1-NEXT:    vpinsrd $3, %eax, %xmm0, %xmm0
942; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
943; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
944; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
945; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
946; AVX1-NEXT:    retq
947;
948; AVX2-LABEL: load_sext_4i1_to_4i64:
949; AVX2:       # BB#0: # %entry
950; AVX2-NEXT:    movzbl (%rdi), %eax
951; AVX2-NEXT:    movq %rax, %rcx
952; AVX2-NEXT:    shlq $60, %rcx
953; AVX2-NEXT:    sarq $63, %rcx
954; AVX2-NEXT:    vmovq %rcx, %xmm0
955; AVX2-NEXT:    movq %rax, %rcx
956; AVX2-NEXT:    shlq $61, %rcx
957; AVX2-NEXT:    sarq $63, %rcx
958; AVX2-NEXT:    vmovq %rcx, %xmm1
959; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
960; AVX2-NEXT:    movq %rax, %rcx
961; AVX2-NEXT:    shlq $62, %rcx
962; AVX2-NEXT:    sarq $63, %rcx
963; AVX2-NEXT:    vmovq %rcx, %xmm1
964; AVX2-NEXT:    shlq $63, %rax
965; AVX2-NEXT:    sarq $63, %rax
966; AVX2-NEXT:    vmovq %rax, %xmm2
967; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
968; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
969; AVX2-NEXT:    retq
970;
971; X32-SSE41-LABEL: load_sext_4i1_to_4i64:
972; X32-SSE41:       # BB#0: # %entry
973; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
974; X32-SSE41-NEXT:    movzbl (%eax), %eax
975; X32-SSE41-NEXT:    movl %eax, %ecx
976; X32-SSE41-NEXT:    shrl %ecx
977; X32-SSE41-NEXT:    andl $1, %ecx
978; X32-SSE41-NEXT:    movl %eax, %edx
979; X32-SSE41-NEXT:    andl $1, %edx
980; X32-SSE41-NEXT:    movd %edx, %xmm1
981; X32-SSE41-NEXT:    pinsrd $1, %ecx, %xmm1
982; X32-SSE41-NEXT:    movl %eax, %ecx
983; X32-SSE41-NEXT:    shrl $2, %ecx
984; X32-SSE41-NEXT:    andl $1, %ecx
985; X32-SSE41-NEXT:    pinsrd $2, %ecx, %xmm1
986; X32-SSE41-NEXT:    shrl $3, %eax
987; X32-SSE41-NEXT:    andl $1, %eax
988; X32-SSE41-NEXT:    pinsrd $3, %eax, %xmm1
989; X32-SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
990; X32-SSE41-NEXT:    psllq $63, %xmm0
991; X32-SSE41-NEXT:    psrad $31, %xmm0
992; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
993; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
994; X32-SSE41-NEXT:    psllq $63, %xmm1
995; X32-SSE41-NEXT:    psrad $31, %xmm1
996; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
997; X32-SSE41-NEXT:    retl
998entry:
999 %X = load <4 x i1>, <4 x i1>* %ptr
1000 %Y = sext <4 x i1> %X to <4 x i64>
1001 ret <4 x i64> %Y
1002}
1003
1004define <4 x i64> @load_sext_4i8_to_4i64(<4 x i8> *%ptr) {
1005; SSE2-LABEL: load_sext_4i8_to_4i64:
1006; SSE2:       # BB#0: # %entry
1007; SSE2-NEXT:    movsbq 1(%rdi), %rax
1008; SSE2-NEXT:    movd %rax, %xmm1
1009; SSE2-NEXT:    movsbq (%rdi), %rax
1010; SSE2-NEXT:    movd %rax, %xmm0
1011; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1012; SSE2-NEXT:    movsbq 3(%rdi), %rax
1013; SSE2-NEXT:    movd %rax, %xmm2
1014; SSE2-NEXT:    movsbq 2(%rdi), %rax
1015; SSE2-NEXT:    movd %rax, %xmm1
1016; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
1017; SSE2-NEXT:    retq
1018;
1019; SSSE3-LABEL: load_sext_4i8_to_4i64:
1020; SSSE3:       # BB#0: # %entry
1021; SSSE3-NEXT:    movsbq 1(%rdi), %rax
1022; SSSE3-NEXT:    movd %rax, %xmm1
1023; SSSE3-NEXT:    movsbq (%rdi), %rax
1024; SSSE3-NEXT:    movd %rax, %xmm0
1025; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1026; SSSE3-NEXT:    movsbq 3(%rdi), %rax
1027; SSSE3-NEXT:    movd %rax, %xmm2
1028; SSSE3-NEXT:    movsbq 2(%rdi), %rax
1029; SSSE3-NEXT:    movd %rax, %xmm1
1030; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
1031; SSSE3-NEXT:    retq
1032;
1033; SSE41-LABEL: load_sext_4i8_to_4i64:
1034; SSE41:       # BB#0: # %entry
1035; SSE41-NEXT:    pmovsxbq (%rdi), %xmm0
1036; SSE41-NEXT:    pmovsxbq 2(%rdi), %xmm1
1037; SSE41-NEXT:    retq
1038;
1039; AVX1-LABEL: load_sext_4i8_to_4i64:
1040; AVX1:       # BB#0: # %entry
1041; AVX1-NEXT:    vpmovsxbd (%rdi), %xmm0
1042; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
1043; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1044; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
1045; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1046; AVX1-NEXT:    retq
1047;
1048; AVX2-LABEL: load_sext_4i8_to_4i64:
1049; AVX2:       # BB#0: # %entry
1050; AVX2-NEXT:    vpmovsxbq (%rdi), %ymm0
1051; AVX2-NEXT:    retq
1052;
1053; X32-SSE41-LABEL: load_sext_4i8_to_4i64:
1054; X32-SSE41:       # BB#0: # %entry
1055; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
1056; X32-SSE41-NEXT:    pmovsxbq (%eax), %xmm0
1057; X32-SSE41-NEXT:    pmovsxbq 2(%eax), %xmm1
1058; X32-SSE41-NEXT:    retl
1059entry:
1060 %X = load <4 x i8>, <4 x i8>* %ptr
1061 %Y = sext <4 x i8> %X to <4 x i64>
1062 ret <4 x i64> %Y
1063}
1064
1065define <8 x i16> @load_sext_8i1_to_8i16(<8 x i1> *%ptr) {
1066; SSE2-LABEL: load_sext_8i1_to_8i16:
1067; SSE2:       # BB#0: # %entry
1068; SSE2-NEXT:    movsbq (%rdi), %rax
1069; SSE2-NEXT:    movq %rax, %rcx
1070; SSE2-NEXT:    shrq $7, %rcx
1071; SSE2-NEXT:    movd %ecx, %xmm0
1072; SSE2-NEXT:    movq %rax, %rcx
1073; SSE2-NEXT:    shlq $60, %rcx
1074; SSE2-NEXT:    sarq $63, %rcx
1075; SSE2-NEXT:    movd %ecx, %xmm2
1076; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
1077; SSE2-NEXT:    movq %rax, %rcx
1078; SSE2-NEXT:    shlq $58, %rcx
1079; SSE2-NEXT:    sarq $63, %rcx
1080; SSE2-NEXT:    movd %ecx, %xmm0
1081; SSE2-NEXT:    movq %rax, %rcx
1082; SSE2-NEXT:    shlq $62, %rcx
1083; SSE2-NEXT:    sarq $63, %rcx
1084; SSE2-NEXT:    movd %ecx, %xmm1
1085; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1086; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1087; SSE2-NEXT:    movq %rax, %rcx
1088; SSE2-NEXT:    shlq $57, %rcx
1089; SSE2-NEXT:    sarq $63, %rcx
1090; SSE2-NEXT:    movd %ecx, %xmm0
1091; SSE2-NEXT:    movq %rax, %rcx
1092; SSE2-NEXT:    shlq $61, %rcx
1093; SSE2-NEXT:    sarq $63, %rcx
1094; SSE2-NEXT:    movd %ecx, %xmm2
1095; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
1096; SSE2-NEXT:    movq %rax, %rcx
1097; SSE2-NEXT:    shlq $59, %rcx
1098; SSE2-NEXT:    sarq $63, %rcx
1099; SSE2-NEXT:    movd %ecx, %xmm3
1100; SSE2-NEXT:    shlq $63, %rax
1101; SSE2-NEXT:    sarq $63, %rax
1102; SSE2-NEXT:    movd %eax, %xmm0
1103; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
1104; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1105; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1106; SSE2-NEXT:    retq
1107;
1108; SSSE3-LABEL: load_sext_8i1_to_8i16:
1109; SSSE3:       # BB#0: # %entry
1110; SSSE3-NEXT:    movsbq (%rdi), %rax
1111; SSSE3-NEXT:    movq %rax, %rcx
1112; SSSE3-NEXT:    shrq $7, %rcx
1113; SSSE3-NEXT:    movd %ecx, %xmm0
1114; SSSE3-NEXT:    movq %rax, %rcx
1115; SSSE3-NEXT:    shlq $60, %rcx
1116; SSSE3-NEXT:    sarq $63, %rcx
1117; SSSE3-NEXT:    movd %ecx, %xmm2
1118; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
1119; SSSE3-NEXT:    movq %rax, %rcx
1120; SSSE3-NEXT:    shlq $58, %rcx
1121; SSSE3-NEXT:    sarq $63, %rcx
1122; SSSE3-NEXT:    movd %ecx, %xmm0
1123; SSSE3-NEXT:    movq %rax, %rcx
1124; SSSE3-NEXT:    shlq $62, %rcx
1125; SSSE3-NEXT:    sarq $63, %rcx
1126; SSSE3-NEXT:    movd %ecx, %xmm1
1127; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1128; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1129; SSSE3-NEXT:    movq %rax, %rcx
1130; SSSE3-NEXT:    shlq $57, %rcx
1131; SSSE3-NEXT:    sarq $63, %rcx
1132; SSSE3-NEXT:    movd %ecx, %xmm0
1133; SSSE3-NEXT:    movq %rax, %rcx
1134; SSSE3-NEXT:    shlq $61, %rcx
1135; SSSE3-NEXT:    sarq $63, %rcx
1136; SSSE3-NEXT:    movd %ecx, %xmm2
1137; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
1138; SSSE3-NEXT:    movq %rax, %rcx
1139; SSSE3-NEXT:    shlq $59, %rcx
1140; SSSE3-NEXT:    sarq $63, %rcx
1141; SSSE3-NEXT:    movd %ecx, %xmm3
1142; SSSE3-NEXT:    shlq $63, %rax
1143; SSSE3-NEXT:    sarq $63, %rax
1144; SSSE3-NEXT:    movd %eax, %xmm0
1145; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
1146; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1147; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1148; SSSE3-NEXT:    retq
1149;
1150; SSE41-LABEL: load_sext_8i1_to_8i16:
1151; SSE41:       # BB#0: # %entry
1152; SSE41-NEXT:    movsbq (%rdi), %rax
1153; SSE41-NEXT:    movq %rax, %rcx
1154; SSE41-NEXT:    shlq $62, %rcx
1155; SSE41-NEXT:    sarq $63, %rcx
1156; SSE41-NEXT:    movq %rax, %rdx
1157; SSE41-NEXT:    shlq $63, %rdx
1158; SSE41-NEXT:    sarq $63, %rdx
1159; SSE41-NEXT:    movd %edx, %xmm0
1160; SSE41-NEXT:    pinsrw $1, %ecx, %xmm0
1161; SSE41-NEXT:    movq %rax, %rcx
1162; SSE41-NEXT:    shlq $61, %rcx
1163; SSE41-NEXT:    sarq $63, %rcx
1164; SSE41-NEXT:    pinsrw $2, %ecx, %xmm0
1165; SSE41-NEXT:    movq %rax, %rcx
1166; SSE41-NEXT:    shlq $60, %rcx
1167; SSE41-NEXT:    sarq $63, %rcx
1168; SSE41-NEXT:    pinsrw $3, %ecx, %xmm0
1169; SSE41-NEXT:    movq %rax, %rcx
1170; SSE41-NEXT:    shlq $59, %rcx
1171; SSE41-NEXT:    sarq $63, %rcx
1172; SSE41-NEXT:    pinsrw $4, %ecx, %xmm0
1173; SSE41-NEXT:    movq %rax, %rcx
1174; SSE41-NEXT:    shlq $58, %rcx
1175; SSE41-NEXT:    sarq $63, %rcx
1176; SSE41-NEXT:    pinsrw $5, %ecx, %xmm0
1177; SSE41-NEXT:    movq %rax, %rcx
1178; SSE41-NEXT:    shlq $57, %rcx
1179; SSE41-NEXT:    sarq $63, %rcx
1180; SSE41-NEXT:    pinsrw $6, %ecx, %xmm0
1181; SSE41-NEXT:    shrq $7, %rax
1182; SSE41-NEXT:    pinsrw $7, %eax, %xmm0
1183; SSE41-NEXT:    retq
1184;
1185; AVX-LABEL: load_sext_8i1_to_8i16:
1186; AVX:       # BB#0: # %entry
1187; AVX-NEXT:    movsbq (%rdi), %rax
1188; AVX-NEXT:    movq %rax, %rcx
1189; AVX-NEXT:    shlq $62, %rcx
1190; AVX-NEXT:    sarq $63, %rcx
1191; AVX-NEXT:    movq %rax, %rdx
1192; AVX-NEXT:    shlq $63, %rdx
1193; AVX-NEXT:    sarq $63, %rdx
1194; AVX-NEXT:    vmovd %edx, %xmm0
1195; AVX-NEXT:    vpinsrw $1, %ecx, %xmm0, %xmm0
1196; AVX-NEXT:    movq %rax, %rcx
1197; AVX-NEXT:    shlq $61, %rcx
1198; AVX-NEXT:    sarq $63, %rcx
1199; AVX-NEXT:    vpinsrw $2, %ecx, %xmm0, %xmm0
1200; AVX-NEXT:    movq %rax, %rcx
1201; AVX-NEXT:    shlq $60, %rcx
1202; AVX-NEXT:    sarq $63, %rcx
1203; AVX-NEXT:    vpinsrw $3, %ecx, %xmm0, %xmm0
1204; AVX-NEXT:    movq %rax, %rcx
1205; AVX-NEXT:    shlq $59, %rcx
1206; AVX-NEXT:    sarq $63, %rcx
1207; AVX-NEXT:    vpinsrw $4, %ecx, %xmm0, %xmm0
1208; AVX-NEXT:    movq %rax, %rcx
1209; AVX-NEXT:    shlq $58, %rcx
1210; AVX-NEXT:    sarq $63, %rcx
1211; AVX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
1212; AVX-NEXT:    movq %rax, %rcx
1213; AVX-NEXT:    shlq $57, %rcx
1214; AVX-NEXT:    sarq $63, %rcx
1215; AVX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
1216; AVX-NEXT:    shrq $7, %rax
1217; AVX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
1218; AVX-NEXT:    retq
1219;
1220; X32-SSE41-LABEL: load_sext_8i1_to_8i16:
1221; X32-SSE41:       # BB#0: # %entry
1222; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
1223; X32-SSE41-NEXT:    movsbl (%eax), %eax
1224; X32-SSE41-NEXT:    movl %eax, %ecx
1225; X32-SSE41-NEXT:    shll $30, %ecx
1226; X32-SSE41-NEXT:    sarl $31, %ecx
1227; X32-SSE41-NEXT:    movl %eax, %edx
1228; X32-SSE41-NEXT:    shll $31, %edx
1229; X32-SSE41-NEXT:    sarl $31, %edx
1230; X32-SSE41-NEXT:    movd %edx, %xmm0
1231; X32-SSE41-NEXT:    pinsrw $1, %ecx, %xmm0
1232; X32-SSE41-NEXT:    movl %eax, %ecx
1233; X32-SSE41-NEXT:    shll $29, %ecx
1234; X32-SSE41-NEXT:    sarl $31, %ecx
1235; X32-SSE41-NEXT:    pinsrw $2, %ecx, %xmm0
1236; X32-SSE41-NEXT:    movl %eax, %ecx
1237; X32-SSE41-NEXT:    shll $28, %ecx
1238; X32-SSE41-NEXT:    sarl $31, %ecx
1239; X32-SSE41-NEXT:    pinsrw $3, %ecx, %xmm0
1240; X32-SSE41-NEXT:    movl %eax, %ecx
1241; X32-SSE41-NEXT:    shll $27, %ecx
1242; X32-SSE41-NEXT:    sarl $31, %ecx
1243; X32-SSE41-NEXT:    pinsrw $4, %ecx, %xmm0
1244; X32-SSE41-NEXT:    movl %eax, %ecx
1245; X32-SSE41-NEXT:    shll $26, %ecx
1246; X32-SSE41-NEXT:    sarl $31, %ecx
1247; X32-SSE41-NEXT:    pinsrw $5, %ecx, %xmm0
1248; X32-SSE41-NEXT:    movl %eax, %ecx
1249; X32-SSE41-NEXT:    shll $25, %ecx
1250; X32-SSE41-NEXT:    sarl $31, %ecx
1251; X32-SSE41-NEXT:    pinsrw $6, %ecx, %xmm0
1252; X32-SSE41-NEXT:    shrl $7, %eax
1253; X32-SSE41-NEXT:    pinsrw $7, %eax, %xmm0
1254; X32-SSE41-NEXT:    retl
1255entry:
1256 %X = load <8 x i1>, <8 x i1>* %ptr
1257 %Y = sext <8 x i1> %X to <8 x i16>
1258 ret <8 x i16> %Y
1259}
1260
1261define <8 x i16> @load_sext_8i8_to_8i16(<8 x i8> *%ptr) {
1262; SSE2-LABEL: load_sext_8i8_to_8i16:
1263; SSE2:       # BB#0: # %entry
1264; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
1265; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1266; SSE2-NEXT:    psraw $8, %xmm0
1267; SSE2-NEXT:    retq
1268;
1269; SSSE3-LABEL: load_sext_8i8_to_8i16:
1270; SSSE3:       # BB#0: # %entry
1271; SSSE3-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
1272; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1273; SSSE3-NEXT:    psraw $8, %xmm0
1274; SSSE3-NEXT:    retq
1275;
1276; SSE41-LABEL: load_sext_8i8_to_8i16:
1277; SSE41:       # BB#0: # %entry
1278; SSE41-NEXT:    pmovsxbw (%rdi), %xmm0
1279; SSE41-NEXT:    retq
1280;
1281; AVX-LABEL: load_sext_8i8_to_8i16:
1282; AVX:       # BB#0: # %entry
1283; AVX-NEXT:    vpmovsxbw (%rdi), %xmm0
1284; AVX-NEXT:    retq
1285;
1286; X32-SSE41-LABEL: load_sext_8i8_to_8i16:
1287; X32-SSE41:       # BB#0: # %entry
1288; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
1289; X32-SSE41-NEXT:    pmovsxbw (%eax), %xmm0
1290; X32-SSE41-NEXT:    retl
1291entry:
1292 %X = load <8 x i8>, <8 x i8>* %ptr
1293 %Y = sext <8 x i8> %X to <8 x i16>
1294 ret <8 x i16> %Y
1295}
1296
1297define <8 x i32> @load_sext_8i1_to_8i32(<8 x i1> *%ptr) {
1298; SSE2-LABEL: load_sext_8i1_to_8i32:
1299; SSE2:       # BB#0: # %entry
1300; SSE2-NEXT:    movzbl (%rdi), %eax
1301; SSE2-NEXT:    movl %eax, %ecx
1302; SSE2-NEXT:    shrl $6, %ecx
1303; SSE2-NEXT:    andl $1, %ecx
1304; SSE2-NEXT:    movd %ecx, %xmm0
1305; SSE2-NEXT:    movl %eax, %ecx
1306; SSE2-NEXT:    shrl $2, %ecx
1307; SSE2-NEXT:    andl $1, %ecx
1308; SSE2-NEXT:    movd %ecx, %xmm2
1309; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
1310; SSE2-NEXT:    movl %eax, %ecx
1311; SSE2-NEXT:    andl $1, %ecx
1312; SSE2-NEXT:    movd %ecx, %xmm1
1313; SSE2-NEXT:    movl %eax, %ecx
1314; SSE2-NEXT:    shrl $4, %ecx
1315; SSE2-NEXT:    andl $1, %ecx
1316; SSE2-NEXT:    movd %ecx, %xmm0
1317; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1318; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1319; SSE2-NEXT:    movl %eax, %ecx
1320; SSE2-NEXT:    shrl $5, %ecx
1321; SSE2-NEXT:    andl $1, %ecx
1322; SSE2-NEXT:    movd %ecx, %xmm0
1323; SSE2-NEXT:    movl %eax, %ecx
1324; SSE2-NEXT:    shrl %ecx
1325; SSE2-NEXT:    andl $1, %ecx
1326; SSE2-NEXT:    movd %ecx, %xmm2
1327; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
1328; SSE2-NEXT:    movl %eax, %ecx
1329; SSE2-NEXT:    shrl $3, %ecx
1330; SSE2-NEXT:    andl $1, %ecx
1331; SSE2-NEXT:    movd %ecx, %xmm0
1332; SSE2-NEXT:    shrl $7, %eax
1333; SSE2-NEXT:    movzwl %ax, %eax
1334; SSE2-NEXT:    movd %eax, %xmm3
1335; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
1336; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
1337; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1338; SSE2-NEXT:    movdqa %xmm1, %xmm0
1339; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1340; SSE2-NEXT:    pslld $31, %xmm0
1341; SSE2-NEXT:    psrad $31, %xmm0
1342; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1343; SSE2-NEXT:    pslld $31, %xmm1
1344; SSE2-NEXT:    psrad $31, %xmm1
1345; SSE2-NEXT:    retq
1346;
1347; SSSE3-LABEL: load_sext_8i1_to_8i32:
1348; SSSE3:       # BB#0: # %entry
1349; SSSE3-NEXT:    movzbl (%rdi), %eax
1350; SSSE3-NEXT:    movl %eax, %ecx
1351; SSSE3-NEXT:    shrl $6, %ecx
1352; SSSE3-NEXT:    andl $1, %ecx
1353; SSSE3-NEXT:    movd %ecx, %xmm0
1354; SSSE3-NEXT:    movl %eax, %ecx
1355; SSSE3-NEXT:    shrl $2, %ecx
1356; SSSE3-NEXT:    andl $1, %ecx
1357; SSSE3-NEXT:    movd %ecx, %xmm2
1358; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
1359; SSSE3-NEXT:    movl %eax, %ecx
1360; SSSE3-NEXT:    andl $1, %ecx
1361; SSSE3-NEXT:    movd %ecx, %xmm1
1362; SSSE3-NEXT:    movl %eax, %ecx
1363; SSSE3-NEXT:    shrl $4, %ecx
1364; SSSE3-NEXT:    andl $1, %ecx
1365; SSSE3-NEXT:    movd %ecx, %xmm0
1366; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1367; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1368; SSSE3-NEXT:    movl %eax, %ecx
1369; SSSE3-NEXT:    shrl $5, %ecx
1370; SSSE3-NEXT:    andl $1, %ecx
1371; SSSE3-NEXT:    movd %ecx, %xmm0
1372; SSSE3-NEXT:    movl %eax, %ecx
1373; SSSE3-NEXT:    shrl %ecx
1374; SSSE3-NEXT:    andl $1, %ecx
1375; SSSE3-NEXT:    movd %ecx, %xmm2
1376; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
1377; SSSE3-NEXT:    movl %eax, %ecx
1378; SSSE3-NEXT:    shrl $3, %ecx
1379; SSSE3-NEXT:    andl $1, %ecx
1380; SSSE3-NEXT:    movd %ecx, %xmm0
1381; SSSE3-NEXT:    shrl $7, %eax
1382; SSSE3-NEXT:    movzwl %ax, %eax
1383; SSSE3-NEXT:    movd %eax, %xmm3
1384; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
1385; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
1386; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1387; SSSE3-NEXT:    movdqa %xmm1, %xmm0
1388; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1389; SSSE3-NEXT:    pslld $31, %xmm0
1390; SSSE3-NEXT:    psrad $31, %xmm0
1391; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1392; SSSE3-NEXT:    pslld $31, %xmm1
1393; SSSE3-NEXT:    psrad $31, %xmm1
1394; SSSE3-NEXT:    retq
1395;
1396; SSE41-LABEL: load_sext_8i1_to_8i32:
1397; SSE41:       # BB#0: # %entry
1398; SSE41-NEXT:    movzbl (%rdi), %eax
1399; SSE41-NEXT:    movl %eax, %ecx
1400; SSE41-NEXT:    shrl %ecx
1401; SSE41-NEXT:    andl $1, %ecx
1402; SSE41-NEXT:    movl %eax, %edx
1403; SSE41-NEXT:    andl $1, %edx
1404; SSE41-NEXT:    movd %edx, %xmm1
1405; SSE41-NEXT:    pinsrw $1, %ecx, %xmm1
1406; SSE41-NEXT:    movl %eax, %ecx
1407; SSE41-NEXT:    shrl $2, %ecx
1408; SSE41-NEXT:    andl $1, %ecx
1409; SSE41-NEXT:    pinsrw $2, %ecx, %xmm1
1410; SSE41-NEXT:    movl %eax, %ecx
1411; SSE41-NEXT:    shrl $3, %ecx
1412; SSE41-NEXT:    andl $1, %ecx
1413; SSE41-NEXT:    pinsrw $3, %ecx, %xmm1
1414; SSE41-NEXT:    movl %eax, %ecx
1415; SSE41-NEXT:    shrl $4, %ecx
1416; SSE41-NEXT:    andl $1, %ecx
1417; SSE41-NEXT:    pinsrw $4, %ecx, %xmm1
1418; SSE41-NEXT:    movl %eax, %ecx
1419; SSE41-NEXT:    shrl $5, %ecx
1420; SSE41-NEXT:    andl $1, %ecx
1421; SSE41-NEXT:    pinsrw $5, %ecx, %xmm1
1422; SSE41-NEXT:    movl %eax, %ecx
1423; SSE41-NEXT:    shrl $6, %ecx
1424; SSE41-NEXT:    andl $1, %ecx
1425; SSE41-NEXT:    pinsrw $6, %ecx, %xmm1
1426; SSE41-NEXT:    shrl $7, %eax
1427; SSE41-NEXT:    movzwl %ax, %eax
1428; SSE41-NEXT:    pinsrw $7, %eax, %xmm1
1429; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1430; SSE41-NEXT:    pslld $31, %xmm0
1431; SSE41-NEXT:    psrad $31, %xmm0
1432; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1433; SSE41-NEXT:    pslld $31, %xmm1
1434; SSE41-NEXT:    psrad $31, %xmm1
1435; SSE41-NEXT:    retq
1436;
1437; AVX1-LABEL: load_sext_8i1_to_8i32:
1438; AVX1:       # BB#0: # %entry
1439; AVX1-NEXT:    movsbq (%rdi), %rax
1440; AVX1-NEXT:    movq %rax, %rcx
1441; AVX1-NEXT:    shlq $58, %rcx
1442; AVX1-NEXT:    sarq $63, %rcx
1443; AVX1-NEXT:    movq %rax, %rdx
1444; AVX1-NEXT:    shlq $59, %rdx
1445; AVX1-NEXT:    sarq $63, %rdx
1446; AVX1-NEXT:    vmovd %edx, %xmm0
1447; AVX1-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0
1448; AVX1-NEXT:    movq %rax, %rcx
1449; AVX1-NEXT:    shlq $57, %rcx
1450; AVX1-NEXT:    sarq $63, %rcx
1451; AVX1-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0
1452; AVX1-NEXT:    movq %rax, %rcx
1453; AVX1-NEXT:    shrq $7, %rcx
1454; AVX1-NEXT:    vpinsrd $3, %ecx, %xmm0, %xmm0
1455; AVX1-NEXT:    movq %rax, %rcx
1456; AVX1-NEXT:    shlq $62, %rcx
1457; AVX1-NEXT:    sarq $63, %rcx
1458; AVX1-NEXT:    movq %rax, %rdx
1459; AVX1-NEXT:    shlq $63, %rdx
1460; AVX1-NEXT:    sarq $63, %rdx
1461; AVX1-NEXT:    vmovd %edx, %xmm1
1462; AVX1-NEXT:    vpinsrd $1, %ecx, %xmm1, %xmm1
1463; AVX1-NEXT:    movq %rax, %rcx
1464; AVX1-NEXT:    shlq $61, %rcx
1465; AVX1-NEXT:    sarq $63, %rcx
1466; AVX1-NEXT:    vpinsrd $2, %ecx, %xmm1, %xmm1
1467; AVX1-NEXT:    shlq $60, %rax
1468; AVX1-NEXT:    sarq $63, %rax
1469; AVX1-NEXT:    vpinsrd $3, %eax, %xmm1, %xmm1
1470; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1471; AVX1-NEXT:    retq
1472;
1473; AVX2-LABEL: load_sext_8i1_to_8i32:
1474; AVX2:       # BB#0: # %entry
1475; AVX2-NEXT:    movsbq (%rdi), %rax
1476; AVX2-NEXT:    movq %rax, %rcx
1477; AVX2-NEXT:    shlq $58, %rcx
1478; AVX2-NEXT:    sarq $63, %rcx
1479; AVX2-NEXT:    movq %rax, %rdx
1480; AVX2-NEXT:    shlq $59, %rdx
1481; AVX2-NEXT:    sarq $63, %rdx
1482; AVX2-NEXT:    vmovd %edx, %xmm0
1483; AVX2-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0
1484; AVX2-NEXT:    movq %rax, %rcx
1485; AVX2-NEXT:    shlq $57, %rcx
1486; AVX2-NEXT:    sarq $63, %rcx
1487; AVX2-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0
1488; AVX2-NEXT:    movq %rax, %rcx
1489; AVX2-NEXT:    shrq $7, %rcx
1490; AVX2-NEXT:    vpinsrd $3, %ecx, %xmm0, %xmm0
1491; AVX2-NEXT:    movq %rax, %rcx
1492; AVX2-NEXT:    shlq $62, %rcx
1493; AVX2-NEXT:    sarq $63, %rcx
1494; AVX2-NEXT:    movq %rax, %rdx
1495; AVX2-NEXT:    shlq $63, %rdx
1496; AVX2-NEXT:    sarq $63, %rdx
1497; AVX2-NEXT:    vmovd %edx, %xmm1
1498; AVX2-NEXT:    vpinsrd $1, %ecx, %xmm1, %xmm1
1499; AVX2-NEXT:    movq %rax, %rcx
1500; AVX2-NEXT:    shlq $61, %rcx
1501; AVX2-NEXT:    sarq $63, %rcx
1502; AVX2-NEXT:    vpinsrd $2, %ecx, %xmm1, %xmm1
1503; AVX2-NEXT:    shlq $60, %rax
1504; AVX2-NEXT:    sarq $63, %rax
1505; AVX2-NEXT:    vpinsrd $3, %eax, %xmm1, %xmm1
1506; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
1507; AVX2-NEXT:    retq
1508;
1509; X32-SSE41-LABEL: load_sext_8i1_to_8i32:
1510; X32-SSE41:       # BB#0: # %entry
1511; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
1512; X32-SSE41-NEXT:    movzbl (%eax), %eax
1513; X32-SSE41-NEXT:    movl %eax, %ecx
1514; X32-SSE41-NEXT:    shrl %ecx
1515; X32-SSE41-NEXT:    andl $1, %ecx
1516; X32-SSE41-NEXT:    movl %eax, %edx
1517; X32-SSE41-NEXT:    andl $1, %edx
1518; X32-SSE41-NEXT:    movd %edx, %xmm1
1519; X32-SSE41-NEXT:    pinsrw $1, %ecx, %xmm1
1520; X32-SSE41-NEXT:    movl %eax, %ecx
1521; X32-SSE41-NEXT:    shrl $2, %ecx
1522; X32-SSE41-NEXT:    andl $1, %ecx
1523; X32-SSE41-NEXT:    pinsrw $2, %ecx, %xmm1
1524; X32-SSE41-NEXT:    movl %eax, %ecx
1525; X32-SSE41-NEXT:    shrl $3, %ecx
1526; X32-SSE41-NEXT:    andl $1, %ecx
1527; X32-SSE41-NEXT:    pinsrw $3, %ecx, %xmm1
1528; X32-SSE41-NEXT:    movl %eax, %ecx
1529; X32-SSE41-NEXT:    shrl $4, %ecx
1530; X32-SSE41-NEXT:    andl $1, %ecx
1531; X32-SSE41-NEXT:    pinsrw $4, %ecx, %xmm1
1532; X32-SSE41-NEXT:    movl %eax, %ecx
1533; X32-SSE41-NEXT:    shrl $5, %ecx
1534; X32-SSE41-NEXT:    andl $1, %ecx
1535; X32-SSE41-NEXT:    pinsrw $5, %ecx, %xmm1
1536; X32-SSE41-NEXT:    movl %eax, %ecx
1537; X32-SSE41-NEXT:    shrl $6, %ecx
1538; X32-SSE41-NEXT:    andl $1, %ecx
1539; X32-SSE41-NEXT:    pinsrw $6, %ecx, %xmm1
1540; X32-SSE41-NEXT:    shrl $7, %eax
1541; X32-SSE41-NEXT:    pinsrw $7, %eax, %xmm1
1542; X32-SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1543; X32-SSE41-NEXT:    pslld $31, %xmm0
1544; X32-SSE41-NEXT:    psrad $31, %xmm0
1545; X32-SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1546; X32-SSE41-NEXT:    pslld $31, %xmm1
1547; X32-SSE41-NEXT:    psrad $31, %xmm1
1548; X32-SSE41-NEXT:    retl
1549entry:
1550 %X = load <8 x i1>, <8 x i1>* %ptr
1551 %Y = sext <8 x i1> %X to <8 x i32>
1552 ret <8 x i32> %Y
1553}
1554
1555define <8 x i32> @load_sext_8i8_to_8i32(<8 x i8> *%ptr) {
1556; SSE2-LABEL: load_sext_8i8_to_8i32:
1557; SSE2:       # BB#0: # %entry
1558; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1559; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1560; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1561; SSE2-NEXT:    psrad $24, %xmm0
1562; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1563; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1564; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
1565; SSE2-NEXT:    psrad $24, %xmm1
1566; SSE2-NEXT:    retq
1567;
1568; SSSE3-LABEL: load_sext_8i8_to_8i32:
1569; SSSE3:       # BB#0: # %entry
1570; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1571; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1572; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1573; SSSE3-NEXT:    psrad $24, %xmm0
1574; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1575; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1576; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
1577; SSSE3-NEXT:    psrad $24, %xmm1
1578; SSSE3-NEXT:    retq
1579;
1580; SSE41-LABEL: load_sext_8i8_to_8i32:
1581; SSE41:       # BB#0: # %entry
1582; SSE41-NEXT:    pmovsxbd (%rdi), %xmm0
1583; SSE41-NEXT:    pmovsxbd 4(%rdi), %xmm1
1584; SSE41-NEXT:    retq
1585;
1586; AVX1-LABEL: load_sext_8i8_to_8i32:
1587; AVX1:       # BB#0: # %entry
1588; AVX1-NEXT:    vpmovsxbw (%rdi), %xmm0
1589; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm1
1590; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1591; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
1592; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1593; AVX1-NEXT:    retq
1594;
1595; AVX2-LABEL: load_sext_8i8_to_8i32:
1596; AVX2:       # BB#0: # %entry
1597; AVX2-NEXT:    vpmovsxbd (%rdi), %ymm0
1598; AVX2-NEXT:    retq
1599;
1600; X32-SSE41-LABEL: load_sext_8i8_to_8i32:
1601; X32-SSE41:       # BB#0: # %entry
1602; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
1603; X32-SSE41-NEXT:    pmovsxbd (%eax), %xmm0
1604; X32-SSE41-NEXT:    pmovsxbd 4(%eax), %xmm1
1605; X32-SSE41-NEXT:    retl
1606entry:
1607 %X = load <8 x i8>, <8 x i8>* %ptr
1608 %Y = sext <8 x i8> %X to <8 x i32>
1609 ret <8 x i32> %Y
1610}
1611
1612define <16 x i8> @load_sext_16i1_to_16i8(<16 x i1> *%ptr) nounwind readnone {
1613; SSE2-LABEL: load_sext_16i1_to_16i8:
1614; SSE2:       # BB#0: # %entry
1615; SSE2-NEXT:    pushq %rbp
1616; SSE2-NEXT:    pushq %r15
1617; SSE2-NEXT:    pushq %r14
1618; SSE2-NEXT:    pushq %r13
1619; SSE2-NEXT:    pushq %r12
1620; SSE2-NEXT:    pushq %rbx
1621; SSE2-NEXT:    movswq (%rdi), %rax
1622; SSE2-NEXT:    movq %rax, %r8
1623; SSE2-NEXT:    movq %rax, %r9
1624; SSE2-NEXT:    movq %rax, %r10
1625; SSE2-NEXT:    movq %rax, %r11
1626; SSE2-NEXT:    movq %rax, %r14
1627; SSE2-NEXT:    movq %rax, %r15
1628; SSE2-NEXT:    movq %rax, %r12
1629; SSE2-NEXT:    movq %rax, %r13
1630; SSE2-NEXT:    movq %rax, %rbx
1631; SSE2-NEXT:    movq %rax, %rcx
1632; SSE2-NEXT:    movq %rax, %rdx
1633; SSE2-NEXT:    movq %rax, %rsi
1634; SSE2-NEXT:    movq %rax, %rdi
1635; SSE2-NEXT:    movq %rax, %rbp
1636; SSE2-NEXT:    shlq $49, %rbp
1637; SSE2-NEXT:    sarq $63, %rbp
1638; SSE2-NEXT:    movd %ebp, %xmm0
1639; SSE2-NEXT:    movq %rax, %rbp
1640; SSE2-NEXT:    movsbq %al, %rax
1641; SSE2-NEXT:    shlq $57, %r8
1642; SSE2-NEXT:    sarq $63, %r8
1643; SSE2-NEXT:    movd %r8d, %xmm1
1644; SSE2-NEXT:    shlq $53, %r9
1645; SSE2-NEXT:    sarq $63, %r9
1646; SSE2-NEXT:    movd %r9d, %xmm2
1647; SSE2-NEXT:    shlq $61, %r10
1648; SSE2-NEXT:    sarq $63, %r10
1649; SSE2-NEXT:    movd %r10d, %xmm3
1650; SSE2-NEXT:    shlq $51, %r11
1651; SSE2-NEXT:    sarq $63, %r11
1652; SSE2-NEXT:    movd %r11d, %xmm4
1653; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1654; SSE2-NEXT:    shlq $59, %r14
1655; SSE2-NEXT:    sarq $63, %r14
1656; SSE2-NEXT:    movd %r14d, %xmm5
1657; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
1658; SSE2-NEXT:    shlq $55, %r15
1659; SSE2-NEXT:    sarq $63, %r15
1660; SSE2-NEXT:    movd %r15d, %xmm2
1661; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
1662; SSE2-NEXT:    shlq $63, %r12
1663; SSE2-NEXT:    sarq $63, %r12
1664; SSE2-NEXT:    movd %r12d, %xmm0
1665; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
1666; SSE2-NEXT:    shlq $50, %r13
1667; SSE2-NEXT:    sarq $63, %r13
1668; SSE2-NEXT:    movd %r13d, %xmm1
1669; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
1670; SSE2-NEXT:    shlq $58, %rbx
1671; SSE2-NEXT:    sarq $63, %rbx
1672; SSE2-NEXT:    movd %ebx, %xmm2
1673; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
1674; SSE2-NEXT:    shlq $54, %rcx
1675; SSE2-NEXT:    sarq $63, %rcx
1676; SSE2-NEXT:    movd %ecx, %xmm4
1677; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
1678; SSE2-NEXT:    shlq $62, %rdx
1679; SSE2-NEXT:    sarq $63, %rdx
1680; SSE2-NEXT:    movd %edx, %xmm3
1681; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
1682; SSE2-NEXT:    shlq $52, %rsi
1683; SSE2-NEXT:    sarq $63, %rsi
1684; SSE2-NEXT:    movd %esi, %xmm1
1685; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
1686; SSE2-NEXT:    shlq $60, %rdi
1687; SSE2-NEXT:    sarq $63, %rdi
1688; SSE2-NEXT:    movd %edi, %xmm4
1689; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
1690; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
1691; SSE2-NEXT:    shrq $15, %rbp
1692; SSE2-NEXT:    movd %ebp, %xmm1
1693; SSE2-NEXT:    shrq $7, %rax
1694; SSE2-NEXT:    movd %eax, %xmm2
1695; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
1696; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
1697; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
1698; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
1699; SSE2-NEXT:    popq %rbx
1700; SSE2-NEXT:    popq %r12
1701; SSE2-NEXT:    popq %r13
1702; SSE2-NEXT:    popq %r14
1703; SSE2-NEXT:    popq %r15
1704; SSE2-NEXT:    popq %rbp
1705; SSE2-NEXT:    retq
1706;
1707; SSSE3-LABEL: load_sext_16i1_to_16i8:
1708; SSSE3:       # BB#0: # %entry
1709; SSSE3-NEXT:    pushq %rbp
1710; SSSE3-NEXT:    pushq %r15
1711; SSSE3-NEXT:    pushq %r14
1712; SSSE3-NEXT:    pushq %r13
1713; SSSE3-NEXT:    pushq %r12
1714; SSSE3-NEXT:    pushq %rbx
1715; SSSE3-NEXT:    movswq (%rdi), %rax
1716; SSSE3-NEXT:    movq %rax, %r8
1717; SSSE3-NEXT:    movq %rax, %r9
1718; SSSE3-NEXT:    movq %rax, %r10
1719; SSSE3-NEXT:    movq %rax, %r11
1720; SSSE3-NEXT:    movq %rax, %r14
1721; SSSE3-NEXT:    movq %rax, %r15
1722; SSSE3-NEXT:    movq %rax, %r12
1723; SSSE3-NEXT:    movq %rax, %r13
1724; SSSE3-NEXT:    movq %rax, %rbx
1725; SSSE3-NEXT:    movq %rax, %rcx
1726; SSSE3-NEXT:    movq %rax, %rdx
1727; SSSE3-NEXT:    movq %rax, %rsi
1728; SSSE3-NEXT:    movq %rax, %rdi
1729; SSSE3-NEXT:    movq %rax, %rbp
1730; SSSE3-NEXT:    shlq $49, %rbp
1731; SSSE3-NEXT:    sarq $63, %rbp
1732; SSSE3-NEXT:    movd %ebp, %xmm0
1733; SSSE3-NEXT:    movq %rax, %rbp
1734; SSSE3-NEXT:    movsbq %al, %rax
1735; SSSE3-NEXT:    shlq $57, %r8
1736; SSSE3-NEXT:    sarq $63, %r8
1737; SSSE3-NEXT:    movd %r8d, %xmm1
1738; SSSE3-NEXT:    shlq $53, %r9
1739; SSSE3-NEXT:    sarq $63, %r9
1740; SSSE3-NEXT:    movd %r9d, %xmm2
1741; SSSE3-NEXT:    shlq $61, %r10
1742; SSSE3-NEXT:    sarq $63, %r10
1743; SSSE3-NEXT:    movd %r10d, %xmm3
1744; SSSE3-NEXT:    shlq $51, %r11
1745; SSSE3-NEXT:    sarq $63, %r11
1746; SSSE3-NEXT:    movd %r11d, %xmm4
1747; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1748; SSSE3-NEXT:    shlq $59, %r14
1749; SSSE3-NEXT:    sarq $63, %r14
1750; SSSE3-NEXT:    movd %r14d, %xmm5
1751; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
1752; SSSE3-NEXT:    shlq $55, %r15
1753; SSSE3-NEXT:    sarq $63, %r15
1754; SSSE3-NEXT:    movd %r15d, %xmm2
1755; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
1756; SSSE3-NEXT:    shlq $63, %r12
1757; SSSE3-NEXT:    sarq $63, %r12
1758; SSSE3-NEXT:    movd %r12d, %xmm0
1759; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
1760; SSSE3-NEXT:    shlq $50, %r13
1761; SSSE3-NEXT:    sarq $63, %r13
1762; SSSE3-NEXT:    movd %r13d, %xmm1
1763; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
1764; SSSE3-NEXT:    shlq $58, %rbx
1765; SSSE3-NEXT:    sarq $63, %rbx
1766; SSSE3-NEXT:    movd %ebx, %xmm2
1767; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
1768; SSSE3-NEXT:    shlq $54, %rcx
1769; SSSE3-NEXT:    sarq $63, %rcx
1770; SSSE3-NEXT:    movd %ecx, %xmm4
1771; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
1772; SSSE3-NEXT:    shlq $62, %rdx
1773; SSSE3-NEXT:    sarq $63, %rdx
1774; SSSE3-NEXT:    movd %edx, %xmm3
1775; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
1776; SSSE3-NEXT:    shlq $52, %rsi
1777; SSSE3-NEXT:    sarq $63, %rsi
1778; SSSE3-NEXT:    movd %esi, %xmm1
1779; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
1780; SSSE3-NEXT:    shlq $60, %rdi
1781; SSSE3-NEXT:    sarq $63, %rdi
1782; SSSE3-NEXT:    movd %edi, %xmm4
1783; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
1784; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
1785; SSSE3-NEXT:    shrq $15, %rbp
1786; SSSE3-NEXT:    movd %ebp, %xmm1
1787; SSSE3-NEXT:    shrq $7, %rax
1788; SSSE3-NEXT:    movd %eax, %xmm2
1789; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
1790; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
1791; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
1792; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
1793; SSSE3-NEXT:    popq %rbx
1794; SSSE3-NEXT:    popq %r12
1795; SSSE3-NEXT:    popq %r13
1796; SSSE3-NEXT:    popq %r14
1797; SSSE3-NEXT:    popq %r15
1798; SSSE3-NEXT:    popq %rbp
1799; SSSE3-NEXT:    retq
1800;
1801; SSE41-LABEL: load_sext_16i1_to_16i8:
1802; SSE41:       # BB#0: # %entry
1803; SSE41-NEXT:    movswq (%rdi), %rax
1804; SSE41-NEXT:    movq %rax, %rcx
1805; SSE41-NEXT:    shlq $62, %rcx
1806; SSE41-NEXT:    sarq $63, %rcx
1807; SSE41-NEXT:    movq %rax, %rdx
1808; SSE41-NEXT:    shlq $63, %rdx
1809; SSE41-NEXT:    sarq $63, %rdx
1810; SSE41-NEXT:    movd %edx, %xmm0
1811; SSE41-NEXT:    pinsrb $1, %ecx, %xmm0
1812; SSE41-NEXT:    movq %rax, %rcx
1813; SSE41-NEXT:    shlq $61, %rcx
1814; SSE41-NEXT:    sarq $63, %rcx
1815; SSE41-NEXT:    pinsrb $2, %ecx, %xmm0
1816; SSE41-NEXT:    movq %rax, %rcx
1817; SSE41-NEXT:    shlq $60, %rcx
1818; SSE41-NEXT:    sarq $63, %rcx
1819; SSE41-NEXT:    pinsrb $3, %ecx, %xmm0
1820; SSE41-NEXT:    movq %rax, %rcx
1821; SSE41-NEXT:    shlq $59, %rcx
1822; SSE41-NEXT:    sarq $63, %rcx
1823; SSE41-NEXT:    pinsrb $4, %ecx, %xmm0
1824; SSE41-NEXT:    movq %rax, %rcx
1825; SSE41-NEXT:    shlq $58, %rcx
1826; SSE41-NEXT:    sarq $63, %rcx
1827; SSE41-NEXT:    pinsrb $5, %ecx, %xmm0
1828; SSE41-NEXT:    movq %rax, %rcx
1829; SSE41-NEXT:    shlq $57, %rcx
1830; SSE41-NEXT:    sarq $63, %rcx
1831; SSE41-NEXT:    pinsrb $6, %ecx, %xmm0
1832; SSE41-NEXT:    movsbq %al, %rcx
1833; SSE41-NEXT:    shrq $7, %rcx
1834; SSE41-NEXT:    pinsrb $7, %ecx, %xmm0
1835; SSE41-NEXT:    movq %rax, %rcx
1836; SSE41-NEXT:    shlq $55, %rcx
1837; SSE41-NEXT:    sarq $63, %rcx
1838; SSE41-NEXT:    pinsrb $8, %ecx, %xmm0
1839; SSE41-NEXT:    movq %rax, %rcx
1840; SSE41-NEXT:    shlq $54, %rcx
1841; SSE41-NEXT:    sarq $63, %rcx
1842; SSE41-NEXT:    pinsrb $9, %ecx, %xmm0
1843; SSE41-NEXT:    movq %rax, %rcx
1844; SSE41-NEXT:    shlq $53, %rcx
1845; SSE41-NEXT:    sarq $63, %rcx
1846; SSE41-NEXT:    pinsrb $10, %ecx, %xmm0
1847; SSE41-NEXT:    movq %rax, %rcx
1848; SSE41-NEXT:    shlq $52, %rcx
1849; SSE41-NEXT:    sarq $63, %rcx
1850; SSE41-NEXT:    pinsrb $11, %ecx, %xmm0
1851; SSE41-NEXT:    movq %rax, %rcx
1852; SSE41-NEXT:    shlq $51, %rcx
1853; SSE41-NEXT:    sarq $63, %rcx
1854; SSE41-NEXT:    pinsrb $12, %ecx, %xmm0
1855; SSE41-NEXT:    movq %rax, %rcx
1856; SSE41-NEXT:    shlq $50, %rcx
1857; SSE41-NEXT:    sarq $63, %rcx
1858; SSE41-NEXT:    pinsrb $13, %ecx, %xmm0
1859; SSE41-NEXT:    movq %rax, %rcx
1860; SSE41-NEXT:    shlq $49, %rcx
1861; SSE41-NEXT:    sarq $63, %rcx
1862; SSE41-NEXT:    pinsrb $14, %ecx, %xmm0
1863; SSE41-NEXT:    shrq $15, %rax
1864; SSE41-NEXT:    pinsrb $15, %eax, %xmm0
1865; SSE41-NEXT:    retq
1866;
1867; AVX-LABEL: load_sext_16i1_to_16i8:
1868; AVX:       # BB#0: # %entry
1869; AVX-NEXT:    movswq (%rdi), %rax
1870; AVX-NEXT:    movq %rax, %rcx
1871; AVX-NEXT:    shlq $62, %rcx
1872; AVX-NEXT:    sarq $63, %rcx
1873; AVX-NEXT:    movq %rax, %rdx
1874; AVX-NEXT:    shlq $63, %rdx
1875; AVX-NEXT:    sarq $63, %rdx
1876; AVX-NEXT:    vmovd %edx, %xmm0
1877; AVX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
1878; AVX-NEXT:    movq %rax, %rcx
1879; AVX-NEXT:    shlq $61, %rcx
1880; AVX-NEXT:    sarq $63, %rcx
1881; AVX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
1882; AVX-NEXT:    movq %rax, %rcx
1883; AVX-NEXT:    shlq $60, %rcx
1884; AVX-NEXT:    sarq $63, %rcx
1885; AVX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
1886; AVX-NEXT:    movq %rax, %rcx
1887; AVX-NEXT:    shlq $59, %rcx
1888; AVX-NEXT:    sarq $63, %rcx
1889; AVX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
1890; AVX-NEXT:    movq %rax, %rcx
1891; AVX-NEXT:    shlq $58, %rcx
1892; AVX-NEXT:    sarq $63, %rcx
1893; AVX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
1894; AVX-NEXT:    movq %rax, %rcx
1895; AVX-NEXT:    shlq $57, %rcx
1896; AVX-NEXT:    sarq $63, %rcx
1897; AVX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
1898; AVX-NEXT:    movsbq %al, %rcx
1899; AVX-NEXT:    shrq $7, %rcx
1900; AVX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
1901; AVX-NEXT:    movq %rax, %rcx
1902; AVX-NEXT:    shlq $55, %rcx
1903; AVX-NEXT:    sarq $63, %rcx
1904; AVX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
1905; AVX-NEXT:    movq %rax, %rcx
1906; AVX-NEXT:    shlq $54, %rcx
1907; AVX-NEXT:    sarq $63, %rcx
1908; AVX-NEXT:    vpinsrb $9, %ecx, %xmm0, %xmm0
1909; AVX-NEXT:    movq %rax, %rcx
1910; AVX-NEXT:    shlq $53, %rcx
1911; AVX-NEXT:    sarq $63, %rcx
1912; AVX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
1913; AVX-NEXT:    movq %rax, %rcx
1914; AVX-NEXT:    shlq $52, %rcx
1915; AVX-NEXT:    sarq $63, %rcx
1916; AVX-NEXT:    vpinsrb $11, %ecx, %xmm0, %xmm0
1917; AVX-NEXT:    movq %rax, %rcx
1918; AVX-NEXT:    shlq $51, %rcx
1919; AVX-NEXT:    sarq $63, %rcx
1920; AVX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
1921; AVX-NEXT:    movq %rax, %rcx
1922; AVX-NEXT:    shlq $50, %rcx
1923; AVX-NEXT:    sarq $63, %rcx
1924; AVX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
1925; AVX-NEXT:    movq %rax, %rcx
1926; AVX-NEXT:    shlq $49, %rcx
1927; AVX-NEXT:    sarq $63, %rcx
1928; AVX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
1929; AVX-NEXT:    shrq $15, %rax
1930; AVX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
1931; AVX-NEXT:    retq
1932;
1933; X32-SSE41-LABEL: load_sext_16i1_to_16i8:
1934; X32-SSE41:       # BB#0: # %entry
1935; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
1936; X32-SSE41-NEXT:    movswl (%eax), %eax
1937; X32-SSE41-NEXT:    movl %eax, %ecx
1938; X32-SSE41-NEXT:    shll $30, %ecx
1939; X32-SSE41-NEXT:    sarl $31, %ecx
1940; X32-SSE41-NEXT:    movl %eax, %edx
1941; X32-SSE41-NEXT:    shll $31, %edx
1942; X32-SSE41-NEXT:    sarl $31, %edx
1943; X32-SSE41-NEXT:    movd %edx, %xmm0
1944; X32-SSE41-NEXT:    pinsrb $1, %ecx, %xmm0
1945; X32-SSE41-NEXT:    movl %eax, %ecx
1946; X32-SSE41-NEXT:    shll $29, %ecx
1947; X32-SSE41-NEXT:    sarl $31, %ecx
1948; X32-SSE41-NEXT:    pinsrb $2, %ecx, %xmm0
1949; X32-SSE41-NEXT:    movl %eax, %ecx
1950; X32-SSE41-NEXT:    shll $28, %ecx
1951; X32-SSE41-NEXT:    sarl $31, %ecx
1952; X32-SSE41-NEXT:    pinsrb $3, %ecx, %xmm0
1953; X32-SSE41-NEXT:    movl %eax, %ecx
1954; X32-SSE41-NEXT:    shll $27, %ecx
1955; X32-SSE41-NEXT:    sarl $31, %ecx
1956; X32-SSE41-NEXT:    pinsrb $4, %ecx, %xmm0
1957; X32-SSE41-NEXT:    movl %eax, %ecx
1958; X32-SSE41-NEXT:    shll $26, %ecx
1959; X32-SSE41-NEXT:    sarl $31, %ecx
1960; X32-SSE41-NEXT:    pinsrb $5, %ecx, %xmm0
1961; X32-SSE41-NEXT:    movl %eax, %ecx
1962; X32-SSE41-NEXT:    shll $25, %ecx
1963; X32-SSE41-NEXT:    sarl $31, %ecx
1964; X32-SSE41-NEXT:    pinsrb $6, %ecx, %xmm0
1965; X32-SSE41-NEXT:    movsbl %al, %ecx
1966; X32-SSE41-NEXT:    shrl $7, %ecx
1967; X32-SSE41-NEXT:    pinsrb $7, %ecx, %xmm0
1968; X32-SSE41-NEXT:    movl %eax, %ecx
1969; X32-SSE41-NEXT:    shll $23, %ecx
1970; X32-SSE41-NEXT:    sarl $31, %ecx
1971; X32-SSE41-NEXT:    pinsrb $8, %ecx, %xmm0
1972; X32-SSE41-NEXT:    movl %eax, %ecx
1973; X32-SSE41-NEXT:    shll $22, %ecx
1974; X32-SSE41-NEXT:    sarl $31, %ecx
1975; X32-SSE41-NEXT:    pinsrb $9, %ecx, %xmm0
1976; X32-SSE41-NEXT:    movl %eax, %ecx
1977; X32-SSE41-NEXT:    shll $21, %ecx
1978; X32-SSE41-NEXT:    sarl $31, %ecx
1979; X32-SSE41-NEXT:    pinsrb $10, %ecx, %xmm0
1980; X32-SSE41-NEXT:    movl %eax, %ecx
1981; X32-SSE41-NEXT:    shll $20, %ecx
1982; X32-SSE41-NEXT:    sarl $31, %ecx
1983; X32-SSE41-NEXT:    pinsrb $11, %ecx, %xmm0
1984; X32-SSE41-NEXT:    movl %eax, %ecx
1985; X32-SSE41-NEXT:    shll $19, %ecx
1986; X32-SSE41-NEXT:    sarl $31, %ecx
1987; X32-SSE41-NEXT:    pinsrb $12, %ecx, %xmm0
1988; X32-SSE41-NEXT:    movl %eax, %ecx
1989; X32-SSE41-NEXT:    shll $18, %ecx
1990; X32-SSE41-NEXT:    sarl $31, %ecx
1991; X32-SSE41-NEXT:    pinsrb $13, %ecx, %xmm0
1992; X32-SSE41-NEXT:    movl %eax, %ecx
1993; X32-SSE41-NEXT:    shll $17, %ecx
1994; X32-SSE41-NEXT:    sarl $31, %ecx
1995; X32-SSE41-NEXT:    pinsrb $14, %ecx, %xmm0
1996; X32-SSE41-NEXT:    shrl $15, %eax
1997; X32-SSE41-NEXT:    pinsrb $15, %eax, %xmm0
1998; X32-SSE41-NEXT:    retl
1999entry:
2000 %X = load <16 x i1>, <16 x i1>* %ptr
2001 %Y = sext <16 x i1> %X to <16 x i8>
2002 ret <16 x i8> %Y
2003}
2004
2005define <16 x i16> @load_sext_16i1_to_16i16(<16 x i1> *%ptr) {
2006; SSE2-LABEL: load_sext_16i1_to_16i16:
2007; SSE2:       # BB#0: # %entry
2008; SSE2-NEXT:    movzwl (%rdi), %eax
2009; SSE2-NEXT:    movl %eax, %ecx
2010; SSE2-NEXT:    shrl $14, %ecx
2011; SSE2-NEXT:    andl $1, %ecx
2012; SSE2-NEXT:    movd %ecx, %xmm0
2013; SSE2-NEXT:    movl %eax, %ecx
2014; SSE2-NEXT:    shrl $6, %ecx
2015; SSE2-NEXT:    andl $1, %ecx
2016; SSE2-NEXT:    movd %ecx, %xmm1
2017; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2018; SSE2-NEXT:    movl %eax, %ecx
2019; SSE2-NEXT:    shrl $10, %ecx
2020; SSE2-NEXT:    andl $1, %ecx
2021; SSE2-NEXT:    movd %ecx, %xmm0
2022; SSE2-NEXT:    movl %eax, %ecx
2023; SSE2-NEXT:    shrl $2, %ecx
2024; SSE2-NEXT:    andl $1, %ecx
2025; SSE2-NEXT:    movd %ecx, %xmm2
2026; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
2027; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
2028; SSE2-NEXT:    movl %eax, %ecx
2029; SSE2-NEXT:    shrl $12, %ecx
2030; SSE2-NEXT:    andl $1, %ecx
2031; SSE2-NEXT:    movd %ecx, %xmm0
2032; SSE2-NEXT:    movl %eax, %ecx
2033; SSE2-NEXT:    shrl $4, %ecx
2034; SSE2-NEXT:    andl $1, %ecx
2035; SSE2-NEXT:    movd %ecx, %xmm3
2036; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
2037; SSE2-NEXT:    movl %eax, %ecx
2038; SSE2-NEXT:    andl $1, %ecx
2039; SSE2-NEXT:    movd %ecx, %xmm1
2040; SSE2-NEXT:    movl %eax, %ecx
2041; SSE2-NEXT:    shrl $8, %ecx
2042; SSE2-NEXT:    andl $1, %ecx
2043; SSE2-NEXT:    movd %ecx, %xmm0
2044; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2045; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
2046; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
2047; SSE2-NEXT:    movl %eax, %ecx
2048; SSE2-NEXT:    shrl $13, %ecx
2049; SSE2-NEXT:    andl $1, %ecx
2050; SSE2-NEXT:    movd %ecx, %xmm0
2051; SSE2-NEXT:    movl %eax, %ecx
2052; SSE2-NEXT:    shrl $5, %ecx
2053; SSE2-NEXT:    andl $1, %ecx
2054; SSE2-NEXT:    movd %ecx, %xmm2
2055; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
2056; SSE2-NEXT:    movl %eax, %ecx
2057; SSE2-NEXT:    shrl $9, %ecx
2058; SSE2-NEXT:    andl $1, %ecx
2059; SSE2-NEXT:    movd %ecx, %xmm3
2060; SSE2-NEXT:    movl %eax, %ecx
2061; SSE2-NEXT:    shrl %ecx
2062; SSE2-NEXT:    andl $1, %ecx
2063; SSE2-NEXT:    movd %ecx, %xmm0
2064; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
2065; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
2066; SSE2-NEXT:    movl %eax, %ecx
2067; SSE2-NEXT:    shrl $11, %ecx
2068; SSE2-NEXT:    andl $1, %ecx
2069; SSE2-NEXT:    movd %ecx, %xmm2
2070; SSE2-NEXT:    movl %eax, %ecx
2071; SSE2-NEXT:    shrl $3, %ecx
2072; SSE2-NEXT:    andl $1, %ecx
2073; SSE2-NEXT:    movd %ecx, %xmm3
2074; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
2075; SSE2-NEXT:    movl %eax, %ecx
2076; SSE2-NEXT:    shrl $7, %ecx
2077; SSE2-NEXT:    andl $1, %ecx
2078; SSE2-NEXT:    movd %ecx, %xmm2
2079; SSE2-NEXT:    shrl $15, %eax
2080; SSE2-NEXT:    movzwl %ax, %eax
2081; SSE2-NEXT:    movd %eax, %xmm4
2082; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
2083; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
2084; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
2085; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2086; SSE2-NEXT:    movdqa %xmm1, %xmm0
2087; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2088; SSE2-NEXT:    psllw $15, %xmm0
2089; SSE2-NEXT:    psraw $15, %xmm0
2090; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
2091; SSE2-NEXT:    psllw $15, %xmm1
2092; SSE2-NEXT:    psraw $15, %xmm1
2093; SSE2-NEXT:    retq
2094;
2095; SSSE3-LABEL: load_sext_16i1_to_16i16:
2096; SSSE3:       # BB#0: # %entry
2097; SSSE3-NEXT:    movzwl (%rdi), %eax
2098; SSSE3-NEXT:    movl %eax, %ecx
2099; SSSE3-NEXT:    shrl $14, %ecx
2100; SSSE3-NEXT:    andl $1, %ecx
2101; SSSE3-NEXT:    movd %ecx, %xmm0
2102; SSSE3-NEXT:    movl %eax, %ecx
2103; SSSE3-NEXT:    shrl $6, %ecx
2104; SSSE3-NEXT:    andl $1, %ecx
2105; SSSE3-NEXT:    movd %ecx, %xmm1
2106; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2107; SSSE3-NEXT:    movl %eax, %ecx
2108; SSSE3-NEXT:    shrl $10, %ecx
2109; SSSE3-NEXT:    andl $1, %ecx
2110; SSSE3-NEXT:    movd %ecx, %xmm0
2111; SSSE3-NEXT:    movl %eax, %ecx
2112; SSSE3-NEXT:    shrl $2, %ecx
2113; SSSE3-NEXT:    andl $1, %ecx
2114; SSSE3-NEXT:    movd %ecx, %xmm2
2115; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
2116; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
2117; SSSE3-NEXT:    movl %eax, %ecx
2118; SSSE3-NEXT:    shrl $12, %ecx
2119; SSSE3-NEXT:    andl $1, %ecx
2120; SSSE3-NEXT:    movd %ecx, %xmm0
2121; SSSE3-NEXT:    movl %eax, %ecx
2122; SSSE3-NEXT:    shrl $4, %ecx
2123; SSSE3-NEXT:    andl $1, %ecx
2124; SSSE3-NEXT:    movd %ecx, %xmm3
2125; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
2126; SSSE3-NEXT:    movl %eax, %ecx
2127; SSSE3-NEXT:    andl $1, %ecx
2128; SSSE3-NEXT:    movd %ecx, %xmm1
2129; SSSE3-NEXT:    movl %eax, %ecx
2130; SSSE3-NEXT:    shrl $8, %ecx
2131; SSSE3-NEXT:    andl $1, %ecx
2132; SSSE3-NEXT:    movd %ecx, %xmm0
2133; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2134; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
2135; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
2136; SSSE3-NEXT:    movl %eax, %ecx
2137; SSSE3-NEXT:    shrl $13, %ecx
2138; SSSE3-NEXT:    andl $1, %ecx
2139; SSSE3-NEXT:    movd %ecx, %xmm0
2140; SSSE3-NEXT:    movl %eax, %ecx
2141; SSSE3-NEXT:    shrl $5, %ecx
2142; SSSE3-NEXT:    andl $1, %ecx
2143; SSSE3-NEXT:    movd %ecx, %xmm2
2144; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
2145; SSSE3-NEXT:    movl %eax, %ecx
2146; SSSE3-NEXT:    shrl $9, %ecx
2147; SSSE3-NEXT:    andl $1, %ecx
2148; SSSE3-NEXT:    movd %ecx, %xmm3
2149; SSSE3-NEXT:    movl %eax, %ecx
2150; SSSE3-NEXT:    shrl %ecx
2151; SSSE3-NEXT:    andl $1, %ecx
2152; SSSE3-NEXT:    movd %ecx, %xmm0
2153; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
2154; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
2155; SSSE3-NEXT:    movl %eax, %ecx
2156; SSSE3-NEXT:    shrl $11, %ecx
2157; SSSE3-NEXT:    andl $1, %ecx
2158; SSSE3-NEXT:    movd %ecx, %xmm2
2159; SSSE3-NEXT:    movl %eax, %ecx
2160; SSSE3-NEXT:    shrl $3, %ecx
2161; SSSE3-NEXT:    andl $1, %ecx
2162; SSSE3-NEXT:    movd %ecx, %xmm3
2163; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
2164; SSSE3-NEXT:    movl %eax, %ecx
2165; SSSE3-NEXT:    shrl $7, %ecx
2166; SSSE3-NEXT:    andl $1, %ecx
2167; SSSE3-NEXT:    movd %ecx, %xmm2
2168; SSSE3-NEXT:    shrl $15, %eax
2169; SSSE3-NEXT:    movzwl %ax, %eax
2170; SSSE3-NEXT:    movd %eax, %xmm4
2171; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
2172; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
2173; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
2174; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2175; SSSE3-NEXT:    movdqa %xmm1, %xmm0
2176; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2177; SSSE3-NEXT:    psllw $15, %xmm0
2178; SSSE3-NEXT:    psraw $15, %xmm0
2179; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
2180; SSSE3-NEXT:    psllw $15, %xmm1
2181; SSSE3-NEXT:    psraw $15, %xmm1
2182; SSSE3-NEXT:    retq
2183;
2184; SSE41-LABEL: load_sext_16i1_to_16i16:
2185; SSE41:       # BB#0: # %entry
2186; SSE41-NEXT:    movzwl (%rdi), %eax
2187; SSE41-NEXT:    movl %eax, %ecx
2188; SSE41-NEXT:    shrl %ecx
2189; SSE41-NEXT:    andl $1, %ecx
2190; SSE41-NEXT:    movl %eax, %edx
2191; SSE41-NEXT:    andl $1, %edx
2192; SSE41-NEXT:    movd %edx, %xmm1
2193; SSE41-NEXT:    pinsrb $1, %ecx, %xmm1
2194; SSE41-NEXT:    movl %eax, %ecx
2195; SSE41-NEXT:    shrl $2, %ecx
2196; SSE41-NEXT:    andl $1, %ecx
2197; SSE41-NEXT:    pinsrb $2, %ecx, %xmm1
2198; SSE41-NEXT:    movl %eax, %ecx
2199; SSE41-NEXT:    shrl $3, %ecx
2200; SSE41-NEXT:    andl $1, %ecx
2201; SSE41-NEXT:    pinsrb $3, %ecx, %xmm1
2202; SSE41-NEXT:    movl %eax, %ecx
2203; SSE41-NEXT:    shrl $4, %ecx
2204; SSE41-NEXT:    andl $1, %ecx
2205; SSE41-NEXT:    pinsrb $4, %ecx, %xmm1
2206; SSE41-NEXT:    movl %eax, %ecx
2207; SSE41-NEXT:    shrl $5, %ecx
2208; SSE41-NEXT:    andl $1, %ecx
2209; SSE41-NEXT:    pinsrb $5, %ecx, %xmm1
2210; SSE41-NEXT:    movl %eax, %ecx
2211; SSE41-NEXT:    shrl $6, %ecx
2212; SSE41-NEXT:    andl $1, %ecx
2213; SSE41-NEXT:    pinsrb $6, %ecx, %xmm1
2214; SSE41-NEXT:    movl %eax, %ecx
2215; SSE41-NEXT:    shrl $7, %ecx
2216; SSE41-NEXT:    andl $1, %ecx
2217; SSE41-NEXT:    pinsrb $7, %ecx, %xmm1
2218; SSE41-NEXT:    movl %eax, %ecx
2219; SSE41-NEXT:    shrl $8, %ecx
2220; SSE41-NEXT:    andl $1, %ecx
2221; SSE41-NEXT:    pinsrb $8, %ecx, %xmm1
2222; SSE41-NEXT:    movl %eax, %ecx
2223; SSE41-NEXT:    shrl $9, %ecx
2224; SSE41-NEXT:    andl $1, %ecx
2225; SSE41-NEXT:    pinsrb $9, %ecx, %xmm1
2226; SSE41-NEXT:    movl %eax, %ecx
2227; SSE41-NEXT:    shrl $10, %ecx
2228; SSE41-NEXT:    andl $1, %ecx
2229; SSE41-NEXT:    pinsrb $10, %ecx, %xmm1
2230; SSE41-NEXT:    movl %eax, %ecx
2231; SSE41-NEXT:    shrl $11, %ecx
2232; SSE41-NEXT:    andl $1, %ecx
2233; SSE41-NEXT:    pinsrb $11, %ecx, %xmm1
2234; SSE41-NEXT:    movl %eax, %ecx
2235; SSE41-NEXT:    shrl $12, %ecx
2236; SSE41-NEXT:    andl $1, %ecx
2237; SSE41-NEXT:    pinsrb $12, %ecx, %xmm1
2238; SSE41-NEXT:    movl %eax, %ecx
2239; SSE41-NEXT:    shrl $13, %ecx
2240; SSE41-NEXT:    andl $1, %ecx
2241; SSE41-NEXT:    pinsrb $13, %ecx, %xmm1
2242; SSE41-NEXT:    movl %eax, %ecx
2243; SSE41-NEXT:    shrl $14, %ecx
2244; SSE41-NEXT:    andl $1, %ecx
2245; SSE41-NEXT:    pinsrb $14, %ecx, %xmm1
2246; SSE41-NEXT:    shrl $15, %eax
2247; SSE41-NEXT:    movzwl %ax, %eax
2248; SSE41-NEXT:    pinsrb $15, %eax, %xmm1
2249; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2250; SSE41-NEXT:    psllw $15, %xmm0
2251; SSE41-NEXT:    psraw $15, %xmm0
2252; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2253; SSE41-NEXT:    psllw $15, %xmm1
2254; SSE41-NEXT:    psraw $15, %xmm1
2255; SSE41-NEXT:    retq
2256;
2257; AVX1-LABEL: load_sext_16i1_to_16i16:
2258; AVX1:       # BB#0: # %entry
2259; AVX1-NEXT:    pushq %rbp
2260; AVX1-NEXT:  .Ltmp0:
2261; AVX1-NEXT:    .cfi_def_cfa_offset 16
2262; AVX1-NEXT:    pushq %r15
2263; AVX1-NEXT:  .Ltmp1:
2264; AVX1-NEXT:    .cfi_def_cfa_offset 24
2265; AVX1-NEXT:    pushq %r14
2266; AVX1-NEXT:  .Ltmp2:
2267; AVX1-NEXT:    .cfi_def_cfa_offset 32
2268; AVX1-NEXT:    pushq %r13
2269; AVX1-NEXT:  .Ltmp3:
2270; AVX1-NEXT:    .cfi_def_cfa_offset 40
2271; AVX1-NEXT:    pushq %r12
2272; AVX1-NEXT:  .Ltmp4:
2273; AVX1-NEXT:    .cfi_def_cfa_offset 48
2274; AVX1-NEXT:    pushq %rbx
2275; AVX1-NEXT:  .Ltmp5:
2276; AVX1-NEXT:    .cfi_def_cfa_offset 56
2277; AVX1-NEXT:  .Ltmp6:
2278; AVX1-NEXT:    .cfi_offset %rbx, -56
2279; AVX1-NEXT:  .Ltmp7:
2280; AVX1-NEXT:    .cfi_offset %r12, -48
2281; AVX1-NEXT:  .Ltmp8:
2282; AVX1-NEXT:    .cfi_offset %r13, -40
2283; AVX1-NEXT:  .Ltmp9:
2284; AVX1-NEXT:    .cfi_offset %r14, -32
2285; AVX1-NEXT:  .Ltmp10:
2286; AVX1-NEXT:    .cfi_offset %r15, -24
2287; AVX1-NEXT:  .Ltmp11:
2288; AVX1-NEXT:    .cfi_offset %rbp, -16
2289; AVX1-NEXT:    movswq (%rdi), %rax
2290; AVX1-NEXT:    movq %rax, %rcx
2291; AVX1-NEXT:    shlq $55, %rcx
2292; AVX1-NEXT:    sarq $63, %rcx
2293; AVX1-NEXT:    vmovd %ecx, %xmm0
2294; AVX1-NEXT:    movq %rax, %r8
2295; AVX1-NEXT:    movq %rax, %r10
2296; AVX1-NEXT:    movq %rax, %r11
2297; AVX1-NEXT:    movq %rax, %r14
2298; AVX1-NEXT:    movq %rax, %r15
2299; AVX1-NEXT:    movq %rax, %r9
2300; AVX1-NEXT:    movq %rax, %r12
2301; AVX1-NEXT:    movq %rax, %r13
2302; AVX1-NEXT:    movq %rax, %rbx
2303; AVX1-NEXT:    movq %rax, %rdi
2304; AVX1-NEXT:    movq %rax, %rcx
2305; AVX1-NEXT:    movq %rax, %rdx
2306; AVX1-NEXT:    movq %rax, %rsi
2307; AVX1-NEXT:    movsbq %al, %rbp
2308; AVX1-NEXT:    shlq $54, %rax
2309; AVX1-NEXT:    sarq $63, %rax
2310; AVX1-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
2311; AVX1-NEXT:    shlq $53, %r8
2312; AVX1-NEXT:    sarq $63, %r8
2313; AVX1-NEXT:    vpinsrw $2, %r8d, %xmm0, %xmm0
2314; AVX1-NEXT:    shlq $52, %r10
2315; AVX1-NEXT:    sarq $63, %r10
2316; AVX1-NEXT:    vpinsrw $3, %r10d, %xmm0, %xmm0
2317; AVX1-NEXT:    shlq $51, %r11
2318; AVX1-NEXT:    sarq $63, %r11
2319; AVX1-NEXT:    vpinsrw $4, %r11d, %xmm0, %xmm0
2320; AVX1-NEXT:    shlq $50, %r14
2321; AVX1-NEXT:    sarq $63, %r14
2322; AVX1-NEXT:    vpinsrw $5, %r14d, %xmm0, %xmm0
2323; AVX1-NEXT:    shlq $49, %r15
2324; AVX1-NEXT:    sarq $63, %r15
2325; AVX1-NEXT:    vpinsrw $6, %r15d, %xmm0, %xmm0
2326; AVX1-NEXT:    shrq $15, %r9
2327; AVX1-NEXT:    vpinsrw $7, %r9d, %xmm0, %xmm0
2328; AVX1-NEXT:    shlq $63, %r13
2329; AVX1-NEXT:    sarq $63, %r13
2330; AVX1-NEXT:    vmovd %r13d, %xmm1
2331; AVX1-NEXT:    shlq $62, %r12
2332; AVX1-NEXT:    sarq $63, %r12
2333; AVX1-NEXT:    vpinsrw $1, %r12d, %xmm1, %xmm1
2334; AVX1-NEXT:    shlq $61, %rbx
2335; AVX1-NEXT:    sarq $63, %rbx
2336; AVX1-NEXT:    vpinsrw $2, %ebx, %xmm1, %xmm1
2337; AVX1-NEXT:    shlq $60, %rdi
2338; AVX1-NEXT:    sarq $63, %rdi
2339; AVX1-NEXT:    vpinsrw $3, %edi, %xmm1, %xmm1
2340; AVX1-NEXT:    shlq $59, %rcx
2341; AVX1-NEXT:    sarq $63, %rcx
2342; AVX1-NEXT:    vpinsrw $4, %ecx, %xmm1, %xmm1
2343; AVX1-NEXT:    shlq $58, %rdx
2344; AVX1-NEXT:    sarq $63, %rdx
2345; AVX1-NEXT:    vpinsrw $5, %edx, %xmm1, %xmm1
2346; AVX1-NEXT:    shlq $57, %rsi
2347; AVX1-NEXT:    sarq $63, %rsi
2348; AVX1-NEXT:    vpinsrw $6, %esi, %xmm1, %xmm1
2349; AVX1-NEXT:    shrq $7, %rbp
2350; AVX1-NEXT:    vpinsrw $7, %ebp, %xmm1, %xmm1
2351; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2352; AVX1-NEXT:    popq %rbx
2353; AVX1-NEXT:    popq %r12
2354; AVX1-NEXT:    popq %r13
2355; AVX1-NEXT:    popq %r14
2356; AVX1-NEXT:    popq %r15
2357; AVX1-NEXT:    popq %rbp
2358; AVX1-NEXT:    retq
2359;
2360; AVX2-LABEL: load_sext_16i1_to_16i16:
2361; AVX2:       # BB#0: # %entry
2362; AVX2-NEXT:    pushq %rbp
2363; AVX2-NEXT:  .Ltmp0:
2364; AVX2-NEXT:    .cfi_def_cfa_offset 16
2365; AVX2-NEXT:    pushq %r15
2366; AVX2-NEXT:  .Ltmp1:
2367; AVX2-NEXT:    .cfi_def_cfa_offset 24
2368; AVX2-NEXT:    pushq %r14
2369; AVX2-NEXT:  .Ltmp2:
2370; AVX2-NEXT:    .cfi_def_cfa_offset 32
2371; AVX2-NEXT:    pushq %r13
2372; AVX2-NEXT:  .Ltmp3:
2373; AVX2-NEXT:    .cfi_def_cfa_offset 40
2374; AVX2-NEXT:    pushq %r12
2375; AVX2-NEXT:  .Ltmp4:
2376; AVX2-NEXT:    .cfi_def_cfa_offset 48
2377; AVX2-NEXT:    pushq %rbx
2378; AVX2-NEXT:  .Ltmp5:
2379; AVX2-NEXT:    .cfi_def_cfa_offset 56
2380; AVX2-NEXT:  .Ltmp6:
2381; AVX2-NEXT:    .cfi_offset %rbx, -56
2382; AVX2-NEXT:  .Ltmp7:
2383; AVX2-NEXT:    .cfi_offset %r12, -48
2384; AVX2-NEXT:  .Ltmp8:
2385; AVX2-NEXT:    .cfi_offset %r13, -40
2386; AVX2-NEXT:  .Ltmp9:
2387; AVX2-NEXT:    .cfi_offset %r14, -32
2388; AVX2-NEXT:  .Ltmp10:
2389; AVX2-NEXT:    .cfi_offset %r15, -24
2390; AVX2-NEXT:  .Ltmp11:
2391; AVX2-NEXT:    .cfi_offset %rbp, -16
2392; AVX2-NEXT:    movswq (%rdi), %rax
2393; AVX2-NEXT:    movq %rax, %rcx
2394; AVX2-NEXT:    shlq $55, %rcx
2395; AVX2-NEXT:    sarq $63, %rcx
2396; AVX2-NEXT:    vmovd %ecx, %xmm0
2397; AVX2-NEXT:    movq %rax, %r8
2398; AVX2-NEXT:    movq %rax, %r10
2399; AVX2-NEXT:    movq %rax, %r11
2400; AVX2-NEXT:    movq %rax, %r14
2401; AVX2-NEXT:    movq %rax, %r15
2402; AVX2-NEXT:    movq %rax, %r9
2403; AVX2-NEXT:    movq %rax, %r12
2404; AVX2-NEXT:    movq %rax, %r13
2405; AVX2-NEXT:    movq %rax, %rbx
2406; AVX2-NEXT:    movq %rax, %rdi
2407; AVX2-NEXT:    movq %rax, %rcx
2408; AVX2-NEXT:    movq %rax, %rdx
2409; AVX2-NEXT:    movq %rax, %rsi
2410; AVX2-NEXT:    movsbq %al, %rbp
2411; AVX2-NEXT:    shlq $54, %rax
2412; AVX2-NEXT:    sarq $63, %rax
2413; AVX2-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
2414; AVX2-NEXT:    shlq $53, %r8
2415; AVX2-NEXT:    sarq $63, %r8
2416; AVX2-NEXT:    vpinsrw $2, %r8d, %xmm0, %xmm0
2417; AVX2-NEXT:    shlq $52, %r10
2418; AVX2-NEXT:    sarq $63, %r10
2419; AVX2-NEXT:    vpinsrw $3, %r10d, %xmm0, %xmm0
2420; AVX2-NEXT:    shlq $51, %r11
2421; AVX2-NEXT:    sarq $63, %r11
2422; AVX2-NEXT:    vpinsrw $4, %r11d, %xmm0, %xmm0
2423; AVX2-NEXT:    shlq $50, %r14
2424; AVX2-NEXT:    sarq $63, %r14
2425; AVX2-NEXT:    vpinsrw $5, %r14d, %xmm0, %xmm0
2426; AVX2-NEXT:    shlq $49, %r15
2427; AVX2-NEXT:    sarq $63, %r15
2428; AVX2-NEXT:    vpinsrw $6, %r15d, %xmm0, %xmm0
2429; AVX2-NEXT:    shrq $15, %r9
2430; AVX2-NEXT:    vpinsrw $7, %r9d, %xmm0, %xmm0
2431; AVX2-NEXT:    shlq $63, %r13
2432; AVX2-NEXT:    sarq $63, %r13
2433; AVX2-NEXT:    vmovd %r13d, %xmm1
2434; AVX2-NEXT:    shlq $62, %r12
2435; AVX2-NEXT:    sarq $63, %r12
2436; AVX2-NEXT:    vpinsrw $1, %r12d, %xmm1, %xmm1
2437; AVX2-NEXT:    shlq $61, %rbx
2438; AVX2-NEXT:    sarq $63, %rbx
2439; AVX2-NEXT:    vpinsrw $2, %ebx, %xmm1, %xmm1
2440; AVX2-NEXT:    shlq $60, %rdi
2441; AVX2-NEXT:    sarq $63, %rdi
2442; AVX2-NEXT:    vpinsrw $3, %edi, %xmm1, %xmm1
2443; AVX2-NEXT:    shlq $59, %rcx
2444; AVX2-NEXT:    sarq $63, %rcx
2445; AVX2-NEXT:    vpinsrw $4, %ecx, %xmm1, %xmm1
2446; AVX2-NEXT:    shlq $58, %rdx
2447; AVX2-NEXT:    sarq $63, %rdx
2448; AVX2-NEXT:    vpinsrw $5, %edx, %xmm1, %xmm1
2449; AVX2-NEXT:    shlq $57, %rsi
2450; AVX2-NEXT:    sarq $63, %rsi
2451; AVX2-NEXT:    vpinsrw $6, %esi, %xmm1, %xmm1
2452; AVX2-NEXT:    shrq $7, %rbp
2453; AVX2-NEXT:    vpinsrw $7, %ebp, %xmm1, %xmm1
2454; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
2455; AVX2-NEXT:    popq %rbx
2456; AVX2-NEXT:    popq %r12
2457; AVX2-NEXT:    popq %r13
2458; AVX2-NEXT:    popq %r14
2459; AVX2-NEXT:    popq %r15
2460; AVX2-NEXT:    popq %rbp
2461; AVX2-NEXT:    retq
2462;
2463; X32-SSE41-LABEL: load_sext_16i1_to_16i16:
2464; X32-SSE41:       # BB#0: # %entry
2465; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
2466; X32-SSE41-NEXT:    movzwl (%eax), %eax
2467; X32-SSE41-NEXT:    movl %eax, %ecx
2468; X32-SSE41-NEXT:    shrl %ecx
2469; X32-SSE41-NEXT:    andl $1, %ecx
2470; X32-SSE41-NEXT:    movl %eax, %edx
2471; X32-SSE41-NEXT:    andl $1, %edx
2472; X32-SSE41-NEXT:    movd %edx, %xmm1
2473; X32-SSE41-NEXT:    pinsrb $1, %ecx, %xmm1
2474; X32-SSE41-NEXT:    movl %eax, %ecx
2475; X32-SSE41-NEXT:    shrl $2, %ecx
2476; X32-SSE41-NEXT:    andl $1, %ecx
2477; X32-SSE41-NEXT:    pinsrb $2, %ecx, %xmm1
2478; X32-SSE41-NEXT:    movl %eax, %ecx
2479; X32-SSE41-NEXT:    shrl $3, %ecx
2480; X32-SSE41-NEXT:    andl $1, %ecx
2481; X32-SSE41-NEXT:    pinsrb $3, %ecx, %xmm1
2482; X32-SSE41-NEXT:    movl %eax, %ecx
2483; X32-SSE41-NEXT:    shrl $4, %ecx
2484; X32-SSE41-NEXT:    andl $1, %ecx
2485; X32-SSE41-NEXT:    pinsrb $4, %ecx, %xmm1
2486; X32-SSE41-NEXT:    movl %eax, %ecx
2487; X32-SSE41-NEXT:    shrl $5, %ecx
2488; X32-SSE41-NEXT:    andl $1, %ecx
2489; X32-SSE41-NEXT:    pinsrb $5, %ecx, %xmm1
2490; X32-SSE41-NEXT:    movl %eax, %ecx
2491; X32-SSE41-NEXT:    shrl $6, %ecx
2492; X32-SSE41-NEXT:    andl $1, %ecx
2493; X32-SSE41-NEXT:    pinsrb $6, %ecx, %xmm1
2494; X32-SSE41-NEXT:    movl %eax, %ecx
2495; X32-SSE41-NEXT:    shrl $7, %ecx
2496; X32-SSE41-NEXT:    andl $1, %ecx
2497; X32-SSE41-NEXT:    pinsrb $7, %ecx, %xmm1
2498; X32-SSE41-NEXT:    movl %eax, %ecx
2499; X32-SSE41-NEXT:    shrl $8, %ecx
2500; X32-SSE41-NEXT:    andl $1, %ecx
2501; X32-SSE41-NEXT:    pinsrb $8, %ecx, %xmm1
2502; X32-SSE41-NEXT:    movl %eax, %ecx
2503; X32-SSE41-NEXT:    shrl $9, %ecx
2504; X32-SSE41-NEXT:    andl $1, %ecx
2505; X32-SSE41-NEXT:    pinsrb $9, %ecx, %xmm1
2506; X32-SSE41-NEXT:    movl %eax, %ecx
2507; X32-SSE41-NEXT:    shrl $10, %ecx
2508; X32-SSE41-NEXT:    andl $1, %ecx
2509; X32-SSE41-NEXT:    pinsrb $10, %ecx, %xmm1
2510; X32-SSE41-NEXT:    movl %eax, %ecx
2511; X32-SSE41-NEXT:    shrl $11, %ecx
2512; X32-SSE41-NEXT:    andl $1, %ecx
2513; X32-SSE41-NEXT:    pinsrb $11, %ecx, %xmm1
2514; X32-SSE41-NEXT:    movl %eax, %ecx
2515; X32-SSE41-NEXT:    shrl $12, %ecx
2516; X32-SSE41-NEXT:    andl $1, %ecx
2517; X32-SSE41-NEXT:    pinsrb $12, %ecx, %xmm1
2518; X32-SSE41-NEXT:    movl %eax, %ecx
2519; X32-SSE41-NEXT:    shrl $13, %ecx
2520; X32-SSE41-NEXT:    andl $1, %ecx
2521; X32-SSE41-NEXT:    pinsrb $13, %ecx, %xmm1
2522; X32-SSE41-NEXT:    movl %eax, %ecx
2523; X32-SSE41-NEXT:    shrl $14, %ecx
2524; X32-SSE41-NEXT:    andl $1, %ecx
2525; X32-SSE41-NEXT:    pinsrb $14, %ecx, %xmm1
2526; X32-SSE41-NEXT:    shrl $15, %eax
2527; X32-SSE41-NEXT:    pinsrb $15, %eax, %xmm1
2528; X32-SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2529; X32-SSE41-NEXT:    psllw $15, %xmm0
2530; X32-SSE41-NEXT:    psraw $15, %xmm0
2531; X32-SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2532; X32-SSE41-NEXT:    psllw $15, %xmm1
2533; X32-SSE41-NEXT:    psraw $15, %xmm1
2534; X32-SSE41-NEXT:    retl
2535entry:
2536 %X = load <16 x i1>, <16 x i1>* %ptr
2537 %Y = sext <16 x i1> %X to <16 x i16>
2538 ret <16 x i16> %Y
2539}
2540
2541define <32 x i8> @load_sext_32i1_to_32i8(<32 x i1> *%ptr) nounwind readnone {
2542; SSE2-LABEL: load_sext_32i1_to_32i8:
2543; SSE2:       # BB#0: # %entry
2544; SSE2-NEXT:    pushq %rbp
2545; SSE2-NEXT:    pushq %r15
2546; SSE2-NEXT:    pushq %r14
2547; SSE2-NEXT:    pushq %r13
2548; SSE2-NEXT:    pushq %r12
2549; SSE2-NEXT:    pushq %rbx
2550; SSE2-NEXT:    movswq (%rdi), %rbx
2551; SSE2-NEXT:    movq %rbx, %r10
2552; SSE2-NEXT:    movq %rbx, %r8
2553; SSE2-NEXT:    movq %rbx, %r9
2554; SSE2-NEXT:    movq %rbx, %r11
2555; SSE2-NEXT:    movq %rbx, %r14
2556; SSE2-NEXT:    movq %rbx, %r15
2557; SSE2-NEXT:    movq %rbx, %r12
2558; SSE2-NEXT:    movq %rbx, %r13
2559; SSE2-NEXT:    movq %rbx, %rdx
2560; SSE2-NEXT:    movq %rbx, %rsi
2561; SSE2-NEXT:    movq %rbx, %rcx
2562; SSE2-NEXT:    movq %rbx, %rbp
2563; SSE2-NEXT:    movq %rbx, %rax
2564; SSE2-NEXT:    shlq $49, %rax
2565; SSE2-NEXT:    sarq $63, %rax
2566; SSE2-NEXT:    movd %eax, %xmm0
2567; SSE2-NEXT:    movq %rbx, %rax
2568; SSE2-NEXT:    shlq $57, %r10
2569; SSE2-NEXT:    sarq $63, %r10
2570; SSE2-NEXT:    movd %r10d, %xmm15
2571; SSE2-NEXT:    movq %rbx, %r10
2572; SSE2-NEXT:    movsbq %bl, %rbx
2573; SSE2-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3],xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
2574; SSE2-NEXT:    shlq $53, %r8
2575; SSE2-NEXT:    sarq $63, %r8
2576; SSE2-NEXT:    movd %r8d, %xmm8
2577; SSE2-NEXT:    shlq $61, %r9
2578; SSE2-NEXT:    sarq $63, %r9
2579; SSE2-NEXT:    movd %r9d, %xmm2
2580; SSE2-NEXT:    shlq $51, %r11
2581; SSE2-NEXT:    sarq $63, %r11
2582; SSE2-NEXT:    movd %r11d, %xmm9
2583; SSE2-NEXT:    shlq $59, %r14
2584; SSE2-NEXT:    sarq $63, %r14
2585; SSE2-NEXT:    movd %r14d, %xmm5
2586; SSE2-NEXT:    shlq $55, %r15
2587; SSE2-NEXT:    sarq $63, %r15
2588; SSE2-NEXT:    movd %r15d, %xmm10
2589; SSE2-NEXT:    shlq $63, %r12
2590; SSE2-NEXT:    sarq $63, %r12
2591; SSE2-NEXT:    movd %r12d, %xmm0
2592; SSE2-NEXT:    shlq $50, %r13
2593; SSE2-NEXT:    sarq $63, %r13
2594; SSE2-NEXT:    movd %r13d, %xmm11
2595; SSE2-NEXT:    shlq $58, %rdx
2596; SSE2-NEXT:    sarq $63, %rdx
2597; SSE2-NEXT:    movd %edx, %xmm4
2598; SSE2-NEXT:    shlq $54, %rsi
2599; SSE2-NEXT:    sarq $63, %rsi
2600; SSE2-NEXT:    movd %esi, %xmm12
2601; SSE2-NEXT:    shlq $62, %rcx
2602; SSE2-NEXT:    sarq $63, %rcx
2603; SSE2-NEXT:    movd %ecx, %xmm6
2604; SSE2-NEXT:    shlq $52, %rbp
2605; SSE2-NEXT:    sarq $63, %rbp
2606; SSE2-NEXT:    movd %ebp, %xmm13
2607; SSE2-NEXT:    shlq $60, %rax
2608; SSE2-NEXT:    sarq $63, %rax
2609; SSE2-NEXT:    movd %eax, %xmm7
2610; SSE2-NEXT:    shrq $15, %r10
2611; SSE2-NEXT:    movd %r10d, %xmm14
2612; SSE2-NEXT:    shrq $7, %rbx
2613; SSE2-NEXT:    movd %ebx, %xmm3
2614; SSE2-NEXT:    movswq 2(%rdi), %rdx
2615; SSE2-NEXT:    movq %rdx, %r8
2616; SSE2-NEXT:    movq %rdx, %r9
2617; SSE2-NEXT:    movq %rdx, %r10
2618; SSE2-NEXT:    movq %rdx, %r11
2619; SSE2-NEXT:    movq %rdx, %r14
2620; SSE2-NEXT:    movq %rdx, %r15
2621; SSE2-NEXT:    movq %rdx, %r12
2622; SSE2-NEXT:    movq %rdx, %r13
2623; SSE2-NEXT:    movq %rdx, %rbx
2624; SSE2-NEXT:    movq %rdx, %rax
2625; SSE2-NEXT:    movq %rdx, %rcx
2626; SSE2-NEXT:    movq %rdx, %rsi
2627; SSE2-NEXT:    movq %rdx, %rdi
2628; SSE2-NEXT:    movq %rdx, %rbp
2629; SSE2-NEXT:    shlq $49, %rbp
2630; SSE2-NEXT:    sarq $63, %rbp
2631; SSE2-NEXT:    movd %ebp, %xmm1
2632; SSE2-NEXT:    movq %rdx, %rbp
2633; SSE2-NEXT:    movsbq %dl, %rdx
2634; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7]
2635; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3],xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7]
2636; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7]
2637; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
2638; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
2639; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
2640; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3],xmm4[4],xmm11[4],xmm4[5],xmm11[5],xmm4[6],xmm11[6],xmm4[7],xmm11[7]
2641; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3],xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7]
2642; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
2643; SSE2-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7]
2644; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7]
2645; SSE2-NEXT:    shlq $57, %r8
2646; SSE2-NEXT:    sarq $63, %r8
2647; SSE2-NEXT:    movd %r8d, %xmm2
2648; SSE2-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
2649; SSE2-NEXT:    shlq $53, %r9
2650; SSE2-NEXT:    sarq $63, %r9
2651; SSE2-NEXT:    movd %r9d, %xmm3
2652; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
2653; SSE2-NEXT:    shlq $61, %r10
2654; SSE2-NEXT:    sarq $63, %r10
2655; SSE2-NEXT:    movd %r10d, %xmm4
2656; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
2657; SSE2-NEXT:    shlq $51, %r11
2658; SSE2-NEXT:    sarq $63, %r11
2659; SSE2-NEXT:    movd %r11d, %xmm5
2660; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
2661; SSE2-NEXT:    shlq $59, %r14
2662; SSE2-NEXT:    sarq $63, %r14
2663; SSE2-NEXT:    movd %r14d, %xmm6
2664; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
2665; SSE2-NEXT:    shlq $55, %r15
2666; SSE2-NEXT:    sarq $63, %r15
2667; SSE2-NEXT:    movd %r15d, %xmm3
2668; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
2669; SSE2-NEXT:    shlq $63, %r12
2670; SSE2-NEXT:    sarq $63, %r12
2671; SSE2-NEXT:    movd %r12d, %xmm1
2672; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
2673; SSE2-NEXT:    shlq $50, %r13
2674; SSE2-NEXT:    sarq $63, %r13
2675; SSE2-NEXT:    movd %r13d, %xmm2
2676; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
2677; SSE2-NEXT:    shlq $58, %rbx
2678; SSE2-NEXT:    sarq $63, %rbx
2679; SSE2-NEXT:    movd %ebx, %xmm3
2680; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
2681; SSE2-NEXT:    shlq $54, %rax
2682; SSE2-NEXT:    sarq $63, %rax
2683; SSE2-NEXT:    movd %eax, %xmm5
2684; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
2685; SSE2-NEXT:    shlq $62, %rcx
2686; SSE2-NEXT:    sarq $63, %rcx
2687; SSE2-NEXT:    movd %ecx, %xmm4
2688; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
2689; SSE2-NEXT:    shlq $52, %rsi
2690; SSE2-NEXT:    sarq $63, %rsi
2691; SSE2-NEXT:    movd %esi, %xmm2
2692; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
2693; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
2694; SSE2-NEXT:    shlq $60, %rdi
2695; SSE2-NEXT:    sarq $63, %rdi
2696; SSE2-NEXT:    movd %edi, %xmm3
2697; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
2698; SSE2-NEXT:    shrq $15, %rbp
2699; SSE2-NEXT:    movd %ebp, %xmm2
2700; SSE2-NEXT:    shrq $7, %rdx
2701; SSE2-NEXT:    movd %edx, %xmm5
2702; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
2703; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
2704; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
2705; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
2706; SSE2-NEXT:    popq %rbx
2707; SSE2-NEXT:    popq %r12
2708; SSE2-NEXT:    popq %r13
2709; SSE2-NEXT:    popq %r14
2710; SSE2-NEXT:    popq %r15
2711; SSE2-NEXT:    popq %rbp
2712; SSE2-NEXT:    retq
2713;
2714; SSSE3-LABEL: load_sext_32i1_to_32i8:
2715; SSSE3:       # BB#0: # %entry
2716; SSSE3-NEXT:    pushq %rbp
2717; SSSE3-NEXT:    pushq %r15
2718; SSSE3-NEXT:    pushq %r14
2719; SSSE3-NEXT:    pushq %r13
2720; SSSE3-NEXT:    pushq %r12
2721; SSSE3-NEXT:    pushq %rbx
2722; SSSE3-NEXT:    movswq (%rdi), %rbx
2723; SSSE3-NEXT:    movq %rbx, %r10
2724; SSSE3-NEXT:    movq %rbx, %r8
2725; SSSE3-NEXT:    movq %rbx, %r9
2726; SSSE3-NEXT:    movq %rbx, %r11
2727; SSSE3-NEXT:    movq %rbx, %r14
2728; SSSE3-NEXT:    movq %rbx, %r15
2729; SSSE3-NEXT:    movq %rbx, %r12
2730; SSSE3-NEXT:    movq %rbx, %r13
2731; SSSE3-NEXT:    movq %rbx, %rdx
2732; SSSE3-NEXT:    movq %rbx, %rsi
2733; SSSE3-NEXT:    movq %rbx, %rcx
2734; SSSE3-NEXT:    movq %rbx, %rbp
2735; SSSE3-NEXT:    movq %rbx, %rax
2736; SSSE3-NEXT:    shlq $49, %rax
2737; SSSE3-NEXT:    sarq $63, %rax
2738; SSSE3-NEXT:    movd %eax, %xmm0
2739; SSSE3-NEXT:    movq %rbx, %rax
2740; SSSE3-NEXT:    shlq $57, %r10
2741; SSSE3-NEXT:    sarq $63, %r10
2742; SSSE3-NEXT:    movd %r10d, %xmm15
2743; SSSE3-NEXT:    movq %rbx, %r10
2744; SSSE3-NEXT:    movsbq %bl, %rbx
2745; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3],xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
2746; SSSE3-NEXT:    shlq $53, %r8
2747; SSSE3-NEXT:    sarq $63, %r8
2748; SSSE3-NEXT:    movd %r8d, %xmm8
2749; SSSE3-NEXT:    shlq $61, %r9
2750; SSSE3-NEXT:    sarq $63, %r9
2751; SSSE3-NEXT:    movd %r9d, %xmm2
2752; SSSE3-NEXT:    shlq $51, %r11
2753; SSSE3-NEXT:    sarq $63, %r11
2754; SSSE3-NEXT:    movd %r11d, %xmm9
2755; SSSE3-NEXT:    shlq $59, %r14
2756; SSSE3-NEXT:    sarq $63, %r14
2757; SSSE3-NEXT:    movd %r14d, %xmm5
2758; SSSE3-NEXT:    shlq $55, %r15
2759; SSSE3-NEXT:    sarq $63, %r15
2760; SSSE3-NEXT:    movd %r15d, %xmm10
2761; SSSE3-NEXT:    shlq $63, %r12
2762; SSSE3-NEXT:    sarq $63, %r12
2763; SSSE3-NEXT:    movd %r12d, %xmm0
2764; SSSE3-NEXT:    shlq $50, %r13
2765; SSSE3-NEXT:    sarq $63, %r13
2766; SSSE3-NEXT:    movd %r13d, %xmm11
2767; SSSE3-NEXT:    shlq $58, %rdx
2768; SSSE3-NEXT:    sarq $63, %rdx
2769; SSSE3-NEXT:    movd %edx, %xmm4
2770; SSSE3-NEXT:    shlq $54, %rsi
2771; SSSE3-NEXT:    sarq $63, %rsi
2772; SSSE3-NEXT:    movd %esi, %xmm12
2773; SSSE3-NEXT:    shlq $62, %rcx
2774; SSSE3-NEXT:    sarq $63, %rcx
2775; SSSE3-NEXT:    movd %ecx, %xmm6
2776; SSSE3-NEXT:    shlq $52, %rbp
2777; SSSE3-NEXT:    sarq $63, %rbp
2778; SSSE3-NEXT:    movd %ebp, %xmm13
2779; SSSE3-NEXT:    shlq $60, %rax
2780; SSSE3-NEXT:    sarq $63, %rax
2781; SSSE3-NEXT:    movd %eax, %xmm7
2782; SSSE3-NEXT:    shrq $15, %r10
2783; SSSE3-NEXT:    movd %r10d, %xmm14
2784; SSSE3-NEXT:    shrq $7, %rbx
2785; SSSE3-NEXT:    movd %ebx, %xmm3
2786; SSSE3-NEXT:    movswq 2(%rdi), %rdx
2787; SSSE3-NEXT:    movq %rdx, %r8
2788; SSSE3-NEXT:    movq %rdx, %r9
2789; SSSE3-NEXT:    movq %rdx, %r10
2790; SSSE3-NEXT:    movq %rdx, %r11
2791; SSSE3-NEXT:    movq %rdx, %r14
2792; SSSE3-NEXT:    movq %rdx, %r15
2793; SSSE3-NEXT:    movq %rdx, %r12
2794; SSSE3-NEXT:    movq %rdx, %r13
2795; SSSE3-NEXT:    movq %rdx, %rbx
2796; SSSE3-NEXT:    movq %rdx, %rax
2797; SSSE3-NEXT:    movq %rdx, %rcx
2798; SSSE3-NEXT:    movq %rdx, %rsi
2799; SSSE3-NEXT:    movq %rdx, %rdi
2800; SSSE3-NEXT:    movq %rdx, %rbp
2801; SSSE3-NEXT:    shlq $49, %rbp
2802; SSSE3-NEXT:    sarq $63, %rbp
2803; SSSE3-NEXT:    movd %ebp, %xmm1
2804; SSSE3-NEXT:    movq %rdx, %rbp
2805; SSSE3-NEXT:    movsbq %dl, %rdx
2806; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7]
2807; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3],xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7]
2808; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7]
2809; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
2810; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
2811; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
2812; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3],xmm4[4],xmm11[4],xmm4[5],xmm11[5],xmm4[6],xmm11[6],xmm4[7],xmm11[7]
2813; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3],xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7]
2814; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
2815; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7]
2816; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7]
2817; SSSE3-NEXT:    shlq $57, %r8
2818; SSSE3-NEXT:    sarq $63, %r8
2819; SSSE3-NEXT:    movd %r8d, %xmm2
2820; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
2821; SSSE3-NEXT:    shlq $53, %r9
2822; SSSE3-NEXT:    sarq $63, %r9
2823; SSSE3-NEXT:    movd %r9d, %xmm3
2824; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
2825; SSSE3-NEXT:    shlq $61, %r10
2826; SSSE3-NEXT:    sarq $63, %r10
2827; SSSE3-NEXT:    movd %r10d, %xmm4
2828; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
2829; SSSE3-NEXT:    shlq $51, %r11
2830; SSSE3-NEXT:    sarq $63, %r11
2831; SSSE3-NEXT:    movd %r11d, %xmm5
2832; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
2833; SSSE3-NEXT:    shlq $59, %r14
2834; SSSE3-NEXT:    sarq $63, %r14
2835; SSSE3-NEXT:    movd %r14d, %xmm6
2836; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
2837; SSSE3-NEXT:    shlq $55, %r15
2838; SSSE3-NEXT:    sarq $63, %r15
2839; SSSE3-NEXT:    movd %r15d, %xmm3
2840; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
2841; SSSE3-NEXT:    shlq $63, %r12
2842; SSSE3-NEXT:    sarq $63, %r12
2843; SSSE3-NEXT:    movd %r12d, %xmm1
2844; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
2845; SSSE3-NEXT:    shlq $50, %r13
2846; SSSE3-NEXT:    sarq $63, %r13
2847; SSSE3-NEXT:    movd %r13d, %xmm2
2848; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
2849; SSSE3-NEXT:    shlq $58, %rbx
2850; SSSE3-NEXT:    sarq $63, %rbx
2851; SSSE3-NEXT:    movd %ebx, %xmm3
2852; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
2853; SSSE3-NEXT:    shlq $54, %rax
2854; SSSE3-NEXT:    sarq $63, %rax
2855; SSSE3-NEXT:    movd %eax, %xmm5
2856; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
2857; SSSE3-NEXT:    shlq $62, %rcx
2858; SSSE3-NEXT:    sarq $63, %rcx
2859; SSSE3-NEXT:    movd %ecx, %xmm4
2860; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
2861; SSSE3-NEXT:    shlq $52, %rsi
2862; SSSE3-NEXT:    sarq $63, %rsi
2863; SSSE3-NEXT:    movd %esi, %xmm2
2864; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
2865; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
2866; SSSE3-NEXT:    shlq $60, %rdi
2867; SSSE3-NEXT:    sarq $63, %rdi
2868; SSSE3-NEXT:    movd %edi, %xmm3
2869; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
2870; SSSE3-NEXT:    shrq $15, %rbp
2871; SSSE3-NEXT:    movd %ebp, %xmm2
2872; SSSE3-NEXT:    shrq $7, %rdx
2873; SSSE3-NEXT:    movd %edx, %xmm5
2874; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
2875; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
2876; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
2877; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
2878; SSSE3-NEXT:    popq %rbx
2879; SSSE3-NEXT:    popq %r12
2880; SSSE3-NEXT:    popq %r13
2881; SSSE3-NEXT:    popq %r14
2882; SSSE3-NEXT:    popq %r15
2883; SSSE3-NEXT:    popq %rbp
2884; SSSE3-NEXT:    retq
2885;
2886; SSE41-LABEL: load_sext_32i1_to_32i8:
2887; SSE41:       # BB#0: # %entry
2888; SSE41-NEXT:    movswq (%rdi), %rax
2889; SSE41-NEXT:    movq %rax, %rcx
2890; SSE41-NEXT:    shlq $62, %rcx
2891; SSE41-NEXT:    sarq $63, %rcx
2892; SSE41-NEXT:    movq %rax, %rdx
2893; SSE41-NEXT:    shlq $63, %rdx
2894; SSE41-NEXT:    sarq $63, %rdx
2895; SSE41-NEXT:    movd %edx, %xmm0
2896; SSE41-NEXT:    pinsrb $1, %ecx, %xmm0
2897; SSE41-NEXT:    movq %rax, %rcx
2898; SSE41-NEXT:    shlq $61, %rcx
2899; SSE41-NEXT:    sarq $63, %rcx
2900; SSE41-NEXT:    pinsrb $2, %ecx, %xmm0
2901; SSE41-NEXT:    movq %rax, %rcx
2902; SSE41-NEXT:    shlq $60, %rcx
2903; SSE41-NEXT:    sarq $63, %rcx
2904; SSE41-NEXT:    pinsrb $3, %ecx, %xmm0
2905; SSE41-NEXT:    movq %rax, %rcx
2906; SSE41-NEXT:    shlq $59, %rcx
2907; SSE41-NEXT:    sarq $63, %rcx
2908; SSE41-NEXT:    pinsrb $4, %ecx, %xmm0
2909; SSE41-NEXT:    movq %rax, %rcx
2910; SSE41-NEXT:    shlq $58, %rcx
2911; SSE41-NEXT:    sarq $63, %rcx
2912; SSE41-NEXT:    pinsrb $5, %ecx, %xmm0
2913; SSE41-NEXT:    movq %rax, %rcx
2914; SSE41-NEXT:    shlq $57, %rcx
2915; SSE41-NEXT:    sarq $63, %rcx
2916; SSE41-NEXT:    pinsrb $6, %ecx, %xmm0
2917; SSE41-NEXT:    movsbq %al, %rcx
2918; SSE41-NEXT:    shrq $7, %rcx
2919; SSE41-NEXT:    pinsrb $7, %ecx, %xmm0
2920; SSE41-NEXT:    movq %rax, %rcx
2921; SSE41-NEXT:    shlq $55, %rcx
2922; SSE41-NEXT:    sarq $63, %rcx
2923; SSE41-NEXT:    pinsrb $8, %ecx, %xmm0
2924; SSE41-NEXT:    movq %rax, %rcx
2925; SSE41-NEXT:    shlq $54, %rcx
2926; SSE41-NEXT:    sarq $63, %rcx
2927; SSE41-NEXT:    pinsrb $9, %ecx, %xmm0
2928; SSE41-NEXT:    movq %rax, %rcx
2929; SSE41-NEXT:    shlq $53, %rcx
2930; SSE41-NEXT:    sarq $63, %rcx
2931; SSE41-NEXT:    pinsrb $10, %ecx, %xmm0
2932; SSE41-NEXT:    movq %rax, %rcx
2933; SSE41-NEXT:    shlq $52, %rcx
2934; SSE41-NEXT:    sarq $63, %rcx
2935; SSE41-NEXT:    pinsrb $11, %ecx, %xmm0
2936; SSE41-NEXT:    movq %rax, %rcx
2937; SSE41-NEXT:    shlq $51, %rcx
2938; SSE41-NEXT:    sarq $63, %rcx
2939; SSE41-NEXT:    pinsrb $12, %ecx, %xmm0
2940; SSE41-NEXT:    movq %rax, %rcx
2941; SSE41-NEXT:    shlq $50, %rcx
2942; SSE41-NEXT:    sarq $63, %rcx
2943; SSE41-NEXT:    pinsrb $13, %ecx, %xmm0
2944; SSE41-NEXT:    movq %rax, %rcx
2945; SSE41-NEXT:    shlq $49, %rcx
2946; SSE41-NEXT:    sarq $63, %rcx
2947; SSE41-NEXT:    pinsrb $14, %ecx, %xmm0
2948; SSE41-NEXT:    shrq $15, %rax
2949; SSE41-NEXT:    pinsrb $15, %eax, %xmm0
2950; SSE41-NEXT:    movswq 2(%rdi), %rax
2951; SSE41-NEXT:    movq %rax, %rcx
2952; SSE41-NEXT:    shlq $62, %rcx
2953; SSE41-NEXT:    sarq $63, %rcx
2954; SSE41-NEXT:    movq %rax, %rdx
2955; SSE41-NEXT:    shlq $63, %rdx
2956; SSE41-NEXT:    sarq $63, %rdx
2957; SSE41-NEXT:    movd %edx, %xmm1
2958; SSE41-NEXT:    pinsrb $1, %ecx, %xmm1
2959; SSE41-NEXT:    movq %rax, %rcx
2960; SSE41-NEXT:    shlq $61, %rcx
2961; SSE41-NEXT:    sarq $63, %rcx
2962; SSE41-NEXT:    pinsrb $2, %ecx, %xmm1
2963; SSE41-NEXT:    movq %rax, %rcx
2964; SSE41-NEXT:    shlq $60, %rcx
2965; SSE41-NEXT:    sarq $63, %rcx
2966; SSE41-NEXT:    pinsrb $3, %ecx, %xmm1
2967; SSE41-NEXT:    movq %rax, %rcx
2968; SSE41-NEXT:    shlq $59, %rcx
2969; SSE41-NEXT:    sarq $63, %rcx
2970; SSE41-NEXT:    pinsrb $4, %ecx, %xmm1
2971; SSE41-NEXT:    movq %rax, %rcx
2972; SSE41-NEXT:    shlq $58, %rcx
2973; SSE41-NEXT:    sarq $63, %rcx
2974; SSE41-NEXT:    pinsrb $5, %ecx, %xmm1
2975; SSE41-NEXT:    movq %rax, %rcx
2976; SSE41-NEXT:    shlq $57, %rcx
2977; SSE41-NEXT:    sarq $63, %rcx
2978; SSE41-NEXT:    pinsrb $6, %ecx, %xmm1
2979; SSE41-NEXT:    movsbq %al, %rcx
2980; SSE41-NEXT:    shrq $7, %rcx
2981; SSE41-NEXT:    pinsrb $7, %ecx, %xmm1
2982; SSE41-NEXT:    movq %rax, %rcx
2983; SSE41-NEXT:    shlq $55, %rcx
2984; SSE41-NEXT:    sarq $63, %rcx
2985; SSE41-NEXT:    pinsrb $8, %ecx, %xmm1
2986; SSE41-NEXT:    movq %rax, %rcx
2987; SSE41-NEXT:    shlq $54, %rcx
2988; SSE41-NEXT:    sarq $63, %rcx
2989; SSE41-NEXT:    pinsrb $9, %ecx, %xmm1
2990; SSE41-NEXT:    movq %rax, %rcx
2991; SSE41-NEXT:    shlq $53, %rcx
2992; SSE41-NEXT:    sarq $63, %rcx
2993; SSE41-NEXT:    pinsrb $10, %ecx, %xmm1
2994; SSE41-NEXT:    movq %rax, %rcx
2995; SSE41-NEXT:    shlq $52, %rcx
2996; SSE41-NEXT:    sarq $63, %rcx
2997; SSE41-NEXT:    pinsrb $11, %ecx, %xmm1
2998; SSE41-NEXT:    movq %rax, %rcx
2999; SSE41-NEXT:    shlq $51, %rcx
3000; SSE41-NEXT:    sarq $63, %rcx
3001; SSE41-NEXT:    pinsrb $12, %ecx, %xmm1
3002; SSE41-NEXT:    movq %rax, %rcx
3003; SSE41-NEXT:    shlq $50, %rcx
3004; SSE41-NEXT:    sarq $63, %rcx
3005; SSE41-NEXT:    pinsrb $13, %ecx, %xmm1
3006; SSE41-NEXT:    movq %rax, %rcx
3007; SSE41-NEXT:    shlq $49, %rcx
3008; SSE41-NEXT:    sarq $63, %rcx
3009; SSE41-NEXT:    pinsrb $14, %ecx, %xmm1
3010; SSE41-NEXT:    shrq $15, %rax
3011; SSE41-NEXT:    pinsrb $15, %eax, %xmm1
3012; SSE41-NEXT:    retq
3013;
3014; AVX1-LABEL: load_sext_32i1_to_32i8:
3015; AVX1:       # BB#0: # %entry
3016; AVX1-NEXT:    pushq %rbp
3017; AVX1-NEXT:    pushq %r15
3018; AVX1-NEXT:    pushq %r14
3019; AVX1-NEXT:    pushq %r13
3020; AVX1-NEXT:    pushq %r12
3021; AVX1-NEXT:    pushq %rbx
3022; AVX1-NEXT:    movslq (%rdi), %rax
3023; AVX1-NEXT:    movq %rax, %rcx
3024; AVX1-NEXT:    shlq $47, %rcx
3025; AVX1-NEXT:    sarq $63, %rcx
3026; AVX1-NEXT:    vmovd %ecx, %xmm0
3027; AVX1-NEXT:    movq %rax, %r8
3028; AVX1-NEXT:    movq %rax, %rdx
3029; AVX1-NEXT:    movq %rax, %rcx
3030; AVX1-NEXT:    movq %rax, %rdi
3031; AVX1-NEXT:    movq %rax, %r13
3032; AVX1-NEXT:    movq %rax, %rsi
3033; AVX1-NEXT:    movq %rax, %r10
3034; AVX1-NEXT:    movq %rax, %r11
3035; AVX1-NEXT:    movq %rax, %r9
3036; AVX1-NEXT:    movq %rax, %rbx
3037; AVX1-NEXT:    movq %rax, %r14
3038; AVX1-NEXT:    movq %rax, %r15
3039; AVX1-NEXT:    movq %rax, %r12
3040; AVX1-NEXT:    movq %rax, %rbp
3041; AVX1-NEXT:    shlq $46, %rbp
3042; AVX1-NEXT:    sarq $63, %rbp
3043; AVX1-NEXT:    vpinsrb $1, %ebp, %xmm0, %xmm0
3044; AVX1-NEXT:    movq %rax, %rbp
3045; AVX1-NEXT:    shlq $45, %r8
3046; AVX1-NEXT:    sarq $63, %r8
3047; AVX1-NEXT:    vpinsrb $2, %r8d, %xmm0, %xmm0
3048; AVX1-NEXT:    movq %rax, %r8
3049; AVX1-NEXT:    shlq $44, %rdx
3050; AVX1-NEXT:    sarq $63, %rdx
3051; AVX1-NEXT:    vpinsrb $3, %edx, %xmm0, %xmm0
3052; AVX1-NEXT:    movq %rax, %rdx
3053; AVX1-NEXT:    shlq $43, %rcx
3054; AVX1-NEXT:    sarq $63, %rcx
3055; AVX1-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
3056; AVX1-NEXT:    movq %rax, %rcx
3057; AVX1-NEXT:    shlq $42, %rdi
3058; AVX1-NEXT:    sarq $63, %rdi
3059; AVX1-NEXT:    vpinsrb $5, %edi, %xmm0, %xmm0
3060; AVX1-NEXT:    movq %rax, %rdi
3061; AVX1-NEXT:    shlq $41, %r13
3062; AVX1-NEXT:    sarq $63, %r13
3063; AVX1-NEXT:    vpinsrb $6, %r13d, %xmm0, %xmm0
3064; AVX1-NEXT:    movq %rax, %r13
3065; AVX1-NEXT:    shlq $40, %rsi
3066; AVX1-NEXT:    sarq $63, %rsi
3067; AVX1-NEXT:    vpinsrb $7, %esi, %xmm0, %xmm0
3068; AVX1-NEXT:    movq %rax, %rsi
3069; AVX1-NEXT:    shlq $39, %r10
3070; AVX1-NEXT:    sarq $63, %r10
3071; AVX1-NEXT:    vpinsrb $8, %r10d, %xmm0, %xmm0
3072; AVX1-NEXT:    movq %rax, %r10
3073; AVX1-NEXT:    shlq $38, %r11
3074; AVX1-NEXT:    sarq $63, %r11
3075; AVX1-NEXT:    vpinsrb $9, %r11d, %xmm0, %xmm0
3076; AVX1-NEXT:    movsbq %al, %r11
3077; AVX1-NEXT:    shlq $37, %r9
3078; AVX1-NEXT:    sarq $63, %r9
3079; AVX1-NEXT:    vpinsrb $10, %r9d, %xmm0, %xmm0
3080; AVX1-NEXT:    movq %rax, %r9
3081; AVX1-NEXT:    shlq $36, %rbx
3082; AVX1-NEXT:    sarq $63, %rbx
3083; AVX1-NEXT:    vpinsrb $11, %ebx, %xmm0, %xmm0
3084; AVX1-NEXT:    movq %rax, %rbx
3085; AVX1-NEXT:    shlq $35, %r14
3086; AVX1-NEXT:    sarq $63, %r14
3087; AVX1-NEXT:    vpinsrb $12, %r14d, %xmm0, %xmm0
3088; AVX1-NEXT:    movq %rax, %r14
3089; AVX1-NEXT:    shlq $34, %r15
3090; AVX1-NEXT:    sarq $63, %r15
3091; AVX1-NEXT:    vpinsrb $13, %r15d, %xmm0, %xmm0
3092; AVX1-NEXT:    movq %rax, %r15
3093; AVX1-NEXT:    shlq $33, %r12
3094; AVX1-NEXT:    sarq $63, %r12
3095; AVX1-NEXT:    vpinsrb $14, %r12d, %xmm0, %xmm0
3096; AVX1-NEXT:    movq %rax, %r12
3097; AVX1-NEXT:    shrq $31, %rbp
3098; AVX1-NEXT:    vpinsrb $15, %ebp, %xmm0, %xmm0
3099; AVX1-NEXT:    movq %rax, %rbp
3100; AVX1-NEXT:    shlq $63, %rdx
3101; AVX1-NEXT:    sarq $63, %rdx
3102; AVX1-NEXT:    vmovd %edx, %xmm1
3103; AVX1-NEXT:    movq %rax, %rdx
3104; AVX1-NEXT:    movswq %ax, %rax
3105; AVX1-NEXT:    shlq $62, %r8
3106; AVX1-NEXT:    sarq $63, %r8
3107; AVX1-NEXT:    vpinsrb $1, %r8d, %xmm1, %xmm1
3108; AVX1-NEXT:    shlq $61, %rcx
3109; AVX1-NEXT:    sarq $63, %rcx
3110; AVX1-NEXT:    vpinsrb $2, %ecx, %xmm1, %xmm1
3111; AVX1-NEXT:    shlq $60, %rdi
3112; AVX1-NEXT:    sarq $63, %rdi
3113; AVX1-NEXT:    vpinsrb $3, %edi, %xmm1, %xmm1
3114; AVX1-NEXT:    shlq $59, %r13
3115; AVX1-NEXT:    sarq $63, %r13
3116; AVX1-NEXT:    vpinsrb $4, %r13d, %xmm1, %xmm1
3117; AVX1-NEXT:    shlq $58, %rsi
3118; AVX1-NEXT:    sarq $63, %rsi
3119; AVX1-NEXT:    vpinsrb $5, %esi, %xmm1, %xmm1
3120; AVX1-NEXT:    shlq $57, %r10
3121; AVX1-NEXT:    sarq $63, %r10
3122; AVX1-NEXT:    vpinsrb $6, %r10d, %xmm1, %xmm1
3123; AVX1-NEXT:    shrq $7, %r11
3124; AVX1-NEXT:    vpinsrb $7, %r11d, %xmm1, %xmm1
3125; AVX1-NEXT:    shlq $55, %r9
3126; AVX1-NEXT:    sarq $63, %r9
3127; AVX1-NEXT:    vpinsrb $8, %r9d, %xmm1, %xmm1
3128; AVX1-NEXT:    shlq $54, %rbx
3129; AVX1-NEXT:    sarq $63, %rbx
3130; AVX1-NEXT:    vpinsrb $9, %ebx, %xmm1, %xmm1
3131; AVX1-NEXT:    shlq $53, %r14
3132; AVX1-NEXT:    sarq $63, %r14
3133; AVX1-NEXT:    vpinsrb $10, %r14d, %xmm1, %xmm1
3134; AVX1-NEXT:    shlq $52, %r15
3135; AVX1-NEXT:    sarq $63, %r15
3136; AVX1-NEXT:    vpinsrb $11, %r15d, %xmm1, %xmm1
3137; AVX1-NEXT:    shlq $51, %r12
3138; AVX1-NEXT:    sarq $63, %r12
3139; AVX1-NEXT:    vpinsrb $12, %r12d, %xmm1, %xmm1
3140; AVX1-NEXT:    shlq $50, %rbp
3141; AVX1-NEXT:    sarq $63, %rbp
3142; AVX1-NEXT:    vpinsrb $13, %ebp, %xmm1, %xmm1
3143; AVX1-NEXT:    shlq $49, %rdx
3144; AVX1-NEXT:    sarq $63, %rdx
3145; AVX1-NEXT:    vpinsrb $14, %edx, %xmm1, %xmm1
3146; AVX1-NEXT:    shrq $15, %rax
3147; AVX1-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
3148; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
3149; AVX1-NEXT:    popq %rbx
3150; AVX1-NEXT:    popq %r12
3151; AVX1-NEXT:    popq %r13
3152; AVX1-NEXT:    popq %r14
3153; AVX1-NEXT:    popq %r15
3154; AVX1-NEXT:    popq %rbp
3155; AVX1-NEXT:    retq
3156;
3157; AVX2-LABEL: load_sext_32i1_to_32i8:
3158; AVX2:       # BB#0: # %entry
3159; AVX2-NEXT:    pushq %rbp
3160; AVX2-NEXT:    pushq %r15
3161; AVX2-NEXT:    pushq %r14
3162; AVX2-NEXT:    pushq %r13
3163; AVX2-NEXT:    pushq %r12
3164; AVX2-NEXT:    pushq %rbx
3165; AVX2-NEXT:    movslq (%rdi), %rax
3166; AVX2-NEXT:    movq %rax, %rcx
3167; AVX2-NEXT:    shlq $47, %rcx
3168; AVX2-NEXT:    sarq $63, %rcx
3169; AVX2-NEXT:    vmovd %ecx, %xmm0
3170; AVX2-NEXT:    movq %rax, %r8
3171; AVX2-NEXT:    movq %rax, %rdx
3172; AVX2-NEXT:    movq %rax, %rcx
3173; AVX2-NEXT:    movq %rax, %rdi
3174; AVX2-NEXT:    movq %rax, %r13
3175; AVX2-NEXT:    movq %rax, %rsi
3176; AVX2-NEXT:    movq %rax, %r10
3177; AVX2-NEXT:    movq %rax, %r11
3178; AVX2-NEXT:    movq %rax, %r9
3179; AVX2-NEXT:    movq %rax, %rbx
3180; AVX2-NEXT:    movq %rax, %r14
3181; AVX2-NEXT:    movq %rax, %r15
3182; AVX2-NEXT:    movq %rax, %r12
3183; AVX2-NEXT:    movq %rax, %rbp
3184; AVX2-NEXT:    shlq $46, %rbp
3185; AVX2-NEXT:    sarq $63, %rbp
3186; AVX2-NEXT:    vpinsrb $1, %ebp, %xmm0, %xmm0
3187; AVX2-NEXT:    movq %rax, %rbp
3188; AVX2-NEXT:    shlq $45, %r8
3189; AVX2-NEXT:    sarq $63, %r8
3190; AVX2-NEXT:    vpinsrb $2, %r8d, %xmm0, %xmm0
3191; AVX2-NEXT:    movq %rax, %r8
3192; AVX2-NEXT:    shlq $44, %rdx
3193; AVX2-NEXT:    sarq $63, %rdx
3194; AVX2-NEXT:    vpinsrb $3, %edx, %xmm0, %xmm0
3195; AVX2-NEXT:    movq %rax, %rdx
3196; AVX2-NEXT:    shlq $43, %rcx
3197; AVX2-NEXT:    sarq $63, %rcx
3198; AVX2-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
3199; AVX2-NEXT:    movq %rax, %rcx
3200; AVX2-NEXT:    shlq $42, %rdi
3201; AVX2-NEXT:    sarq $63, %rdi
3202; AVX2-NEXT:    vpinsrb $5, %edi, %xmm0, %xmm0
3203; AVX2-NEXT:    movq %rax, %rdi
3204; AVX2-NEXT:    shlq $41, %r13
3205; AVX2-NEXT:    sarq $63, %r13
3206; AVX2-NEXT:    vpinsrb $6, %r13d, %xmm0, %xmm0
3207; AVX2-NEXT:    movq %rax, %r13
3208; AVX2-NEXT:    shlq $40, %rsi
3209; AVX2-NEXT:    sarq $63, %rsi
3210; AVX2-NEXT:    vpinsrb $7, %esi, %xmm0, %xmm0
3211; AVX2-NEXT:    movq %rax, %rsi
3212; AVX2-NEXT:    shlq $39, %r10
3213; AVX2-NEXT:    sarq $63, %r10
3214; AVX2-NEXT:    vpinsrb $8, %r10d, %xmm0, %xmm0
3215; AVX2-NEXT:    movq %rax, %r10
3216; AVX2-NEXT:    shlq $38, %r11
3217; AVX2-NEXT:    sarq $63, %r11
3218; AVX2-NEXT:    vpinsrb $9, %r11d, %xmm0, %xmm0
3219; AVX2-NEXT:    movsbq %al, %r11
3220; AVX2-NEXT:    shlq $37, %r9
3221; AVX2-NEXT:    sarq $63, %r9
3222; AVX2-NEXT:    vpinsrb $10, %r9d, %xmm0, %xmm0
3223; AVX2-NEXT:    movq %rax, %r9
3224; AVX2-NEXT:    shlq $36, %rbx
3225; AVX2-NEXT:    sarq $63, %rbx
3226; AVX2-NEXT:    vpinsrb $11, %ebx, %xmm0, %xmm0
3227; AVX2-NEXT:    movq %rax, %rbx
3228; AVX2-NEXT:    shlq $35, %r14
3229; AVX2-NEXT:    sarq $63, %r14
3230; AVX2-NEXT:    vpinsrb $12, %r14d, %xmm0, %xmm0
3231; AVX2-NEXT:    movq %rax, %r14
3232; AVX2-NEXT:    shlq $34, %r15
3233; AVX2-NEXT:    sarq $63, %r15
3234; AVX2-NEXT:    vpinsrb $13, %r15d, %xmm0, %xmm0
3235; AVX2-NEXT:    movq %rax, %r15
3236; AVX2-NEXT:    shlq $33, %r12
3237; AVX2-NEXT:    sarq $63, %r12
3238; AVX2-NEXT:    vpinsrb $14, %r12d, %xmm0, %xmm0
3239; AVX2-NEXT:    movq %rax, %r12
3240; AVX2-NEXT:    shrq $31, %rbp
3241; AVX2-NEXT:    vpinsrb $15, %ebp, %xmm0, %xmm0
3242; AVX2-NEXT:    movq %rax, %rbp
3243; AVX2-NEXT:    shlq $63, %rdx
3244; AVX2-NEXT:    sarq $63, %rdx
3245; AVX2-NEXT:    vmovd %edx, %xmm1
3246; AVX2-NEXT:    movq %rax, %rdx
3247; AVX2-NEXT:    movswq %ax, %rax
3248; AVX2-NEXT:    shlq $62, %r8
3249; AVX2-NEXT:    sarq $63, %r8
3250; AVX2-NEXT:    vpinsrb $1, %r8d, %xmm1, %xmm1
3251; AVX2-NEXT:    shlq $61, %rcx
3252; AVX2-NEXT:    sarq $63, %rcx
3253; AVX2-NEXT:    vpinsrb $2, %ecx, %xmm1, %xmm1
3254; AVX2-NEXT:    shlq $60, %rdi
3255; AVX2-NEXT:    sarq $63, %rdi
3256; AVX2-NEXT:    vpinsrb $3, %edi, %xmm1, %xmm1
3257; AVX2-NEXT:    shlq $59, %r13
3258; AVX2-NEXT:    sarq $63, %r13
3259; AVX2-NEXT:    vpinsrb $4, %r13d, %xmm1, %xmm1
3260; AVX2-NEXT:    shlq $58, %rsi
3261; AVX2-NEXT:    sarq $63, %rsi
3262; AVX2-NEXT:    vpinsrb $5, %esi, %xmm1, %xmm1
3263; AVX2-NEXT:    shlq $57, %r10
3264; AVX2-NEXT:    sarq $63, %r10
3265; AVX2-NEXT:    vpinsrb $6, %r10d, %xmm1, %xmm1
3266; AVX2-NEXT:    shrq $7, %r11
3267; AVX2-NEXT:    vpinsrb $7, %r11d, %xmm1, %xmm1
3268; AVX2-NEXT:    shlq $55, %r9
3269; AVX2-NEXT:    sarq $63, %r9
3270; AVX2-NEXT:    vpinsrb $8, %r9d, %xmm1, %xmm1
3271; AVX2-NEXT:    shlq $54, %rbx
3272; AVX2-NEXT:    sarq $63, %rbx
3273; AVX2-NEXT:    vpinsrb $9, %ebx, %xmm1, %xmm1
3274; AVX2-NEXT:    shlq $53, %r14
3275; AVX2-NEXT:    sarq $63, %r14
3276; AVX2-NEXT:    vpinsrb $10, %r14d, %xmm1, %xmm1
3277; AVX2-NEXT:    shlq $52, %r15
3278; AVX2-NEXT:    sarq $63, %r15
3279; AVX2-NEXT:    vpinsrb $11, %r15d, %xmm1, %xmm1
3280; AVX2-NEXT:    shlq $51, %r12
3281; AVX2-NEXT:    sarq $63, %r12
3282; AVX2-NEXT:    vpinsrb $12, %r12d, %xmm1, %xmm1
3283; AVX2-NEXT:    shlq $50, %rbp
3284; AVX2-NEXT:    sarq $63, %rbp
3285; AVX2-NEXT:    vpinsrb $13, %ebp, %xmm1, %xmm1
3286; AVX2-NEXT:    shlq $49, %rdx
3287; AVX2-NEXT:    sarq $63, %rdx
3288; AVX2-NEXT:    vpinsrb $14, %edx, %xmm1, %xmm1
3289; AVX2-NEXT:    shrq $15, %rax
3290; AVX2-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
3291; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
3292; AVX2-NEXT:    popq %rbx
3293; AVX2-NEXT:    popq %r12
3294; AVX2-NEXT:    popq %r13
3295; AVX2-NEXT:    popq %r14
3296; AVX2-NEXT:    popq %r15
3297; AVX2-NEXT:    popq %rbp
3298; AVX2-NEXT:    retq
3299;
3300; X32-SSE41-LABEL: load_sext_32i1_to_32i8:
3301; X32-SSE41:       # BB#0: # %entry
3302; X32-SSE41-NEXT:    pushl %esi
3303; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
3304; X32-SSE41-NEXT:    movswl (%eax), %ecx
3305; X32-SSE41-NEXT:    movl %ecx, %edx
3306; X32-SSE41-NEXT:    shll $30, %edx
3307; X32-SSE41-NEXT:    sarl $31, %edx
3308; X32-SSE41-NEXT:    movl %ecx, %esi
3309; X32-SSE41-NEXT:    shll $31, %esi
3310; X32-SSE41-NEXT:    sarl $31, %esi
3311; X32-SSE41-NEXT:    movd %esi, %xmm0
3312; X32-SSE41-NEXT:    pinsrb $1, %edx, %xmm0
3313; X32-SSE41-NEXT:    movl %ecx, %edx
3314; X32-SSE41-NEXT:    shll $29, %edx
3315; X32-SSE41-NEXT:    sarl $31, %edx
3316; X32-SSE41-NEXT:    pinsrb $2, %edx, %xmm0
3317; X32-SSE41-NEXT:    movl %ecx, %edx
3318; X32-SSE41-NEXT:    shll $28, %edx
3319; X32-SSE41-NEXT:    sarl $31, %edx
3320; X32-SSE41-NEXT:    pinsrb $3, %edx, %xmm0
3321; X32-SSE41-NEXT:    movl %ecx, %edx
3322; X32-SSE41-NEXT:    shll $27, %edx
3323; X32-SSE41-NEXT:    sarl $31, %edx
3324; X32-SSE41-NEXT:    pinsrb $4, %edx, %xmm0
3325; X32-SSE41-NEXT:    movl %ecx, %edx
3326; X32-SSE41-NEXT:    shll $26, %edx
3327; X32-SSE41-NEXT:    sarl $31, %edx
3328; X32-SSE41-NEXT:    pinsrb $5, %edx, %xmm0
3329; X32-SSE41-NEXT:    movl %ecx, %edx
3330; X32-SSE41-NEXT:    shll $25, %edx
3331; X32-SSE41-NEXT:    sarl $31, %edx
3332; X32-SSE41-NEXT:    pinsrb $6, %edx, %xmm0
3333; X32-SSE41-NEXT:    movsbl %cl, %edx
3334; X32-SSE41-NEXT:    shrl $7, %edx
3335; X32-SSE41-NEXT:    pinsrb $7, %edx, %xmm0
3336; X32-SSE41-NEXT:    movl %ecx, %edx
3337; X32-SSE41-NEXT:    shll $23, %edx
3338; X32-SSE41-NEXT:    sarl $31, %edx
3339; X32-SSE41-NEXT:    pinsrb $8, %edx, %xmm0
3340; X32-SSE41-NEXT:    movl %ecx, %edx
3341; X32-SSE41-NEXT:    shll $22, %edx
3342; X32-SSE41-NEXT:    sarl $31, %edx
3343; X32-SSE41-NEXT:    pinsrb $9, %edx, %xmm0
3344; X32-SSE41-NEXT:    movl %ecx, %edx
3345; X32-SSE41-NEXT:    shll $21, %edx
3346; X32-SSE41-NEXT:    sarl $31, %edx
3347; X32-SSE41-NEXT:    pinsrb $10, %edx, %xmm0
3348; X32-SSE41-NEXT:    movl %ecx, %edx
3349; X32-SSE41-NEXT:    shll $20, %edx
3350; X32-SSE41-NEXT:    sarl $31, %edx
3351; X32-SSE41-NEXT:    pinsrb $11, %edx, %xmm0
3352; X32-SSE41-NEXT:    movl %ecx, %edx
3353; X32-SSE41-NEXT:    shll $19, %edx
3354; X32-SSE41-NEXT:    sarl $31, %edx
3355; X32-SSE41-NEXT:    pinsrb $12, %edx, %xmm0
3356; X32-SSE41-NEXT:    movl %ecx, %edx
3357; X32-SSE41-NEXT:    shll $18, %edx
3358; X32-SSE41-NEXT:    sarl $31, %edx
3359; X32-SSE41-NEXT:    pinsrb $13, %edx, %xmm0
3360; X32-SSE41-NEXT:    movl %ecx, %edx
3361; X32-SSE41-NEXT:    shll $17, %edx
3362; X32-SSE41-NEXT:    sarl $31, %edx
3363; X32-SSE41-NEXT:    pinsrb $14, %edx, %xmm0
3364; X32-SSE41-NEXT:    shrl $15, %ecx
3365; X32-SSE41-NEXT:    pinsrb $15, %ecx, %xmm0
3366; X32-SSE41-NEXT:    movswl 2(%eax), %eax
3367; X32-SSE41-NEXT:    movl %eax, %ecx
3368; X32-SSE41-NEXT:    shll $30, %ecx
3369; X32-SSE41-NEXT:    sarl $31, %ecx
3370; X32-SSE41-NEXT:    movl %eax, %edx
3371; X32-SSE41-NEXT:    shll $31, %edx
3372; X32-SSE41-NEXT:    sarl $31, %edx
3373; X32-SSE41-NEXT:    movd %edx, %xmm1
3374; X32-SSE41-NEXT:    pinsrb $1, %ecx, %xmm1
3375; X32-SSE41-NEXT:    movl %eax, %ecx
3376; X32-SSE41-NEXT:    shll $29, %ecx
3377; X32-SSE41-NEXT:    sarl $31, %ecx
3378; X32-SSE41-NEXT:    pinsrb $2, %ecx, %xmm1
3379; X32-SSE41-NEXT:    movl %eax, %ecx
3380; X32-SSE41-NEXT:    shll $28, %ecx
3381; X32-SSE41-NEXT:    sarl $31, %ecx
3382; X32-SSE41-NEXT:    pinsrb $3, %ecx, %xmm1
3383; X32-SSE41-NEXT:    movl %eax, %ecx
3384; X32-SSE41-NEXT:    shll $27, %ecx
3385; X32-SSE41-NEXT:    sarl $31, %ecx
3386; X32-SSE41-NEXT:    pinsrb $4, %ecx, %xmm1
3387; X32-SSE41-NEXT:    movl %eax, %ecx
3388; X32-SSE41-NEXT:    shll $26, %ecx
3389; X32-SSE41-NEXT:    sarl $31, %ecx
3390; X32-SSE41-NEXT:    pinsrb $5, %ecx, %xmm1
3391; X32-SSE41-NEXT:    movl %eax, %ecx
3392; X32-SSE41-NEXT:    shll $25, %ecx
3393; X32-SSE41-NEXT:    sarl $31, %ecx
3394; X32-SSE41-NEXT:    pinsrb $6, %ecx, %xmm1
3395; X32-SSE41-NEXT:    movsbl %al, %ecx
3396; X32-SSE41-NEXT:    shrl $7, %ecx
3397; X32-SSE41-NEXT:    pinsrb $7, %ecx, %xmm1
3398; X32-SSE41-NEXT:    movl %eax, %ecx
3399; X32-SSE41-NEXT:    shll $23, %ecx
3400; X32-SSE41-NEXT:    sarl $31, %ecx
3401; X32-SSE41-NEXT:    pinsrb $8, %ecx, %xmm1
3402; X32-SSE41-NEXT:    movl %eax, %ecx
3403; X32-SSE41-NEXT:    shll $22, %ecx
3404; X32-SSE41-NEXT:    sarl $31, %ecx
3405; X32-SSE41-NEXT:    pinsrb $9, %ecx, %xmm1
3406; X32-SSE41-NEXT:    movl %eax, %ecx
3407; X32-SSE41-NEXT:    shll $21, %ecx
3408; X32-SSE41-NEXT:    sarl $31, %ecx
3409; X32-SSE41-NEXT:    pinsrb $10, %ecx, %xmm1
3410; X32-SSE41-NEXT:    movl %eax, %ecx
3411; X32-SSE41-NEXT:    shll $20, %ecx
3412; X32-SSE41-NEXT:    sarl $31, %ecx
3413; X32-SSE41-NEXT:    pinsrb $11, %ecx, %xmm1
3414; X32-SSE41-NEXT:    movl %eax, %ecx
3415; X32-SSE41-NEXT:    shll $19, %ecx
3416; X32-SSE41-NEXT:    sarl $31, %ecx
3417; X32-SSE41-NEXT:    pinsrb $12, %ecx, %xmm1
3418; X32-SSE41-NEXT:    movl %eax, %ecx
3419; X32-SSE41-NEXT:    shll $18, %ecx
3420; X32-SSE41-NEXT:    sarl $31, %ecx
3421; X32-SSE41-NEXT:    pinsrb $13, %ecx, %xmm1
3422; X32-SSE41-NEXT:    movl %eax, %ecx
3423; X32-SSE41-NEXT:    shll $17, %ecx
3424; X32-SSE41-NEXT:    sarl $31, %ecx
3425; X32-SSE41-NEXT:    pinsrb $14, %ecx, %xmm1
3426; X32-SSE41-NEXT:    shrl $15, %eax
3427; X32-SSE41-NEXT:    pinsrb $15, %eax, %xmm1
3428; X32-SSE41-NEXT:    popl %esi
3429; X32-SSE41-NEXT:    retl
3430entry:
3431 %X = load <32 x i1>, <32 x i1>* %ptr
3432 %Y = sext <32 x i1> %X to <32 x i8>
3433 ret <32 x i8> %Y
3434}
3435
3436define <16 x i16> @load_sext_16i8_to_16i16(<16 x i8> *%ptr) {
3437; SSE2-LABEL: load_sext_16i8_to_16i16:
3438; SSE2:       # BB#0: # %entry
3439; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
3440; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
3441; SSE2-NEXT:    psraw $8, %xmm0
3442; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
3443; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
3444; SSE2-NEXT:    psraw $8, %xmm1
3445; SSE2-NEXT:    retq
3446;
3447; SSSE3-LABEL: load_sext_16i8_to_16i16:
3448; SSSE3:       # BB#0: # %entry
3449; SSSE3-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
3450; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
3451; SSSE3-NEXT:    psraw $8, %xmm0
3452; SSSE3-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
3453; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
3454; SSSE3-NEXT:    psraw $8, %xmm1
3455; SSSE3-NEXT:    retq
3456;
3457; SSE41-LABEL: load_sext_16i8_to_16i16:
3458; SSE41:       # BB#0: # %entry
3459; SSE41-NEXT:    pmovsxbw (%rdi), %xmm0
3460; SSE41-NEXT:    pmovsxbw 8(%rdi), %xmm1
3461; SSE41-NEXT:    retq
3462;
3463; AVX1-LABEL: load_sext_16i8_to_16i16:
3464; AVX1:       # BB#0: # %entry
3465; AVX1-NEXT:    vpmovsxbw (%rdi), %xmm0
3466; AVX1-NEXT:    vpmovsxbw 8(%rdi), %xmm1
3467; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
3468; AVX1-NEXT:    retq
3469;
3470; AVX2-LABEL: load_sext_16i8_to_16i16:
3471; AVX2:       # BB#0: # %entry
3472; AVX2-NEXT:    vpmovsxbw (%rdi), %ymm0
3473; AVX2-NEXT:    retq
3474;
3475; X32-SSE41-LABEL: load_sext_16i8_to_16i16:
3476; X32-SSE41:       # BB#0: # %entry
3477; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
3478; X32-SSE41-NEXT:    pmovsxbw (%eax), %xmm0
3479; X32-SSE41-NEXT:    pmovsxbw 8(%eax), %xmm1
3480; X32-SSE41-NEXT:    retl
3481entry:
3482 %X = load <16 x i8>, <16 x i8>* %ptr
3483 %Y = sext <16 x i8> %X to <16 x i16>
3484 ret <16 x i16> %Y
3485}
3486
3487define <2 x i64> @load_sext_2i16_to_2i64(<2 x i16> *%ptr) {
3488; SSE2-LABEL: load_sext_2i16_to_2i64:
3489; SSE2:       # BB#0: # %entry
3490; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
3491; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
3492; SSE2-NEXT:    movdqa %xmm0, %xmm1
3493; SSE2-NEXT:    psrad $31, %xmm1
3494; SSE2-NEXT:    psrad $16, %xmm0
3495; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3496; SSE2-NEXT:    retq
3497;
3498; SSSE3-LABEL: load_sext_2i16_to_2i64:
3499; SSSE3:       # BB#0: # %entry
3500; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
3501; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
3502; SSSE3-NEXT:    movdqa %xmm0, %xmm1
3503; SSSE3-NEXT:    psrad $31, %xmm1
3504; SSSE3-NEXT:    psrad $16, %xmm0
3505; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3506; SSSE3-NEXT:    retq
3507;
3508; SSE41-LABEL: load_sext_2i16_to_2i64:
3509; SSE41:       # BB#0: # %entry
3510; SSE41-NEXT:    pmovsxwq (%rdi), %xmm0
3511; SSE41-NEXT:    retq
3512;
3513; AVX-LABEL: load_sext_2i16_to_2i64:
3514; AVX:       # BB#0: # %entry
3515; AVX-NEXT:    vpmovsxwq (%rdi), %xmm0
3516; AVX-NEXT:    retq
3517;
3518; X32-SSE41-LABEL: load_sext_2i16_to_2i64:
3519; X32-SSE41:       # BB#0: # %entry
3520; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
3521; X32-SSE41-NEXT:    pmovsxwq (%eax), %xmm0
3522; X32-SSE41-NEXT:    retl
3523entry:
3524 %X = load <2 x i16>, <2 x i16>* %ptr
3525 %Y = sext <2 x i16> %X to <2 x i64>
3526 ret <2 x i64> %Y
3527}
3528
3529define <4 x i32> @load_sext_4i16_to_4i32(<4 x i16> *%ptr) {
3530; SSE2-LABEL: load_sext_4i16_to_4i32:
3531; SSE2:       # BB#0: # %entry
3532; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
3533; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
3534; SSE2-NEXT:    psrad $16, %xmm0
3535; SSE2-NEXT:    retq
3536;
3537; SSSE3-LABEL: load_sext_4i16_to_4i32:
3538; SSSE3:       # BB#0: # %entry
3539; SSSE3-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
3540; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
3541; SSSE3-NEXT:    psrad $16, %xmm0
3542; SSSE3-NEXT:    retq
3543;
3544; SSE41-LABEL: load_sext_4i16_to_4i32:
3545; SSE41:       # BB#0: # %entry
3546; SSE41-NEXT:    pmovsxwd (%rdi), %xmm0
3547; SSE41-NEXT:    retq
3548;
3549; AVX-LABEL: load_sext_4i16_to_4i32:
3550; AVX:       # BB#0: # %entry
3551; AVX-NEXT:    vpmovsxwd (%rdi), %xmm0
3552; AVX-NEXT:    retq
3553;
3554; X32-SSE41-LABEL: load_sext_4i16_to_4i32:
3555; X32-SSE41:       # BB#0: # %entry
3556; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
3557; X32-SSE41-NEXT:    pmovsxwd (%eax), %xmm0
3558; X32-SSE41-NEXT:    retl
3559entry:
3560 %X = load <4 x i16>, <4 x i16>* %ptr
3561 %Y = sext <4 x i16> %X to <4 x i32>
3562 ret <4 x i32> %Y
3563}
3564
3565define <4 x i64> @load_sext_4i16_to_4i64(<4 x i16> *%ptr) {
3566; SSE2-LABEL: load_sext_4i16_to_4i64:
3567; SSE2:       # BB#0: # %entry
3568; SSE2-NEXT:    movswq 2(%rdi), %rax
3569; SSE2-NEXT:    movd %rax, %xmm1
3570; SSE2-NEXT:    movswq (%rdi), %rax
3571; SSE2-NEXT:    movd %rax, %xmm0
3572; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3573; SSE2-NEXT:    movswq 6(%rdi), %rax
3574; SSE2-NEXT:    movd %rax, %xmm2
3575; SSE2-NEXT:    movswq 4(%rdi), %rax
3576; SSE2-NEXT:    movd %rax, %xmm1
3577; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
3578; SSE2-NEXT:    retq
3579;
3580; SSSE3-LABEL: load_sext_4i16_to_4i64:
3581; SSSE3:       # BB#0: # %entry
3582; SSSE3-NEXT:    movswq 2(%rdi), %rax
3583; SSSE3-NEXT:    movd %rax, %xmm1
3584; SSSE3-NEXT:    movswq (%rdi), %rax
3585; SSSE3-NEXT:    movd %rax, %xmm0
3586; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3587; SSSE3-NEXT:    movswq 6(%rdi), %rax
3588; SSSE3-NEXT:    movd %rax, %xmm2
3589; SSSE3-NEXT:    movswq 4(%rdi), %rax
3590; SSSE3-NEXT:    movd %rax, %xmm1
3591; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
3592; SSSE3-NEXT:    retq
3593;
3594; SSE41-LABEL: load_sext_4i16_to_4i64:
3595; SSE41:       # BB#0: # %entry
3596; SSE41-NEXT:    pmovsxwq (%rdi), %xmm0
3597; SSE41-NEXT:    pmovsxwq 4(%rdi), %xmm1
3598; SSE41-NEXT:    retq
3599;
3600; AVX1-LABEL: load_sext_4i16_to_4i64:
3601; AVX1:       # BB#0: # %entry
3602; AVX1-NEXT:    vpmovsxwd (%rdi), %xmm0
3603; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
3604; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
3605; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
3606; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
3607; AVX1-NEXT:    retq
3608;
3609; AVX2-LABEL: load_sext_4i16_to_4i64:
3610; AVX2:       # BB#0: # %entry
3611; AVX2-NEXT:    vpmovsxwq (%rdi), %ymm0
3612; AVX2-NEXT:    retq
3613;
3614; X32-SSE41-LABEL: load_sext_4i16_to_4i64:
3615; X32-SSE41:       # BB#0: # %entry
3616; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
3617; X32-SSE41-NEXT:    pmovsxwq (%eax), %xmm0
3618; X32-SSE41-NEXT:    pmovsxwq 4(%eax), %xmm1
3619; X32-SSE41-NEXT:    retl
3620entry:
3621 %X = load <4 x i16>, <4 x i16>* %ptr
3622 %Y = sext <4 x i16> %X to <4 x i64>
3623 ret <4 x i64> %Y
3624}
3625
3626define <8 x i32> @load_sext_8i16_to_8i32(<8 x i16> *%ptr) {
3627; SSE2-LABEL: load_sext_8i16_to_8i32:
3628; SSE2:       # BB#0: # %entry
3629; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
3630; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
3631; SSE2-NEXT:    psrad $16, %xmm0
3632; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
3633; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
3634; SSE2-NEXT:    psrad $16, %xmm1
3635; SSE2-NEXT:    retq
3636;
3637; SSSE3-LABEL: load_sext_8i16_to_8i32:
3638; SSSE3:       # BB#0: # %entry
3639; SSSE3-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
3640; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
3641; SSSE3-NEXT:    psrad $16, %xmm0
3642; SSSE3-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
3643; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
3644; SSSE3-NEXT:    psrad $16, %xmm1
3645; SSSE3-NEXT:    retq
3646;
3647; SSE41-LABEL: load_sext_8i16_to_8i32:
3648; SSE41:       # BB#0: # %entry
3649; SSE41-NEXT:    pmovsxwd (%rdi), %xmm0
3650; SSE41-NEXT:    pmovsxwd 8(%rdi), %xmm1
3651; SSE41-NEXT:    retq
3652;
3653; AVX1-LABEL: load_sext_8i16_to_8i32:
3654; AVX1:       # BB#0: # %entry
3655; AVX1-NEXT:    vpmovsxwd (%rdi), %xmm0
3656; AVX1-NEXT:    vpmovsxwd 8(%rdi), %xmm1
3657; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
3658; AVX1-NEXT:    retq
3659;
3660; AVX2-LABEL: load_sext_8i16_to_8i32:
3661; AVX2:       # BB#0: # %entry
3662; AVX2-NEXT:    vpmovsxwd (%rdi), %ymm0
3663; AVX2-NEXT:    retq
3664;
3665; X32-SSE41-LABEL: load_sext_8i16_to_8i32:
3666; X32-SSE41:       # BB#0: # %entry
3667; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
3668; X32-SSE41-NEXT:    pmovsxwd (%eax), %xmm0
3669; X32-SSE41-NEXT:    pmovsxwd 8(%eax), %xmm1
3670; X32-SSE41-NEXT:    retl
3671entry:
3672 %X = load <8 x i16>, <8 x i16>* %ptr
3673 %Y = sext <8 x i16> %X to <8 x i32>
3674 ret <8 x i32> %Y
3675}
3676
3677define <2 x i64> @load_sext_2i32_to_2i64(<2 x i32> *%ptr) {
3678; SSE2-LABEL: load_sext_2i32_to_2i64:
3679; SSE2:       # BB#0: # %entry
3680; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
3681; SSE2-NEXT:    movdqa %xmm0, %xmm1
3682; SSE2-NEXT:    psrad $31, %xmm1
3683; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3684; SSE2-NEXT:    retq
3685;
3686; SSSE3-LABEL: load_sext_2i32_to_2i64:
3687; SSSE3:       # BB#0: # %entry
3688; SSSE3-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
3689; SSSE3-NEXT:    movdqa %xmm0, %xmm1
3690; SSSE3-NEXT:    psrad $31, %xmm1
3691; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3692; SSSE3-NEXT:    retq
3693;
3694; SSE41-LABEL: load_sext_2i32_to_2i64:
3695; SSE41:       # BB#0: # %entry
3696; SSE41-NEXT:    pmovsxdq (%rdi), %xmm0
3697; SSE41-NEXT:    retq
3698;
3699; AVX-LABEL: load_sext_2i32_to_2i64:
3700; AVX:       # BB#0: # %entry
3701; AVX-NEXT:    vpmovsxdq (%rdi), %xmm0
3702; AVX-NEXT:    retq
3703;
3704; X32-SSE41-LABEL: load_sext_2i32_to_2i64:
3705; X32-SSE41:       # BB#0: # %entry
3706; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
3707; X32-SSE41-NEXT:    pmovsxdq (%eax), %xmm0
3708; X32-SSE41-NEXT:    retl
3709entry:
3710 %X = load <2 x i32>, <2 x i32>* %ptr
3711 %Y = sext <2 x i32> %X to <2 x i64>
3712 ret <2 x i64> %Y
3713}
3714
3715define <4 x i64> @load_sext_4i32_to_4i64(<4 x i32> *%ptr) {
3716; SSE2-LABEL: load_sext_4i32_to_4i64:
3717; SSE2:       # BB#0: # %entry
3718; SSE2-NEXT:    movdqa (%rdi), %xmm0
3719; SSE2-NEXT:    movdqa %xmm0, %xmm2
3720; SSE2-NEXT:    psrad $31, %xmm2
3721; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
3722; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
3723; SSE2-NEXT:    movdqa %xmm1, %xmm2
3724; SSE2-NEXT:    psrad $31, %xmm2
3725; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
3726; SSE2-NEXT:    retq
3727;
3728; SSSE3-LABEL: load_sext_4i32_to_4i64:
3729; SSSE3:       # BB#0: # %entry
3730; SSSE3-NEXT:    movdqa (%rdi), %xmm0
3731; SSSE3-NEXT:    movdqa %xmm0, %xmm2
3732; SSSE3-NEXT:    psrad $31, %xmm2
3733; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
3734; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
3735; SSSE3-NEXT:    movdqa %xmm1, %xmm2
3736; SSSE3-NEXT:    psrad $31, %xmm2
3737; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
3738; SSSE3-NEXT:    retq
3739;
3740; SSE41-LABEL: load_sext_4i32_to_4i64:
3741; SSE41:       # BB#0: # %entry
3742; SSE41-NEXT:    pmovsxdq (%rdi), %xmm0
3743; SSE41-NEXT:    pmovsxdq 8(%rdi), %xmm1
3744; SSE41-NEXT:    retq
3745;
3746; AVX1-LABEL: load_sext_4i32_to_4i64:
3747; AVX1:       # BB#0: # %entry
3748; AVX1-NEXT:    vpmovsxdq (%rdi), %xmm0
3749; AVX1-NEXT:    vpmovsxdq 8(%rdi), %xmm1
3750; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
3751; AVX1-NEXT:    retq
3752;
3753; AVX2-LABEL: load_sext_4i32_to_4i64:
3754; AVX2:       # BB#0: # %entry
3755; AVX2-NEXT:    vpmovsxdq (%rdi), %ymm0
3756; AVX2-NEXT:    retq
3757;
3758; X32-SSE41-LABEL: load_sext_4i32_to_4i64:
3759; X32-SSE41:       # BB#0: # %entry
3760; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
3761; X32-SSE41-NEXT:    pmovsxdq (%eax), %xmm0
3762; X32-SSE41-NEXT:    pmovsxdq 8(%eax), %xmm1
3763; X32-SSE41-NEXT:    retl
3764entry:
3765 %X = load <4 x i32>, <4 x i32>* %ptr
3766 %Y = sext <4 x i32> %X to <4 x i64>
3767 ret <4 x i64> %Y
3768}
3769
3770define i32 @sext_2i8_to_i32(<16 x i8> %A) nounwind uwtable readnone ssp {
3771; SSE2-LABEL: sext_2i8_to_i32:
3772; SSE2:       # BB#0: # %entry
3773; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
3774; SSE2-NEXT:    psraw $8, %xmm0
3775; SSE2-NEXT:    movd %xmm0, %eax
3776; SSE2-NEXT:    retq
3777;
3778; SSSE3-LABEL: sext_2i8_to_i32:
3779; SSSE3:       # BB#0: # %entry
3780; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
3781; SSSE3-NEXT:    psraw $8, %xmm0
3782; SSSE3-NEXT:    movd %xmm0, %eax
3783; SSSE3-NEXT:    retq
3784;
3785; SSE41-LABEL: sext_2i8_to_i32:
3786; SSE41:       # BB#0: # %entry
3787; SSE41-NEXT:    pmovsxbw %xmm0, %xmm0
3788; SSE41-NEXT:    movd %xmm0, %eax
3789; SSE41-NEXT:    retq
3790;
3791; AVX-LABEL: sext_2i8_to_i32:
3792; AVX:       # BB#0: # %entry
3793; AVX-NEXT:    vpmovsxbw %xmm0, %xmm0
3794; AVX-NEXT:    vmovd %xmm0, %eax
3795; AVX-NEXT:    retq
3796;
3797; X32-SSE41-LABEL: sext_2i8_to_i32:
3798; X32-SSE41:       # BB#0: # %entry
3799; X32-SSE41-NEXT:    pushl %eax
3800; X32-SSE41-NEXT:  .Ltmp0:
3801; X32-SSE41-NEXT:    .cfi_def_cfa_offset 8
3802; X32-SSE41-NEXT:    pmovsxbw %xmm0, %xmm0
3803; X32-SSE41-NEXT:    movd %xmm0, %eax
3804; X32-SSE41-NEXT:    popl %ecx
3805; X32-SSE41-NEXT:    retl
3806entry:
3807  %Shuf = shufflevector <16 x i8> %A, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
3808  %Ex = sext <2 x i8> %Shuf to <2 x i16>
3809  %Bc = bitcast <2 x i16> %Ex to i32
3810  ret i32 %Bc
3811}
3812
3813define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) {
3814; SSE2-LABEL: sext_4i1_to_4i64:
3815; SSE2:       # BB#0:
3816; SSE2-NEXT:    pslld $31, %xmm0
3817; SSE2-NEXT:    psrad $31, %xmm0
3818; SSE2-NEXT:    movdqa %xmm0, %xmm2
3819; SSE2-NEXT:    psrad $31, %xmm2
3820; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
3821; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
3822; SSE2-NEXT:    movdqa %xmm1, %xmm2
3823; SSE2-NEXT:    psrad $31, %xmm2
3824; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
3825; SSE2-NEXT:    retq
3826;
3827; SSSE3-LABEL: sext_4i1_to_4i64:
3828; SSSE3:       # BB#0:
3829; SSSE3-NEXT:    pslld $31, %xmm0
3830; SSSE3-NEXT:    psrad $31, %xmm0
3831; SSSE3-NEXT:    movdqa %xmm0, %xmm2
3832; SSSE3-NEXT:    psrad $31, %xmm2
3833; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
3834; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
3835; SSSE3-NEXT:    movdqa %xmm1, %xmm2
3836; SSSE3-NEXT:    psrad $31, %xmm2
3837; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
3838; SSSE3-NEXT:    retq
3839;
3840; SSE41-LABEL: sext_4i1_to_4i64:
3841; SSE41:       # BB#0:
3842; SSE41-NEXT:    pslld $31, %xmm0
3843; SSE41-NEXT:    psrad $31, %xmm0
3844; SSE41-NEXT:    pmovsxdq %xmm0, %xmm2
3845; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
3846; SSE41-NEXT:    pmovsxdq %xmm0, %xmm1
3847; SSE41-NEXT:    movdqa %xmm2, %xmm0
3848; SSE41-NEXT:    retq
3849;
3850; AVX1-LABEL: sext_4i1_to_4i64:
3851; AVX1:       # BB#0:
3852; AVX1-NEXT:    vpslld $31, %xmm0, %xmm0
3853; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm0
3854; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
3855; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
3856; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
3857; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
3858; AVX1-NEXT:    retq
3859;
3860; AVX2-LABEL: sext_4i1_to_4i64:
3861; AVX2:       # BB#0:
3862; AVX2-NEXT:    vpslld $31, %xmm0, %xmm0
3863; AVX2-NEXT:    vpsrad $31, %xmm0, %xmm0
3864; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
3865; AVX2-NEXT:    retq
3866;
3867; X32-SSE41-LABEL: sext_4i1_to_4i64:
3868; X32-SSE41:       # BB#0:
3869; X32-SSE41-NEXT:    pslld $31, %xmm0
3870; X32-SSE41-NEXT:    psrad $31, %xmm0
3871; X32-SSE41-NEXT:    pmovsxdq %xmm0, %xmm2
3872; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
3873; X32-SSE41-NEXT:    pmovsxdq %xmm0, %xmm1
3874; X32-SSE41-NEXT:    movdqa %xmm2, %xmm0
3875; X32-SSE41-NEXT:    retl
3876  %extmask = sext <4 x i1> %mask to <4 x i64>
3877  ret <4 x i64> %extmask
3878}
3879
3880define <4 x i64> @sext_4i8_to_4i64(<4 x i8> %mask) {
3881; SSE2-LABEL: sext_4i8_to_4i64:
3882; SSE2:       # BB#0:
3883; SSE2-NEXT:    pslld $24, %xmm0
3884; SSE2-NEXT:    psrad $24, %xmm0
3885; SSE2-NEXT:    movdqa %xmm0, %xmm2
3886; SSE2-NEXT:    psrad $31, %xmm2
3887; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
3888; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
3889; SSE2-NEXT:    movdqa %xmm1, %xmm2
3890; SSE2-NEXT:    psrad $31, %xmm2
3891; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
3892; SSE2-NEXT:    retq
3893;
3894; SSSE3-LABEL: sext_4i8_to_4i64:
3895; SSSE3:       # BB#0:
3896; SSSE3-NEXT:    pslld $24, %xmm0
3897; SSSE3-NEXT:    psrad $24, %xmm0
3898; SSSE3-NEXT:    movdqa %xmm0, %xmm2
3899; SSSE3-NEXT:    psrad $31, %xmm2
3900; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
3901; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
3902; SSSE3-NEXT:    movdqa %xmm1, %xmm2
3903; SSSE3-NEXT:    psrad $31, %xmm2
3904; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
3905; SSSE3-NEXT:    retq
3906;
3907; SSE41-LABEL: sext_4i8_to_4i64:
3908; SSE41:       # BB#0:
3909; SSE41-NEXT:    pslld $24, %xmm0
3910; SSE41-NEXT:    psrad $24, %xmm0
3911; SSE41-NEXT:    pmovsxdq %xmm0, %xmm2
3912; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
3913; SSE41-NEXT:    pmovsxdq %xmm0, %xmm1
3914; SSE41-NEXT:    movdqa %xmm2, %xmm0
3915; SSE41-NEXT:    retq
3916;
3917; AVX1-LABEL: sext_4i8_to_4i64:
3918; AVX1:       # BB#0:
3919; AVX1-NEXT:    vpslld $24, %xmm0, %xmm0
3920; AVX1-NEXT:    vpsrad $24, %xmm0, %xmm0
3921; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
3922; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
3923; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
3924; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
3925; AVX1-NEXT:    retq
3926;
3927; AVX2-LABEL: sext_4i8_to_4i64:
3928; AVX2:       # BB#0:
3929; AVX2-NEXT:    vpslld $24, %xmm0, %xmm0
3930; AVX2-NEXT:    vpsrad $24, %xmm0, %xmm0
3931; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
3932; AVX2-NEXT:    retq
3933;
3934; X32-SSE41-LABEL: sext_4i8_to_4i64:
3935; X32-SSE41:       # BB#0:
3936; X32-SSE41-NEXT:    pslld $24, %xmm0
3937; X32-SSE41-NEXT:    psrad $24, %xmm0
3938; X32-SSE41-NEXT:    pmovsxdq %xmm0, %xmm2
3939; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
3940; X32-SSE41-NEXT:    pmovsxdq %xmm0, %xmm1
3941; X32-SSE41-NEXT:    movdqa %xmm2, %xmm0
3942; X32-SSE41-NEXT:    retl
3943  %extmask = sext <4 x i8> %mask to <4 x i64>
3944  ret <4 x i64> %extmask
3945}
3946