1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW
9;
10; Just two 32-bit runs to make sure we do reasonable things there.
11; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=X86-SSE,X86-SSE2
12; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=X86-SSE,X86-SSE41
13
14define <8 x i16> @sext_16i8_to_8i16(<16 x i8> %A) nounwind uwtable readnone ssp {
15; SSE2-LABEL: sext_16i8_to_8i16:
16; SSE2:       # %bb.0: # %entry
17; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
18; SSE2-NEXT:    psraw $8, %xmm0
19; SSE2-NEXT:    retq
20;
21; SSSE3-LABEL: sext_16i8_to_8i16:
22; SSSE3:       # %bb.0: # %entry
23; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
24; SSSE3-NEXT:    psraw $8, %xmm0
25; SSSE3-NEXT:    retq
26;
27; SSE41-LABEL: sext_16i8_to_8i16:
28; SSE41:       # %bb.0: # %entry
29; SSE41-NEXT:    pmovsxbw %xmm0, %xmm0
30; SSE41-NEXT:    retq
31;
32; AVX-LABEL: sext_16i8_to_8i16:
33; AVX:       # %bb.0: # %entry
34; AVX-NEXT:    vpmovsxbw %xmm0, %xmm0
35; AVX-NEXT:    retq
36;
37; X86-SSE2-LABEL: sext_16i8_to_8i16:
38; X86-SSE2:       # %bb.0: # %entry
39; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
40; X86-SSE2-NEXT:    psraw $8, %xmm0
41; X86-SSE2-NEXT:    retl
42;
43; X86-SSE41-LABEL: sext_16i8_to_8i16:
44; X86-SSE41:       # %bb.0: # %entry
45; X86-SSE41-NEXT:    pmovsxbw %xmm0, %xmm0
46; X86-SSE41-NEXT:    retl
47entry:
48  %B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
49  %C = sext <8 x i8> %B to <8 x i16>
50  ret <8 x i16> %C
51}
52
53define <16 x i16> @sext_16i8_to_16i16(<16 x i8> %A) nounwind uwtable readnone ssp {
54; SSE2-LABEL: sext_16i8_to_16i16:
55; SSE2:       # %bb.0: # %entry
56; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
57; SSE2-NEXT:    psraw $8, %xmm2
58; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
59; SSE2-NEXT:    psraw $8, %xmm1
60; SSE2-NEXT:    movdqa %xmm2, %xmm0
61; SSE2-NEXT:    retq
62;
63; SSSE3-LABEL: sext_16i8_to_16i16:
64; SSSE3:       # %bb.0: # %entry
65; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
66; SSSE3-NEXT:    psraw $8, %xmm2
67; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
68; SSSE3-NEXT:    psraw $8, %xmm1
69; SSSE3-NEXT:    movdqa %xmm2, %xmm0
70; SSSE3-NEXT:    retq
71;
72; SSE41-LABEL: sext_16i8_to_16i16:
73; SSE41:       # %bb.0: # %entry
74; SSE41-NEXT:    pmovsxbw %xmm0, %xmm2
75; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
76; SSE41-NEXT:    pmovsxbw %xmm0, %xmm1
77; SSE41-NEXT:    movdqa %xmm2, %xmm0
78; SSE41-NEXT:    retq
79;
80; AVX1-LABEL: sext_16i8_to_16i16:
81; AVX1:       # %bb.0: # %entry
82; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm1
83; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
84; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm0
85; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
86; AVX1-NEXT:    retq
87;
88; AVX2-LABEL: sext_16i8_to_16i16:
89; AVX2:       # %bb.0: # %entry
90; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
91; AVX2-NEXT:    retq
92;
93; AVX512-LABEL: sext_16i8_to_16i16:
94; AVX512:       # %bb.0: # %entry
95; AVX512-NEXT:    vpmovsxbw %xmm0, %ymm0
96; AVX512-NEXT:    retq
97;
98; X86-SSE2-LABEL: sext_16i8_to_16i16:
99; X86-SSE2:       # %bb.0: # %entry
100; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
101; X86-SSE2-NEXT:    psraw $8, %xmm2
102; X86-SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
103; X86-SSE2-NEXT:    psraw $8, %xmm1
104; X86-SSE2-NEXT:    movdqa %xmm2, %xmm0
105; X86-SSE2-NEXT:    retl
106;
107; X86-SSE41-LABEL: sext_16i8_to_16i16:
108; X86-SSE41:       # %bb.0: # %entry
109; X86-SSE41-NEXT:    pmovsxbw %xmm0, %xmm2
110; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
111; X86-SSE41-NEXT:    pmovsxbw %xmm0, %xmm1
112; X86-SSE41-NEXT:    movdqa %xmm2, %xmm0
113; X86-SSE41-NEXT:    retl
114entry:
115  %B = sext <16 x i8> %A to <16 x i16>
116  ret <16 x i16> %B
117}
118
119define <32 x i16> @sext_32i8_to_32i16(<32 x i8> %A) nounwind uwtable readnone ssp {
120; SSE2-LABEL: sext_32i8_to_32i16:
121; SSE2:       # %bb.0: # %entry
122; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
123; SSE2-NEXT:    psraw $8, %xmm4
124; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
125; SSE2-NEXT:    psraw $8, %xmm5
126; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
127; SSE2-NEXT:    psraw $8, %xmm2
128; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
129; SSE2-NEXT:    psraw $8, %xmm3
130; SSE2-NEXT:    movdqa %xmm4, %xmm0
131; SSE2-NEXT:    movdqa %xmm5, %xmm1
132; SSE2-NEXT:    retq
133;
134; SSSE3-LABEL: sext_32i8_to_32i16:
135; SSSE3:       # %bb.0: # %entry
136; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
137; SSSE3-NEXT:    psraw $8, %xmm4
138; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
139; SSSE3-NEXT:    psraw $8, %xmm5
140; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
141; SSSE3-NEXT:    psraw $8, %xmm2
142; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
143; SSSE3-NEXT:    psraw $8, %xmm3
144; SSSE3-NEXT:    movdqa %xmm4, %xmm0
145; SSSE3-NEXT:    movdqa %xmm5, %xmm1
146; SSSE3-NEXT:    retq
147;
148; SSE41-LABEL: sext_32i8_to_32i16:
149; SSE41:       # %bb.0: # %entry
150; SSE41-NEXT:    pmovsxbw %xmm0, %xmm5
151; SSE41-NEXT:    pmovsxbw %xmm1, %xmm2
152; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
153; SSE41-NEXT:    pmovsxbw %xmm0, %xmm4
154; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
155; SSE41-NEXT:    pmovsxbw %xmm0, %xmm3
156; SSE41-NEXT:    movdqa %xmm5, %xmm0
157; SSE41-NEXT:    movdqa %xmm4, %xmm1
158; SSE41-NEXT:    retq
159;
160; AVX1-LABEL: sext_32i8_to_32i16:
161; AVX1:       # %bb.0: # %entry
162; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm1
163; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
164; AVX1-NEXT:    vpmovsxbw %xmm2, %xmm2
165; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm2
166; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
167; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm1
168; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
169; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm0
170; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
171; AVX1-NEXT:    vmovaps %ymm2, %ymm0
172; AVX1-NEXT:    retq
173;
174; AVX2-LABEL: sext_32i8_to_32i16:
175; AVX2:       # %bb.0: # %entry
176; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm2
177; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
178; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm1
179; AVX2-NEXT:    vmovdqa %ymm2, %ymm0
180; AVX2-NEXT:    retq
181;
182; AVX512F-LABEL: sext_32i8_to_32i16:
183; AVX512F:       # %bb.0: # %entry
184; AVX512F-NEXT:    vpmovsxbw %xmm0, %ymm1
185; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
186; AVX512F-NEXT:    vpmovsxbw %xmm0, %ymm0
187; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
188; AVX512F-NEXT:    retq
189;
190; AVX512BW-LABEL: sext_32i8_to_32i16:
191; AVX512BW:       # %bb.0: # %entry
192; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
193; AVX512BW-NEXT:    retq
194;
195; X86-SSE2-LABEL: sext_32i8_to_32i16:
196; X86-SSE2:       # %bb.0: # %entry
197; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
198; X86-SSE2-NEXT:    psraw $8, %xmm4
199; X86-SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
200; X86-SSE2-NEXT:    psraw $8, %xmm5
201; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
202; X86-SSE2-NEXT:    psraw $8, %xmm2
203; X86-SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
204; X86-SSE2-NEXT:    psraw $8, %xmm3
205; X86-SSE2-NEXT:    movdqa %xmm4, %xmm0
206; X86-SSE2-NEXT:    movdqa %xmm5, %xmm1
207; X86-SSE2-NEXT:    retl
208;
209; X86-SSE41-LABEL: sext_32i8_to_32i16:
210; X86-SSE41:       # %bb.0: # %entry
211; X86-SSE41-NEXT:    pmovsxbw %xmm0, %xmm5
212; X86-SSE41-NEXT:    pmovsxbw %xmm1, %xmm2
213; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
214; X86-SSE41-NEXT:    pmovsxbw %xmm0, %xmm4
215; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
216; X86-SSE41-NEXT:    pmovsxbw %xmm0, %xmm3
217; X86-SSE41-NEXT:    movdqa %xmm5, %xmm0
218; X86-SSE41-NEXT:    movdqa %xmm4, %xmm1
219; X86-SSE41-NEXT:    retl
220entry:
221  %B = sext <32 x i8> %A to <32 x i16>
222  ret <32 x i16> %B
223}
224
225define <4 x i32> @sext_16i8_to_4i32(<16 x i8> %A) nounwind uwtable readnone ssp {
226; SSE2-LABEL: sext_16i8_to_4i32:
227; SSE2:       # %bb.0: # %entry
228; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
229; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
230; SSE2-NEXT:    psrad $24, %xmm0
231; SSE2-NEXT:    retq
232;
233; SSSE3-LABEL: sext_16i8_to_4i32:
234; SSSE3:       # %bb.0: # %entry
235; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
236; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
237; SSSE3-NEXT:    psrad $24, %xmm0
238; SSSE3-NEXT:    retq
239;
240; SSE41-LABEL: sext_16i8_to_4i32:
241; SSE41:       # %bb.0: # %entry
242; SSE41-NEXT:    pmovsxbd %xmm0, %xmm0
243; SSE41-NEXT:    retq
244;
245; AVX-LABEL: sext_16i8_to_4i32:
246; AVX:       # %bb.0: # %entry
247; AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
248; AVX-NEXT:    retq
249;
250; X86-SSE2-LABEL: sext_16i8_to_4i32:
251; X86-SSE2:       # %bb.0: # %entry
252; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
253; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
254; X86-SSE2-NEXT:    psrad $24, %xmm0
255; X86-SSE2-NEXT:    retl
256;
257; X86-SSE41-LABEL: sext_16i8_to_4i32:
258; X86-SSE41:       # %bb.0: # %entry
259; X86-SSE41-NEXT:    pmovsxbd %xmm0, %xmm0
260; X86-SSE41-NEXT:    retl
261entry:
262  %B = shufflevector <16 x i8> %A, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
263  %C = sext <4 x i8> %B to <4 x i32>
264  ret <4 x i32> %C
265}
266
267define <8 x i32> @sext_16i8_to_8i32(<16 x i8> %A) nounwind uwtable readnone ssp {
268; SSE2-LABEL: sext_16i8_to_8i32:
269; SSE2:       # %bb.0: # %entry
270; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
271; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
272; SSE2-NEXT:    psrad $24, %xmm0
273; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
274; SSE2-NEXT:    psrad $24, %xmm1
275; SSE2-NEXT:    retq
276;
277; SSSE3-LABEL: sext_16i8_to_8i32:
278; SSSE3:       # %bb.0: # %entry
279; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
280; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
281; SSSE3-NEXT:    psrad $24, %xmm0
282; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
283; SSSE3-NEXT:    psrad $24, %xmm1
284; SSSE3-NEXT:    retq
285;
286; SSE41-LABEL: sext_16i8_to_8i32:
287; SSE41:       # %bb.0: # %entry
288; SSE41-NEXT:    pmovsxbd %xmm0, %xmm2
289; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
290; SSE41-NEXT:    pmovsxbd %xmm0, %xmm1
291; SSE41-NEXT:    movdqa %xmm2, %xmm0
292; SSE41-NEXT:    retq
293;
294; AVX1-LABEL: sext_16i8_to_8i32:
295; AVX1:       # %bb.0: # %entry
296; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm1
297; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
298; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
299; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
300; AVX1-NEXT:    retq
301;
302; AVX2-LABEL: sext_16i8_to_8i32:
303; AVX2:       # %bb.0: # %entry
304; AVX2-NEXT:    vpmovsxbd %xmm0, %ymm0
305; AVX2-NEXT:    retq
306;
307; AVX512-LABEL: sext_16i8_to_8i32:
308; AVX512:       # %bb.0: # %entry
309; AVX512-NEXT:    vpmovsxbd %xmm0, %ymm0
310; AVX512-NEXT:    retq
311;
312; X86-SSE2-LABEL: sext_16i8_to_8i32:
313; X86-SSE2:       # %bb.0: # %entry
314; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
315; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
316; X86-SSE2-NEXT:    psrad $24, %xmm0
317; X86-SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
318; X86-SSE2-NEXT:    psrad $24, %xmm1
319; X86-SSE2-NEXT:    retl
320;
321; X86-SSE41-LABEL: sext_16i8_to_8i32:
322; X86-SSE41:       # %bb.0: # %entry
323; X86-SSE41-NEXT:    pmovsxbd %xmm0, %xmm2
324; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
325; X86-SSE41-NEXT:    pmovsxbd %xmm0, %xmm1
326; X86-SSE41-NEXT:    movdqa %xmm2, %xmm0
327; X86-SSE41-NEXT:    retl
328entry:
329  %B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
330  %C = sext <8 x i8> %B to <8 x i32>
331  ret <8 x i32> %C
332}
333
334define <16 x i32> @sext_16i8_to_16i32(<16 x i8> %A) nounwind uwtable readnone ssp {
335; SSE2-LABEL: sext_16i8_to_16i32:
336; SSE2:       # %bb.0: # %entry
337; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
338; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
339; SSE2-NEXT:    psrad $24, %xmm4
340; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
341; SSE2-NEXT:    psrad $24, %xmm1
342; SSE2-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
343; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
344; SSE2-NEXT:    psrad $24, %xmm2
345; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
346; SSE2-NEXT:    psrad $24, %xmm3
347; SSE2-NEXT:    movdqa %xmm4, %xmm0
348; SSE2-NEXT:    retq
349;
350; SSSE3-LABEL: sext_16i8_to_16i32:
351; SSSE3:       # %bb.0: # %entry
352; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
353; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
354; SSSE3-NEXT:    psrad $24, %xmm4
355; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
356; SSSE3-NEXT:    psrad $24, %xmm1
357; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
358; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
359; SSSE3-NEXT:    psrad $24, %xmm2
360; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
361; SSSE3-NEXT:    psrad $24, %xmm3
362; SSSE3-NEXT:    movdqa %xmm4, %xmm0
363; SSSE3-NEXT:    retq
364;
365; SSE41-LABEL: sext_16i8_to_16i32:
366; SSE41:       # %bb.0: # %entry
367; SSE41-NEXT:    pmovsxbd %xmm0, %xmm4
368; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
369; SSE41-NEXT:    pmovsxbd %xmm1, %xmm1
370; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
371; SSE41-NEXT:    pmovsxbd %xmm2, %xmm2
372; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
373; SSE41-NEXT:    pmovsxbd %xmm0, %xmm3
374; SSE41-NEXT:    movdqa %xmm4, %xmm0
375; SSE41-NEXT:    retq
376;
377; AVX1-LABEL: sext_16i8_to_16i32:
378; AVX1:       # %bb.0: # %entry
379; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm1
380; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
381; AVX1-NEXT:    vpmovsxbd %xmm2, %xmm2
382; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm2
383; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
384; AVX1-NEXT:    vpmovsxbd %xmm1, %xmm1
385; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
386; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
387; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
388; AVX1-NEXT:    vmovaps %ymm2, %ymm0
389; AVX1-NEXT:    retq
390;
391; AVX2-LABEL: sext_16i8_to_16i32:
392; AVX2:       # %bb.0: # %entry
393; AVX2-NEXT:    vpmovsxbd %xmm0, %ymm2
394; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
395; AVX2-NEXT:    vpmovsxbd %xmm0, %ymm1
396; AVX2-NEXT:    vmovdqa %ymm2, %ymm0
397; AVX2-NEXT:    retq
398;
399; AVX512-LABEL: sext_16i8_to_16i32:
400; AVX512:       # %bb.0: # %entry
401; AVX512-NEXT:    vpmovsxbd %xmm0, %zmm0
402; AVX512-NEXT:    retq
403;
404; X86-SSE2-LABEL: sext_16i8_to_16i32:
405; X86-SSE2:       # %bb.0: # %entry
406; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
407; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
408; X86-SSE2-NEXT:    psrad $24, %xmm4
409; X86-SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
410; X86-SSE2-NEXT:    psrad $24, %xmm1
411; X86-SSE2-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
412; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
413; X86-SSE2-NEXT:    psrad $24, %xmm2
414; X86-SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
415; X86-SSE2-NEXT:    psrad $24, %xmm3
416; X86-SSE2-NEXT:    movdqa %xmm4, %xmm0
417; X86-SSE2-NEXT:    retl
418;
419; X86-SSE41-LABEL: sext_16i8_to_16i32:
420; X86-SSE41:       # %bb.0: # %entry
421; X86-SSE41-NEXT:    pmovsxbd %xmm0, %xmm4
422; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
423; X86-SSE41-NEXT:    pmovsxbd %xmm1, %xmm1
424; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
425; X86-SSE41-NEXT:    pmovsxbd %xmm2, %xmm2
426; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
427; X86-SSE41-NEXT:    pmovsxbd %xmm0, %xmm3
428; X86-SSE41-NEXT:    movdqa %xmm4, %xmm0
429; X86-SSE41-NEXT:    retl
430entry:
431  %B = sext <16 x i8> %A to <16 x i32>
432  ret <16 x i32> %B
433}
434
435define <2 x i64> @sext_16i8_to_2i64(<16 x i8> %A) nounwind uwtable readnone ssp {
436; SSE2-LABEL: sext_16i8_to_2i64:
437; SSE2:       # %bb.0: # %entry
438; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
439; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
440; SSE2-NEXT:    pxor %xmm1, %xmm1
441; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
442; SSE2-NEXT:    psrad $24, %xmm0
443; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
444; SSE2-NEXT:    retq
445;
446; SSSE3-LABEL: sext_16i8_to_2i64:
447; SSSE3:       # %bb.0: # %entry
448; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
449; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
450; SSSE3-NEXT:    pxor %xmm1, %xmm1
451; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm1
452; SSSE3-NEXT:    psrad $24, %xmm0
453; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
454; SSSE3-NEXT:    retq
455;
456; SSE41-LABEL: sext_16i8_to_2i64:
457; SSE41:       # %bb.0: # %entry
458; SSE41-NEXT:    pmovsxbq %xmm0, %xmm0
459; SSE41-NEXT:    retq
460;
461; AVX-LABEL: sext_16i8_to_2i64:
462; AVX:       # %bb.0: # %entry
463; AVX-NEXT:    vpmovsxbq %xmm0, %xmm0
464; AVX-NEXT:    retq
465;
466; X86-SSE2-LABEL: sext_16i8_to_2i64:
467; X86-SSE2:       # %bb.0: # %entry
468; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
469; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
470; X86-SSE2-NEXT:    pxor %xmm1, %xmm1
471; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
472; X86-SSE2-NEXT:    psrad $24, %xmm0
473; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
474; X86-SSE2-NEXT:    retl
475;
476; X86-SSE41-LABEL: sext_16i8_to_2i64:
477; X86-SSE41:       # %bb.0: # %entry
478; X86-SSE41-NEXT:    pmovsxbq %xmm0, %xmm0
479; X86-SSE41-NEXT:    retl
480entry:
481  %B = shufflevector <16 x i8> %A, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
482  %C = sext <2 x i8> %B to <2 x i64>
483  ret <2 x i64> %C
484}
485
486define <4 x i64> @sext_16i8_to_4i64(<16 x i8> %A) nounwind uwtable readnone ssp {
487; SSE2-LABEL: sext_16i8_to_4i64:
488; SSE2:       # %bb.0: # %entry
489; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
490; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
491; SSE2-NEXT:    psrad $24, %xmm1
492; SSE2-NEXT:    pxor %xmm2, %xmm2
493; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
494; SSE2-NEXT:    movdqa %xmm1, %xmm0
495; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
496; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
497; SSE2-NEXT:    retq
498;
499; SSSE3-LABEL: sext_16i8_to_4i64:
500; SSSE3:       # %bb.0: # %entry
501; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
502; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
503; SSSE3-NEXT:    psrad $24, %xmm1
504; SSSE3-NEXT:    pxor %xmm2, %xmm2
505; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
506; SSSE3-NEXT:    movdqa %xmm1, %xmm0
507; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
508; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
509; SSSE3-NEXT:    retq
510;
511; SSE41-LABEL: sext_16i8_to_4i64:
512; SSE41:       # %bb.0: # %entry
513; SSE41-NEXT:    pmovsxbq %xmm0, %xmm2
514; SSE41-NEXT:    psrld $16, %xmm0
515; SSE41-NEXT:    pmovsxbq %xmm0, %xmm1
516; SSE41-NEXT:    movdqa %xmm2, %xmm0
517; SSE41-NEXT:    retq
518;
519; AVX1-LABEL: sext_16i8_to_4i64:
520; AVX1:       # %bb.0: # %entry
521; AVX1-NEXT:    vpmovsxbq %xmm0, %xmm1
522; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
523; AVX1-NEXT:    vpmovsxbq %xmm0, %xmm0
524; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
525; AVX1-NEXT:    retq
526;
527; AVX2-LABEL: sext_16i8_to_4i64:
528; AVX2:       # %bb.0: # %entry
529; AVX2-NEXT:    vpmovsxbq %xmm0, %ymm0
530; AVX2-NEXT:    retq
531;
532; AVX512-LABEL: sext_16i8_to_4i64:
533; AVX512:       # %bb.0: # %entry
534; AVX512-NEXT:    vpmovsxbq %xmm0, %ymm0
535; AVX512-NEXT:    retq
536;
537; X86-SSE2-LABEL: sext_16i8_to_4i64:
538; X86-SSE2:       # %bb.0: # %entry
539; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
540; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
541; X86-SSE2-NEXT:    psrad $24, %xmm1
542; X86-SSE2-NEXT:    pxor %xmm2, %xmm2
543; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
544; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
545; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
546; X86-SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
547; X86-SSE2-NEXT:    retl
548;
549; X86-SSE41-LABEL: sext_16i8_to_4i64:
550; X86-SSE41:       # %bb.0: # %entry
551; X86-SSE41-NEXT:    pmovsxbq %xmm0, %xmm2
552; X86-SSE41-NEXT:    psrld $16, %xmm0
553; X86-SSE41-NEXT:    pmovsxbq %xmm0, %xmm1
554; X86-SSE41-NEXT:    movdqa %xmm2, %xmm0
555; X86-SSE41-NEXT:    retl
556entry:
557  %B = shufflevector <16 x i8> %A, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
558  %C = sext <4 x i8> %B to <4 x i64>
559  ret <4 x i64> %C
560}
561
562define <8 x i64> @sext_16i8_to_8i64(<16 x i8> %A) nounwind uwtable readnone ssp {
563; SSE2-LABEL: sext_16i8_to_8i64:
564; SSE2:       # %bb.0: # %entry
565; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
566; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
567; SSE2-NEXT:    psrad $24, %xmm1
568; SSE2-NEXT:    pxor %xmm4, %xmm4
569; SSE2-NEXT:    pxor %xmm3, %xmm3
570; SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
571; SSE2-NEXT:    movdqa %xmm1, %xmm0
572; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
573; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
574; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
575; SSE2-NEXT:    psrad $24, %xmm3
576; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
577; SSE2-NEXT:    movdqa %xmm3, %xmm2
578; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
579; SSE2-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
580; SSE2-NEXT:    retq
581;
582; SSSE3-LABEL: sext_16i8_to_8i64:
583; SSSE3:       # %bb.0: # %entry
584; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
585; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
586; SSSE3-NEXT:    psrad $24, %xmm1
587; SSSE3-NEXT:    pxor %xmm4, %xmm4
588; SSSE3-NEXT:    pxor %xmm3, %xmm3
589; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm3
590; SSSE3-NEXT:    movdqa %xmm1, %xmm0
591; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
592; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
593; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
594; SSSE3-NEXT:    psrad $24, %xmm3
595; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm4
596; SSSE3-NEXT:    movdqa %xmm3, %xmm2
597; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
598; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
599; SSSE3-NEXT:    retq
600;
601; SSE41-LABEL: sext_16i8_to_8i64:
602; SSE41:       # %bb.0: # %entry
603; SSE41-NEXT:    pmovsxbq %xmm0, %xmm4
604; SSE41-NEXT:    movdqa %xmm0, %xmm1
605; SSE41-NEXT:    psrld $16, %xmm1
606; SSE41-NEXT:    pmovsxbq %xmm1, %xmm1
607; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
608; SSE41-NEXT:    pmovsxbq %xmm2, %xmm2
609; SSE41-NEXT:    psrlq $48, %xmm0
610; SSE41-NEXT:    pmovsxbq %xmm0, %xmm3
611; SSE41-NEXT:    movdqa %xmm4, %xmm0
612; SSE41-NEXT:    retq
613;
614; AVX1-LABEL: sext_16i8_to_8i64:
615; AVX1:       # %bb.0: # %entry
616; AVX1-NEXT:    vpmovsxbq %xmm0, %xmm1
617; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm2
618; AVX1-NEXT:    vpmovsxbq %xmm2, %xmm2
619; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm2
620; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
621; AVX1-NEXT:    vpmovsxbq %xmm0, %xmm1
622; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
623; AVX1-NEXT:    vpmovsxbq %xmm0, %xmm0
624; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
625; AVX1-NEXT:    vmovaps %ymm2, %ymm0
626; AVX1-NEXT:    retq
627;
628; AVX2-LABEL: sext_16i8_to_8i64:
629; AVX2:       # %bb.0: # %entry
630; AVX2-NEXT:    vpmovsxbq %xmm0, %ymm2
631; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
632; AVX2-NEXT:    vpmovsxbq %xmm0, %ymm1
633; AVX2-NEXT:    vmovdqa %ymm2, %ymm0
634; AVX2-NEXT:    retq
635;
636; AVX512-LABEL: sext_16i8_to_8i64:
637; AVX512:       # %bb.0: # %entry
638; AVX512-NEXT:    vpmovsxbq %xmm0, %zmm0
639; AVX512-NEXT:    retq
640;
641; X86-SSE2-LABEL: sext_16i8_to_8i64:
642; X86-SSE2:       # %bb.0: # %entry
643; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
644; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
645; X86-SSE2-NEXT:    psrad $24, %xmm1
646; X86-SSE2-NEXT:    pxor %xmm4, %xmm4
647; X86-SSE2-NEXT:    pxor %xmm3, %xmm3
648; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
649; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
650; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
651; X86-SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
652; X86-SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
653; X86-SSE2-NEXT:    psrad $24, %xmm3
654; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
655; X86-SSE2-NEXT:    movdqa %xmm3, %xmm2
656; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
657; X86-SSE2-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
658; X86-SSE2-NEXT:    retl
659;
660; X86-SSE41-LABEL: sext_16i8_to_8i64:
661; X86-SSE41:       # %bb.0: # %entry
662; X86-SSE41-NEXT:    pmovsxbq %xmm0, %xmm4
663; X86-SSE41-NEXT:    movdqa %xmm0, %xmm1
664; X86-SSE41-NEXT:    psrld $16, %xmm1
665; X86-SSE41-NEXT:    pmovsxbq %xmm1, %xmm1
666; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
667; X86-SSE41-NEXT:    pmovsxbq %xmm2, %xmm2
668; X86-SSE41-NEXT:    psrlq $48, %xmm0
669; X86-SSE41-NEXT:    pmovsxbq %xmm0, %xmm3
670; X86-SSE41-NEXT:    movdqa %xmm4, %xmm0
671; X86-SSE41-NEXT:    retl
672entry:
673  %B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
674  %C = sext <8 x i8> %B to <8 x i64>
675  ret <8 x i64> %C
676}
677
678define <4 x i32> @sext_8i16_to_4i32(<8 x i16> %A) nounwind uwtable readnone ssp {
679; SSE2-LABEL: sext_8i16_to_4i32:
680; SSE2:       # %bb.0: # %entry
681; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
682; SSE2-NEXT:    psrad $16, %xmm0
683; SSE2-NEXT:    retq
684;
685; SSSE3-LABEL: sext_8i16_to_4i32:
686; SSSE3:       # %bb.0: # %entry
687; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
688; SSSE3-NEXT:    psrad $16, %xmm0
689; SSSE3-NEXT:    retq
690;
691; SSE41-LABEL: sext_8i16_to_4i32:
692; SSE41:       # %bb.0: # %entry
693; SSE41-NEXT:    pmovsxwd %xmm0, %xmm0
694; SSE41-NEXT:    retq
695;
696; AVX-LABEL: sext_8i16_to_4i32:
697; AVX:       # %bb.0: # %entry
698; AVX-NEXT:    vpmovsxwd %xmm0, %xmm0
699; AVX-NEXT:    retq
700;
701; X86-SSE2-LABEL: sext_8i16_to_4i32:
702; X86-SSE2:       # %bb.0: # %entry
703; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
704; X86-SSE2-NEXT:    psrad $16, %xmm0
705; X86-SSE2-NEXT:    retl
706;
707; X86-SSE41-LABEL: sext_8i16_to_4i32:
708; X86-SSE41:       # %bb.0: # %entry
709; X86-SSE41-NEXT:    pmovsxwd %xmm0, %xmm0
710; X86-SSE41-NEXT:    retl
711entry:
712  %B = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
713  %C = sext <4 x i16> %B to <4 x i32>
714  ret <4 x i32> %C
715}
716
717define <8 x i32> @sext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp {
718; SSE2-LABEL: sext_8i16_to_8i32:
719; SSE2:       # %bb.0: # %entry
720; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
721; SSE2-NEXT:    psrad $16, %xmm2
722; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
723; SSE2-NEXT:    psrad $16, %xmm1
724; SSE2-NEXT:    movdqa %xmm2, %xmm0
725; SSE2-NEXT:    retq
726;
727; SSSE3-LABEL: sext_8i16_to_8i32:
728; SSSE3:       # %bb.0: # %entry
729; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
730; SSSE3-NEXT:    psrad $16, %xmm2
731; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
732; SSSE3-NEXT:    psrad $16, %xmm1
733; SSSE3-NEXT:    movdqa %xmm2, %xmm0
734; SSSE3-NEXT:    retq
735;
736; SSE41-LABEL: sext_8i16_to_8i32:
737; SSE41:       # %bb.0: # %entry
738; SSE41-NEXT:    pmovsxwd %xmm0, %xmm2
739; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
740; SSE41-NEXT:    pmovsxwd %xmm0, %xmm1
741; SSE41-NEXT:    movdqa %xmm2, %xmm0
742; SSE41-NEXT:    retq
743;
744; AVX1-LABEL: sext_8i16_to_8i32:
745; AVX1:       # %bb.0: # %entry
746; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm1
747; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
748; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
749; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
750; AVX1-NEXT:    retq
751;
752; AVX2-LABEL: sext_8i16_to_8i32:
753; AVX2:       # %bb.0: # %entry
754; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
755; AVX2-NEXT:    retq
756;
757; AVX512-LABEL: sext_8i16_to_8i32:
758; AVX512:       # %bb.0: # %entry
759; AVX512-NEXT:    vpmovsxwd %xmm0, %ymm0
760; AVX512-NEXT:    retq
761;
762; X86-SSE2-LABEL: sext_8i16_to_8i32:
763; X86-SSE2:       # %bb.0: # %entry
764; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
765; X86-SSE2-NEXT:    psrad $16, %xmm2
766; X86-SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
767; X86-SSE2-NEXT:    psrad $16, %xmm1
768; X86-SSE2-NEXT:    movdqa %xmm2, %xmm0
769; X86-SSE2-NEXT:    retl
770;
771; X86-SSE41-LABEL: sext_8i16_to_8i32:
772; X86-SSE41:       # %bb.0: # %entry
773; X86-SSE41-NEXT:    pmovsxwd %xmm0, %xmm2
774; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
775; X86-SSE41-NEXT:    pmovsxwd %xmm0, %xmm1
776; X86-SSE41-NEXT:    movdqa %xmm2, %xmm0
777; X86-SSE41-NEXT:    retl
778entry:
779  %B = sext <8 x i16> %A to <8 x i32>
780  ret <8 x i32> %B
781}
782
783define <16 x i32> @sext_16i16_to_16i32(<16 x i16> %A) nounwind uwtable readnone ssp {
784; SSE2-LABEL: sext_16i16_to_16i32:
785; SSE2:       # %bb.0: # %entry
786; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
787; SSE2-NEXT:    psrad $16, %xmm4
788; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
789; SSE2-NEXT:    psrad $16, %xmm5
790; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
791; SSE2-NEXT:    psrad $16, %xmm2
792; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
793; SSE2-NEXT:    psrad $16, %xmm3
794; SSE2-NEXT:    movdqa %xmm4, %xmm0
795; SSE2-NEXT:    movdqa %xmm5, %xmm1
796; SSE2-NEXT:    retq
797;
798; SSSE3-LABEL: sext_16i16_to_16i32:
799; SSSE3:       # %bb.0: # %entry
800; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
801; SSSE3-NEXT:    psrad $16, %xmm4
802; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
803; SSSE3-NEXT:    psrad $16, %xmm5
804; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
805; SSSE3-NEXT:    psrad $16, %xmm2
806; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
807; SSSE3-NEXT:    psrad $16, %xmm3
808; SSSE3-NEXT:    movdqa %xmm4, %xmm0
809; SSSE3-NEXT:    movdqa %xmm5, %xmm1
810; SSSE3-NEXT:    retq
811;
812; SSE41-LABEL: sext_16i16_to_16i32:
813; SSE41:       # %bb.0: # %entry
814; SSE41-NEXT:    pmovsxwd %xmm0, %xmm5
815; SSE41-NEXT:    pmovsxwd %xmm1, %xmm2
816; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
817; SSE41-NEXT:    pmovsxwd %xmm0, %xmm4
818; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
819; SSE41-NEXT:    pmovsxwd %xmm0, %xmm3
820; SSE41-NEXT:    movdqa %xmm5, %xmm0
821; SSE41-NEXT:    movdqa %xmm4, %xmm1
822; SSE41-NEXT:    retq
823;
824; AVX1-LABEL: sext_16i16_to_16i32:
825; AVX1:       # %bb.0: # %entry
826; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm1
827; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
828; AVX1-NEXT:    vpmovsxwd %xmm2, %xmm2
829; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm2
830; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
831; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm1
832; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
833; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
834; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
835; AVX1-NEXT:    vmovaps %ymm2, %ymm0
836; AVX1-NEXT:    retq
837;
838; AVX2-LABEL: sext_16i16_to_16i32:
839; AVX2:       # %bb.0: # %entry
840; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm2
841; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
842; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm1
843; AVX2-NEXT:    vmovdqa %ymm2, %ymm0
844; AVX2-NEXT:    retq
845;
846; AVX512-LABEL: sext_16i16_to_16i32:
847; AVX512:       # %bb.0: # %entry
848; AVX512-NEXT:    vpmovsxwd %ymm0, %zmm0
849; AVX512-NEXT:    retq
850;
851; X86-SSE2-LABEL: sext_16i16_to_16i32:
852; X86-SSE2:       # %bb.0: # %entry
853; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
854; X86-SSE2-NEXT:    psrad $16, %xmm4
855; X86-SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
856; X86-SSE2-NEXT:    psrad $16, %xmm5
857; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
858; X86-SSE2-NEXT:    psrad $16, %xmm2
859; X86-SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
860; X86-SSE2-NEXT:    psrad $16, %xmm3
861; X86-SSE2-NEXT:    movdqa %xmm4, %xmm0
862; X86-SSE2-NEXT:    movdqa %xmm5, %xmm1
863; X86-SSE2-NEXT:    retl
864;
865; X86-SSE41-LABEL: sext_16i16_to_16i32:
866; X86-SSE41:       # %bb.0: # %entry
867; X86-SSE41-NEXT:    pmovsxwd %xmm0, %xmm5
868; X86-SSE41-NEXT:    pmovsxwd %xmm1, %xmm2
869; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
870; X86-SSE41-NEXT:    pmovsxwd %xmm0, %xmm4
871; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
872; X86-SSE41-NEXT:    pmovsxwd %xmm0, %xmm3
873; X86-SSE41-NEXT:    movdqa %xmm5, %xmm0
874; X86-SSE41-NEXT:    movdqa %xmm4, %xmm1
875; X86-SSE41-NEXT:    retl
876entry:
877  %B = sext <16 x i16> %A to <16 x i32>
878  ret <16 x i32> %B
879}
880
881define <2 x i64> @sext_8i16_to_2i64(<8 x i16> %A) nounwind uwtable readnone ssp {
882; SSE2-LABEL: sext_8i16_to_2i64:
883; SSE2:       # %bb.0: # %entry
884; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
885; SSE2-NEXT:    pxor %xmm1, %xmm1
886; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
887; SSE2-NEXT:    psrad $16, %xmm0
888; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
889; SSE2-NEXT:    retq
890;
891; SSSE3-LABEL: sext_8i16_to_2i64:
892; SSSE3:       # %bb.0: # %entry
893; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
894; SSSE3-NEXT:    pxor %xmm1, %xmm1
895; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm1
896; SSSE3-NEXT:    psrad $16, %xmm0
897; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
898; SSSE3-NEXT:    retq
899;
900; SSE41-LABEL: sext_8i16_to_2i64:
901; SSE41:       # %bb.0: # %entry
902; SSE41-NEXT:    pmovsxwq %xmm0, %xmm0
903; SSE41-NEXT:    retq
904;
905; AVX-LABEL: sext_8i16_to_2i64:
906; AVX:       # %bb.0: # %entry
907; AVX-NEXT:    vpmovsxwq %xmm0, %xmm0
908; AVX-NEXT:    retq
909;
910; X86-SSE2-LABEL: sext_8i16_to_2i64:
911; X86-SSE2:       # %bb.0: # %entry
912; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
913; X86-SSE2-NEXT:    pxor %xmm1, %xmm1
914; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
915; X86-SSE2-NEXT:    psrad $16, %xmm0
916; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
917; X86-SSE2-NEXT:    retl
918;
919; X86-SSE41-LABEL: sext_8i16_to_2i64:
920; X86-SSE41:       # %bb.0: # %entry
921; X86-SSE41-NEXT:    pmovsxwq %xmm0, %xmm0
922; X86-SSE41-NEXT:    retl
923entry:
924  %B = shufflevector <8 x i16> %A, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
925  %C = sext <2 x i16> %B to <2 x i64>
926  ret <2 x i64> %C
927}
928
929define <4 x i64> @sext_8i16_to_4i64(<8 x i16> %A) nounwind uwtable readnone ssp {
930; SSE2-LABEL: sext_8i16_to_4i64:
931; SSE2:       # %bb.0: # %entry
932; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
933; SSE2-NEXT:    psrad $16, %xmm1
934; SSE2-NEXT:    pxor %xmm2, %xmm2
935; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
936; SSE2-NEXT:    movdqa %xmm1, %xmm0
937; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
938; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
939; SSE2-NEXT:    retq
940;
941; SSSE3-LABEL: sext_8i16_to_4i64:
942; SSSE3:       # %bb.0: # %entry
943; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
944; SSSE3-NEXT:    psrad $16, %xmm1
945; SSSE3-NEXT:    pxor %xmm2, %xmm2
946; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
947; SSSE3-NEXT:    movdqa %xmm1, %xmm0
948; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
949; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
950; SSSE3-NEXT:    retq
951;
952; SSE41-LABEL: sext_8i16_to_4i64:
953; SSE41:       # %bb.0: # %entry
954; SSE41-NEXT:    pmovsxwq %xmm0, %xmm2
955; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
956; SSE41-NEXT:    pmovsxwq %xmm0, %xmm1
957; SSE41-NEXT:    movdqa %xmm2, %xmm0
958; SSE41-NEXT:    retq
959;
960; AVX1-LABEL: sext_8i16_to_4i64:
961; AVX1:       # %bb.0: # %entry
962; AVX1-NEXT:    vpmovsxwq %xmm0, %xmm1
963; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
964; AVX1-NEXT:    vpmovsxwq %xmm0, %xmm0
965; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
966; AVX1-NEXT:    retq
967;
968; AVX2-LABEL: sext_8i16_to_4i64:
969; AVX2:       # %bb.0: # %entry
970; AVX2-NEXT:    vpmovsxwq %xmm0, %ymm0
971; AVX2-NEXT:    retq
972;
973; AVX512-LABEL: sext_8i16_to_4i64:
974; AVX512:       # %bb.0: # %entry
975; AVX512-NEXT:    vpmovsxwq %xmm0, %ymm0
976; AVX512-NEXT:    retq
977;
978; X86-SSE2-LABEL: sext_8i16_to_4i64:
979; X86-SSE2:       # %bb.0: # %entry
980; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
981; X86-SSE2-NEXT:    psrad $16, %xmm1
982; X86-SSE2-NEXT:    pxor %xmm2, %xmm2
983; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
984; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
985; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
986; X86-SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
987; X86-SSE2-NEXT:    retl
988;
989; X86-SSE41-LABEL: sext_8i16_to_4i64:
990; X86-SSE41:       # %bb.0: # %entry
991; X86-SSE41-NEXT:    pmovsxwq %xmm0, %xmm2
992; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
993; X86-SSE41-NEXT:    pmovsxwq %xmm0, %xmm1
994; X86-SSE41-NEXT:    movdqa %xmm2, %xmm0
995; X86-SSE41-NEXT:    retl
996entry:
997  %B = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
998  %C = sext <4 x i16> %B to <4 x i64>
999  ret <4 x i64> %C
1000}
1001
1002define <8 x i64> @sext_8i16_to_8i64(<8 x i16> %A) nounwind uwtable readnone ssp {
1003; SSE2-LABEL: sext_8i16_to_8i64:
1004; SSE2:       # %bb.0: # %entry
1005; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1006; SSE2-NEXT:    psrad $16, %xmm1
1007; SSE2-NEXT:    pxor %xmm5, %xmm5
1008; SSE2-NEXT:    pxor %xmm2, %xmm2
1009; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
1010; SSE2-NEXT:    movdqa %xmm1, %xmm4
1011; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
1012; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1013; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
1014; SSE2-NEXT:    psrad $16, %xmm3
1015; SSE2-NEXT:    pcmpgtd %xmm3, %xmm5
1016; SSE2-NEXT:    movdqa %xmm3, %xmm2
1017; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
1018; SSE2-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
1019; SSE2-NEXT:    movdqa %xmm4, %xmm0
1020; SSE2-NEXT:    retq
1021;
1022; SSSE3-LABEL: sext_8i16_to_8i64:
1023; SSSE3:       # %bb.0: # %entry
1024; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1025; SSSE3-NEXT:    psrad $16, %xmm1
1026; SSSE3-NEXT:    pxor %xmm5, %xmm5
1027; SSSE3-NEXT:    pxor %xmm2, %xmm2
1028; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
1029; SSSE3-NEXT:    movdqa %xmm1, %xmm4
1030; SSSE3-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
1031; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1032; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
1033; SSSE3-NEXT:    psrad $16, %xmm3
1034; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm5
1035; SSSE3-NEXT:    movdqa %xmm3, %xmm2
1036; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
1037; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
1038; SSSE3-NEXT:    movdqa %xmm4, %xmm0
1039; SSSE3-NEXT:    retq
1040;
1041; SSE41-LABEL: sext_8i16_to_8i64:
1042; SSE41:       # %bb.0: # %entry
1043; SSE41-NEXT:    pmovsxwq %xmm0, %xmm4
1044; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1045; SSE41-NEXT:    pmovsxwq %xmm1, %xmm1
1046; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
1047; SSE41-NEXT:    pmovsxwq %xmm2, %xmm2
1048; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
1049; SSE41-NEXT:    pmovsxwq %xmm0, %xmm3
1050; SSE41-NEXT:    movdqa %xmm4, %xmm0
1051; SSE41-NEXT:    retq
1052;
1053; AVX1-LABEL: sext_8i16_to_8i64:
1054; AVX1:       # %bb.0: # %entry
1055; AVX1-NEXT:    vpmovsxwq %xmm0, %xmm1
1056; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
1057; AVX1-NEXT:    vpmovsxwq %xmm2, %xmm2
1058; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm2
1059; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1060; AVX1-NEXT:    vpmovsxwq %xmm1, %xmm1
1061; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
1062; AVX1-NEXT:    vpmovsxwq %xmm0, %xmm0
1063; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
1064; AVX1-NEXT:    vmovaps %ymm2, %ymm0
1065; AVX1-NEXT:    retq
1066;
1067; AVX2-LABEL: sext_8i16_to_8i64:
1068; AVX2:       # %bb.0: # %entry
1069; AVX2-NEXT:    vpmovsxwq %xmm0, %ymm2
1070; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1071; AVX2-NEXT:    vpmovsxwq %xmm0, %ymm1
1072; AVX2-NEXT:    vmovdqa %ymm2, %ymm0
1073; AVX2-NEXT:    retq
1074;
1075; AVX512-LABEL: sext_8i16_to_8i64:
1076; AVX512:       # %bb.0: # %entry
1077; AVX512-NEXT:    vpmovsxwq %xmm0, %zmm0
1078; AVX512-NEXT:    retq
1079;
1080; X86-SSE2-LABEL: sext_8i16_to_8i64:
1081; X86-SSE2:       # %bb.0: # %entry
1082; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1083; X86-SSE2-NEXT:    psrad $16, %xmm1
1084; X86-SSE2-NEXT:    pxor %xmm5, %xmm5
1085; X86-SSE2-NEXT:    pxor %xmm2, %xmm2
1086; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
1087; X86-SSE2-NEXT:    movdqa %xmm1, %xmm4
1088; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
1089; X86-SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1090; X86-SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
1091; X86-SSE2-NEXT:    psrad $16, %xmm3
1092; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm5
1093; X86-SSE2-NEXT:    movdqa %xmm3, %xmm2
1094; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
1095; X86-SSE2-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
1096; X86-SSE2-NEXT:    movdqa %xmm4, %xmm0
1097; X86-SSE2-NEXT:    retl
1098;
1099; X86-SSE41-LABEL: sext_8i16_to_8i64:
1100; X86-SSE41:       # %bb.0: # %entry
1101; X86-SSE41-NEXT:    pmovsxwq %xmm0, %xmm4
1102; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1103; X86-SSE41-NEXT:    pmovsxwq %xmm1, %xmm1
1104; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
1105; X86-SSE41-NEXT:    pmovsxwq %xmm2, %xmm2
1106; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
1107; X86-SSE41-NEXT:    pmovsxwq %xmm0, %xmm3
1108; X86-SSE41-NEXT:    movdqa %xmm4, %xmm0
1109; X86-SSE41-NEXT:    retl
1110entry:
1111  %B = sext <8 x i16> %A to <8 x i64>
1112  ret <8 x i64> %B
1113}
1114
1115define <2 x i64> @sext_4i32_to_2i64(<4 x i32> %A) nounwind uwtable readnone ssp {
1116; SSE2-LABEL: sext_4i32_to_2i64:
1117; SSE2:       # %bb.0: # %entry
1118; SSE2-NEXT:    pxor %xmm1, %xmm1
1119; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
1120; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1121; SSE2-NEXT:    retq
1122;
1123; SSSE3-LABEL: sext_4i32_to_2i64:
1124; SSSE3:       # %bb.0: # %entry
1125; SSSE3-NEXT:    pxor %xmm1, %xmm1
1126; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm1
1127; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1128; SSSE3-NEXT:    retq
1129;
1130; SSE41-LABEL: sext_4i32_to_2i64:
1131; SSE41:       # %bb.0: # %entry
1132; SSE41-NEXT:    pmovsxdq %xmm0, %xmm0
1133; SSE41-NEXT:    retq
1134;
1135; AVX-LABEL: sext_4i32_to_2i64:
1136; AVX:       # %bb.0: # %entry
1137; AVX-NEXT:    vpmovsxdq %xmm0, %xmm0
1138; AVX-NEXT:    retq
1139;
1140; X86-SSE2-LABEL: sext_4i32_to_2i64:
1141; X86-SSE2:       # %bb.0: # %entry
1142; X86-SSE2-NEXT:    pxor %xmm1, %xmm1
1143; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
1144; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1145; X86-SSE2-NEXT:    retl
1146;
1147; X86-SSE41-LABEL: sext_4i32_to_2i64:
1148; X86-SSE41:       # %bb.0: # %entry
1149; X86-SSE41-NEXT:    pmovsxdq %xmm0, %xmm0
1150; X86-SSE41-NEXT:    retl
1151entry:
1152  %B = shufflevector <4 x i32> %A, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
1153  %C = sext <2 x i32> %B to <2 x i64>
1154  ret <2 x i64> %C
1155}
1156
1157define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp {
1158; SSE2-LABEL: sext_4i32_to_4i64:
1159; SSE2:       # %bb.0: # %entry
1160; SSE2-NEXT:    pxor %xmm2, %xmm2
1161; SSE2-NEXT:    pxor %xmm3, %xmm3
1162; SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
1163; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1164; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
1165; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
1166; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1167; SSE2-NEXT:    retq
1168;
1169; SSSE3-LABEL: sext_4i32_to_4i64:
1170; SSSE3:       # %bb.0: # %entry
1171; SSSE3-NEXT:    pxor %xmm2, %xmm2
1172; SSSE3-NEXT:    pxor %xmm3, %xmm3
1173; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm3
1174; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1175; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
1176; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
1177; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1178; SSSE3-NEXT:    retq
1179;
1180; SSE41-LABEL: sext_4i32_to_4i64:
1181; SSE41:       # %bb.0: # %entry
1182; SSE41-NEXT:    pmovsxdq %xmm0, %xmm2
1183; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1184; SSE41-NEXT:    pmovsxdq %xmm0, %xmm1
1185; SSE41-NEXT:    movdqa %xmm2, %xmm0
1186; SSE41-NEXT:    retq
1187;
1188; AVX1-LABEL: sext_4i32_to_4i64:
1189; AVX1:       # %bb.0: # %entry
1190; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
1191; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1192; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
1193; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1194; AVX1-NEXT:    retq
1195;
1196; AVX2-LABEL: sext_4i32_to_4i64:
1197; AVX2:       # %bb.0: # %entry
1198; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
1199; AVX2-NEXT:    retq
1200;
1201; AVX512-LABEL: sext_4i32_to_4i64:
1202; AVX512:       # %bb.0: # %entry
1203; AVX512-NEXT:    vpmovsxdq %xmm0, %ymm0
1204; AVX512-NEXT:    retq
1205;
1206; X86-SSE2-LABEL: sext_4i32_to_4i64:
1207; X86-SSE2:       # %bb.0: # %entry
1208; X86-SSE2-NEXT:    pxor %xmm2, %xmm2
1209; X86-SSE2-NEXT:    pxor %xmm3, %xmm3
1210; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
1211; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1212; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
1213; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
1214; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1215; X86-SSE2-NEXT:    retl
1216;
1217; X86-SSE41-LABEL: sext_4i32_to_4i64:
1218; X86-SSE41:       # %bb.0: # %entry
1219; X86-SSE41-NEXT:    pmovsxdq %xmm0, %xmm2
1220; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1221; X86-SSE41-NEXT:    pmovsxdq %xmm0, %xmm1
1222; X86-SSE41-NEXT:    movdqa %xmm2, %xmm0
1223; X86-SSE41-NEXT:    retl
1224entry:
1225  %B = sext <4 x i32> %A to <4 x i64>
1226  ret <4 x i64> %B
1227}
1228
1229define <8 x i64> @sext_8i32_to_8i64(<8 x i32> %A) nounwind uwtable readnone ssp {
1230; SSE2-LABEL: sext_8i32_to_8i64:
1231; SSE2:       # %bb.0: # %entry
1232; SSE2-NEXT:    movdqa %xmm1, %xmm2
1233; SSE2-NEXT:    pxor %xmm4, %xmm4
1234; SSE2-NEXT:    pxor %xmm3, %xmm3
1235; SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
1236; SSE2-NEXT:    pxor %xmm5, %xmm5
1237; SSE2-NEXT:    pcmpgtd %xmm1, %xmm5
1238; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1239; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
1240; SSE2-NEXT:    pxor %xmm3, %xmm3
1241; SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
1242; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
1243; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
1244; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
1245; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
1246; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
1247; SSE2-NEXT:    retq
1248;
1249; SSSE3-LABEL: sext_8i32_to_8i64:
1250; SSSE3:       # %bb.0: # %entry
1251; SSSE3-NEXT:    movdqa %xmm1, %xmm2
1252; SSSE3-NEXT:    pxor %xmm4, %xmm4
1253; SSSE3-NEXT:    pxor %xmm3, %xmm3
1254; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm3
1255; SSSE3-NEXT:    pxor %xmm5, %xmm5
1256; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm5
1257; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1258; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
1259; SSSE3-NEXT:    pxor %xmm3, %xmm3
1260; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm3
1261; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
1262; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
1263; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
1264; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm4
1265; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
1266; SSSE3-NEXT:    retq
1267;
1268; SSE41-LABEL: sext_8i32_to_8i64:
1269; SSE41:       # %bb.0: # %entry
1270; SSE41-NEXT:    pmovsxdq %xmm0, %xmm5
1271; SSE41-NEXT:    pmovsxdq %xmm1, %xmm2
1272; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1273; SSE41-NEXT:    pmovsxdq %xmm0, %xmm4
1274; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
1275; SSE41-NEXT:    pmovsxdq %xmm0, %xmm3
1276; SSE41-NEXT:    movdqa %xmm5, %xmm0
1277; SSE41-NEXT:    movdqa %xmm4, %xmm1
1278; SSE41-NEXT:    retq
1279;
1280; AVX1-LABEL: sext_8i32_to_8i64:
1281; AVX1:       # %bb.0: # %entry
1282; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
1283; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
1284; AVX1-NEXT:    vpmovsxdq %xmm2, %xmm2
1285; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm2
1286; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1287; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
1288; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1289; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
1290; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
1291; AVX1-NEXT:    vmovaps %ymm2, %ymm0
1292; AVX1-NEXT:    retq
1293;
1294; AVX2-LABEL: sext_8i32_to_8i64:
1295; AVX2:       # %bb.0: # %entry
1296; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm2
1297; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
1298; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm1
1299; AVX2-NEXT:    vmovdqa %ymm2, %ymm0
1300; AVX2-NEXT:    retq
1301;
1302; AVX512-LABEL: sext_8i32_to_8i64:
1303; AVX512:       # %bb.0: # %entry
1304; AVX512-NEXT:    vpmovsxdq %ymm0, %zmm0
1305; AVX512-NEXT:    retq
1306;
1307; X86-SSE2-LABEL: sext_8i32_to_8i64:
1308; X86-SSE2:       # %bb.0: # %entry
1309; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
1310; X86-SSE2-NEXT:    pxor %xmm4, %xmm4
1311; X86-SSE2-NEXT:    pxor %xmm3, %xmm3
1312; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
1313; X86-SSE2-NEXT:    pxor %xmm5, %xmm5
1314; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm5
1315; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1316; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
1317; X86-SSE2-NEXT:    pxor %xmm3, %xmm3
1318; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
1319; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
1320; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
1321; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
1322; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
1323; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
1324; X86-SSE2-NEXT:    retl
1325;
1326; X86-SSE41-LABEL: sext_8i32_to_8i64:
1327; X86-SSE41:       # %bb.0: # %entry
1328; X86-SSE41-NEXT:    pmovsxdq %xmm0, %xmm5
1329; X86-SSE41-NEXT:    pmovsxdq %xmm1, %xmm2
1330; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1331; X86-SSE41-NEXT:    pmovsxdq %xmm0, %xmm4
1332; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
1333; X86-SSE41-NEXT:    pmovsxdq %xmm0, %xmm3
1334; X86-SSE41-NEXT:    movdqa %xmm5, %xmm0
1335; X86-SSE41-NEXT:    movdqa %xmm4, %xmm1
1336; X86-SSE41-NEXT:    retl
1337entry:
1338  %B = sext <8 x i32> %A to <8 x i64>
1339  ret <8 x i64> %B
1340}
1341
1342define <2 x i64> @load_sext_2i1_to_2i64(<2 x i1> *%ptr) {
1343; SSE-LABEL: load_sext_2i1_to_2i64:
1344; SSE:       # %bb.0: # %entry
1345; SSE-NEXT:    movb (%rdi), %al
1346; SSE-NEXT:    movzbl %al, %ecx
1347; SSE-NEXT:    shrb %al
1348; SSE-NEXT:    movzbl %al, %eax
1349; SSE-NEXT:    negq %rax
1350; SSE-NEXT:    movq %rax, %xmm1
1351; SSE-NEXT:    andl $1, %ecx
1352; SSE-NEXT:    negq %rcx
1353; SSE-NEXT:    movq %rcx, %xmm0
1354; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1355; SSE-NEXT:    retq
1356;
1357; AVX1-LABEL: load_sext_2i1_to_2i64:
1358; AVX1:       # %bb.0: # %entry
1359; AVX1-NEXT:    movb (%rdi), %al
1360; AVX1-NEXT:    movzbl %al, %ecx
1361; AVX1-NEXT:    shrb %al
1362; AVX1-NEXT:    movzbl %al, %eax
1363; AVX1-NEXT:    negq %rax
1364; AVX1-NEXT:    vmovq %rax, %xmm0
1365; AVX1-NEXT:    andl $1, %ecx
1366; AVX1-NEXT:    negq %rcx
1367; AVX1-NEXT:    vmovq %rcx, %xmm1
1368; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1369; AVX1-NEXT:    retq
1370;
1371; AVX2-LABEL: load_sext_2i1_to_2i64:
1372; AVX2:       # %bb.0: # %entry
1373; AVX2-NEXT:    movb (%rdi), %al
1374; AVX2-NEXT:    movzbl %al, %ecx
1375; AVX2-NEXT:    shrb %al
1376; AVX2-NEXT:    movzbl %al, %eax
1377; AVX2-NEXT:    negq %rax
1378; AVX2-NEXT:    vmovq %rax, %xmm0
1379; AVX2-NEXT:    andl $1, %ecx
1380; AVX2-NEXT:    negq %rcx
1381; AVX2-NEXT:    vmovq %rcx, %xmm1
1382; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1383; AVX2-NEXT:    retq
1384;
1385; AVX512-LABEL: load_sext_2i1_to_2i64:
1386; AVX512:       # %bb.0: # %entry
1387; AVX512-NEXT:    kmovw (%rdi), %k1
1388; AVX512-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
1389; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1390; AVX512-NEXT:    vzeroupper
1391; AVX512-NEXT:    retq
1392;
1393; X86-SSE2-LABEL: load_sext_2i1_to_2i64:
1394; X86-SSE2:       # %bb.0: # %entry
1395; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
1396; X86-SSE2-NEXT:    movb (%eax), %al
1397; X86-SSE2-NEXT:    movzbl %al, %ecx
1398; X86-SSE2-NEXT:    shrb %al
1399; X86-SSE2-NEXT:    movzbl %al, %eax
1400; X86-SSE2-NEXT:    negl %eax
1401; X86-SSE2-NEXT:    movd %eax, %xmm0
1402; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1]
1403; X86-SSE2-NEXT:    andl $1, %ecx
1404; X86-SSE2-NEXT:    negl %ecx
1405; X86-SSE2-NEXT:    movd %ecx, %xmm0
1406; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
1407; X86-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1408; X86-SSE2-NEXT:    retl
1409;
1410; X86-SSE41-LABEL: load_sext_2i1_to_2i64:
1411; X86-SSE41:       # %bb.0: # %entry
1412; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
1413; X86-SSE41-NEXT:    movb (%eax), %al
1414; X86-SSE41-NEXT:    movzbl %al, %ecx
1415; X86-SSE41-NEXT:    andl $1, %ecx
1416; X86-SSE41-NEXT:    negl %ecx
1417; X86-SSE41-NEXT:    movd %ecx, %xmm0
1418; X86-SSE41-NEXT:    pinsrd $1, %ecx, %xmm0
1419; X86-SSE41-NEXT:    shrb %al
1420; X86-SSE41-NEXT:    movzbl %al, %eax
1421; X86-SSE41-NEXT:    negl %eax
1422; X86-SSE41-NEXT:    pinsrd $2, %eax, %xmm0
1423; X86-SSE41-NEXT:    pinsrd $3, %eax, %xmm0
1424; X86-SSE41-NEXT:    retl
1425entry:
1426 %X = load <2 x i1>, <2 x i1>* %ptr
1427 %Y = sext <2 x i1> %X to <2 x i64>
1428 ret <2 x i64> %Y
1429}
1430
1431define <2 x i64> @load_sext_2i8_to_2i64(<2 x i8> *%ptr) {
1432; SSE2-LABEL: load_sext_2i8_to_2i64:
1433; SSE2:       # %bb.0: # %entry
1434; SSE2-NEXT:    movzwl (%rdi), %eax
1435; SSE2-NEXT:    movd %eax, %xmm0
1436; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1437; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1438; SSE2-NEXT:    pxor %xmm1, %xmm1
1439; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
1440; SSE2-NEXT:    psrad $24, %xmm0
1441; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1442; SSE2-NEXT:    retq
1443;
1444; SSSE3-LABEL: load_sext_2i8_to_2i64:
1445; SSSE3:       # %bb.0: # %entry
1446; SSSE3-NEXT:    movzwl (%rdi), %eax
1447; SSSE3-NEXT:    movd %eax, %xmm0
1448; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1449; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1450; SSSE3-NEXT:    pxor %xmm1, %xmm1
1451; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm1
1452; SSSE3-NEXT:    psrad $24, %xmm0
1453; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1454; SSSE3-NEXT:    retq
1455;
1456; SSE41-LABEL: load_sext_2i8_to_2i64:
1457; SSE41:       # %bb.0: # %entry
1458; SSE41-NEXT:    pmovsxbq (%rdi), %xmm0
1459; SSE41-NEXT:    retq
1460;
1461; AVX-LABEL: load_sext_2i8_to_2i64:
1462; AVX:       # %bb.0: # %entry
1463; AVX-NEXT:    vpmovsxbq (%rdi), %xmm0
1464; AVX-NEXT:    retq
1465;
1466; X86-SSE2-LABEL: load_sext_2i8_to_2i64:
1467; X86-SSE2:       # %bb.0: # %entry
1468; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
1469; X86-SSE2-NEXT:    movzwl (%eax), %eax
1470; X86-SSE2-NEXT:    movd %eax, %xmm0
1471; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1472; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1473; X86-SSE2-NEXT:    pxor %xmm1, %xmm1
1474; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
1475; X86-SSE2-NEXT:    psrad $24, %xmm0
1476; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1477; X86-SSE2-NEXT:    retl
1478;
1479; X86-SSE41-LABEL: load_sext_2i8_to_2i64:
1480; X86-SSE41:       # %bb.0: # %entry
1481; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
1482; X86-SSE41-NEXT:    pmovsxbq (%eax), %xmm0
1483; X86-SSE41-NEXT:    retl
1484entry:
1485 %X = load <2 x i8>, <2 x i8>* %ptr
1486 %Y = sext <2 x i8> %X to <2 x i64>
1487 ret <2 x i64> %Y
1488}
1489
1490define <4 x i32> @load_sext_4i1_to_4i32(<4 x i1> *%ptr) {
1491; SSE2-LABEL: load_sext_4i1_to_4i32:
1492; SSE2:       # %bb.0: # %entry
1493; SSE2-NEXT:    movb (%rdi), %al
1494; SSE2-NEXT:    movl %eax, %ecx
1495; SSE2-NEXT:    shrb $3, %cl
1496; SSE2-NEXT:    movzbl %cl, %ecx
1497; SSE2-NEXT:    negl %ecx
1498; SSE2-NEXT:    movd %ecx, %xmm0
1499; SSE2-NEXT:    movzbl %al, %ecx
1500; SSE2-NEXT:    shrb $2, %al
1501; SSE2-NEXT:    movzbl %al, %eax
1502; SSE2-NEXT:    andl $1, %eax
1503; SSE2-NEXT:    negl %eax
1504; SSE2-NEXT:    movd %eax, %xmm1
1505; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1506; SSE2-NEXT:    movl %ecx, %eax
1507; SSE2-NEXT:    andl $1, %eax
1508; SSE2-NEXT:    negl %eax
1509; SSE2-NEXT:    movd %eax, %xmm0
1510; SSE2-NEXT:    shrb %cl
1511; SSE2-NEXT:    movzbl %cl, %eax
1512; SSE2-NEXT:    andl $1, %eax
1513; SSE2-NEXT:    negl %eax
1514; SSE2-NEXT:    movd %eax, %xmm2
1515; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1516; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1517; SSE2-NEXT:    retq
1518;
1519; SSSE3-LABEL: load_sext_4i1_to_4i32:
1520; SSSE3:       # %bb.0: # %entry
1521; SSSE3-NEXT:    movb (%rdi), %al
1522; SSSE3-NEXT:    movl %eax, %ecx
1523; SSSE3-NEXT:    shrb $3, %cl
1524; SSSE3-NEXT:    movzbl %cl, %ecx
1525; SSSE3-NEXT:    negl %ecx
1526; SSSE3-NEXT:    movd %ecx, %xmm0
1527; SSSE3-NEXT:    movzbl %al, %ecx
1528; SSSE3-NEXT:    shrb $2, %al
1529; SSSE3-NEXT:    movzbl %al, %eax
1530; SSSE3-NEXT:    andl $1, %eax
1531; SSSE3-NEXT:    negl %eax
1532; SSSE3-NEXT:    movd %eax, %xmm1
1533; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1534; SSSE3-NEXT:    movl %ecx, %eax
1535; SSSE3-NEXT:    andl $1, %eax
1536; SSSE3-NEXT:    negl %eax
1537; SSSE3-NEXT:    movd %eax, %xmm0
1538; SSSE3-NEXT:    shrb %cl
1539; SSSE3-NEXT:    movzbl %cl, %eax
1540; SSSE3-NEXT:    andl $1, %eax
1541; SSSE3-NEXT:    negl %eax
1542; SSSE3-NEXT:    movd %eax, %xmm2
1543; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1544; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1545; SSSE3-NEXT:    retq
1546;
1547; SSE41-LABEL: load_sext_4i1_to_4i32:
1548; SSE41:       # %bb.0: # %entry
1549; SSE41-NEXT:    movb (%rdi), %al
1550; SSE41-NEXT:    movzbl %al, %ecx
1551; SSE41-NEXT:    shrb %al
1552; SSE41-NEXT:    movzbl %al, %eax
1553; SSE41-NEXT:    andl $1, %eax
1554; SSE41-NEXT:    negl %eax
1555; SSE41-NEXT:    movl %ecx, %edx
1556; SSE41-NEXT:    andl $1, %edx
1557; SSE41-NEXT:    negl %edx
1558; SSE41-NEXT:    movd %edx, %xmm0
1559; SSE41-NEXT:    pinsrd $1, %eax, %xmm0
1560; SSE41-NEXT:    movl %ecx, %eax
1561; SSE41-NEXT:    shrb $2, %al
1562; SSE41-NEXT:    movzbl %al, %eax
1563; SSE41-NEXT:    andl $1, %eax
1564; SSE41-NEXT:    negl %eax
1565; SSE41-NEXT:    pinsrd $2, %eax, %xmm0
1566; SSE41-NEXT:    shrb $3, %cl
1567; SSE41-NEXT:    movzbl %cl, %eax
1568; SSE41-NEXT:    negl %eax
1569; SSE41-NEXT:    pinsrd $3, %eax, %xmm0
1570; SSE41-NEXT:    retq
1571;
1572; AVX1-LABEL: load_sext_4i1_to_4i32:
1573; AVX1:       # %bb.0: # %entry
1574; AVX1-NEXT:    movb (%rdi), %al
1575; AVX1-NEXT:    movzbl %al, %ecx
1576; AVX1-NEXT:    shrb %al
1577; AVX1-NEXT:    movzbl %al, %eax
1578; AVX1-NEXT:    andl $1, %eax
1579; AVX1-NEXT:    negl %eax
1580; AVX1-NEXT:    movl %ecx, %edx
1581; AVX1-NEXT:    andl $1, %edx
1582; AVX1-NEXT:    negl %edx
1583; AVX1-NEXT:    vmovd %edx, %xmm0
1584; AVX1-NEXT:    vpinsrd $1, %eax, %xmm0, %xmm0
1585; AVX1-NEXT:    movl %ecx, %eax
1586; AVX1-NEXT:    shrb $2, %al
1587; AVX1-NEXT:    movzbl %al, %eax
1588; AVX1-NEXT:    andl $1, %eax
1589; AVX1-NEXT:    negl %eax
1590; AVX1-NEXT:    vpinsrd $2, %eax, %xmm0, %xmm0
1591; AVX1-NEXT:    shrb $3, %cl
1592; AVX1-NEXT:    movzbl %cl, %eax
1593; AVX1-NEXT:    negl %eax
1594; AVX1-NEXT:    vpinsrd $3, %eax, %xmm0, %xmm0
1595; AVX1-NEXT:    retq
1596;
1597; AVX2-LABEL: load_sext_4i1_to_4i32:
1598; AVX2:       # %bb.0: # %entry
1599; AVX2-NEXT:    movb (%rdi), %al
1600; AVX2-NEXT:    movzbl %al, %ecx
1601; AVX2-NEXT:    shrb %al
1602; AVX2-NEXT:    movzbl %al, %eax
1603; AVX2-NEXT:    andl $1, %eax
1604; AVX2-NEXT:    negl %eax
1605; AVX2-NEXT:    movl %ecx, %edx
1606; AVX2-NEXT:    andl $1, %edx
1607; AVX2-NEXT:    negl %edx
1608; AVX2-NEXT:    vmovd %edx, %xmm0
1609; AVX2-NEXT:    vpinsrd $1, %eax, %xmm0, %xmm0
1610; AVX2-NEXT:    movl %ecx, %eax
1611; AVX2-NEXT:    shrb $2, %al
1612; AVX2-NEXT:    movzbl %al, %eax
1613; AVX2-NEXT:    andl $1, %eax
1614; AVX2-NEXT:    negl %eax
1615; AVX2-NEXT:    vpinsrd $2, %eax, %xmm0, %xmm0
1616; AVX2-NEXT:    shrb $3, %cl
1617; AVX2-NEXT:    movzbl %cl, %eax
1618; AVX2-NEXT:    negl %eax
1619; AVX2-NEXT:    vpinsrd $3, %eax, %xmm0, %xmm0
1620; AVX2-NEXT:    retq
1621;
1622; AVX512-LABEL: load_sext_4i1_to_4i32:
1623; AVX512:       # %bb.0: # %entry
1624; AVX512-NEXT:    kmovw (%rdi), %k1
1625; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
1626; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1627; AVX512-NEXT:    vzeroupper
1628; AVX512-NEXT:    retq
1629;
1630; X86-SSE2-LABEL: load_sext_4i1_to_4i32:
1631; X86-SSE2:       # %bb.0: # %entry
1632; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
1633; X86-SSE2-NEXT:    movb (%eax), %al
1634; X86-SSE2-NEXT:    movl %eax, %ecx
1635; X86-SSE2-NEXT:    shrb $3, %cl
1636; X86-SSE2-NEXT:    movzbl %cl, %ecx
1637; X86-SSE2-NEXT:    negl %ecx
1638; X86-SSE2-NEXT:    movd %ecx, %xmm0
1639; X86-SSE2-NEXT:    movl %eax, %ecx
1640; X86-SSE2-NEXT:    shrb $2, %cl
1641; X86-SSE2-NEXT:    movzbl %cl, %ecx
1642; X86-SSE2-NEXT:    andl $1, %ecx
1643; X86-SSE2-NEXT:    negl %ecx
1644; X86-SSE2-NEXT:    movd %ecx, %xmm1
1645; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1646; X86-SSE2-NEXT:    movzbl %al, %ecx
1647; X86-SSE2-NEXT:    andl $1, %ecx
1648; X86-SSE2-NEXT:    negl %ecx
1649; X86-SSE2-NEXT:    movd %ecx, %xmm0
1650; X86-SSE2-NEXT:    shrb %al
1651; X86-SSE2-NEXT:    movzbl %al, %eax
1652; X86-SSE2-NEXT:    andl $1, %eax
1653; X86-SSE2-NEXT:    negl %eax
1654; X86-SSE2-NEXT:    movd %eax, %xmm2
1655; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1656; X86-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1657; X86-SSE2-NEXT:    retl
1658;
1659; X86-SSE41-LABEL: load_sext_4i1_to_4i32:
1660; X86-SSE41:       # %bb.0: # %entry
1661; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
1662; X86-SSE41-NEXT:    movb (%eax), %al
1663; X86-SSE41-NEXT:    movl %eax, %ecx
1664; X86-SSE41-NEXT:    shrb %cl
1665; X86-SSE41-NEXT:    movzbl %cl, %ecx
1666; X86-SSE41-NEXT:    andl $1, %ecx
1667; X86-SSE41-NEXT:    negl %ecx
1668; X86-SSE41-NEXT:    movzbl %al, %edx
1669; X86-SSE41-NEXT:    andl $1, %edx
1670; X86-SSE41-NEXT:    negl %edx
1671; X86-SSE41-NEXT:    movd %edx, %xmm0
1672; X86-SSE41-NEXT:    pinsrd $1, %ecx, %xmm0
1673; X86-SSE41-NEXT:    movl %eax, %ecx
1674; X86-SSE41-NEXT:    shrb $2, %cl
1675; X86-SSE41-NEXT:    movzbl %cl, %ecx
1676; X86-SSE41-NEXT:    andl $1, %ecx
1677; X86-SSE41-NEXT:    negl %ecx
1678; X86-SSE41-NEXT:    pinsrd $2, %ecx, %xmm0
1679; X86-SSE41-NEXT:    shrb $3, %al
1680; X86-SSE41-NEXT:    movzbl %al, %eax
1681; X86-SSE41-NEXT:    negl %eax
1682; X86-SSE41-NEXT:    pinsrd $3, %eax, %xmm0
1683; X86-SSE41-NEXT:    retl
1684entry:
1685 %X = load <4 x i1>, <4 x i1>* %ptr
1686 %Y = sext <4 x i1> %X to <4 x i32>
1687 ret <4 x i32> %Y
1688}
1689
1690define <4 x i32> @load_sext_4i8_to_4i32(<4 x i8> *%ptr) {
1691; SSE2-LABEL: load_sext_4i8_to_4i32:
1692; SSE2:       # %bb.0: # %entry
1693; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1694; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1695; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1696; SSE2-NEXT:    psrad $24, %xmm0
1697; SSE2-NEXT:    retq
1698;
1699; SSSE3-LABEL: load_sext_4i8_to_4i32:
1700; SSSE3:       # %bb.0: # %entry
1701; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1702; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1703; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1704; SSSE3-NEXT:    psrad $24, %xmm0
1705; SSSE3-NEXT:    retq
1706;
1707; SSE41-LABEL: load_sext_4i8_to_4i32:
1708; SSE41:       # %bb.0: # %entry
1709; SSE41-NEXT:    pmovsxbd (%rdi), %xmm0
1710; SSE41-NEXT:    retq
1711;
1712; AVX-LABEL: load_sext_4i8_to_4i32:
1713; AVX:       # %bb.0: # %entry
1714; AVX-NEXT:    vpmovsxbd (%rdi), %xmm0
1715; AVX-NEXT:    retq
1716;
1717; X86-SSE2-LABEL: load_sext_4i8_to_4i32:
1718; X86-SSE2:       # %bb.0: # %entry
1719; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
1720; X86-SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1721; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1722; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1723; X86-SSE2-NEXT:    psrad $24, %xmm0
1724; X86-SSE2-NEXT:    retl
1725;
1726; X86-SSE41-LABEL: load_sext_4i8_to_4i32:
1727; X86-SSE41:       # %bb.0: # %entry
1728; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
1729; X86-SSE41-NEXT:    pmovsxbd (%eax), %xmm0
1730; X86-SSE41-NEXT:    retl
1731entry:
1732 %X = load <4 x i8>, <4 x i8>* %ptr
1733 %Y = sext <4 x i8> %X to <4 x i32>
1734 ret <4 x i32> %Y
1735}
1736
1737define <4 x i64> @load_sext_4i1_to_4i64(<4 x i1> *%ptr) {
1738; SSE2-LABEL: load_sext_4i1_to_4i64:
1739; SSE2:       # %bb.0: # %entry
1740; SSE2-NEXT:    movb (%rdi), %al
1741; SSE2-NEXT:    movl %eax, %ecx
1742; SSE2-NEXT:    shrb %cl
1743; SSE2-NEXT:    andb $1, %cl
1744; SSE2-NEXT:    movzbl %cl, %ecx
1745; SSE2-NEXT:    movl %eax, %edx
1746; SSE2-NEXT:    andb $1, %dl
1747; SSE2-NEXT:    movzbl %dl, %edx
1748; SSE2-NEXT:    movd %edx, %xmm1
1749; SSE2-NEXT:    pinsrw $2, %ecx, %xmm1
1750; SSE2-NEXT:    movl %eax, %ecx
1751; SSE2-NEXT:    shrb $2, %cl
1752; SSE2-NEXT:    andb $1, %cl
1753; SSE2-NEXT:    movzbl %cl, %ecx
1754; SSE2-NEXT:    pinsrw $4, %ecx, %xmm1
1755; SSE2-NEXT:    shrb $3, %al
1756; SSE2-NEXT:    movzbl %al, %eax
1757; SSE2-NEXT:    pinsrw $6, %eax, %xmm1
1758; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
1759; SSE2-NEXT:    psllq $63, %xmm0
1760; SSE2-NEXT:    psrad $31, %xmm0
1761; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1762; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
1763; SSE2-NEXT:    psllq $63, %xmm1
1764; SSE2-NEXT:    psrad $31, %xmm1
1765; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1766; SSE2-NEXT:    retq
1767;
1768; SSSE3-LABEL: load_sext_4i1_to_4i64:
1769; SSSE3:       # %bb.0: # %entry
1770; SSSE3-NEXT:    movb (%rdi), %al
1771; SSSE3-NEXT:    movl %eax, %ecx
1772; SSSE3-NEXT:    shrb %cl
1773; SSSE3-NEXT:    andb $1, %cl
1774; SSSE3-NEXT:    movzbl %cl, %ecx
1775; SSSE3-NEXT:    movl %eax, %edx
1776; SSSE3-NEXT:    andb $1, %dl
1777; SSSE3-NEXT:    movzbl %dl, %edx
1778; SSSE3-NEXT:    movd %edx, %xmm1
1779; SSSE3-NEXT:    pinsrw $2, %ecx, %xmm1
1780; SSSE3-NEXT:    movl %eax, %ecx
1781; SSSE3-NEXT:    shrb $2, %cl
1782; SSSE3-NEXT:    andb $1, %cl
1783; SSSE3-NEXT:    movzbl %cl, %ecx
1784; SSSE3-NEXT:    pinsrw $4, %ecx, %xmm1
1785; SSSE3-NEXT:    shrb $3, %al
1786; SSSE3-NEXT:    movzbl %al, %eax
1787; SSSE3-NEXT:    pinsrw $6, %eax, %xmm1
1788; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
1789; SSSE3-NEXT:    psllq $63, %xmm0
1790; SSSE3-NEXT:    psrad $31, %xmm0
1791; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1792; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
1793; SSSE3-NEXT:    psllq $63, %xmm1
1794; SSSE3-NEXT:    psrad $31, %xmm1
1795; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1796; SSSE3-NEXT:    retq
1797;
1798; SSE41-LABEL: load_sext_4i1_to_4i64:
1799; SSE41:       # %bb.0: # %entry
1800; SSE41-NEXT:    movb (%rdi), %al
1801; SSE41-NEXT:    movl %eax, %ecx
1802; SSE41-NEXT:    shrb %cl
1803; SSE41-NEXT:    andb $1, %cl
1804; SSE41-NEXT:    movzbl %cl, %ecx
1805; SSE41-NEXT:    movl %eax, %edx
1806; SSE41-NEXT:    andb $1, %dl
1807; SSE41-NEXT:    movzbl %dl, %edx
1808; SSE41-NEXT:    movd %edx, %xmm1
1809; SSE41-NEXT:    pinsrb $4, %ecx, %xmm1
1810; SSE41-NEXT:    movl %eax, %ecx
1811; SSE41-NEXT:    shrb $2, %cl
1812; SSE41-NEXT:    andb $1, %cl
1813; SSE41-NEXT:    movzbl %cl, %ecx
1814; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
1815; SSE41-NEXT:    pinsrb $8, %ecx, %xmm1
1816; SSE41-NEXT:    shrb $3, %al
1817; SSE41-NEXT:    movzbl %al, %eax
1818; SSE41-NEXT:    pinsrb $12, %eax, %xmm1
1819; SSE41-NEXT:    psllq $63, %xmm0
1820; SSE41-NEXT:    psrad $31, %xmm0
1821; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1822; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
1823; SSE41-NEXT:    psllq $63, %xmm1
1824; SSE41-NEXT:    psrad $31, %xmm1
1825; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1826; SSE41-NEXT:    retq
1827;
1828; AVX1-LABEL: load_sext_4i1_to_4i64:
1829; AVX1:       # %bb.0: # %entry
1830; AVX1-NEXT:    movb (%rdi), %al
1831; AVX1-NEXT:    movzbl %al, %ecx
1832; AVX1-NEXT:    shrb %al
1833; AVX1-NEXT:    movzbl %al, %eax
1834; AVX1-NEXT:    andl $1, %eax
1835; AVX1-NEXT:    negl %eax
1836; AVX1-NEXT:    movl %ecx, %edx
1837; AVX1-NEXT:    andl $1, %edx
1838; AVX1-NEXT:    negl %edx
1839; AVX1-NEXT:    vmovd %edx, %xmm0
1840; AVX1-NEXT:    vpinsrd $1, %eax, %xmm0, %xmm0
1841; AVX1-NEXT:    movl %ecx, %eax
1842; AVX1-NEXT:    shrb $2, %al
1843; AVX1-NEXT:    movzbl %al, %eax
1844; AVX1-NEXT:    andl $1, %eax
1845; AVX1-NEXT:    negl %eax
1846; AVX1-NEXT:    vpinsrd $2, %eax, %xmm0, %xmm0
1847; AVX1-NEXT:    shrb $3, %cl
1848; AVX1-NEXT:    movzbl %cl, %eax
1849; AVX1-NEXT:    negl %eax
1850; AVX1-NEXT:    vpinsrd $3, %eax, %xmm0, %xmm0
1851; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
1852; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1853; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
1854; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1855; AVX1-NEXT:    retq
1856;
1857; AVX2-LABEL: load_sext_4i1_to_4i64:
1858; AVX2:       # %bb.0: # %entry
1859; AVX2-NEXT:    movb (%rdi), %al
1860; AVX2-NEXT:    movl %eax, %ecx
1861; AVX2-NEXT:    shrb $3, %cl
1862; AVX2-NEXT:    movzbl %cl, %ecx
1863; AVX2-NEXT:    negq %rcx
1864; AVX2-NEXT:    vmovq %rcx, %xmm0
1865; AVX2-NEXT:    movzbl %al, %ecx
1866; AVX2-NEXT:    shrb $2, %al
1867; AVX2-NEXT:    movzbl %al, %eax
1868; AVX2-NEXT:    andl $1, %eax
1869; AVX2-NEXT:    negq %rax
1870; AVX2-NEXT:    vmovq %rax, %xmm1
1871; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1872; AVX2-NEXT:    movl %ecx, %eax
1873; AVX2-NEXT:    andl $1, %eax
1874; AVX2-NEXT:    negq %rax
1875; AVX2-NEXT:    vmovq %rax, %xmm1
1876; AVX2-NEXT:    shrb %cl
1877; AVX2-NEXT:    movzbl %cl, %eax
1878; AVX2-NEXT:    andl $1, %eax
1879; AVX2-NEXT:    negq %rax
1880; AVX2-NEXT:    vmovq %rax, %xmm2
1881; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
1882; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
1883; AVX2-NEXT:    retq
1884;
1885; AVX512-LABEL: load_sext_4i1_to_4i64:
1886; AVX512:       # %bb.0: # %entry
1887; AVX512-NEXT:    kmovw (%rdi), %k1
1888; AVX512-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
1889; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
1890; AVX512-NEXT:    retq
1891;
1892; X86-SSE2-LABEL: load_sext_4i1_to_4i64:
1893; X86-SSE2:       # %bb.0: # %entry
1894; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
1895; X86-SSE2-NEXT:    movb (%eax), %al
1896; X86-SSE2-NEXT:    movl %eax, %ecx
1897; X86-SSE2-NEXT:    shrb %cl
1898; X86-SSE2-NEXT:    andb $1, %cl
1899; X86-SSE2-NEXT:    movzbl %cl, %ecx
1900; X86-SSE2-NEXT:    movl %eax, %edx
1901; X86-SSE2-NEXT:    andb $1, %dl
1902; X86-SSE2-NEXT:    movzbl %dl, %edx
1903; X86-SSE2-NEXT:    movd %edx, %xmm1
1904; X86-SSE2-NEXT:    pinsrw $2, %ecx, %xmm1
1905; X86-SSE2-NEXT:    movl %eax, %ecx
1906; X86-SSE2-NEXT:    shrb $2, %cl
1907; X86-SSE2-NEXT:    andb $1, %cl
1908; X86-SSE2-NEXT:    movzbl %cl, %ecx
1909; X86-SSE2-NEXT:    pinsrw $4, %ecx, %xmm1
1910; X86-SSE2-NEXT:    shrb $3, %al
1911; X86-SSE2-NEXT:    movzbl %al, %eax
1912; X86-SSE2-NEXT:    pinsrw $6, %eax, %xmm1
1913; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
1914; X86-SSE2-NEXT:    psllq $63, %xmm0
1915; X86-SSE2-NEXT:    psrad $31, %xmm0
1916; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1917; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
1918; X86-SSE2-NEXT:    psllq $63, %xmm1
1919; X86-SSE2-NEXT:    psrad $31, %xmm1
1920; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1921; X86-SSE2-NEXT:    retl
1922;
1923; X86-SSE41-LABEL: load_sext_4i1_to_4i64:
1924; X86-SSE41:       # %bb.0: # %entry
1925; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
1926; X86-SSE41-NEXT:    movb (%eax), %al
1927; X86-SSE41-NEXT:    movl %eax, %ecx
1928; X86-SSE41-NEXT:    shrb %cl
1929; X86-SSE41-NEXT:    andb $1, %cl
1930; X86-SSE41-NEXT:    movzbl %cl, %ecx
1931; X86-SSE41-NEXT:    movl %eax, %edx
1932; X86-SSE41-NEXT:    andb $1, %dl
1933; X86-SSE41-NEXT:    movzbl %dl, %edx
1934; X86-SSE41-NEXT:    movd %edx, %xmm1
1935; X86-SSE41-NEXT:    pinsrb $4, %ecx, %xmm1
1936; X86-SSE41-NEXT:    movl %eax, %ecx
1937; X86-SSE41-NEXT:    shrb $2, %cl
1938; X86-SSE41-NEXT:    andb $1, %cl
1939; X86-SSE41-NEXT:    movzbl %cl, %ecx
1940; X86-SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
1941; X86-SSE41-NEXT:    pinsrb $8, %ecx, %xmm1
1942; X86-SSE41-NEXT:    shrb $3, %al
1943; X86-SSE41-NEXT:    movzbl %al, %eax
1944; X86-SSE41-NEXT:    pinsrb $12, %eax, %xmm1
1945; X86-SSE41-NEXT:    psllq $63, %xmm0
1946; X86-SSE41-NEXT:    psrad $31, %xmm0
1947; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1948; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
1949; X86-SSE41-NEXT:    psllq $63, %xmm1
1950; X86-SSE41-NEXT:    psrad $31, %xmm1
1951; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1952; X86-SSE41-NEXT:    retl
1953entry:
1954 %X = load <4 x i1>, <4 x i1>* %ptr
1955 %Y = sext <4 x i1> %X to <4 x i64>
1956 ret <4 x i64> %Y
1957}
1958
1959define <4 x i64> @load_sext_4i8_to_4i64(<4 x i8> *%ptr) {
1960; SSE2-LABEL: load_sext_4i8_to_4i64:
1961; SSE2:       # %bb.0: # %entry
1962; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1963; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1964; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1965; SSE2-NEXT:    psrad $24, %xmm1
1966; SSE2-NEXT:    pxor %xmm2, %xmm2
1967; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
1968; SSE2-NEXT:    movdqa %xmm1, %xmm0
1969; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1970; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1971; SSE2-NEXT:    retq
1972;
1973; SSSE3-LABEL: load_sext_4i8_to_4i64:
1974; SSSE3:       # %bb.0: # %entry
1975; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1976; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1977; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1978; SSSE3-NEXT:    psrad $24, %xmm1
1979; SSSE3-NEXT:    pxor %xmm2, %xmm2
1980; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
1981; SSSE3-NEXT:    movdqa %xmm1, %xmm0
1982; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1983; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1984; SSSE3-NEXT:    retq
1985;
1986; SSE41-LABEL: load_sext_4i8_to_4i64:
1987; SSE41:       # %bb.0: # %entry
1988; SSE41-NEXT:    pmovsxbq (%rdi), %xmm0
1989; SSE41-NEXT:    pmovsxbq 2(%rdi), %xmm1
1990; SSE41-NEXT:    retq
1991;
1992; AVX1-LABEL: load_sext_4i8_to_4i64:
1993; AVX1:       # %bb.0: # %entry
1994; AVX1-NEXT:    vpmovsxbq 2(%rdi), %xmm0
1995; AVX1-NEXT:    vpmovsxbq (%rdi), %xmm1
1996; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1997; AVX1-NEXT:    retq
1998;
1999; AVX2-LABEL: load_sext_4i8_to_4i64:
2000; AVX2:       # %bb.0: # %entry
2001; AVX2-NEXT:    vpmovsxbq (%rdi), %ymm0
2002; AVX2-NEXT:    retq
2003;
2004; AVX512-LABEL: load_sext_4i8_to_4i64:
2005; AVX512:       # %bb.0: # %entry
2006; AVX512-NEXT:    vpmovsxbq (%rdi), %ymm0
2007; AVX512-NEXT:    retq
2008;
2009; X86-SSE2-LABEL: load_sext_4i8_to_4i64:
2010; X86-SSE2:       # %bb.0: # %entry
2011; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
2012; X86-SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2013; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2014; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2015; X86-SSE2-NEXT:    psrad $24, %xmm1
2016; X86-SSE2-NEXT:    pxor %xmm2, %xmm2
2017; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
2018; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
2019; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
2020; X86-SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
2021; X86-SSE2-NEXT:    retl
2022;
2023; X86-SSE41-LABEL: load_sext_4i8_to_4i64:
2024; X86-SSE41:       # %bb.0: # %entry
2025; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
2026; X86-SSE41-NEXT:    pmovsxbq (%eax), %xmm0
2027; X86-SSE41-NEXT:    pmovsxbq 2(%eax), %xmm1
2028; X86-SSE41-NEXT:    retl
2029entry:
2030 %X = load <4 x i8>, <4 x i8>* %ptr
2031 %Y = sext <4 x i8> %X to <4 x i64>
2032 ret <4 x i64> %Y
2033}
2034
2035define <2 x i64> @load_sext_4i8_to_4i64_extract(<4 x i8> *%ptr) {
2036; SSE2-LABEL: load_sext_4i8_to_4i64_extract:
2037; SSE2:       # %bb.0:
2038; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2039; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2040; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
2041; SSE2-NEXT:    psrad $24, %xmm0
2042; SSE2-NEXT:    pxor %xmm1, %xmm1
2043; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
2044; SSE2-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2045; SSE2-NEXT:    retq
2046;
2047; SSSE3-LABEL: load_sext_4i8_to_4i64_extract:
2048; SSSE3:       # %bb.0:
2049; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2050; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2051; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
2052; SSSE3-NEXT:    psrad $24, %xmm0
2053; SSSE3-NEXT:    pxor %xmm1, %xmm1
2054; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm1
2055; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2056; SSSE3-NEXT:    retq
2057;
2058; SSE41-LABEL: load_sext_4i8_to_4i64_extract:
2059; SSE41:       # %bb.0:
2060; SSE41-NEXT:    pmovsxbq 2(%rdi), %xmm0
2061; SSE41-NEXT:    retq
2062;
2063; AVX1-LABEL: load_sext_4i8_to_4i64_extract:
2064; AVX1:       # %bb.0:
2065; AVX1-NEXT:    vpmovsxbq 2(%rdi), %xmm0
2066; AVX1-NEXT:    retq
2067;
2068; AVX2-LABEL: load_sext_4i8_to_4i64_extract:
2069; AVX2:       # %bb.0:
2070; AVX2-NEXT:    vpmovsxbq (%rdi), %ymm0
2071; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
2072; AVX2-NEXT:    vzeroupper
2073; AVX2-NEXT:    retq
2074;
2075; AVX512-LABEL: load_sext_4i8_to_4i64_extract:
2076; AVX512:       # %bb.0:
2077; AVX512-NEXT:    vpmovsxbq (%rdi), %ymm0
2078; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
2079; AVX512-NEXT:    vzeroupper
2080; AVX512-NEXT:    retq
2081;
2082; X86-SSE2-LABEL: load_sext_4i8_to_4i64_extract:
2083; X86-SSE2:       # %bb.0:
2084; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
2085; X86-SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2086; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2087; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
2088; X86-SSE2-NEXT:    psrad $24, %xmm0
2089; X86-SSE2-NEXT:    pxor %xmm1, %xmm1
2090; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
2091; X86-SSE2-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2092; X86-SSE2-NEXT:    retl
2093;
2094; X86-SSE41-LABEL: load_sext_4i8_to_4i64_extract:
2095; X86-SSE41:       # %bb.0:
2096; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
2097; X86-SSE41-NEXT:    pmovsxbq 2(%eax), %xmm0
2098; X86-SSE41-NEXT:    retl
2099 %ld = load <4 x i8>, <4 x i8>* %ptr
2100 %sext = sext <4 x i8> %ld to <4 x i64>
2101 %extract = shufflevector <4 x i64> %sext, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
2102 ret <2 x i64> %extract
2103}
2104
2105define <8 x i16> @load_sext_8i1_to_8i16(<8 x i1> *%ptr) {
2106; SSE-LABEL: load_sext_8i1_to_8i16:
2107; SSE:       # %bb.0: # %entry
2108; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2109; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
2110; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
2111; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
2112; SSE-NEXT:    pand %xmm1, %xmm0
2113; SSE-NEXT:    pcmpeqw %xmm1, %xmm0
2114; SSE-NEXT:    retq
2115;
2116; AVX1-LABEL: load_sext_8i1_to_8i16:
2117; AVX1:       # %bb.0: # %entry
2118; AVX1-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2119; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
2120; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
2121; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
2122; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
2123; AVX1-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
2124; AVX1-NEXT:    retq
2125;
2126; AVX2-LABEL: load_sext_8i1_to_8i16:
2127; AVX2:       # %bb.0: # %entry
2128; AVX2-NEXT:    vpbroadcastb (%rdi), %xmm0
2129; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
2130; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
2131; AVX2-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
2132; AVX2-NEXT:    retq
2133;
2134; AVX512F-LABEL: load_sext_8i1_to_8i16:
2135; AVX512F:       # %bb.0: # %entry
2136; AVX512F-NEXT:    kmovw (%rdi), %k1
2137; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
2138; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
2139; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2140; AVX512F-NEXT:    vzeroupper
2141; AVX512F-NEXT:    retq
2142;
2143; AVX512BW-LABEL: load_sext_8i1_to_8i16:
2144; AVX512BW:       # %bb.0: # %entry
2145; AVX512BW-NEXT:    kmovw (%rdi), %k0
2146; AVX512BW-NEXT:    vpmovm2w %k0, %zmm0
2147; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
2148; AVX512BW-NEXT:    vzeroupper
2149; AVX512BW-NEXT:    retq
2150;
2151; X86-SSE-LABEL: load_sext_8i1_to_8i16:
2152; X86-SSE:       # %bb.0: # %entry
2153; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
2154; X86-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2155; X86-SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
2156; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
2157; X86-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
2158; X86-SSE-NEXT:    pand %xmm1, %xmm0
2159; X86-SSE-NEXT:    pcmpeqw %xmm1, %xmm0
2160; X86-SSE-NEXT:    retl
2161entry:
2162 %X = load <8 x i1>, <8 x i1>* %ptr
2163 %Y = sext <8 x i1> %X to <8 x i16>
2164 ret <8 x i16> %Y
2165}
2166
2167define <8 x i16> @load_sext_8i8_to_8i16(<8 x i8> *%ptr) {
2168; SSE2-LABEL: load_sext_8i8_to_8i16:
2169; SSE2:       # %bb.0: # %entry
2170; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
2171; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2172; SSE2-NEXT:    psraw $8, %xmm0
2173; SSE2-NEXT:    retq
2174;
2175; SSSE3-LABEL: load_sext_8i8_to_8i16:
2176; SSSE3:       # %bb.0: # %entry
2177; SSSE3-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
2178; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2179; SSSE3-NEXT:    psraw $8, %xmm0
2180; SSSE3-NEXT:    retq
2181;
2182; SSE41-LABEL: load_sext_8i8_to_8i16:
2183; SSE41:       # %bb.0: # %entry
2184; SSE41-NEXT:    pmovsxbw (%rdi), %xmm0
2185; SSE41-NEXT:    retq
2186;
2187; AVX-LABEL: load_sext_8i8_to_8i16:
2188; AVX:       # %bb.0: # %entry
2189; AVX-NEXT:    vpmovsxbw (%rdi), %xmm0
2190; AVX-NEXT:    retq
2191;
2192; X86-SSE2-LABEL: load_sext_8i8_to_8i16:
2193; X86-SSE2:       # %bb.0: # %entry
2194; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
2195; X86-SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
2196; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2197; X86-SSE2-NEXT:    psraw $8, %xmm0
2198; X86-SSE2-NEXT:    retl
2199;
2200; X86-SSE41-LABEL: load_sext_8i8_to_8i16:
2201; X86-SSE41:       # %bb.0: # %entry
2202; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
2203; X86-SSE41-NEXT:    pmovsxbw (%eax), %xmm0
2204; X86-SSE41-NEXT:    retl
2205entry:
2206 %X = load <8 x i8>, <8 x i8>* %ptr
2207 %Y = sext <8 x i8> %X to <8 x i16>
2208 ret <8 x i16> %Y
2209}
2210
2211define <8 x i64> @load_sext_8i8_to_8i64(<8 x i8> *%ptr) {
2212; SSE2-LABEL: load_sext_8i8_to_8i64:
2213; SSE2:       # %bb.0: # %entry
2214; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
2215; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
2216; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
2217; SSE2-NEXT:    psrad $24, %xmm1
2218; SSE2-NEXT:    pxor %xmm4, %xmm4
2219; SSE2-NEXT:    pxor %xmm3, %xmm3
2220; SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
2221; SSE2-NEXT:    movdqa %xmm1, %xmm0
2222; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
2223; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
2224; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
2225; SSE2-NEXT:    psrad $24, %xmm3
2226; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
2227; SSE2-NEXT:    movdqa %xmm3, %xmm2
2228; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
2229; SSE2-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
2230; SSE2-NEXT:    retq
2231;
2232; SSSE3-LABEL: load_sext_8i8_to_8i64:
2233; SSSE3:       # %bb.0: # %entry
2234; SSSE3-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
2235; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
2236; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
2237; SSSE3-NEXT:    psrad $24, %xmm1
2238; SSSE3-NEXT:    pxor %xmm4, %xmm4
2239; SSSE3-NEXT:    pxor %xmm3, %xmm3
2240; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm3
2241; SSSE3-NEXT:    movdqa %xmm1, %xmm0
2242; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
2243; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
2244; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
2245; SSSE3-NEXT:    psrad $24, %xmm3
2246; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm4
2247; SSSE3-NEXT:    movdqa %xmm3, %xmm2
2248; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
2249; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
2250; SSSE3-NEXT:    retq
2251;
2252; SSE41-LABEL: load_sext_8i8_to_8i64:
2253; SSE41:       # %bb.0: # %entry
2254; SSE41-NEXT:    pmovsxbq (%rdi), %xmm0
2255; SSE41-NEXT:    pmovsxbq 2(%rdi), %xmm1
2256; SSE41-NEXT:    pmovsxbq 4(%rdi), %xmm2
2257; SSE41-NEXT:    pmovsxbq 6(%rdi), %xmm3
2258; SSE41-NEXT:    retq
2259;
2260; AVX1-LABEL: load_sext_8i8_to_8i64:
2261; AVX1:       # %bb.0: # %entry
2262; AVX1-NEXT:    vpmovsxbq 6(%rdi), %xmm1
2263; AVX1-NEXT:    vpmovsxbq 4(%rdi), %xmm2
2264; AVX1-NEXT:    vpmovsxbq 2(%rdi), %xmm0
2265; AVX1-NEXT:    vpmovsxbq (%rdi), %xmm3
2266; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
2267; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
2268; AVX1-NEXT:    retq
2269;
2270; AVX2-LABEL: load_sext_8i8_to_8i64:
2271; AVX2:       # %bb.0: # %entry
2272; AVX2-NEXT:    vpmovsxbq (%rdi), %ymm0
2273; AVX2-NEXT:    vpmovsxbq 4(%rdi), %ymm1
2274; AVX2-NEXT:    retq
2275;
2276; AVX512-LABEL: load_sext_8i8_to_8i64:
2277; AVX512:       # %bb.0: # %entry
2278; AVX512-NEXT:    vpmovsxbq (%rdi), %zmm0
2279; AVX512-NEXT:    retq
2280;
2281; X86-SSE2-LABEL: load_sext_8i8_to_8i64:
2282; X86-SSE2:       # %bb.0: # %entry
2283; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
2284; X86-SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
2285; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
2286; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
2287; X86-SSE2-NEXT:    psrad $24, %xmm1
2288; X86-SSE2-NEXT:    pxor %xmm4, %xmm4
2289; X86-SSE2-NEXT:    pxor %xmm3, %xmm3
2290; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
2291; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
2292; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
2293; X86-SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
2294; X86-SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
2295; X86-SSE2-NEXT:    psrad $24, %xmm3
2296; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
2297; X86-SSE2-NEXT:    movdqa %xmm3, %xmm2
2298; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
2299; X86-SSE2-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
2300; X86-SSE2-NEXT:    retl
2301;
2302; X86-SSE41-LABEL: load_sext_8i8_to_8i64:
2303; X86-SSE41:       # %bb.0: # %entry
2304; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
2305; X86-SSE41-NEXT:    pmovsxbq (%eax), %xmm0
2306; X86-SSE41-NEXT:    pmovsxbq 2(%eax), %xmm1
2307; X86-SSE41-NEXT:    pmovsxbq 4(%eax), %xmm2
2308; X86-SSE41-NEXT:    pmovsxbq 6(%eax), %xmm3
2309; X86-SSE41-NEXT:    retl
2310entry:
2311 %X = load <8 x i8>, <8 x i8>* %ptr
2312 %Y = sext <8 x i8> %X to <8 x i64>
2313 ret <8 x i64> %Y
2314}
2315
2316define <8 x i32> @load_sext_8i1_to_8i32(<8 x i1> *%ptr) {
2317; SSE-LABEL: load_sext_8i1_to_8i32:
2318; SSE:       # %bb.0: # %entry
2319; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2320; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
2321; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [1,2,4,8]
2322; SSE-NEXT:    movdqa %xmm1, %xmm0
2323; SSE-NEXT:    pand %xmm2, %xmm0
2324; SSE-NEXT:    pcmpeqd %xmm2, %xmm0
2325; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [16,32,64,128]
2326; SSE-NEXT:    pand %xmm2, %xmm1
2327; SSE-NEXT:    pcmpeqd %xmm2, %xmm1
2328; SSE-NEXT:    retq
2329;
2330; AVX1-LABEL: load_sext_8i1_to_8i32:
2331; AVX1:       # %bb.0: # %entry
2332; AVX1-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2333; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
2334; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
2335; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
2336; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
2337; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
2338; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm1
2339; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
2340; AVX1-NEXT:    vpxor %xmm3, %xmm1, %xmm1
2341; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
2342; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm0
2343; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
2344; AVX1-NEXT:    retq
2345;
2346; AVX2-LABEL: load_sext_8i1_to_8i32:
2347; AVX2:       # %bb.0: # %entry
2348; AVX2-NEXT:    vpbroadcastb (%rdi), %ymm0
2349; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128]
2350; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
2351; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm0, %ymm0
2352; AVX2-NEXT:    retq
2353;
2354; AVX512-LABEL: load_sext_8i1_to_8i32:
2355; AVX512:       # %bb.0: # %entry
2356; AVX512-NEXT:    kmovw (%rdi), %k1
2357; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
2358; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
2359; AVX512-NEXT:    retq
2360;
2361; X86-SSE-LABEL: load_sext_8i1_to_8i32:
2362; X86-SSE:       # %bb.0: # %entry
2363; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
2364; X86-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2365; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
2366; X86-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [1,2,4,8]
2367; X86-SSE-NEXT:    movdqa %xmm1, %xmm0
2368; X86-SSE-NEXT:    pand %xmm2, %xmm0
2369; X86-SSE-NEXT:    pcmpeqd %xmm2, %xmm0
2370; X86-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [16,32,64,128]
2371; X86-SSE-NEXT:    pand %xmm2, %xmm1
2372; X86-SSE-NEXT:    pcmpeqd %xmm2, %xmm1
2373; X86-SSE-NEXT:    retl
2374entry:
2375 %X = load <8 x i1>, <8 x i1>* %ptr
2376 %Y = sext <8 x i1> %X to <8 x i32>
2377 ret <8 x i32> %Y
2378}
2379
2380define <8 x i32> @load_sext_8i8_to_8i32(<8 x i8> *%ptr) {
2381; SSE2-LABEL: load_sext_8i8_to_8i32:
2382; SSE2:       # %bb.0: # %entry
2383; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
2384; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2385; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2386; SSE2-NEXT:    psrad $24, %xmm0
2387; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
2388; SSE2-NEXT:    psrad $24, %xmm1
2389; SSE2-NEXT:    retq
2390;
2391; SSSE3-LABEL: load_sext_8i8_to_8i32:
2392; SSSE3:       # %bb.0: # %entry
2393; SSSE3-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
2394; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2395; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2396; SSSE3-NEXT:    psrad $24, %xmm0
2397; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
2398; SSSE3-NEXT:    psrad $24, %xmm1
2399; SSSE3-NEXT:    retq
2400;
2401; SSE41-LABEL: load_sext_8i8_to_8i32:
2402; SSE41:       # %bb.0: # %entry
2403; SSE41-NEXT:    pmovsxbd (%rdi), %xmm0
2404; SSE41-NEXT:    pmovsxbd 4(%rdi), %xmm1
2405; SSE41-NEXT:    retq
2406;
2407; AVX1-LABEL: load_sext_8i8_to_8i32:
2408; AVX1:       # %bb.0: # %entry
2409; AVX1-NEXT:    vpmovsxbd 4(%rdi), %xmm0
2410; AVX1-NEXT:    vpmovsxbd (%rdi), %xmm1
2411; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2412; AVX1-NEXT:    retq
2413;
2414; AVX2-LABEL: load_sext_8i8_to_8i32:
2415; AVX2:       # %bb.0: # %entry
2416; AVX2-NEXT:    vpmovsxbd (%rdi), %ymm0
2417; AVX2-NEXT:    retq
2418;
2419; AVX512-LABEL: load_sext_8i8_to_8i32:
2420; AVX512:       # %bb.0: # %entry
2421; AVX512-NEXT:    vpmovsxbd (%rdi), %ymm0
2422; AVX512-NEXT:    retq
2423;
2424; X86-SSE2-LABEL: load_sext_8i8_to_8i32:
2425; X86-SSE2:       # %bb.0: # %entry
2426; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
2427; X86-SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
2428; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2429; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2430; X86-SSE2-NEXT:    psrad $24, %xmm0
2431; X86-SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
2432; X86-SSE2-NEXT:    psrad $24, %xmm1
2433; X86-SSE2-NEXT:    retl
2434;
2435; X86-SSE41-LABEL: load_sext_8i8_to_8i32:
2436; X86-SSE41:       # %bb.0: # %entry
2437; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
2438; X86-SSE41-NEXT:    pmovsxbd (%eax), %xmm0
2439; X86-SSE41-NEXT:    pmovsxbd 4(%eax), %xmm1
2440; X86-SSE41-NEXT:    retl
2441entry:
2442 %X = load <8 x i8>, <8 x i8>* %ptr
2443 %Y = sext <8 x i8> %X to <8 x i32>
2444 ret <8 x i32> %Y
2445}
2446
2447define <16 x i8> @load_sext_16i1_to_16i8(<16 x i1> *%ptr) nounwind readnone {
2448; SSE2-LABEL: load_sext_16i1_to_16i8:
2449; SSE2:       # %bb.0: # %entry
2450; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2451; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2452; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,1,1,4,5,6,7]
2453; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
2454; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
2455; SSE2-NEXT:    pand %xmm1, %xmm0
2456; SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
2457; SSE2-NEXT:    retq
2458;
2459; SSSE3-LABEL: load_sext_16i1_to_16i8:
2460; SSSE3:       # %bb.0: # %entry
2461; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2462; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
2463; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
2464; SSSE3-NEXT:    pand %xmm1, %xmm0
2465; SSSE3-NEXT:    pcmpeqb %xmm1, %xmm0
2466; SSSE3-NEXT:    retq
2467;
2468; SSE41-LABEL: load_sext_16i1_to_16i8:
2469; SSE41:       # %bb.0: # %entry
2470; SSE41-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2471; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
2472; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
2473; SSE41-NEXT:    pand %xmm1, %xmm0
2474; SSE41-NEXT:    pcmpeqb %xmm1, %xmm0
2475; SSE41-NEXT:    retq
2476;
2477; AVX1-LABEL: load_sext_16i1_to_16i8:
2478; AVX1:       # %bb.0: # %entry
2479; AVX1-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2480; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
2481; AVX1-NEXT:    vmovddup {{.*#+}} xmm1 = [9241421688590303745,9241421688590303745]
2482; AVX1-NEXT:    # xmm1 = mem[0,0]
2483; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
2484; AVX1-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
2485; AVX1-NEXT:    retq
2486;
2487; AVX2-LABEL: load_sext_16i1_to_16i8:
2488; AVX2:       # %bb.0: # %entry
2489; AVX2-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2490; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
2491; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [9241421688590303745,9241421688590303745]
2492; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
2493; AVX2-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
2494; AVX2-NEXT:    retq
2495;
2496; AVX512F-LABEL: load_sext_16i1_to_16i8:
2497; AVX512F:       # %bb.0: # %entry
2498; AVX512F-NEXT:    kmovw (%rdi), %k1
2499; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
2500; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
2501; AVX512F-NEXT:    vzeroupper
2502; AVX512F-NEXT:    retq
2503;
2504; AVX512BW-LABEL: load_sext_16i1_to_16i8:
2505; AVX512BW:       # %bb.0: # %entry
2506; AVX512BW-NEXT:    kmovw (%rdi), %k0
2507; AVX512BW-NEXT:    vpmovm2b %k0, %zmm0
2508; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
2509; AVX512BW-NEXT:    vzeroupper
2510; AVX512BW-NEXT:    retq
2511;
2512; X86-SSE2-LABEL: load_sext_16i1_to_16i8:
2513; X86-SSE2:       # %bb.0: # %entry
2514; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
2515; X86-SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2516; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2517; X86-SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,1,1,4,5,6,7]
2518; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
2519; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
2520; X86-SSE2-NEXT:    pand %xmm1, %xmm0
2521; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
2522; X86-SSE2-NEXT:    retl
2523;
2524; X86-SSE41-LABEL: load_sext_16i1_to_16i8:
2525; X86-SSE41:       # %bb.0: # %entry
2526; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
2527; X86-SSE41-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2528; X86-SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
2529; X86-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
2530; X86-SSE41-NEXT:    pand %xmm1, %xmm0
2531; X86-SSE41-NEXT:    pcmpeqb %xmm1, %xmm0
2532; X86-SSE41-NEXT:    retl
2533entry:
2534 %X = load <16 x i1>, <16 x i1>* %ptr
2535 %Y = sext <16 x i1> %X to <16 x i8>
2536 ret <16 x i8> %Y
2537}
2538
2539define <16 x i16> @load_sext_16i1_to_16i16(<16 x i1> *%ptr) {
2540; SSE-LABEL: load_sext_16i1_to_16i16:
2541; SSE:       # %bb.0: # %entry
2542; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2543; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
2544; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
2545; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128]
2546; SSE-NEXT:    movdqa %xmm1, %xmm0
2547; SSE-NEXT:    pand %xmm2, %xmm0
2548; SSE-NEXT:    pcmpeqw %xmm2, %xmm0
2549; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [256,512,1024,2048,4096,8192,16384,32768]
2550; SSE-NEXT:    pand %xmm2, %xmm1
2551; SSE-NEXT:    pcmpeqw %xmm2, %xmm1
2552; SSE-NEXT:    retq
2553;
2554; AVX1-LABEL: load_sext_16i1_to_16i16:
2555; AVX1:       # %bb.0: # %entry
2556; AVX1-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2557; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
2558; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
2559; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
2560; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
2561; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
2562; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
2563; AVX1-NEXT:    vpcmpeqw %xmm2, %xmm1, %xmm1
2564; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
2565; AVX1-NEXT:    vpxor %xmm3, %xmm1, %xmm1
2566; AVX1-NEXT:    vpcmpeqw %xmm2, %xmm0, %xmm0
2567; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm0
2568; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
2569; AVX1-NEXT:    retq
2570;
2571; AVX2-LABEL: load_sext_16i1_to_16i16:
2572; AVX2:       # %bb.0: # %entry
2573; AVX2-NEXT:    vpbroadcastw (%rdi), %ymm0
2574; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
2575; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
2576; AVX2-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
2577; AVX2-NEXT:    retq
2578;
2579; AVX512F-LABEL: load_sext_16i1_to_16i16:
2580; AVX512F:       # %bb.0: # %entry
2581; AVX512F-NEXT:    kmovw (%rdi), %k1
2582; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
2583; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
2584; AVX512F-NEXT:    retq
2585;
2586; AVX512BW-LABEL: load_sext_16i1_to_16i16:
2587; AVX512BW:       # %bb.0: # %entry
2588; AVX512BW-NEXT:    kmovw (%rdi), %k0
2589; AVX512BW-NEXT:    vpmovm2w %k0, %zmm0
2590; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
2591; AVX512BW-NEXT:    retq
2592;
2593; X86-SSE-LABEL: load_sext_16i1_to_16i16:
2594; X86-SSE:       # %bb.0: # %entry
2595; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
2596; X86-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2597; X86-SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
2598; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
2599; X86-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128]
2600; X86-SSE-NEXT:    movdqa %xmm1, %xmm0
2601; X86-SSE-NEXT:    pand %xmm2, %xmm0
2602; X86-SSE-NEXT:    pcmpeqw %xmm2, %xmm0
2603; X86-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [256,512,1024,2048,4096,8192,16384,32768]
2604; X86-SSE-NEXT:    pand %xmm2, %xmm1
2605; X86-SSE-NEXT:    pcmpeqw %xmm2, %xmm1
2606; X86-SSE-NEXT:    retl
2607entry:
2608 %X = load <16 x i1>, <16 x i1>* %ptr
2609 %Y = sext <16 x i1> %X to <16 x i16>
2610 ret <16 x i16> %Y
2611}
2612
2613define <32 x i8> @load_sext_32i1_to_32i8(<32 x i1> *%ptr) nounwind readnone {
2614; SSE-LABEL: load_sext_32i1_to_32i8:
2615; SSE:       # %bb.0: # %entry
2616; SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2617; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2618; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,0,1,1,4,5,6,7]
2619; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
2620; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
2621; SSE-NEXT:    pand %xmm2, %xmm0
2622; SSE-NEXT:    pcmpeqb %xmm2, %xmm0
2623; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[2,2,3,3,4,5,6,7]
2624; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
2625; SSE-NEXT:    pand %xmm2, %xmm1
2626; SSE-NEXT:    pcmpeqb %xmm2, %xmm1
2627; SSE-NEXT:    retq
2628;
2629; AVX1-LABEL: load_sext_32i1_to_32i8:
2630; AVX1:       # %bb.0: # %entry
2631; AVX1-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2632; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2633; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[0,0,1,1,4,5,6,7]
2634; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,2,3,3,4,5,6,7]
2635; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2636; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
2637; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
2638; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
2639; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
2640; AVX1-NEXT:    vpcmpeqb %xmm2, %xmm1, %xmm1
2641; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
2642; AVX1-NEXT:    vpxor %xmm3, %xmm1, %xmm1
2643; AVX1-NEXT:    vpcmpeqb %xmm2, %xmm0, %xmm0
2644; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm0
2645; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
2646; AVX1-NEXT:    retq
2647;
2648; AVX2-LABEL: load_sext_32i1_to_32i8:
2649; AVX2:       # %bb.0: # %entry
2650; AVX2-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2651; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
2652; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19]
2653; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
2654; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
2655; AVX2-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
2656; AVX2-NEXT:    retq
2657;
2658; AVX512F-LABEL: load_sext_32i1_to_32i8:
2659; AVX512F:       # %bb.0: # %entry
2660; AVX512F-NEXT:    kmovw (%rdi), %k1
2661; AVX512F-NEXT:    kmovw 2(%rdi), %k2
2662; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
2663; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
2664; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
2665; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
2666; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
2667; AVX512F-NEXT:    retq
2668;
2669; AVX512BW-LABEL: load_sext_32i1_to_32i8:
2670; AVX512BW:       # %bb.0: # %entry
2671; AVX512BW-NEXT:    kmovd (%rdi), %k0
2672; AVX512BW-NEXT:    vpmovm2b %k0, %zmm0
2673; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
2674; AVX512BW-NEXT:    retq
2675;
2676; X86-SSE-LABEL: load_sext_32i1_to_32i8:
2677; X86-SSE:       # %bb.0: # %entry
2678; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
2679; X86-SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2680; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2681; X86-SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,0,1,1,4,5,6,7]
2682; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
2683; X86-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
2684; X86-SSE-NEXT:    pand %xmm2, %xmm0
2685; X86-SSE-NEXT:    pcmpeqb %xmm2, %xmm0
2686; X86-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[2,2,3,3,4,5,6,7]
2687; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
2688; X86-SSE-NEXT:    pand %xmm2, %xmm1
2689; X86-SSE-NEXT:    pcmpeqb %xmm2, %xmm1
2690; X86-SSE-NEXT:    retl
2691entry:
2692 %X = load <32 x i1>, <32 x i1>* %ptr
2693 %Y = sext <32 x i1> %X to <32 x i8>
2694 ret <32 x i8> %Y
2695}
2696
2697define <16 x i16> @load_sext_16i8_to_16i16(<16 x i8> *%ptr) {
2698; SSE2-LABEL: load_sext_16i8_to_16i16:
2699; SSE2:       # %bb.0: # %entry
2700; SSE2-NEXT:    movdqa (%rdi), %xmm1
2701; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2702; SSE2-NEXT:    psraw $8, %xmm0
2703; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2704; SSE2-NEXT:    psraw $8, %xmm1
2705; SSE2-NEXT:    retq
2706;
2707; SSSE3-LABEL: load_sext_16i8_to_16i16:
2708; SSSE3:       # %bb.0: # %entry
2709; SSSE3-NEXT:    movdqa (%rdi), %xmm1
2710; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2711; SSSE3-NEXT:    psraw $8, %xmm0
2712; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2713; SSSE3-NEXT:    psraw $8, %xmm1
2714; SSSE3-NEXT:    retq
2715;
2716; SSE41-LABEL: load_sext_16i8_to_16i16:
2717; SSE41:       # %bb.0: # %entry
2718; SSE41-NEXT:    pmovsxbw (%rdi), %xmm0
2719; SSE41-NEXT:    pmovsxbw 8(%rdi), %xmm1
2720; SSE41-NEXT:    retq
2721;
2722; AVX1-LABEL: load_sext_16i8_to_16i16:
2723; AVX1:       # %bb.0: # %entry
2724; AVX1-NEXT:    vpmovsxbw 8(%rdi), %xmm0
2725; AVX1-NEXT:    vpmovsxbw (%rdi), %xmm1
2726; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2727; AVX1-NEXT:    retq
2728;
2729; AVX2-LABEL: load_sext_16i8_to_16i16:
2730; AVX2:       # %bb.0: # %entry
2731; AVX2-NEXT:    vpmovsxbw (%rdi), %ymm0
2732; AVX2-NEXT:    retq
2733;
2734; AVX512-LABEL: load_sext_16i8_to_16i16:
2735; AVX512:       # %bb.0: # %entry
2736; AVX512-NEXT:    vpmovsxbw (%rdi), %ymm0
2737; AVX512-NEXT:    retq
2738;
2739; X86-SSE2-LABEL: load_sext_16i8_to_16i16:
2740; X86-SSE2:       # %bb.0: # %entry
2741; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
2742; X86-SSE2-NEXT:    movdqa (%eax), %xmm1
2743; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2744; X86-SSE2-NEXT:    psraw $8, %xmm0
2745; X86-SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2746; X86-SSE2-NEXT:    psraw $8, %xmm1
2747; X86-SSE2-NEXT:    retl
2748;
2749; X86-SSE41-LABEL: load_sext_16i8_to_16i16:
2750; X86-SSE41:       # %bb.0: # %entry
2751; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
2752; X86-SSE41-NEXT:    pmovsxbw (%eax), %xmm0
2753; X86-SSE41-NEXT:    pmovsxbw 8(%eax), %xmm1
2754; X86-SSE41-NEXT:    retl
2755entry:
2756 %X = load <16 x i8>, <16 x i8>* %ptr
2757 %Y = sext <16 x i8> %X to <16 x i16>
2758 ret <16 x i16> %Y
2759}
2760
2761define <2 x i64> @load_sext_2i16_to_2i64(<2 x i16> *%ptr) {
2762; SSE2-LABEL: load_sext_2i16_to_2i64:
2763; SSE2:       # %bb.0: # %entry
2764; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2765; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
2766; SSE2-NEXT:    pxor %xmm1, %xmm1
2767; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
2768; SSE2-NEXT:    psrad $16, %xmm0
2769; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2770; SSE2-NEXT:    retq
2771;
2772; SSSE3-LABEL: load_sext_2i16_to_2i64:
2773; SSSE3:       # %bb.0: # %entry
2774; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2775; SSSE3-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
2776; SSSE3-NEXT:    pxor %xmm1, %xmm1
2777; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm1
2778; SSSE3-NEXT:    psrad $16, %xmm0
2779; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2780; SSSE3-NEXT:    retq
2781;
2782; SSE41-LABEL: load_sext_2i16_to_2i64:
2783; SSE41:       # %bb.0: # %entry
2784; SSE41-NEXT:    pmovsxwq (%rdi), %xmm0
2785; SSE41-NEXT:    retq
2786;
2787; AVX-LABEL: load_sext_2i16_to_2i64:
2788; AVX:       # %bb.0: # %entry
2789; AVX-NEXT:    vpmovsxwq (%rdi), %xmm0
2790; AVX-NEXT:    retq
2791;
2792; X86-SSE2-LABEL: load_sext_2i16_to_2i64:
2793; X86-SSE2:       # %bb.0: # %entry
2794; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
2795; X86-SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2796; X86-SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
2797; X86-SSE2-NEXT:    pxor %xmm1, %xmm1
2798; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
2799; X86-SSE2-NEXT:    psrad $16, %xmm0
2800; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2801; X86-SSE2-NEXT:    retl
2802;
2803; X86-SSE41-LABEL: load_sext_2i16_to_2i64:
2804; X86-SSE41:       # %bb.0: # %entry
2805; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
2806; X86-SSE41-NEXT:    pmovsxwq (%eax), %xmm0
2807; X86-SSE41-NEXT:    retl
2808entry:
2809 %X = load <2 x i16>, <2 x i16>* %ptr
2810 %Y = sext <2 x i16> %X to <2 x i64>
2811 ret <2 x i64> %Y
2812}
2813
2814define <4 x i32> @load_sext_4i16_to_4i32(<4 x i16> *%ptr) {
2815; SSE2-LABEL: load_sext_4i16_to_4i32:
2816; SSE2:       # %bb.0: # %entry
2817; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
2818; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
2819; SSE2-NEXT:    psrad $16, %xmm0
2820; SSE2-NEXT:    retq
2821;
2822; SSSE3-LABEL: load_sext_4i16_to_4i32:
2823; SSSE3:       # %bb.0: # %entry
2824; SSSE3-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
2825; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
2826; SSSE3-NEXT:    psrad $16, %xmm0
2827; SSSE3-NEXT:    retq
2828;
2829; SSE41-LABEL: load_sext_4i16_to_4i32:
2830; SSE41:       # %bb.0: # %entry
2831; SSE41-NEXT:    pmovsxwd (%rdi), %xmm0
2832; SSE41-NEXT:    retq
2833;
2834; AVX-LABEL: load_sext_4i16_to_4i32:
2835; AVX:       # %bb.0: # %entry
2836; AVX-NEXT:    vpmovsxwd (%rdi), %xmm0
2837; AVX-NEXT:    retq
2838;
2839; X86-SSE2-LABEL: load_sext_4i16_to_4i32:
2840; X86-SSE2:       # %bb.0: # %entry
2841; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
2842; X86-SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
2843; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
2844; X86-SSE2-NEXT:    psrad $16, %xmm0
2845; X86-SSE2-NEXT:    retl
2846;
2847; X86-SSE41-LABEL: load_sext_4i16_to_4i32:
2848; X86-SSE41:       # %bb.0: # %entry
2849; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
2850; X86-SSE41-NEXT:    pmovsxwd (%eax), %xmm0
2851; X86-SSE41-NEXT:    retl
2852entry:
2853 %X = load <4 x i16>, <4 x i16>* %ptr
2854 %Y = sext <4 x i16> %X to <4 x i32>
2855 ret <4 x i32> %Y
2856}
2857
2858define <4 x i64> @load_sext_4i16_to_4i64(<4 x i16> *%ptr) {
2859; SSE2-LABEL: load_sext_4i16_to_4i64:
2860; SSE2:       # %bb.0: # %entry
2861; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
2862; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2863; SSE2-NEXT:    psrad $16, %xmm1
2864; SSE2-NEXT:    pxor %xmm2, %xmm2
2865; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
2866; SSE2-NEXT:    movdqa %xmm1, %xmm0
2867; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
2868; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
2869; SSE2-NEXT:    retq
2870;
2871; SSSE3-LABEL: load_sext_4i16_to_4i64:
2872; SSSE3:       # %bb.0: # %entry
2873; SSSE3-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
2874; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2875; SSSE3-NEXT:    psrad $16, %xmm1
2876; SSSE3-NEXT:    pxor %xmm2, %xmm2
2877; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
2878; SSSE3-NEXT:    movdqa %xmm1, %xmm0
2879; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
2880; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
2881; SSSE3-NEXT:    retq
2882;
2883; SSE41-LABEL: load_sext_4i16_to_4i64:
2884; SSE41:       # %bb.0: # %entry
2885; SSE41-NEXT:    pmovsxwq (%rdi), %xmm0
2886; SSE41-NEXT:    pmovsxwq 4(%rdi), %xmm1
2887; SSE41-NEXT:    retq
2888;
2889; AVX1-LABEL: load_sext_4i16_to_4i64:
2890; AVX1:       # %bb.0: # %entry
2891; AVX1-NEXT:    vpmovsxwq 4(%rdi), %xmm0
2892; AVX1-NEXT:    vpmovsxwq (%rdi), %xmm1
2893; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2894; AVX1-NEXT:    retq
2895;
2896; AVX2-LABEL: load_sext_4i16_to_4i64:
2897; AVX2:       # %bb.0: # %entry
2898; AVX2-NEXT:    vpmovsxwq (%rdi), %ymm0
2899; AVX2-NEXT:    retq
2900;
2901; AVX512-LABEL: load_sext_4i16_to_4i64:
2902; AVX512:       # %bb.0: # %entry
2903; AVX512-NEXT:    vpmovsxwq (%rdi), %ymm0
2904; AVX512-NEXT:    retq
2905;
2906; X86-SSE2-LABEL: load_sext_4i16_to_4i64:
2907; X86-SSE2:       # %bb.0: # %entry
2908; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
2909; X86-SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
2910; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2911; X86-SSE2-NEXT:    psrad $16, %xmm1
2912; X86-SSE2-NEXT:    pxor %xmm2, %xmm2
2913; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
2914; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
2915; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
2916; X86-SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
2917; X86-SSE2-NEXT:    retl
2918;
2919; X86-SSE41-LABEL: load_sext_4i16_to_4i64:
2920; X86-SSE41:       # %bb.0: # %entry
2921; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
2922; X86-SSE41-NEXT:    pmovsxwq (%eax), %xmm0
2923; X86-SSE41-NEXT:    pmovsxwq 4(%eax), %xmm1
2924; X86-SSE41-NEXT:    retl
2925entry:
2926 %X = load <4 x i16>, <4 x i16>* %ptr
2927 %Y = sext <4 x i16> %X to <4 x i64>
2928 ret <4 x i64> %Y
2929}
2930
2931define <8 x i32> @load_sext_8i16_to_8i32(<8 x i16> *%ptr) {
2932; SSE2-LABEL: load_sext_8i16_to_8i32:
2933; SSE2:       # %bb.0: # %entry
2934; SSE2-NEXT:    movdqa (%rdi), %xmm1
2935; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2936; SSE2-NEXT:    psrad $16, %xmm0
2937; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
2938; SSE2-NEXT:    psrad $16, %xmm1
2939; SSE2-NEXT:    retq
2940;
2941; SSSE3-LABEL: load_sext_8i16_to_8i32:
2942; SSSE3:       # %bb.0: # %entry
2943; SSSE3-NEXT:    movdqa (%rdi), %xmm1
2944; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2945; SSSE3-NEXT:    psrad $16, %xmm0
2946; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
2947; SSSE3-NEXT:    psrad $16, %xmm1
2948; SSSE3-NEXT:    retq
2949;
2950; SSE41-LABEL: load_sext_8i16_to_8i32:
2951; SSE41:       # %bb.0: # %entry
2952; SSE41-NEXT:    pmovsxwd (%rdi), %xmm0
2953; SSE41-NEXT:    pmovsxwd 8(%rdi), %xmm1
2954; SSE41-NEXT:    retq
2955;
2956; AVX1-LABEL: load_sext_8i16_to_8i32:
2957; AVX1:       # %bb.0: # %entry
2958; AVX1-NEXT:    vpmovsxwd 8(%rdi), %xmm0
2959; AVX1-NEXT:    vpmovsxwd (%rdi), %xmm1
2960; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2961; AVX1-NEXT:    retq
2962;
2963; AVX2-LABEL: load_sext_8i16_to_8i32:
2964; AVX2:       # %bb.0: # %entry
2965; AVX2-NEXT:    vpmovsxwd (%rdi), %ymm0
2966; AVX2-NEXT:    retq
2967;
2968; AVX512-LABEL: load_sext_8i16_to_8i32:
2969; AVX512:       # %bb.0: # %entry
2970; AVX512-NEXT:    vpmovsxwd (%rdi), %ymm0
2971; AVX512-NEXT:    retq
2972;
2973; X86-SSE2-LABEL: load_sext_8i16_to_8i32:
2974; X86-SSE2:       # %bb.0: # %entry
2975; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
2976; X86-SSE2-NEXT:    movdqa (%eax), %xmm1
2977; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2978; X86-SSE2-NEXT:    psrad $16, %xmm0
2979; X86-SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
2980; X86-SSE2-NEXT:    psrad $16, %xmm1
2981; X86-SSE2-NEXT:    retl
2982;
2983; X86-SSE41-LABEL: load_sext_8i16_to_8i32:
2984; X86-SSE41:       # %bb.0: # %entry
2985; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
2986; X86-SSE41-NEXT:    pmovsxwd (%eax), %xmm0
2987; X86-SSE41-NEXT:    pmovsxwd 8(%eax), %xmm1
2988; X86-SSE41-NEXT:    retl
2989entry:
2990 %X = load <8 x i16>, <8 x i16>* %ptr
2991 %Y = sext <8 x i16> %X to <8 x i32>
2992 ret <8 x i32> %Y
2993}
2994
2995define <2 x i64> @load_sext_2i32_to_2i64(<2 x i32> *%ptr) {
2996; SSE2-LABEL: load_sext_2i32_to_2i64:
2997; SSE2:       # %bb.0: # %entry
2998; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
2999; SSE2-NEXT:    pxor %xmm1, %xmm1
3000; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
3001; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3002; SSE2-NEXT:    retq
3003;
3004; SSSE3-LABEL: load_sext_2i32_to_2i64:
3005; SSSE3:       # %bb.0: # %entry
3006; SSSE3-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
3007; SSSE3-NEXT:    pxor %xmm1, %xmm1
3008; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm1
3009; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3010; SSSE3-NEXT:    retq
3011;
3012; SSE41-LABEL: load_sext_2i32_to_2i64:
3013; SSE41:       # %bb.0: # %entry
3014; SSE41-NEXT:    pmovsxdq (%rdi), %xmm0
3015; SSE41-NEXT:    retq
3016;
3017; AVX-LABEL: load_sext_2i32_to_2i64:
3018; AVX:       # %bb.0: # %entry
3019; AVX-NEXT:    vpmovsxdq (%rdi), %xmm0
3020; AVX-NEXT:    retq
3021;
3022; X86-SSE2-LABEL: load_sext_2i32_to_2i64:
3023; X86-SSE2:       # %bb.0: # %entry
3024; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
3025; X86-SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
3026; X86-SSE2-NEXT:    pxor %xmm1, %xmm1
3027; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
3028; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3029; X86-SSE2-NEXT:    retl
3030;
3031; X86-SSE41-LABEL: load_sext_2i32_to_2i64:
3032; X86-SSE41:       # %bb.0: # %entry
3033; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
3034; X86-SSE41-NEXT:    pmovsxdq (%eax), %xmm0
3035; X86-SSE41-NEXT:    retl
3036entry:
3037 %X = load <2 x i32>, <2 x i32>* %ptr
3038 %Y = sext <2 x i32> %X to <2 x i64>
3039 ret <2 x i64> %Y
3040}
3041
3042define <4 x i64> @load_sext_4i32_to_4i64(<4 x i32> *%ptr) {
3043; SSE2-LABEL: load_sext_4i32_to_4i64:
3044; SSE2:       # %bb.0: # %entry
3045; SSE2-NEXT:    movdqa (%rdi), %xmm0
3046; SSE2-NEXT:    pxor %xmm2, %xmm2
3047; SSE2-NEXT:    pxor %xmm3, %xmm3
3048; SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
3049; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
3050; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
3051; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
3052; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
3053; SSE2-NEXT:    retq
3054;
3055; SSSE3-LABEL: load_sext_4i32_to_4i64:
3056; SSSE3:       # %bb.0: # %entry
3057; SSSE3-NEXT:    movdqa (%rdi), %xmm0
3058; SSSE3-NEXT:    pxor %xmm2, %xmm2
3059; SSSE3-NEXT:    pxor %xmm3, %xmm3
3060; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm3
3061; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
3062; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
3063; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
3064; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
3065; SSSE3-NEXT:    retq
3066;
3067; SSE41-LABEL: load_sext_4i32_to_4i64:
3068; SSE41:       # %bb.0: # %entry
3069; SSE41-NEXT:    pmovsxdq (%rdi), %xmm0
3070; SSE41-NEXT:    pmovsxdq 8(%rdi), %xmm1
3071; SSE41-NEXT:    retq
3072;
3073; AVX1-LABEL: load_sext_4i32_to_4i64:
3074; AVX1:       # %bb.0: # %entry
3075; AVX1-NEXT:    vpmovsxdq 8(%rdi), %xmm0
3076; AVX1-NEXT:    vpmovsxdq (%rdi), %xmm1
3077; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
3078; AVX1-NEXT:    retq
3079;
3080; AVX2-LABEL: load_sext_4i32_to_4i64:
3081; AVX2:       # %bb.0: # %entry
3082; AVX2-NEXT:    vpmovsxdq (%rdi), %ymm0
3083; AVX2-NEXT:    retq
3084;
3085; AVX512-LABEL: load_sext_4i32_to_4i64:
3086; AVX512:       # %bb.0: # %entry
3087; AVX512-NEXT:    vpmovsxdq (%rdi), %ymm0
3088; AVX512-NEXT:    retq
3089;
3090; X86-SSE2-LABEL: load_sext_4i32_to_4i64:
3091; X86-SSE2:       # %bb.0: # %entry
3092; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
3093; X86-SSE2-NEXT:    movdqa (%eax), %xmm0
3094; X86-SSE2-NEXT:    pxor %xmm2, %xmm2
3095; X86-SSE2-NEXT:    pxor %xmm3, %xmm3
3096; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
3097; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
3098; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
3099; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
3100; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
3101; X86-SSE2-NEXT:    retl
3102;
3103; X86-SSE41-LABEL: load_sext_4i32_to_4i64:
3104; X86-SSE41:       # %bb.0: # %entry
3105; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
3106; X86-SSE41-NEXT:    pmovsxdq (%eax), %xmm0
3107; X86-SSE41-NEXT:    pmovsxdq 8(%eax), %xmm1
3108; X86-SSE41-NEXT:    retl
3109entry:
3110 %X = load <4 x i32>, <4 x i32>* %ptr
3111 %Y = sext <4 x i32> %X to <4 x i64>
3112 ret <4 x i64> %Y
3113}
3114
3115define i32 @sext_2i8_to_i32(<16 x i8> %A) nounwind uwtable readnone ssp {
3116; SSE2-LABEL: sext_2i8_to_i32:
3117; SSE2:       # %bb.0: # %entry
3118; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
3119; SSE2-NEXT:    psraw $8, %xmm0
3120; SSE2-NEXT:    movd %xmm0, %eax
3121; SSE2-NEXT:    retq
3122;
3123; SSSE3-LABEL: sext_2i8_to_i32:
3124; SSSE3:       # %bb.0: # %entry
3125; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
3126; SSSE3-NEXT:    psraw $8, %xmm0
3127; SSSE3-NEXT:    movd %xmm0, %eax
3128; SSSE3-NEXT:    retq
3129;
3130; SSE41-LABEL: sext_2i8_to_i32:
3131; SSE41:       # %bb.0: # %entry
3132; SSE41-NEXT:    pmovsxbw %xmm0, %xmm0
3133; SSE41-NEXT:    movd %xmm0, %eax
3134; SSE41-NEXT:    retq
3135;
3136; AVX-LABEL: sext_2i8_to_i32:
3137; AVX:       # %bb.0: # %entry
3138; AVX-NEXT:    vpmovsxbw %xmm0, %xmm0
3139; AVX-NEXT:    vmovd %xmm0, %eax
3140; AVX-NEXT:    retq
3141;
3142; X86-SSE2-LABEL: sext_2i8_to_i32:
3143; X86-SSE2:       # %bb.0: # %entry
3144; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
3145; X86-SSE2-NEXT:    psraw $8, %xmm0
3146; X86-SSE2-NEXT:    movd %xmm0, %eax
3147; X86-SSE2-NEXT:    retl
3148;
3149; X86-SSE41-LABEL: sext_2i8_to_i32:
3150; X86-SSE41:       # %bb.0: # %entry
3151; X86-SSE41-NEXT:    pmovsxbw %xmm0, %xmm0
3152; X86-SSE41-NEXT:    movd %xmm0, %eax
3153; X86-SSE41-NEXT:    retl
3154entry:
3155  %Shuf = shufflevector <16 x i8> %A, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
3156  %Ex = sext <2 x i8> %Shuf to <2 x i16>
3157  %Bc = bitcast <2 x i16> %Ex to i32
3158  ret i32 %Bc
3159}
3160
3161define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) {
3162; SSE2-LABEL: sext_4i1_to_4i64:
3163; SSE2:       # %bb.0:
3164; SSE2-NEXT:    pslld $31, %xmm0
3165; SSE2-NEXT:    psrad $31, %xmm0
3166; SSE2-NEXT:    pxor %xmm2, %xmm2
3167; SSE2-NEXT:    pxor %xmm3, %xmm3
3168; SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
3169; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
3170; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
3171; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
3172; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
3173; SSE2-NEXT:    retq
3174;
3175; SSSE3-LABEL: sext_4i1_to_4i64:
3176; SSSE3:       # %bb.0:
3177; SSSE3-NEXT:    pslld $31, %xmm0
3178; SSSE3-NEXT:    psrad $31, %xmm0
3179; SSSE3-NEXT:    pxor %xmm2, %xmm2
3180; SSSE3-NEXT:    pxor %xmm3, %xmm3
3181; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm3
3182; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
3183; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
3184; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
3185; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
3186; SSSE3-NEXT:    retq
3187;
3188; SSE41-LABEL: sext_4i1_to_4i64:
3189; SSE41:       # %bb.0:
3190; SSE41-NEXT:    pslld $31, %xmm0
3191; SSE41-NEXT:    psrad $31, %xmm0
3192; SSE41-NEXT:    pmovsxdq %xmm0, %xmm2
3193; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
3194; SSE41-NEXT:    pmovsxdq %xmm0, %xmm1
3195; SSE41-NEXT:    movdqa %xmm2, %xmm0
3196; SSE41-NEXT:    retq
3197;
3198; AVX1-LABEL: sext_4i1_to_4i64:
3199; AVX1:       # %bb.0:
3200; AVX1-NEXT:    vpslld $31, %xmm0, %xmm0
3201; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm0
3202; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
3203; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
3204; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
3205; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
3206; AVX1-NEXT:    retq
3207;
3208; AVX2-LABEL: sext_4i1_to_4i64:
3209; AVX2:       # %bb.0:
3210; AVX2-NEXT:    vpslld $31, %xmm0, %xmm0
3211; AVX2-NEXT:    vpsrad $31, %xmm0, %xmm0
3212; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
3213; AVX2-NEXT:    retq
3214;
3215; AVX512-LABEL: sext_4i1_to_4i64:
3216; AVX512:       # %bb.0:
3217; AVX512-NEXT:    vpslld $31, %xmm0, %xmm0
3218; AVX512-NEXT:    vpsrad $31, %xmm0, %xmm0
3219; AVX512-NEXT:    vpmovsxdq %xmm0, %ymm0
3220; AVX512-NEXT:    retq
3221;
3222; X86-SSE2-LABEL: sext_4i1_to_4i64:
3223; X86-SSE2:       # %bb.0:
3224; X86-SSE2-NEXT:    pslld $31, %xmm0
3225; X86-SSE2-NEXT:    psrad $31, %xmm0
3226; X86-SSE2-NEXT:    pxor %xmm2, %xmm2
3227; X86-SSE2-NEXT:    pxor %xmm3, %xmm3
3228; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
3229; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
3230; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
3231; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
3232; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
3233; X86-SSE2-NEXT:    retl
3234;
3235; X86-SSE41-LABEL: sext_4i1_to_4i64:
3236; X86-SSE41:       # %bb.0:
3237; X86-SSE41-NEXT:    pslld $31, %xmm0
3238; X86-SSE41-NEXT:    psrad $31, %xmm0
3239; X86-SSE41-NEXT:    pmovsxdq %xmm0, %xmm2
3240; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
3241; X86-SSE41-NEXT:    pmovsxdq %xmm0, %xmm1
3242; X86-SSE41-NEXT:    movdqa %xmm2, %xmm0
3243; X86-SSE41-NEXT:    retl
3244  %extmask = sext <4 x i1> %mask to <4 x i64>
3245  ret <4 x i64> %extmask
3246}
3247
3248define <4 x i64> @sext_4i8_to_4i64(<4 x i8> %mask) {
3249; SSE2-LABEL: sext_4i8_to_4i64:
3250; SSE2:       # %bb.0:
3251; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
3252; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3253; SSE2-NEXT:    psrad $24, %xmm1
3254; SSE2-NEXT:    pxor %xmm2, %xmm2
3255; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
3256; SSE2-NEXT:    movdqa %xmm1, %xmm0
3257; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
3258; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
3259; SSE2-NEXT:    retq
3260;
3261; SSSE3-LABEL: sext_4i8_to_4i64:
3262; SSSE3:       # %bb.0:
3263; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
3264; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3265; SSSE3-NEXT:    psrad $24, %xmm1
3266; SSSE3-NEXT:    pxor %xmm2, %xmm2
3267; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
3268; SSSE3-NEXT:    movdqa %xmm1, %xmm0
3269; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
3270; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
3271; SSSE3-NEXT:    retq
3272;
3273; SSE41-LABEL: sext_4i8_to_4i64:
3274; SSE41:       # %bb.0:
3275; SSE41-NEXT:    pmovsxbq %xmm0, %xmm2
3276; SSE41-NEXT:    psrld $16, %xmm0
3277; SSE41-NEXT:    pmovsxbq %xmm0, %xmm1
3278; SSE41-NEXT:    movdqa %xmm2, %xmm0
3279; SSE41-NEXT:    retq
3280;
3281; AVX1-LABEL: sext_4i8_to_4i64:
3282; AVX1:       # %bb.0:
3283; AVX1-NEXT:    vpmovsxbq %xmm0, %xmm1
3284; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
3285; AVX1-NEXT:    vpmovsxbq %xmm0, %xmm0
3286; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
3287; AVX1-NEXT:    retq
3288;
3289; AVX2-LABEL: sext_4i8_to_4i64:
3290; AVX2:       # %bb.0:
3291; AVX2-NEXT:    vpmovsxbq %xmm0, %ymm0
3292; AVX2-NEXT:    retq
3293;
3294; AVX512-LABEL: sext_4i8_to_4i64:
3295; AVX512:       # %bb.0:
3296; AVX512-NEXT:    vpmovsxbq %xmm0, %ymm0
3297; AVX512-NEXT:    retq
3298;
3299; X86-SSE2-LABEL: sext_4i8_to_4i64:
3300; X86-SSE2:       # %bb.0:
3301; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
3302; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3303; X86-SSE2-NEXT:    psrad $24, %xmm1
3304; X86-SSE2-NEXT:    pxor %xmm2, %xmm2
3305; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
3306; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
3307; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
3308; X86-SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
3309; X86-SSE2-NEXT:    retl
3310;
3311; X86-SSE41-LABEL: sext_4i8_to_4i64:
3312; X86-SSE41:       # %bb.0:
3313; X86-SSE41-NEXT:    pmovsxbq %xmm0, %xmm2
3314; X86-SSE41-NEXT:    psrld $16, %xmm0
3315; X86-SSE41-NEXT:    pmovsxbq %xmm0, %xmm1
3316; X86-SSE41-NEXT:    movdqa %xmm2, %xmm0
3317; X86-SSE41-NEXT:    retl
3318  %extmask = sext <4 x i8> %mask to <4 x i64>
3319  ret <4 x i64> %extmask
3320}
3321
3322define <32 x i8> @sext_32xi1_to_32xi8(<32 x i16> %c1, <32 x i16> %c2)nounwind {
3323; SSE-LABEL: sext_32xi1_to_32xi8:
3324; SSE:       # %bb.0:
3325; SSE-NEXT:    pcmpeqw %xmm5, %xmm1
3326; SSE-NEXT:    pcmpeqw %xmm4, %xmm0
3327; SSE-NEXT:    packsswb %xmm1, %xmm0
3328; SSE-NEXT:    pcmpeqw %xmm7, %xmm3
3329; SSE-NEXT:    pcmpeqw %xmm6, %xmm2
3330; SSE-NEXT:    packsswb %xmm3, %xmm2
3331; SSE-NEXT:    movdqa %xmm2, %xmm1
3332; SSE-NEXT:    retq
3333;
3334; AVX1-LABEL: sext_32xi1_to_32xi8:
3335; AVX1:       # %bb.0:
3336; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
3337; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
3338; AVX1-NEXT:    vpcmpeqw %xmm4, %xmm5, %xmm4
3339; AVX1-NEXT:    vpcmpeqw %xmm3, %xmm1, %xmm1
3340; AVX1-NEXT:    vpacksswb %xmm4, %xmm1, %xmm1
3341; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
3342; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
3343; AVX1-NEXT:    vpcmpeqw %xmm3, %xmm4, %xmm3
3344; AVX1-NEXT:    vpcmpeqw %xmm2, %xmm0, %xmm0
3345; AVX1-NEXT:    vpacksswb %xmm3, %xmm0, %xmm0
3346; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
3347; AVX1-NEXT:    retq
3348;
3349; AVX2-LABEL: sext_32xi1_to_32xi8:
3350; AVX2:       # %bb.0:
3351; AVX2-NEXT:    vpcmpeqw %ymm3, %ymm1, %ymm1
3352; AVX2-NEXT:    vpcmpeqw %ymm2, %ymm0, %ymm0
3353; AVX2-NEXT:    vpacksswb %ymm1, %ymm0, %ymm0
3354; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
3355; AVX2-NEXT:    retq
3356;
3357; AVX512F-LABEL: sext_32xi1_to_32xi8:
3358; AVX512F:       # %bb.0:
3359; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
3360; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
3361; AVX512F-NEXT:    vpcmpeqw %ymm2, %ymm3, %ymm2
3362; AVX512F-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
3363; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
3364; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
3365; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
3366; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
3367; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
3368; AVX512F-NEXT:    retq
3369;
3370; AVX512BW-LABEL: sext_32xi1_to_32xi8:
3371; AVX512BW:       # %bb.0:
3372; AVX512BW-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0
3373; AVX512BW-NEXT:    vpmovm2b %k0, %zmm0
3374; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
3375; AVX512BW-NEXT:    retq
3376;
3377; X86-SSE-LABEL: sext_32xi1_to_32xi8:
3378; X86-SSE:       # %bb.0:
3379; X86-SSE-NEXT:    pushl %ebp
3380; X86-SSE-NEXT:    movl %esp, %ebp
3381; X86-SSE-NEXT:    andl $-16, %esp
3382; X86-SSE-NEXT:    subl $16, %esp
3383; X86-SSE-NEXT:    movdqa 8(%ebp), %xmm3
3384; X86-SSE-NEXT:    pcmpeqw 40(%ebp), %xmm1
3385; X86-SSE-NEXT:    pcmpeqw 24(%ebp), %xmm0
3386; X86-SSE-NEXT:    packsswb %xmm1, %xmm0
3387; X86-SSE-NEXT:    pcmpeqw 72(%ebp), %xmm3
3388; X86-SSE-NEXT:    pcmpeqw 56(%ebp), %xmm2
3389; X86-SSE-NEXT:    packsswb %xmm3, %xmm2
3390; X86-SSE-NEXT:    movdqa %xmm2, %xmm1
3391; X86-SSE-NEXT:    movl %ebp, %esp
3392; X86-SSE-NEXT:    popl %ebp
3393; X86-SSE-NEXT:    retl
3394  %a = icmp eq <32 x i16> %c1, %c2
3395  %b = sext <32 x i1> %a to <32 x i8>
3396  ret <32 x i8> %b
3397}
3398
3399define <2 x i32> @sext_2i8_to_2i32(<2 x i8>* %addr) {
3400; SSE2-LABEL: sext_2i8_to_2i32:
3401; SSE2:       # %bb.0:
3402; SSE2-NEXT:    movzwl (%rdi), %eax
3403; SSE2-NEXT:    movd %eax, %xmm0
3404; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
3405; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
3406; SSE2-NEXT:    psrad $24, %xmm0
3407; SSE2-NEXT:    paddd %xmm0, %xmm0
3408; SSE2-NEXT:    retq
3409;
3410; SSSE3-LABEL: sext_2i8_to_2i32:
3411; SSSE3:       # %bb.0:
3412; SSSE3-NEXT:    movzwl (%rdi), %eax
3413; SSSE3-NEXT:    movd %eax, %xmm0
3414; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
3415; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
3416; SSSE3-NEXT:    psrad $24, %xmm0
3417; SSSE3-NEXT:    paddd %xmm0, %xmm0
3418; SSSE3-NEXT:    retq
3419;
3420; SSE41-LABEL: sext_2i8_to_2i32:
3421; SSE41:       # %bb.0:
3422; SSE41-NEXT:    movzwl (%rdi), %eax
3423; SSE41-NEXT:    movd %eax, %xmm0
3424; SSE41-NEXT:    pmovsxbd %xmm0, %xmm0
3425; SSE41-NEXT:    paddd %xmm0, %xmm0
3426; SSE41-NEXT:    retq
3427;
3428; AVX-LABEL: sext_2i8_to_2i32:
3429; AVX:       # %bb.0:
3430; AVX-NEXT:    movzwl (%rdi), %eax
3431; AVX-NEXT:    vmovd %eax, %xmm0
3432; AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
3433; AVX-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
3434; AVX-NEXT:    retq
3435;
3436; X86-SSE2-LABEL: sext_2i8_to_2i32:
3437; X86-SSE2:       # %bb.0:
3438; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
3439; X86-SSE2-NEXT:    movzwl (%eax), %eax
3440; X86-SSE2-NEXT:    movd %eax, %xmm0
3441; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
3442; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
3443; X86-SSE2-NEXT:    psrad $24, %xmm0
3444; X86-SSE2-NEXT:    paddd %xmm0, %xmm0
3445; X86-SSE2-NEXT:    retl
3446;
3447; X86-SSE41-LABEL: sext_2i8_to_2i32:
3448; X86-SSE41:       # %bb.0:
3449; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
3450; X86-SSE41-NEXT:    movzwl (%eax), %eax
3451; X86-SSE41-NEXT:    movd %eax, %xmm0
3452; X86-SSE41-NEXT:    pmovsxbd %xmm0, %xmm0
3453; X86-SSE41-NEXT:    paddd %xmm0, %xmm0
3454; X86-SSE41-NEXT:    retl
3455  %x = load <2 x i8>, <2 x i8>* %addr, align 1
3456  %y = sext <2 x i8> %x to <2 x i32>
3457  %z = add <2 x i32>%y, %y
3458  ret <2 x i32>%z
3459}
3460
3461define <4 x i32> @sext_4i17_to_4i32(<4 x i17>* %ptr) {
3462; SSE2-LABEL: sext_4i17_to_4i32:
3463; SSE2:       # %bb.0:
3464; SSE2-NEXT:    movq (%rdi), %rax
3465; SSE2-NEXT:    movl %eax, %ecx
3466; SSE2-NEXT:    shll $15, %ecx
3467; SSE2-NEXT:    sarl $15, %ecx
3468; SSE2-NEXT:    movd %ecx, %xmm0
3469; SSE2-NEXT:    movq %rax, %rcx
3470; SSE2-NEXT:    shrq $17, %rcx
3471; SSE2-NEXT:    shll $15, %ecx
3472; SSE2-NEXT:    sarl $15, %ecx
3473; SSE2-NEXT:    movd %ecx, %xmm1
3474; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3475; SSE2-NEXT:    movl 8(%rdi), %ecx
3476; SSE2-NEXT:    shll $13, %ecx
3477; SSE2-NEXT:    movq %rax, %rdx
3478; SSE2-NEXT:    shrq $51, %rdx
3479; SSE2-NEXT:    orl %ecx, %edx
3480; SSE2-NEXT:    shll $15, %edx
3481; SSE2-NEXT:    sarl $15, %edx
3482; SSE2-NEXT:    movd %edx, %xmm1
3483; SSE2-NEXT:    shrq $34, %rax
3484; SSE2-NEXT:    shll $15, %eax
3485; SSE2-NEXT:    sarl $15, %eax
3486; SSE2-NEXT:    movd %eax, %xmm2
3487; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
3488; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
3489; SSE2-NEXT:    retq
3490;
3491; SSSE3-LABEL: sext_4i17_to_4i32:
3492; SSSE3:       # %bb.0:
3493; SSSE3-NEXT:    movq (%rdi), %rax
3494; SSSE3-NEXT:    movl %eax, %ecx
3495; SSSE3-NEXT:    shll $15, %ecx
3496; SSSE3-NEXT:    sarl $15, %ecx
3497; SSSE3-NEXT:    movd %ecx, %xmm0
3498; SSSE3-NEXT:    movq %rax, %rcx
3499; SSSE3-NEXT:    shrq $17, %rcx
3500; SSSE3-NEXT:    shll $15, %ecx
3501; SSSE3-NEXT:    sarl $15, %ecx
3502; SSSE3-NEXT:    movd %ecx, %xmm1
3503; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3504; SSSE3-NEXT:    movl 8(%rdi), %ecx
3505; SSSE3-NEXT:    shll $13, %ecx
3506; SSSE3-NEXT:    movq %rax, %rdx
3507; SSSE3-NEXT:    shrq $51, %rdx
3508; SSSE3-NEXT:    orl %ecx, %edx
3509; SSSE3-NEXT:    shll $15, %edx
3510; SSSE3-NEXT:    sarl $15, %edx
3511; SSSE3-NEXT:    movd %edx, %xmm1
3512; SSSE3-NEXT:    shrq $34, %rax
3513; SSSE3-NEXT:    shll $15, %eax
3514; SSSE3-NEXT:    sarl $15, %eax
3515; SSSE3-NEXT:    movd %eax, %xmm2
3516; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
3517; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
3518; SSSE3-NEXT:    retq
3519;
3520; SSE41-LABEL: sext_4i17_to_4i32:
3521; SSE41:       # %bb.0:
3522; SSE41-NEXT:    movq (%rdi), %rax
3523; SSE41-NEXT:    movq %rax, %rcx
3524; SSE41-NEXT:    shrq $17, %rcx
3525; SSE41-NEXT:    shll $15, %ecx
3526; SSE41-NEXT:    sarl $15, %ecx
3527; SSE41-NEXT:    movl %eax, %edx
3528; SSE41-NEXT:    shll $15, %edx
3529; SSE41-NEXT:    sarl $15, %edx
3530; SSE41-NEXT:    movd %edx, %xmm0
3531; SSE41-NEXT:    pinsrd $1, %ecx, %xmm0
3532; SSE41-NEXT:    movq %rax, %rcx
3533; SSE41-NEXT:    shrq $34, %rcx
3534; SSE41-NEXT:    shll $15, %ecx
3535; SSE41-NEXT:    sarl $15, %ecx
3536; SSE41-NEXT:    pinsrd $2, %ecx, %xmm0
3537; SSE41-NEXT:    movl 8(%rdi), %ecx
3538; SSE41-NEXT:    shll $13, %ecx
3539; SSE41-NEXT:    shrq $51, %rax
3540; SSE41-NEXT:    orl %ecx, %eax
3541; SSE41-NEXT:    shll $15, %eax
3542; SSE41-NEXT:    sarl $15, %eax
3543; SSE41-NEXT:    pinsrd $3, %eax, %xmm0
3544; SSE41-NEXT:    retq
3545;
3546; AVX-LABEL: sext_4i17_to_4i32:
3547; AVX:       # %bb.0:
3548; AVX-NEXT:    movq (%rdi), %rax
3549; AVX-NEXT:    movq %rax, %rcx
3550; AVX-NEXT:    shrq $17, %rcx
3551; AVX-NEXT:    shll $15, %ecx
3552; AVX-NEXT:    sarl $15, %ecx
3553; AVX-NEXT:    movl %eax, %edx
3554; AVX-NEXT:    shll $15, %edx
3555; AVX-NEXT:    sarl $15, %edx
3556; AVX-NEXT:    vmovd %edx, %xmm0
3557; AVX-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0
3558; AVX-NEXT:    movq %rax, %rcx
3559; AVX-NEXT:    shrq $34, %rcx
3560; AVX-NEXT:    shll $15, %ecx
3561; AVX-NEXT:    sarl $15, %ecx
3562; AVX-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0
3563; AVX-NEXT:    movl 8(%rdi), %ecx
3564; AVX-NEXT:    shll $13, %ecx
3565; AVX-NEXT:    shrq $51, %rax
3566; AVX-NEXT:    orl %ecx, %eax
3567; AVX-NEXT:    shll $15, %eax
3568; AVX-NEXT:    sarl $15, %eax
3569; AVX-NEXT:    vpinsrd $3, %eax, %xmm0, %xmm0
3570; AVX-NEXT:    retq
3571;
3572; X86-SSE2-LABEL: sext_4i17_to_4i32:
3573; X86-SSE2:       # %bb.0:
3574; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
3575; X86-SSE2-NEXT:    movl (%eax), %ecx
3576; X86-SSE2-NEXT:    movl 4(%eax), %edx
3577; X86-SSE2-NEXT:    movl 8(%eax), %eax
3578; X86-SSE2-NEXT:    shldl $13, %edx, %eax
3579; X86-SSE2-NEXT:    shll $15, %eax
3580; X86-SSE2-NEXT:    sarl $15, %eax
3581; X86-SSE2-NEXT:    movd %eax, %xmm0
3582; X86-SSE2-NEXT:    movl %edx, %eax
3583; X86-SSE2-NEXT:    shll $13, %eax
3584; X86-SSE2-NEXT:    sarl $15, %eax
3585; X86-SSE2-NEXT:    movd %eax, %xmm1
3586; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3587; X86-SSE2-NEXT:    shldl $15, %ecx, %edx
3588; X86-SSE2-NEXT:    shll $15, %ecx
3589; X86-SSE2-NEXT:    sarl $15, %ecx
3590; X86-SSE2-NEXT:    movd %ecx, %xmm0
3591; X86-SSE2-NEXT:    shll $15, %edx
3592; X86-SSE2-NEXT:    sarl $15, %edx
3593; X86-SSE2-NEXT:    movd %edx, %xmm2
3594; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
3595; X86-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3596; X86-SSE2-NEXT:    retl
3597;
3598; X86-SSE41-LABEL: sext_4i17_to_4i32:
3599; X86-SSE41:       # %bb.0:
3600; X86-SSE41-NEXT:    pushl %esi
3601; X86-SSE41-NEXT:    .cfi_def_cfa_offset 8
3602; X86-SSE41-NEXT:    .cfi_offset %esi, -8
3603; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
3604; X86-SSE41-NEXT:    movl (%eax), %ecx
3605; X86-SSE41-NEXT:    movl 4(%eax), %edx
3606; X86-SSE41-NEXT:    movl %edx, %esi
3607; X86-SSE41-NEXT:    movl 8(%eax), %eax
3608; X86-SSE41-NEXT:    shldl $13, %edx, %eax
3609; X86-SSE41-NEXT:    shldl $15, %ecx, %edx
3610; X86-SSE41-NEXT:    shll $15, %edx
3611; X86-SSE41-NEXT:    sarl $15, %edx
3612; X86-SSE41-NEXT:    shll $15, %ecx
3613; X86-SSE41-NEXT:    sarl $15, %ecx
3614; X86-SSE41-NEXT:    movd %ecx, %xmm0
3615; X86-SSE41-NEXT:    pinsrd $1, %edx, %xmm0
3616; X86-SSE41-NEXT:    shll $13, %esi
3617; X86-SSE41-NEXT:    sarl $15, %esi
3618; X86-SSE41-NEXT:    pinsrd $2, %esi, %xmm0
3619; X86-SSE41-NEXT:    shll $15, %eax
3620; X86-SSE41-NEXT:    sarl $15, %eax
3621; X86-SSE41-NEXT:    pinsrd $3, %eax, %xmm0
3622; X86-SSE41-NEXT:    popl %esi
3623; X86-SSE41-NEXT:    .cfi_def_cfa_offset 4
3624; X86-SSE41-NEXT:    retl
3625  %a = load <4 x i17>, <4 x i17>* %ptr
3626  %b = sext <4 x i17> %a to <4 x i32>
3627  ret <4 x i32> %b
3628}
3629
3630define <8 x i64> @sext_8i6_to_8i64(i32 %x) nounwind uwtable readnone ssp {
3631; SSE2-LABEL: sext_8i6_to_8i64:
3632; SSE2:       # %bb.0: # %entry
3633; SSE2-NEXT:    movd %edi, %xmm0
3634; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
3635; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
3636; SSE2-NEXT:    paddw {{.*}}(%rip), %xmm3
3637; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,0,0,0]
3638; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
3639; SSE2-NEXT:    psllq $58, %xmm0
3640; SSE2-NEXT:    movdqa %xmm0, %xmm1
3641; SSE2-NEXT:    psrad $31, %xmm1
3642; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
3643; SSE2-NEXT:    psrad $26, %xmm0
3644; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
3645; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3646; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
3647; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
3648; SSE2-NEXT:    psllq $58, %xmm1
3649; SSE2-NEXT:    movdqa %xmm1, %xmm2
3650; SSE2-NEXT:    psrad $31, %xmm2
3651; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
3652; SSE2-NEXT:    psrad $26, %xmm1
3653; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
3654; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
3655; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,2,2,2]
3656; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
3657; SSE2-NEXT:    psllq $58, %xmm2
3658; SSE2-NEXT:    movdqa %xmm2, %xmm4
3659; SSE2-NEXT:    psrad $31, %xmm4
3660; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3]
3661; SSE2-NEXT:    psrad $26, %xmm2
3662; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
3663; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
3664; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3]
3665; SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5]
3666; SSE2-NEXT:    psllq $58, %xmm3
3667; SSE2-NEXT:    movdqa %xmm3, %xmm4
3668; SSE2-NEXT:    psrad $31, %xmm4
3669; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3]
3670; SSE2-NEXT:    psrad $26, %xmm3
3671; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
3672; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
3673; SSE2-NEXT:    retq
3674;
3675; SSSE3-LABEL: sext_8i6_to_8i64:
3676; SSSE3:       # %bb.0: # %entry
3677; SSSE3-NEXT:    movd %edi, %xmm0
3678; SSSE3-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
3679; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
3680; SSSE3-NEXT:    paddw {{.*}}(%rip), %xmm3
3681; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,0,0,0]
3682; SSSE3-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
3683; SSSE3-NEXT:    psllq $58, %xmm0
3684; SSSE3-NEXT:    movdqa %xmm0, %xmm1
3685; SSSE3-NEXT:    psrad $31, %xmm1
3686; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
3687; SSSE3-NEXT:    psrad $26, %xmm0
3688; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
3689; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3690; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
3691; SSSE3-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
3692; SSSE3-NEXT:    psllq $58, %xmm1
3693; SSSE3-NEXT:    movdqa %xmm1, %xmm2
3694; SSSE3-NEXT:    psrad $31, %xmm2
3695; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
3696; SSSE3-NEXT:    psrad $26, %xmm1
3697; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
3698; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
3699; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,2,2,2]
3700; SSSE3-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
3701; SSSE3-NEXT:    psllq $58, %xmm2
3702; SSSE3-NEXT:    movdqa %xmm2, %xmm4
3703; SSSE3-NEXT:    psrad $31, %xmm4
3704; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3]
3705; SSSE3-NEXT:    psrad $26, %xmm2
3706; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
3707; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
3708; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3]
3709; SSSE3-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5]
3710; SSSE3-NEXT:    psllq $58, %xmm3
3711; SSSE3-NEXT:    movdqa %xmm3, %xmm4
3712; SSSE3-NEXT:    psrad $31, %xmm4
3713; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3]
3714; SSSE3-NEXT:    psrad $26, %xmm3
3715; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
3716; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
3717; SSSE3-NEXT:    retq
3718;
3719; SSE41-LABEL: sext_8i6_to_8i64:
3720; SSE41:       # %bb.0: # %entry
3721; SSE41-NEXT:    movd %edi, %xmm0
3722; SSE41-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
3723; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
3724; SSE41-NEXT:    paddw {{.*}}(%rip), %xmm3
3725; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm0 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
3726; SSE41-NEXT:    psllq $58, %xmm0
3727; SSE41-NEXT:    movdqa %xmm0, %xmm1
3728; SSE41-NEXT:    psrad $31, %xmm1
3729; SSE41-NEXT:    psrad $26, %xmm0
3730; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
3731; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
3732; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
3733; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
3734; SSE41-NEXT:    psllq $58, %xmm1
3735; SSE41-NEXT:    movdqa %xmm1, %xmm2
3736; SSE41-NEXT:    psrad $31, %xmm2
3737; SSE41-NEXT:    psrad $26, %xmm1
3738; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
3739; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
3740; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
3741; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
3742; SSE41-NEXT:    psllq $58, %xmm2
3743; SSE41-NEXT:    movdqa %xmm2, %xmm4
3744; SSE41-NEXT:    psrad $31, %xmm4
3745; SSE41-NEXT:    psrad $26, %xmm2
3746; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
3747; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
3748; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3]
3749; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
3750; SSE41-NEXT:    psllq $58, %xmm3
3751; SSE41-NEXT:    movdqa %xmm3, %xmm4
3752; SSE41-NEXT:    psrad $31, %xmm4
3753; SSE41-NEXT:    psrad $26, %xmm3
3754; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
3755; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
3756; SSE41-NEXT:    retq
3757;
3758; AVX1-LABEL: sext_8i6_to_8i64:
3759; AVX1:       # %bb.0: # %entry
3760; AVX1-NEXT:    vmovd %edi, %xmm0
3761; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
3762; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
3763; AVX1-NEXT:    vpaddw {{.*}}(%rip), %xmm0, %xmm0
3764; AVX1-NEXT:    vpsllw $10, %xmm0, %xmm0
3765; AVX1-NEXT:    vpsraw $10, %xmm0, %xmm1
3766; AVX1-NEXT:    vpmovsxwq %xmm1, %xmm0
3767; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
3768; AVX1-NEXT:    vpmovsxwq %xmm2, %xmm2
3769; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
3770; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
3771; AVX1-NEXT:    vpmovsxwq %xmm2, %xmm2
3772; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
3773; AVX1-NEXT:    vpmovsxwq %xmm1, %xmm1
3774; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
3775; AVX1-NEXT:    retq
3776;
3777; AVX2-LABEL: sext_8i6_to_8i64:
3778; AVX2:       # %bb.0: # %entry
3779; AVX2-NEXT:    vmovd %edi, %xmm0
3780; AVX2-NEXT:    vpbroadcastw %xmm0, %xmm0
3781; AVX2-NEXT:    vpaddw {{.*}}(%rip), %xmm0, %xmm0
3782; AVX2-NEXT:    vpsllw $10, %xmm0, %xmm0
3783; AVX2-NEXT:    vpsraw $10, %xmm0, %xmm1
3784; AVX2-NEXT:    vpmovsxwq %xmm1, %ymm0
3785; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
3786; AVX2-NEXT:    vpmovsxwq %xmm1, %ymm1
3787; AVX2-NEXT:    retq
3788;
3789; AVX512-LABEL: sext_8i6_to_8i64:
3790; AVX512:       # %bb.0: # %entry
3791; AVX512-NEXT:    vmovd %edi, %xmm0
3792; AVX512-NEXT:    vpbroadcastw %xmm0, %xmm0
3793; AVX512-NEXT:    vpaddw {{.*}}(%rip), %xmm0, %xmm0
3794; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
3795; AVX512-NEXT:    vpsllq $58, %zmm0, %zmm0
3796; AVX512-NEXT:    vpsraq $58, %zmm0, %zmm0
3797; AVX512-NEXT:    retq
3798;
3799; X86-SSE2-LABEL: sext_8i6_to_8i64:
3800; X86-SSE2:       # %bb.0: # %entry
3801; X86-SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
3802; X86-SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
3803; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
3804; X86-SSE2-NEXT:    paddw {{\.LCPI.*}}, %xmm3
3805; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,0,0,0]
3806; X86-SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
3807; X86-SSE2-NEXT:    psllq $58, %xmm0
3808; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
3809; X86-SSE2-NEXT:    psrad $31, %xmm1
3810; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
3811; X86-SSE2-NEXT:    psrad $26, %xmm0
3812; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
3813; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3814; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
3815; X86-SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
3816; X86-SSE2-NEXT:    psllq $58, %xmm1
3817; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
3818; X86-SSE2-NEXT:    psrad $31, %xmm2
3819; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
3820; X86-SSE2-NEXT:    psrad $26, %xmm1
3821; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
3822; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
3823; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,2,2,2]
3824; X86-SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
3825; X86-SSE2-NEXT:    psllq $58, %xmm2
3826; X86-SSE2-NEXT:    movdqa %xmm2, %xmm4
3827; X86-SSE2-NEXT:    psrad $31, %xmm4
3828; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3]
3829; X86-SSE2-NEXT:    psrad $26, %xmm2
3830; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
3831; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
3832; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3]
3833; X86-SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5]
3834; X86-SSE2-NEXT:    psllq $58, %xmm3
3835; X86-SSE2-NEXT:    movdqa %xmm3, %xmm4
3836; X86-SSE2-NEXT:    psrad $31, %xmm4
3837; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3]
3838; X86-SSE2-NEXT:    psrad $26, %xmm3
3839; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
3840; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
3841; X86-SSE2-NEXT:    retl
3842;
3843; X86-SSE41-LABEL: sext_8i6_to_8i64:
3844; X86-SSE41:       # %bb.0: # %entry
3845; X86-SSE41-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
3846; X86-SSE41-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
3847; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
3848; X86-SSE41-NEXT:    paddw {{\.LCPI.*}}, %xmm3
3849; X86-SSE41-NEXT:    pmovzxwq {{.*#+}} xmm0 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
3850; X86-SSE41-NEXT:    psllq $58, %xmm0
3851; X86-SSE41-NEXT:    movdqa %xmm0, %xmm1
3852; X86-SSE41-NEXT:    psrad $31, %xmm1
3853; X86-SSE41-NEXT:    psrad $26, %xmm0
3854; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
3855; X86-SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
3856; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
3857; X86-SSE41-NEXT:    pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
3858; X86-SSE41-NEXT:    psllq $58, %xmm1
3859; X86-SSE41-NEXT:    movdqa %xmm1, %xmm2
3860; X86-SSE41-NEXT:    psrad $31, %xmm2
3861; X86-SSE41-NEXT:    psrad $26, %xmm1
3862; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
3863; X86-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
3864; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
3865; X86-SSE41-NEXT:    pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
3866; X86-SSE41-NEXT:    psllq $58, %xmm2
3867; X86-SSE41-NEXT:    movdqa %xmm2, %xmm4
3868; X86-SSE41-NEXT:    psrad $31, %xmm4
3869; X86-SSE41-NEXT:    psrad $26, %xmm2
3870; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
3871; X86-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
3872; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3]
3873; X86-SSE41-NEXT:    pmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
3874; X86-SSE41-NEXT:    psllq $58, %xmm3
3875; X86-SSE41-NEXT:    movdqa %xmm3, %xmm4
3876; X86-SSE41-NEXT:    psrad $31, %xmm4
3877; X86-SSE41-NEXT:    psrad $26, %xmm3
3878; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
3879; X86-SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
3880; X86-SSE41-NEXT:    retl
3881entry:
3882  %a = trunc i32 %x to i6
3883  %b = insertelement <8 x i6> undef, i6 %a, i32 0
3884  %c = shufflevector <8 x i6> %b, <8 x i6> undef, <8 x i32> zeroinitializer
3885  %d = add <8 x i6> %c, <i6 0, i6 1, i6 2, i6 3, i6 4, i6 5, i6 6, i6 7>
3886  %e = sext <8 x i6> %d to <8 x i64>
3887  ret <8 x i64> %e
3888}
3889
3890define <8 x i32> @zext_negate_sext(<8 x i8> %x) {
3891; SSE2-LABEL: zext_negate_sext:
3892; SSE2:       # %bb.0:
3893; SSE2-NEXT:    pxor %xmm1, %xmm1
3894; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
3895; SSE2-NEXT:    psubw %xmm0, %xmm1
3896; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3897; SSE2-NEXT:    psrad $16, %xmm0
3898; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
3899; SSE2-NEXT:    psrad $16, %xmm1
3900; SSE2-NEXT:    retq
3901;
3902; SSSE3-LABEL: zext_negate_sext:
3903; SSSE3:       # %bb.0:
3904; SSSE3-NEXT:    pxor %xmm1, %xmm1
3905; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
3906; SSSE3-NEXT:    psubw %xmm0, %xmm1
3907; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3908; SSSE3-NEXT:    psrad $16, %xmm0
3909; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
3910; SSSE3-NEXT:    psrad $16, %xmm1
3911; SSSE3-NEXT:    retq
3912;
3913; SSE41-LABEL: zext_negate_sext:
3914; SSE41:       # %bb.0:
3915; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
3916; SSE41-NEXT:    pxor %xmm1, %xmm1
3917; SSE41-NEXT:    psubw %xmm0, %xmm1
3918; SSE41-NEXT:    pmovsxwd %xmm1, %xmm0
3919; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
3920; SSE41-NEXT:    pmovsxwd %xmm1, %xmm1
3921; SSE41-NEXT:    retq
3922;
3923; AVX1-LABEL: zext_negate_sext:
3924; AVX1:       # %bb.0:
3925; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
3926; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
3927; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
3928; AVX1-NEXT:    vpsubd %xmm1, %xmm2, %xmm1
3929; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
3930; AVX1-NEXT:    vpsubd %xmm0, %xmm2, %xmm0
3931; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
3932; AVX1-NEXT:    retq
3933;
3934; AVX2-LABEL: zext_negate_sext:
3935; AVX2:       # %bb.0:
3936; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
3937; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
3938; AVX2-NEXT:    vpsubd %ymm0, %ymm1, %ymm0
3939; AVX2-NEXT:    retq
3940;
3941; AVX512-LABEL: zext_negate_sext:
3942; AVX512:       # %bb.0:
3943; AVX512-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
3944; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
3945; AVX512-NEXT:    vpsubd %ymm0, %ymm1, %ymm0
3946; AVX512-NEXT:    retq
3947;
3948; X86-SSE2-LABEL: zext_negate_sext:
3949; X86-SSE2:       # %bb.0:
3950; X86-SSE2-NEXT:    pxor %xmm1, %xmm1
3951; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
3952; X86-SSE2-NEXT:    psubw %xmm0, %xmm1
3953; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3954; X86-SSE2-NEXT:    psrad $16, %xmm0
3955; X86-SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
3956; X86-SSE2-NEXT:    psrad $16, %xmm1
3957; X86-SSE2-NEXT:    retl
3958;
3959; X86-SSE41-LABEL: zext_negate_sext:
3960; X86-SSE41:       # %bb.0:
3961; X86-SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
3962; X86-SSE41-NEXT:    pxor %xmm1, %xmm1
3963; X86-SSE41-NEXT:    psubw %xmm0, %xmm1
3964; X86-SSE41-NEXT:    pmovsxwd %xmm1, %xmm0
3965; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
3966; X86-SSE41-NEXT:    pmovsxwd %xmm1, %xmm1
3967; X86-SSE41-NEXT:    retl
3968  %z = zext <8 x i8> %x to <8 x i16>
3969  %neg = sub nsw <8 x i16> zeroinitializer, %z
3970  %r = sext <8 x i16> %neg to <8 x i32>
3971  ret <8 x i32> %r
3972}
3973
3974define <8 x i32> @zext_decremenet_sext(<8 x i8> %x) {
3975; SSE2-LABEL: zext_decremenet_sext:
3976; SSE2:       # %bb.0:
3977; SSE2-NEXT:    pxor %xmm1, %xmm1
3978; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
3979; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
3980; SSE2-NEXT:    paddw %xmm0, %xmm1
3981; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3982; SSE2-NEXT:    psrad $16, %xmm0
3983; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
3984; SSE2-NEXT:    psrad $16, %xmm1
3985; SSE2-NEXT:    retq
3986;
3987; SSSE3-LABEL: zext_decremenet_sext:
3988; SSSE3:       # %bb.0:
3989; SSSE3-NEXT:    pxor %xmm1, %xmm1
3990; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
3991; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm1
3992; SSSE3-NEXT:    paddw %xmm0, %xmm1
3993; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3994; SSSE3-NEXT:    psrad $16, %xmm0
3995; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
3996; SSSE3-NEXT:    psrad $16, %xmm1
3997; SSSE3-NEXT:    retq
3998;
3999; SSE41-LABEL: zext_decremenet_sext:
4000; SSE41:       # %bb.0:
4001; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
4002; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
4003; SSE41-NEXT:    paddw %xmm0, %xmm1
4004; SSE41-NEXT:    pmovsxwd %xmm1, %xmm0
4005; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
4006; SSE41-NEXT:    pmovsxwd %xmm1, %xmm1
4007; SSE41-NEXT:    retq
4008;
4009; AVX1-LABEL: zext_decremenet_sext:
4010; AVX1:       # %bb.0:
4011; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
4012; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
4013; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
4014; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
4015; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
4016; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
4017; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
4018; AVX1-NEXT:    retq
4019;
4020; AVX2-LABEL: zext_decremenet_sext:
4021; AVX2:       # %bb.0:
4022; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
4023; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
4024; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
4025; AVX2-NEXT:    retq
4026;
4027; AVX512-LABEL: zext_decremenet_sext:
4028; AVX512:       # %bb.0:
4029; AVX512-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
4030; AVX512-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
4031; AVX512-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
4032; AVX512-NEXT:    retq
4033;
4034; X86-SSE2-LABEL: zext_decremenet_sext:
4035; X86-SSE2:       # %bb.0:
4036; X86-SSE2-NEXT:    pxor %xmm1, %xmm1
4037; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
4038; X86-SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
4039; X86-SSE2-NEXT:    paddw %xmm0, %xmm1
4040; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
4041; X86-SSE2-NEXT:    psrad $16, %xmm0
4042; X86-SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
4043; X86-SSE2-NEXT:    psrad $16, %xmm1
4044; X86-SSE2-NEXT:    retl
4045;
4046; X86-SSE41-LABEL: zext_decremenet_sext:
4047; X86-SSE41:       # %bb.0:
4048; X86-SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
4049; X86-SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
4050; X86-SSE41-NEXT:    paddw %xmm0, %xmm1
4051; X86-SSE41-NEXT:    pmovsxwd %xmm1, %xmm0
4052; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
4053; X86-SSE41-NEXT:    pmovsxwd %xmm1, %xmm1
4054; X86-SSE41-NEXT:    retl
4055  %z = zext <8 x i8> %x to <8 x i16>
4056  %dec = add <8 x i16> %z, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
4057  %r = sext <8 x i16> %dec to <8 x i32>
4058  ret <8 x i32> %r
4059}
4060