1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
7
8define <8 x i16> @zext_16i8_to_8i16(<16 x i8> %A) nounwind uwtable readnone ssp {
9; SSE2-LABEL: zext_16i8_to_8i16:
10; SSE2:       # BB#0: # %entry
11; SSE2-NEXT:    pxor %xmm1, %xmm1
12; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
13; SSE2-NEXT:    retq
14;
15; SSSE3-LABEL: zext_16i8_to_8i16:
16; SSSE3:       # BB#0: # %entry
17; SSSE3-NEXT:    pxor %xmm1, %xmm1
18; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
19; SSSE3-NEXT:    retq
20;
21; SSE41-LABEL: zext_16i8_to_8i16:
22; SSE41:       # BB#0: # %entry
23; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
24; SSE41-NEXT:    retq
25;
26; AVX-LABEL: zext_16i8_to_8i16:
27; AVX:       # BB#0: # %entry
28; AVX-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
29; AVX-NEXT:    retq
30entry:
31  %B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
32  %C = zext <8 x i8> %B to <8 x i16>
33  ret <8 x i16> %C
34}
35
36; PR17654
37define <16 x i16> @zext_16i8_to_16i16(<16 x i8> %A) {
38; SSE2-LABEL: zext_16i8_to_16i16:
39; SSE2:       # BB#0: # %entry
40; SSE2-NEXT:    movdqa %xmm0, %xmm1
41; SSE2-NEXT:    pxor %xmm2, %xmm2
42; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
43; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
44; SSE2-NEXT:    retq
45;
46; SSSE3-LABEL: zext_16i8_to_16i16:
47; SSSE3:       # BB#0: # %entry
48; SSSE3-NEXT:    movdqa %xmm0, %xmm1
49; SSSE3-NEXT:    pxor %xmm2, %xmm2
50; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
51; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
52; SSSE3-NEXT:    retq
53;
54; SSE41-LABEL: zext_16i8_to_16i16:
55; SSE41:       # BB#0: # %entry
56; SSE41-NEXT:    movdqa %xmm0, %xmm1
57; SSE41-NEXT:    pxor %xmm2, %xmm2
58; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
59; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
60; SSE41-NEXT:    retq
61;
62; AVX1-LABEL: zext_16i8_to_16i16:
63; AVX1:       # BB#0: # %entry
64; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
65; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
66; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
67; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
68; AVX1-NEXT:    retq
69;
70; AVX2-LABEL: zext_16i8_to_16i16:
71; AVX2:       # BB#0: # %entry
72; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
73; AVX2-NEXT:    retq
74entry:
75  %B = zext <16 x i8> %A to <16 x i16>
76  ret <16 x i16> %B
77}
78
79define <4 x i32> @zext_16i8_to_4i32(<16 x i8> %A) nounwind uwtable readnone ssp {
80; SSE2-LABEL: zext_16i8_to_4i32:
81; SSE2:       # BB#0: # %entry
82; SSE2-NEXT:    pxor %xmm1, %xmm1
83; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
84; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
85; SSE2-NEXT:    retq
86;
87; SSSE3-LABEL: zext_16i8_to_4i32:
88; SSSE3:       # BB#0: # %entry
89; SSSE3-NEXT:    pxor %xmm1, %xmm1
90; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
91; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
92; SSSE3-NEXT:    retq
93;
94; SSE41-LABEL: zext_16i8_to_4i32:
95; SSE41:       # BB#0: # %entry
96; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
97; SSE41-NEXT:    retq
98;
99; AVX-LABEL: zext_16i8_to_4i32:
100; AVX:       # BB#0: # %entry
101; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
102; AVX-NEXT:    retq
103entry:
104  %B = shufflevector <16 x i8> %A, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
105  %C = zext <4 x i8> %B to <4 x i32>
106  ret <4 x i32> %C
107}
108
109define <8 x i32> @zext_16i8_to_8i32(<16 x i8> %A) nounwind uwtable readnone ssp {
110; SSE2-LABEL: zext_16i8_to_8i32:
111; SSE2:       # BB#0: # %entry
112; SSE2-NEXT:    movdqa %xmm0, %xmm1
113; SSE2-NEXT:    pxor %xmm2, %xmm2
114; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
115; SSE2-NEXT:    movdqa %xmm1, %xmm0
116; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
117; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
118; SSE2-NEXT:    retq
119;
120; SSSE3-LABEL: zext_16i8_to_8i32:
121; SSSE3:       # BB#0: # %entry
122; SSSE3-NEXT:    movdqa %xmm0, %xmm1
123; SSSE3-NEXT:    pxor %xmm2, %xmm2
124; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
125; SSSE3-NEXT:    movdqa %xmm1, %xmm0
126; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
127; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
128; SSSE3-NEXT:    retq
129;
130; SSE41-LABEL: zext_16i8_to_8i32:
131; SSE41:       # BB#0: # %entry
132; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
133; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
134; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
135; SSE41-NEXT:    movdqa %xmm2, %xmm0
136; SSE41-NEXT:    retq
137;
138; AVX1-LABEL: zext_16i8_to_8i32:
139; AVX1:       # BB#0: # %entry
140; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
141; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
142; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
143; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
144; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
145; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
146; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
147; AVX1-NEXT:    retq
148;
149; AVX2-LABEL: zext_16i8_to_8i32:
150; AVX2:       # BB#0: # %entry
151; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
152; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
153; AVX2-NEXT:    retq
154entry:
155  %B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
156  %C = zext <8 x i8> %B to <8 x i32>
157  ret <8 x i32> %C
158}
159
160define <2 x i64> @zext_16i8_to_2i64(<16 x i8> %A) nounwind uwtable readnone ssp {
161; SSE2-LABEL: zext_16i8_to_2i64:
162; SSE2:       # BB#0: # %entry
163; SSE2-NEXT:    pxor %xmm1, %xmm1
164; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
165; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
166; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
167; SSE2-NEXT:    retq
168;
169; SSSE3-LABEL: zext_16i8_to_2i64:
170; SSSE3:       # BB#0: # %entry
171; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
172; SSSE3-NEXT:    retq
173;
174; SSE41-LABEL: zext_16i8_to_2i64:
175; SSE41:       # BB#0: # %entry
176; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
177; SSE41-NEXT:    retq
178;
179; AVX-LABEL: zext_16i8_to_2i64:
180; AVX:       # BB#0: # %entry
181; AVX-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
182; AVX-NEXT:    retq
183entry:
184  %B = shufflevector <16 x i8> %A, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
185  %C = zext <2 x i8> %B to <2 x i64>
186  ret <2 x i64> %C
187}
188
189define <4 x i64> @zext_16i8_to_4i64(<16 x i8> %A) nounwind uwtable readnone ssp {
190; SSE2-LABEL: zext_16i8_to_4i64:
191; SSE2:       # BB#0: # %entry
192; SSE2-NEXT:    movdqa %xmm0, %xmm1
193; SSE2-NEXT:    pxor %xmm2, %xmm2
194; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
195; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
196; SSE2-NEXT:    movdqa %xmm1, %xmm0
197; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
198; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
199; SSE2-NEXT:    retq
200;
201; SSSE3-LABEL: zext_16i8_to_4i64:
202; SSSE3:       # BB#0: # %entry
203; SSSE3-NEXT:    movdqa %xmm0, %xmm1
204; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
205; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero
206; SSSE3-NEXT:    retq
207;
208; SSE41-LABEL: zext_16i8_to_4i64:
209; SSE41:       # BB#0: # %entry
210; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
211; SSE41-NEXT:    psrld $16, %xmm0
212; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
213; SSE41-NEXT:    movdqa %xmm2, %xmm0
214; SSE41-NEXT:    retq
215;
216; AVX1-LABEL: zext_16i8_to_4i64:
217; AVX1:       # BB#0: # %entry
218; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
219; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
220; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
221; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
222; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
223; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
224; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
225; AVX1-NEXT:    retq
226;
227; AVX2-LABEL: zext_16i8_to_4i64:
228; AVX2:       # BB#0: # %entry
229; AVX2-NEXT:    vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
230; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
231; AVX2-NEXT:    retq
232entry:
233  %B = shufflevector <16 x i8> %A, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
234  %C = zext <4 x i8> %B to <4 x i64>
235  ret <4 x i64> %C
236}
237
238define <4 x i32> @zext_8i16_to_4i32(<8 x i16> %A) nounwind uwtable readnone ssp {
239; SSE2-LABEL: zext_8i16_to_4i32:
240; SSE2:       # BB#0: # %entry
241; SSE2-NEXT:    pxor %xmm1, %xmm1
242; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
243; SSE2-NEXT:    retq
244;
245; SSSE3-LABEL: zext_8i16_to_4i32:
246; SSSE3:       # BB#0: # %entry
247; SSSE3-NEXT:    pxor %xmm1, %xmm1
248; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
249; SSSE3-NEXT:    retq
250;
251; SSE41-LABEL: zext_8i16_to_4i32:
252; SSE41:       # BB#0: # %entry
253; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
254; SSE41-NEXT:    retq
255;
256; AVX-LABEL: zext_8i16_to_4i32:
257; AVX:       # BB#0: # %entry
258; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
259; AVX-NEXT:    retq
260entry:
261  %B = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
262  %C = zext <4 x i16> %B to <4 x i32>
263  ret <4 x i32> %C
264}
265
266define <8 x i32> @zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp {
267; SSE2-LABEL: zext_8i16_to_8i32:
268; SSE2:       # BB#0: # %entry
269; SSE2-NEXT:    movdqa %xmm0, %xmm1
270; SSE2-NEXT:    pxor %xmm2, %xmm2
271; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
272; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
273; SSE2-NEXT:    retq
274;
275; SSSE3-LABEL: zext_8i16_to_8i32:
276; SSSE3:       # BB#0: # %entry
277; SSSE3-NEXT:    movdqa %xmm0, %xmm1
278; SSSE3-NEXT:    pxor %xmm2, %xmm2
279; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
280; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
281; SSSE3-NEXT:    retq
282;
283; SSE41-LABEL: zext_8i16_to_8i32:
284; SSE41:       # BB#0: # %entry
285; SSE41-NEXT:    movdqa %xmm0, %xmm1
286; SSE41-NEXT:    pxor %xmm2, %xmm2
287; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
288; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
289; SSE41-NEXT:    retq
290;
291; AVX1-LABEL: zext_8i16_to_8i32:
292; AVX1:       # BB#0: # %entry
293; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
294; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
295; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
296; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
297; AVX1-NEXT:    retq
298;
299; AVX2-LABEL: zext_8i16_to_8i32:
300; AVX2:       # BB#0: # %entry
301; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
302; AVX2-NEXT:    retq
303entry:
304  %B = zext <8 x i16> %A to <8 x i32>
305  ret <8 x i32>%B
306}
307
308define <2 x i64> @zext_8i16_to_2i64(<8 x i16> %A) nounwind uwtable readnone ssp {
309; SSE2-LABEL: zext_8i16_to_2i64:
310; SSE2:       # BB#0: # %entry
311; SSE2-NEXT:    pxor %xmm1, %xmm1
312; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
313; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
314; SSE2-NEXT:    retq
315;
316; SSSE3-LABEL: zext_8i16_to_2i64:
317; SSSE3:       # BB#0: # %entry
318; SSSE3-NEXT:    pxor %xmm1, %xmm1
319; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
320; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
321; SSSE3-NEXT:    retq
322;
323; SSE41-LABEL: zext_8i16_to_2i64:
324; SSE41:       # BB#0: # %entry
325; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
326; SSE41-NEXT:    retq
327;
328; AVX-LABEL: zext_8i16_to_2i64:
329; AVX:       # BB#0: # %entry
330; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
331; AVX-NEXT:    retq
332entry:
333  %B = shufflevector <8 x i16> %A, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
334  %C = zext <2 x i16> %B to <2 x i64>
335  ret <2 x i64> %C
336}
337
338define <4 x i64> @zext_8i16_to_4i64(<8 x i16> %A) nounwind uwtable readnone ssp {
339; SSE2-LABEL: zext_8i16_to_4i64:
340; SSE2:       # BB#0: # %entry
341; SSE2-NEXT:    movdqa %xmm0, %xmm1
342; SSE2-NEXT:    pxor %xmm2, %xmm2
343; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
344; SSE2-NEXT:    movdqa %xmm1, %xmm0
345; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
346; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
347; SSE2-NEXT:    retq
348;
349; SSSE3-LABEL: zext_8i16_to_4i64:
350; SSSE3:       # BB#0: # %entry
351; SSSE3-NEXT:    movdqa %xmm0, %xmm1
352; SSSE3-NEXT:    pxor %xmm2, %xmm2
353; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
354; SSSE3-NEXT:    movdqa %xmm1, %xmm0
355; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
356; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
357; SSSE3-NEXT:    retq
358;
359; SSE41-LABEL: zext_8i16_to_4i64:
360; SSE41:       # BB#0: # %entry
361; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
362; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
363; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
364; SSE41-NEXT:    movdqa %xmm2, %xmm0
365; SSE41-NEXT:    retq
366;
367; AVX1-LABEL: zext_8i16_to_4i64:
368; AVX1:       # BB#0: # %entry
369; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
370; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
371; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
372; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
373; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
374; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
375; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
376; AVX1-NEXT:    retq
377;
378; AVX2-LABEL: zext_8i16_to_4i64:
379; AVX2:       # BB#0: # %entry
380; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
381; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
382; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
383; AVX2-NEXT:    retq
384entry:
385  %B = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
386  %C = zext <4 x i16> %B to <4 x i64>
387  ret <4 x i64> %C
388}
389
390define <2 x i64> @zext_4i32_to_2i64(<4 x i32> %A) nounwind uwtable readnone ssp {
391; SSE2-LABEL: zext_4i32_to_2i64:
392; SSE2:       # BB#0: # %entry
393; SSE2-NEXT:    pxor %xmm1, %xmm1
394; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
395; SSE2-NEXT:    retq
396;
397; SSSE3-LABEL: zext_4i32_to_2i64:
398; SSSE3:       # BB#0: # %entry
399; SSSE3-NEXT:    pxor %xmm1, %xmm1
400; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
401; SSSE3-NEXT:    retq
402;
403; SSE41-LABEL: zext_4i32_to_2i64:
404; SSE41:       # BB#0: # %entry
405; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
406; SSE41-NEXT:    retq
407;
408; AVX-LABEL: zext_4i32_to_2i64:
409; AVX:       # BB#0: # %entry
410; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
411; AVX-NEXT:    retq
412entry:
413  %B = shufflevector <4 x i32> %A, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
414  %C = zext <2 x i32> %B to <2 x i64>
415  ret <2 x i64> %C
416}
417
418define <4 x i64> @zext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp {
419; SSE2-LABEL: zext_4i32_to_4i64:
420; SSE2:       # BB#0: # %entry
421; SSE2-NEXT:    movdqa %xmm0, %xmm1
422; SSE2-NEXT:    pxor %xmm2, %xmm2
423; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
424; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
425; SSE2-NEXT:    retq
426;
427; SSSE3-LABEL: zext_4i32_to_4i64:
428; SSSE3:       # BB#0: # %entry
429; SSSE3-NEXT:    movdqa %xmm0, %xmm1
430; SSSE3-NEXT:    pxor %xmm2, %xmm2
431; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
432; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
433; SSSE3-NEXT:    retq
434;
435; SSE41-LABEL: zext_4i32_to_4i64:
436; SSE41:       # BB#0: # %entry
437; SSE41-NEXT:    movdqa %xmm0, %xmm1
438; SSE41-NEXT:    pxor %xmm2, %xmm2
439; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
440; SSE41-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
441; SSE41-NEXT:    retq
442;
443; AVX1-LABEL: zext_4i32_to_4i64:
444; AVX1:       # BB#0: # %entry
445; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
446; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
447; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
448; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
449; AVX1-NEXT:    retq
450;
451; AVX2-LABEL: zext_4i32_to_4i64:
452; AVX2:       # BB#0: # %entry
453; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
454; AVX2-NEXT:    retq
455entry:
456  %B = zext <4 x i32> %A to <4 x i64>
457  ret <4 x i64>%B
458}
459
460define <2 x i64> @load_zext_2i8_to_2i64(<2 x i8> *%ptr) {
461; SSE2-LABEL: load_zext_2i8_to_2i64:
462; SSE2:       # BB#0: # %entry
463; SSE2-NEXT:    movzwl (%rdi), %eax
464; SSE2-NEXT:    movd %eax, %xmm0
465; SSE2-NEXT:    pxor %xmm1, %xmm1
466; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
467; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
468; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
469; SSE2-NEXT:    retq
470;
471; SSSE3-LABEL: load_zext_2i8_to_2i64:
472; SSSE3:       # BB#0: # %entry
473; SSSE3-NEXT:    movzwl (%rdi), %eax
474; SSSE3-NEXT:    movd %eax, %xmm0
475; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
476; SSSE3-NEXT:    retq
477;
478; SSE41-LABEL: load_zext_2i8_to_2i64:
479; SSE41:       # BB#0: # %entry
480; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
481; SSE41-NEXT:    retq
482;
483; AVX-LABEL: load_zext_2i8_to_2i64:
484; AVX:       # BB#0: # %entry
485; AVX-NEXT:    vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
486; AVX-NEXT:    retq
487entry:
488 %X = load <2 x i8>, <2 x i8>* %ptr
489 %Y = zext <2 x i8> %X to <2 x i64>
490 ret <2 x i64> %Y
491}
492
493define <4 x i32> @load_zext_4i8_to_4i32(<4 x i8> *%ptr) {
494; SSE2-LABEL: load_zext_4i8_to_4i32:
495; SSE2:       # BB#0: # %entry
496; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
497; SSE2-NEXT:    pxor %xmm1, %xmm1
498; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
499; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
500; SSE2-NEXT:    retq
501;
502; SSSE3-LABEL: load_zext_4i8_to_4i32:
503; SSSE3:       # BB#0: # %entry
504; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
505; SSSE3-NEXT:    pxor %xmm1, %xmm1
506; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
507; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
508; SSSE3-NEXT:    retq
509;
510; SSE41-LABEL: load_zext_4i8_to_4i32:
511; SSE41:       # BB#0: # %entry
512; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
513; SSE41-NEXT:    retq
514;
515; AVX-LABEL: load_zext_4i8_to_4i32:
516; AVX:       # BB#0: # %entry
517; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
518; AVX-NEXT:    retq
519entry:
520 %X = load <4 x i8>, <4 x i8>* %ptr
521 %Y = zext <4 x i8> %X to <4 x i32>
522 ret <4 x i32> %Y
523}
524
525define <4 x i64> @load_zext_4i8_to_4i64(<4 x i8> *%ptr) {
526; SSE2-LABEL: load_zext_4i8_to_4i64:
527; SSE2:       # BB#0: # %entry
528; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
529; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
530; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
531; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
532; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
533; SSE2-NEXT:    pand %xmm2, %xmm0
534; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
535; SSE2-NEXT:    pand %xmm2, %xmm1
536; SSE2-NEXT:    retq
537;
538; SSSE3-LABEL: load_zext_4i8_to_4i64:
539; SSSE3:       # BB#0: # %entry
540; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
541; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
542; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
543; SSSE3-NEXT:    movdqa %xmm1, %xmm0
544; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero
545; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[8],zero,zero,zero,zero,zero,zero,zero,xmm1[12],zero,zero,zero,zero,zero,zero,zero
546; SSSE3-NEXT:    retq
547;
548; SSE41-LABEL: load_zext_4i8_to_4i64:
549; SSE41:       # BB#0: # %entry
550; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
551; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
552; SSE41-NEXT:    retq
553;
554; AVX1-LABEL: load_zext_4i8_to_4i64:
555; AVX1:       # BB#0: # %entry
556; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
557; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
558; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
559; AVX1-NEXT:    retq
560;
561; AVX2-LABEL: load_zext_4i8_to_4i64:
562; AVX2:       # BB#0: # %entry
563; AVX2-NEXT:    vpmovzxbq {{.*#+}} ymm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero
564; AVX2-NEXT:    retq
565entry:
566 %X = load <4 x i8>, <4 x i8>* %ptr
567 %Y = zext <4 x i8> %X to <4 x i64>
568 ret <4 x i64> %Y
569}
570
571define <8 x i16> @load_zext_8i8_to_8i16(<8 x i8> *%ptr) {
572; SSE2-LABEL: load_zext_8i8_to_8i16:
573; SSE2:       # BB#0: # %entry
574; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
575; SSE2-NEXT:    pxor %xmm1, %xmm1
576; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
577; SSE2-NEXT:    retq
578;
579; SSSE3-LABEL: load_zext_8i8_to_8i16:
580; SSSE3:       # BB#0: # %entry
581; SSSE3-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
582; SSSE3-NEXT:    pxor %xmm1, %xmm1
583; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
584; SSSE3-NEXT:    retq
585;
586; SSE41-LABEL: load_zext_8i8_to_8i16:
587; SSE41:       # BB#0: # %entry
588; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
589; SSE41-NEXT:    retq
590;
591; AVX-LABEL: load_zext_8i8_to_8i16:
592; AVX:       # BB#0: # %entry
593; AVX-NEXT:    vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
594; AVX-NEXT:    retq
595entry:
596 %X = load <8 x i8>, <8 x i8>* %ptr
597 %Y = zext <8 x i8> %X to <8 x i16>
598 ret <8 x i16> %Y
599}
600
601define <8 x i32> @load_zext_8i8_to_8i32(<8 x i8> *%ptr) {
602; SSE2-LABEL: load_zext_8i8_to_8i32:
603; SSE2:       # BB#0: # %entry
604; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
605; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
606; SSE2-NEXT:    movdqa %xmm1, %xmm0
607; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
608; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
609; SSE2-NEXT:    pand %xmm2, %xmm0
610; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
611; SSE2-NEXT:    pand %xmm2, %xmm1
612; SSE2-NEXT:    retq
613;
614; SSSE3-LABEL: load_zext_8i8_to_8i32:
615; SSSE3:       # BB#0: # %entry
616; SSSE3-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
617; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
618; SSSE3-NEXT:    movdqa %xmm1, %xmm0
619; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[6],zero,zero,zero
620; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[8],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[14],zero,zero,zero
621; SSSE3-NEXT:    retq
622;
623; SSE41-LABEL: load_zext_8i8_to_8i32:
624; SSE41:       # BB#0: # %entry
625; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
626; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
627; SSE41-NEXT:    retq
628;
629; AVX1-LABEL: load_zext_8i8_to_8i32:
630; AVX1:       # BB#0: # %entry
631; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
632; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
633; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
634; AVX1-NEXT:    retq
635;
636; AVX2-LABEL: load_zext_8i8_to_8i32:
637; AVX2:       # BB#0: # %entry
638; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
639; AVX2-NEXT:    retq
640entry:
641 %X = load <8 x i8>, <8 x i8>* %ptr
642 %Y = zext <8 x i8> %X to <8 x i32>
643 ret <8 x i32> %Y
644}
645
646define <16 x i16> @load_zext_16i8_to_16i16(<16 x i8> *%ptr) {
647; SSE2-LABEL: load_zext_16i8_to_16i16:
648; SSE2:       # BB#0: # %entry
649; SSE2-NEXT:    movdqa (%rdi), %xmm1
650; SSE2-NEXT:    pxor %xmm2, %xmm2
651; SSE2-NEXT:    movdqa %xmm1, %xmm0
652; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
653; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
654; SSE2-NEXT:    retq
655;
656; SSSE3-LABEL: load_zext_16i8_to_16i16:
657; SSSE3:       # BB#0: # %entry
658; SSSE3-NEXT:    movdqa (%rdi), %xmm1
659; SSSE3-NEXT:    pxor %xmm2, %xmm2
660; SSSE3-NEXT:    movdqa %xmm1, %xmm0
661; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
662; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
663; SSSE3-NEXT:    retq
664;
665; SSE41-LABEL: load_zext_16i8_to_16i16:
666; SSE41:       # BB#0: # %entry
667; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
668; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
669; SSE41-NEXT:    retq
670;
671; AVX1-LABEL: load_zext_16i8_to_16i16:
672; AVX1:       # BB#0: # %entry
673; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
674; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
675; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
676; AVX1-NEXT:    retq
677;
678; AVX2-LABEL: load_zext_16i8_to_16i16:
679; AVX2:       # BB#0: # %entry
680; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
681; AVX2-NEXT:    retq
682entry:
683 %X = load <16 x i8>, <16 x i8>* %ptr
684 %Y = zext <16 x i8> %X to <16 x i16>
685 ret <16 x i16> %Y
686}
687
688define <2 x i64> @load_zext_2i16_to_2i64(<2 x i16> *%ptr) {
689; SSE2-LABEL: load_zext_2i16_to_2i64:
690; SSE2:       # BB#0: # %entry
691; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
692; SSE2-NEXT:    pxor %xmm1, %xmm1
693; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
694; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
695; SSE2-NEXT:    retq
696;
697; SSSE3-LABEL: load_zext_2i16_to_2i64:
698; SSSE3:       # BB#0: # %entry
699; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
700; SSSE3-NEXT:    pxor %xmm1, %xmm1
701; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
702; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
703; SSSE3-NEXT:    retq
704;
705; SSE41-LABEL: load_zext_2i16_to_2i64:
706; SSE41:       # BB#0: # %entry
707; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
708; SSE41-NEXT:    retq
709;
710; AVX-LABEL: load_zext_2i16_to_2i64:
711; AVX:       # BB#0: # %entry
712; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
713; AVX-NEXT:    retq
714entry:
715 %X = load <2 x i16>, <2 x i16>* %ptr
716 %Y = zext <2 x i16> %X to <2 x i64>
717 ret <2 x i64> %Y
718}
719
720define <4 x i32> @load_zext_4i16_to_4i32(<4 x i16> *%ptr) {
721; SSE2-LABEL: load_zext_4i16_to_4i32:
722; SSE2:       # BB#0: # %entry
723; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
724; SSE2-NEXT:    pxor %xmm1, %xmm1
725; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
726; SSE2-NEXT:    retq
727;
728; SSSE3-LABEL: load_zext_4i16_to_4i32:
729; SSSE3:       # BB#0: # %entry
730; SSSE3-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
731; SSSE3-NEXT:    pxor %xmm1, %xmm1
732; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
733; SSSE3-NEXT:    retq
734;
735; SSE41-LABEL: load_zext_4i16_to_4i32:
736; SSE41:       # BB#0: # %entry
737; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
738; SSE41-NEXT:    retq
739;
740; AVX-LABEL: load_zext_4i16_to_4i32:
741; AVX:       # BB#0: # %entry
742; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
743; AVX-NEXT:    retq
744entry:
745 %X = load <4 x i16>, <4 x i16>* %ptr
746 %Y = zext <4 x i16> %X to <4 x i32>
747 ret <4 x i32> %Y
748}
749
750define <4 x i64> @load_zext_4i16_to_4i64(<4 x i16> *%ptr) {
751; SSE2-LABEL: load_zext_4i16_to_4i64:
752; SSE2:       # BB#0: # %entry
753; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
754; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
755; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
756; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [65535,0,0,0,65535,0,0,0]
757; SSE2-NEXT:    pand %xmm2, %xmm0
758; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
759; SSE2-NEXT:    pand %xmm2, %xmm1
760; SSE2-NEXT:    retq
761;
762; SSSE3-LABEL: load_zext_4i16_to_4i64:
763; SSSE3:       # BB#0: # %entry
764; SSSE3-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
765; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
766; SSSE3-NEXT:    movdqa %xmm1, %xmm0
767; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[4,5],zero,zero,zero,zero,zero,zero
768; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[8,9],zero,zero,zero,zero,zero,zero,xmm1[12,13],zero,zero,zero,zero,zero,zero
769; SSSE3-NEXT:    retq
770;
771; SSE41-LABEL: load_zext_4i16_to_4i64:
772; SSE41:       # BB#0: # %entry
773; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
774; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
775; SSE41-NEXT:    retq
776;
777; AVX1-LABEL: load_zext_4i16_to_4i64:
778; AVX1:       # BB#0: # %entry
779; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
780; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
781; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
782; AVX1-NEXT:    retq
783;
784; AVX2-LABEL: load_zext_4i16_to_4i64:
785; AVX2:       # BB#0: # %entry
786; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
787; AVX2-NEXT:    retq
788entry:
789 %X = load <4 x i16>, <4 x i16>* %ptr
790 %Y = zext <4 x i16> %X to <4 x i64>
791 ret <4 x i64> %Y
792}
793
794define <8 x i32> @load_zext_8i16_to_8i32(<8 x i16> *%ptr) {
795; SSE2-LABEL: load_zext_8i16_to_8i32:
796; SSE2:       # BB#0: # %entry
797; SSE2-NEXT:    movdqa (%rdi), %xmm1
798; SSE2-NEXT:    pxor %xmm2, %xmm2
799; SSE2-NEXT:    movdqa %xmm1, %xmm0
800; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
801; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
802; SSE2-NEXT:    retq
803;
804; SSSE3-LABEL: load_zext_8i16_to_8i32:
805; SSSE3:       # BB#0: # %entry
806; SSSE3-NEXT:    movdqa (%rdi), %xmm1
807; SSSE3-NEXT:    pxor %xmm2, %xmm2
808; SSSE3-NEXT:    movdqa %xmm1, %xmm0
809; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
810; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
811; SSSE3-NEXT:    retq
812;
813; SSE41-LABEL: load_zext_8i16_to_8i32:
814; SSE41:       # BB#0: # %entry
815; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
816; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
817; SSE41-NEXT:    retq
818;
819; AVX1-LABEL: load_zext_8i16_to_8i32:
820; AVX1:       # BB#0: # %entry
821; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
822; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
823; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
824; AVX1-NEXT:    retq
825;
826; AVX2-LABEL: load_zext_8i16_to_8i32:
827; AVX2:       # BB#0: # %entry
828; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
829; AVX2-NEXT:    retq
830entry:
831 %X = load <8 x i16>, <8 x i16>* %ptr
832 %Y = zext <8 x i16> %X to <8 x i32>
833 ret <8 x i32> %Y
834}
835
836define <2 x i64> @load_zext_2i32_to_2i64(<2 x i32> *%ptr) {
837; SSE2-LABEL: load_zext_2i32_to_2i64:
838; SSE2:       # BB#0: # %entry
839; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
840; SSE2-NEXT:    pxor %xmm1, %xmm1
841; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
842; SSE2-NEXT:    retq
843;
844; SSSE3-LABEL: load_zext_2i32_to_2i64:
845; SSSE3:       # BB#0: # %entry
846; SSSE3-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
847; SSSE3-NEXT:    pxor %xmm1, %xmm1
848; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
849; SSSE3-NEXT:    retq
850;
851; SSE41-LABEL: load_zext_2i32_to_2i64:
852; SSE41:       # BB#0: # %entry
853; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
854; SSE41-NEXT:    retq
855;
856; AVX-LABEL: load_zext_2i32_to_2i64:
857; AVX:       # BB#0: # %entry
858; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
859; AVX-NEXT:    retq
860entry:
861 %X = load <2 x i32>, <2 x i32>* %ptr
862 %Y = zext <2 x i32> %X to <2 x i64>
863 ret <2 x i64> %Y
864}
865
866define <4 x i64> @load_zext_4i32_to_4i64(<4 x i32> *%ptr) {
867; SSE2-LABEL: load_zext_4i32_to_4i64:
868; SSE2:       # BB#0: # %entry
869; SSE2-NEXT:    movdqa (%rdi), %xmm1
870; SSE2-NEXT:    pxor %xmm2, %xmm2
871; SSE2-NEXT:    movdqa %xmm1, %xmm0
872; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
873; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
874; SSE2-NEXT:    retq
875;
876; SSSE3-LABEL: load_zext_4i32_to_4i64:
877; SSSE3:       # BB#0: # %entry
878; SSSE3-NEXT:    movdqa (%rdi), %xmm1
879; SSSE3-NEXT:    pxor %xmm2, %xmm2
880; SSSE3-NEXT:    movdqa %xmm1, %xmm0
881; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
882; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
883; SSSE3-NEXT:    retq
884;
885; SSE41-LABEL: load_zext_4i32_to_4i64:
886; SSE41:       # BB#0: # %entry
887; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
888; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
889; SSE41-NEXT:    retq
890;
891; AVX1-LABEL: load_zext_4i32_to_4i64:
892; AVX1:       # BB#0: # %entry
893; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
894; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
895; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
896; AVX1-NEXT:    retq
897;
898; AVX2-LABEL: load_zext_4i32_to_4i64:
899; AVX2:       # BB#0: # %entry
900; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
901; AVX2-NEXT:    retq
902entry:
903 %X = load <4 x i32>, <4 x i32>* %ptr
904 %Y = zext <4 x i32> %X to <4 x i64>
905 ret <4 x i64> %Y
906}
907
908define <8 x i32> @zext_8i8_to_8i32(<8 x i8> %z) {
909; SSE2-LABEL: zext_8i8_to_8i32:
910; SSE2:       # BB#0: # %entry
911; SSE2-NEXT:    movdqa %xmm0, %xmm1
912; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
913; SSE2-NEXT:    pxor %xmm2, %xmm2
914; SSE2-NEXT:    movdqa %xmm1, %xmm0
915; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
916; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
917; SSE2-NEXT:    retq
918;
919; SSSE3-LABEL: zext_8i8_to_8i32:
920; SSSE3:       # BB#0: # %entry
921; SSSE3-NEXT:    movdqa %xmm0, %xmm1
922; SSSE3-NEXT:    pand {{.*}}(%rip), %xmm1
923; SSSE3-NEXT:    pxor %xmm2, %xmm2
924; SSSE3-NEXT:    movdqa %xmm1, %xmm0
925; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
926; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
927; SSSE3-NEXT:    retq
928;
929; SSE41-LABEL: zext_8i8_to_8i32:
930; SSE41:       # BB#0: # %entry
931; SSE41-NEXT:    movdqa %xmm0, %xmm1
932; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
933; SSE41-NEXT:    pxor %xmm2, %xmm2
934; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
935; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
936; SSE41-NEXT:    retq
937;
938; AVX1-LABEL: zext_8i8_to_8i32:
939; AVX1:       # BB#0: # %entry
940; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
941; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
942; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
943; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
944; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
945; AVX1-NEXT:    retq
946;
947; AVX2-LABEL: zext_8i8_to_8i32:
948; AVX2:       # BB#0: # %entry
949; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
950; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
951; AVX2-NEXT:    retq
952entry:
953  %t = zext <8 x i8> %z to <8 x i32>
954  ret <8 x i32> %t
955}
956
957define <8 x i32> @shuf_zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp {
958; SSE2-LABEL: shuf_zext_8i16_to_8i32:
959; SSE2:       # BB#0: # %entry
960; SSE2-NEXT:    movdqa %xmm0, %xmm1
961; SSE2-NEXT:    pxor %xmm2, %xmm2
962; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
963; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
964; SSE2-NEXT:    retq
965;
966; SSSE3-LABEL: shuf_zext_8i16_to_8i32:
967; SSSE3:       # BB#0: # %entry
968; SSSE3-NEXT:    movdqa %xmm0, %xmm1
969; SSSE3-NEXT:    pxor %xmm2, %xmm2
970; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
971; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
972; SSSE3-NEXT:    retq
973;
974; SSE41-LABEL: shuf_zext_8i16_to_8i32:
975; SSE41:       # BB#0: # %entry
976; SSE41-NEXT:    movdqa %xmm0, %xmm1
977; SSE41-NEXT:    pxor %xmm2, %xmm2
978; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
979; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
980; SSE41-NEXT:    retq
981;
982; AVX1-LABEL: shuf_zext_8i16_to_8i32:
983; AVX1:       # BB#0: # %entry
984; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
985; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
986; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
987; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
988; AVX1-NEXT:    retq
989;
990; AVX2-LABEL: shuf_zext_8i16_to_8i32:
991; AVX2:       # BB#0: # %entry
992; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
993; AVX2-NEXT:    retq
994entry:
995  %B = shufflevector <8 x i16> %A, <8 x i16> zeroinitializer, <16 x i32> <i32 0, i32 8, i32 1, i32 8, i32 2, i32 8, i32 3, i32 8, i32 4, i32 8, i32 5, i32 8, i32 6, i32 8, i32 7, i32 8>
996  %Z = bitcast <16 x i16> %B to <8 x i32>
997  ret <8 x i32> %Z
998}
999
1000define <4 x i64> @shuf_zext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp {
1001; SSE2-LABEL: shuf_zext_4i32_to_4i64:
1002; SSE2:       # BB#0: # %entry
1003; SSE2-NEXT:    movdqa %xmm0, %xmm1
1004; SSE2-NEXT:    pxor %xmm2, %xmm2
1005; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1006; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1007; SSE2-NEXT:    retq
1008;
1009; SSSE3-LABEL: shuf_zext_4i32_to_4i64:
1010; SSSE3:       # BB#0: # %entry
1011; SSSE3-NEXT:    movdqa %xmm0, %xmm1
1012; SSSE3-NEXT:    pxor %xmm2, %xmm2
1013; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1014; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1015; SSSE3-NEXT:    retq
1016;
1017; SSE41-LABEL: shuf_zext_4i32_to_4i64:
1018; SSE41:       # BB#0: # %entry
1019; SSE41-NEXT:    movdqa %xmm0, %xmm1
1020; SSE41-NEXT:    pxor %xmm2, %xmm2
1021; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
1022; SSE41-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1023; SSE41-NEXT:    retq
1024;
1025; AVX1-LABEL: shuf_zext_4i32_to_4i64:
1026; AVX1:       # BB#0: # %entry
1027; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero
1028; AVX1-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
1029; AVX1-NEXT:    vblendpd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
1030; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,0,3,0]
1031; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1032; AVX1-NEXT:    retq
1033;
1034; AVX2-LABEL: shuf_zext_4i32_to_4i64:
1035; AVX2:       # BB#0: # %entry
1036; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1037; AVX2-NEXT:    retq
1038entry:
1039  %B = shufflevector <4 x i32> %A, <4 x i32> zeroinitializer, <8 x i32> <i32 0, i32 4, i32 1, i32 4, i32 2, i32 4, i32 3, i32 4>
1040  %Z = bitcast <8 x i32> %B to <4 x i64>
1041  ret <4 x i64> %Z
1042}
1043
1044define <8 x i32> @shuf_zext_8i8_to_8i32(<8 x i8> %A) {
1045; SSE2-LABEL: shuf_zext_8i8_to_8i32:
1046; SSE2:       # BB#0: # %entry
1047; SSE2-NEXT:    movdqa %xmm0, %xmm1
1048; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
1049; SSE2-NEXT:    packuswb %xmm1, %xmm1
1050; SSE2-NEXT:    pxor %xmm2, %xmm2
1051; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
1052; SSE2-NEXT:    movdqa %xmm1, %xmm0
1053; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1054; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
1055; SSE2-NEXT:    retq
1056;
1057; SSSE3-LABEL: shuf_zext_8i8_to_8i32:
1058; SSSE3:       # BB#0: # %entry
1059; SSSE3-NEXT:    movdqa %xmm0, %xmm1
1060; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
1061; SSSE3-NEXT:    pxor %xmm2, %xmm2
1062; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
1063; SSSE3-NEXT:    movdqa %xmm1, %xmm0
1064; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1065; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
1066; SSSE3-NEXT:    retq
1067;
1068; SSE41-LABEL: shuf_zext_8i8_to_8i32:
1069; SSE41:       # BB#0: # %entry
1070; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
1071; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1072; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
1073; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1074; SSE41-NEXT:    movdqa %xmm2, %xmm0
1075; SSE41-NEXT:    retq
1076;
1077; AVX1-LABEL: shuf_zext_8i8_to_8i32:
1078; AVX1:       # BB#0: # %entry
1079; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
1080; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1081; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
1082; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1083; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1084; AVX1-NEXT:    retq
1085;
1086; AVX2-LABEL: shuf_zext_8i8_to_8i32:
1087; AVX2:       # BB#0: # %entry
1088; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
1089; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
1090; AVX2-NEXT:    retq
1091entry:
1092  %B = shufflevector <8 x i8> %A, <8 x i8> zeroinitializer, <32 x i32> <i32 0, i32 8, i32 8, i32 8, i32 1, i32 8, i32 8, i32 8, i32 2, i32 8, i32 8, i32 8, i32 3, i32 8, i32 8, i32 8, i32 4, i32 8, i32 8, i32 8, i32 5, i32 8, i32 8, i32 8, i32 6, i32 8, i32 8, i32 8, i32 7, i32 8, i32 8, i32 8>
1093  %Z = bitcast <32 x i8> %B to <8 x i32>
1094  ret <8 x i32> %Z
1095}
1096
1097define <2 x i64> @shuf_zext_16i8_to_2i64_offset6(<16 x i8> %A) nounwind uwtable readnone ssp {
1098; SSE2-LABEL: shuf_zext_16i8_to_2i64_offset6:
1099; SSE2:       # BB#0: # %entry
1100; SSE2-NEXT:    pxor %xmm1, %xmm1
1101; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1102; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1103; SSE2-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1104; SSE2-NEXT:    retq
1105;
1106; SSSE3-LABEL: shuf_zext_16i8_to_2i64_offset6:
1107; SSSE3:       # BB#0: # %entry
1108; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
1109; SSSE3-NEXT:    retq
1110;
1111; SSE41-LABEL: shuf_zext_16i8_to_2i64_offset6:
1112; SSE41:       # BB#0: # %entry
1113; SSE41-NEXT:    psrlq $48, %xmm0
1114; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
1115; SSE41-NEXT:    retq
1116;
1117; AVX-LABEL: shuf_zext_16i8_to_2i64_offset6:
1118; AVX:       # BB#0: # %entry
1119; AVX-NEXT:    vpsrlq $48, %xmm0, %xmm0
1120; AVX-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
1121; AVX-NEXT:    retq
1122entry:
1123  %B = shufflevector <16 x i8> %A, <16 x i8> zeroinitializer, <16 x i32> <i32 6, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 7, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
1124  %Z = bitcast <16 x i8> %B to <2 x i64>
1125  ret <2 x i64> %Z
1126}
1127
1128define <4 x i64> @shuf_zext_16i8_to_4i64_offset11(<16 x i8> %A) nounwind uwtable readnone ssp {
1129; SSE2-LABEL: shuf_zext_16i8_to_4i64_offset11:
1130; SSE2:       # BB#0: # %entry
1131; SSE2-NEXT:    movdqa %xmm0, %xmm1
1132; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
1133; SSE2-NEXT:    pxor %xmm2, %xmm2
1134; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
1135; SSE2-NEXT:    movdqa %xmm1, %xmm0
1136; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1137; SSE2-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1138; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
1139; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1140; SSE2-NEXT:    retq
1141;
1142; SSSE3-LABEL: shuf_zext_16i8_to_4i64_offset11:
1143; SSSE3:       # BB#0: # %entry
1144; SSSE3-NEXT:    movdqa %xmm0, %xmm1
1145; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[11],zero,zero,zero,zero,zero,zero,zero,xmm0[12],zero,zero,zero,zero,zero,zero,zero
1146; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[13],zero,zero,zero,zero,zero,zero,zero,xmm1[14],zero,zero,zero,zero,zero,zero,zero
1147; SSSE3-NEXT:    retq
1148;
1149; SSE41-LABEL: shuf_zext_16i8_to_4i64_offset11:
1150; SSE41:       # BB#0: # %entry
1151; SSE41-NEXT:    movdqa %xmm0, %xmm1
1152; SSE41-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1153; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1154; SSE41-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1155; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
1156; SSE41-NEXT:    movdqa %xmm2, %xmm0
1157; SSE41-NEXT:    retq
1158;
1159; AVX1-LABEL: shuf_zext_16i8_to_4i64_offset11:
1160; AVX1:       # BB#0: # %entry
1161; AVX1-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm0[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1162; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1163; AVX1-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1164; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
1165; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1166; AVX1-NEXT:    retq
1167;
1168; AVX2-LABEL: shuf_zext_16i8_to_4i64_offset11:
1169; AVX2:       # BB#0: # %entry
1170; AVX2-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1171; AVX2-NEXT:    vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
1172; AVX2-NEXT:    retq
1173entry:
1174  %B = shufflevector <16 x i8> %A, <16 x i8> zeroinitializer, <32 x i32> <i32 11, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 12, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 13, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 14, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
1175  %Z = bitcast <32 x i8> %B to <4 x i64>
1176  ret <4 x i64> %Z
1177}
1178
1179define <2 x i64> @shuf_zext_8i16_to_2i64_offset6(<8 x i16> %A) nounwind uwtable readnone ssp {
1180; SSE2-LABEL: shuf_zext_8i16_to_2i64_offset6:
1181; SSE2:       # BB#0: # %entry
1182; SSE2-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
1183; SSE2-NEXT:    pxor %xmm1, %xmm1
1184; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1185; SSE2-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1186; SSE2-NEXT:    retq
1187;
1188; SSSE3-LABEL: shuf_zext_8i16_to_2i64_offset6:
1189; SSSE3:       # BB#0: # %entry
1190; SSSE3-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
1191; SSSE3-NEXT:    pxor %xmm1, %xmm1
1192; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1193; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1194; SSSE3-NEXT:    retq
1195;
1196; SSE41-LABEL: shuf_zext_8i16_to_2i64_offset6:
1197; SSE41:       # BB#0: # %entry
1198; SSE41-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
1199; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
1200; SSE41-NEXT:    retq
1201;
1202; AVX-LABEL: shuf_zext_8i16_to_2i64_offset6:
1203; AVX:       # BB#0: # %entry
1204; AVX-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
1205; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
1206; AVX-NEXT:    retq
1207entry:
1208  %B = shufflevector <8 x i16> %A, <8 x i16> zeroinitializer, <8 x i32> <i32 3, i32 8, i32 8, i32 8, i32 4, i32 8, i32 8, i32 8>
1209  %Z = bitcast <8 x i16> %B to <2 x i64>
1210  ret <2 x i64> %Z
1211}
1212
1213define <4 x i64> @shuf_zext_8i16_to_4i64_offset2(<8 x i16> %A) nounwind uwtable readnone ssp {
1214; SSE2-LABEL: shuf_zext_8i16_to_4i64_offset2:
1215; SSE2:       # BB#0: # %entry
1216; SSE2-NEXT:    movdqa %xmm0, %xmm1
1217; SSE2-NEXT:    pxor %xmm2, %xmm2
1218; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1219; SSE2-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1220; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
1221; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1222; SSE2-NEXT:    retq
1223;
1224; SSSE3-LABEL: shuf_zext_8i16_to_4i64_offset2:
1225; SSSE3:       # BB#0: # %entry
1226; SSSE3-NEXT:    movdqa %xmm0, %xmm1
1227; SSSE3-NEXT:    pxor %xmm2, %xmm2
1228; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1229; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1230; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
1231; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1232; SSSE3-NEXT:    retq
1233;
1234; SSE41-LABEL: shuf_zext_8i16_to_4i64_offset2:
1235; SSE41:       # BB#0: # %entry
1236; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1237; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
1238; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1239; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
1240; SSE41-NEXT:    movdqa %xmm2, %xmm0
1241; SSE41-NEXT:    retq
1242;
1243; AVX1-LABEL: shuf_zext_8i16_to_4i64_offset2:
1244; AVX1:       # BB#0: # %entry
1245; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1246; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
1247; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1248; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
1249; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1250; AVX1-NEXT:    retq
1251;
1252; AVX2-LABEL: shuf_zext_8i16_to_4i64_offset2:
1253; AVX2:       # BB#0: # %entry
1254; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[1,2,2,3,5,6,6,7]
1255; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1256; AVX2-NEXT:    retq
1257entry:
1258  %B = shufflevector <8 x i16> %A, <8 x i16> zeroinitializer, <16 x i32> <i32 2, i32 8, i32 8, i32 8, i32 3, i32 8, i32 8, i32 8, i32 4, i32 8, i32 8, i32 8, i32 5, i32 8, i32 8, i32 8>
1259  %Z = bitcast <16 x i16> %B to <4 x i64>
1260  ret <4 x i64> %Z
1261}
1262
1263define <4 x i32> @shuf_zext_8i16_to_4i32_offset1(<8 x i16> %A) nounwind uwtable readnone ssp {
1264; SSE-LABEL: shuf_zext_8i16_to_4i32_offset1:
1265; SSE:       # BB#0: # %entry
1266; SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
1267; SSE-NEXT:    pxor %xmm1, %xmm1
1268; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1269; SSE-NEXT:    retq
1270;
1271; AVX-LABEL: shuf_zext_8i16_to_4i32_offset1:
1272; AVX:       # BB#0: # %entry
1273; AVX-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
1274; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1275; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1276; AVX-NEXT:    retq
1277entry:
1278  %B = shufflevector <8 x i16> %A, <8 x i16> zeroinitializer, <8 x i32> <i32 1, i32 8, i32 2, i32 8, i32 3, i32 8, i32 4, i32 8>
1279  %Z = bitcast <8 x i16> %B to <4 x i32>
1280  ret <4 x i32> %Z
1281}
1282
1283define <8 x i32> @shuf_zext_8i16_to_8i32_offset3(<8 x i16> %A) nounwind uwtable readnone ssp {
1284; SSE2-LABEL: shuf_zext_8i16_to_8i32_offset3:
1285; SSE2:       # BB#0: # %entry
1286; SSE2-NEXT:    movdqa %xmm0, %xmm1
1287; SSE2-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
1288; SSE2-NEXT:    pxor %xmm2, %xmm2
1289; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1290; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1291; SSE2-NEXT:    retq
1292;
1293; SSSE3-LABEL: shuf_zext_8i16_to_8i32_offset3:
1294; SSSE3:       # BB#0: # %entry
1295; SSSE3-NEXT:    movdqa %xmm0, %xmm1
1296; SSSE3-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
1297; SSSE3-NEXT:    pxor %xmm2, %xmm2
1298; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1299; SSSE3-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1300; SSSE3-NEXT:    retq
1301;
1302; SSE41-LABEL: shuf_zext_8i16_to_8i32_offset3:
1303; SSE41:       # BB#0: # %entry
1304; SSE41-NEXT:    movdqa %xmm0, %xmm1
1305; SSE41-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
1306; SSE41-NEXT:    pxor %xmm2, %xmm2
1307; SSE41-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
1308; SSE41-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1309; SSE41-NEXT:    retq
1310;
1311; AVX1-LABEL: shuf_zext_8i16_to_8i32_offset3:
1312; AVX1:       # BB#0: # %entry
1313; AVX1-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
1314; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1315; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
1316; AVX1-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1317; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1318; AVX1-NEXT:    retq
1319;
1320; AVX2-LABEL: shuf_zext_8i16_to_8i32_offset3:
1321; AVX2:       # BB#0: # %entry
1322; AVX2-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
1323; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1324; AVX2-NEXT:    retq
1325entry:
1326  %B = shufflevector <8 x i16> %A, <8 x i16> zeroinitializer, <16 x i32> <i32 3, i32 8, i32 4, i32 8, i32 5, i32 8, i32 6, i32 8, i32 7, i32 8, i32 undef, i32 8, i32 undef, i32 8, i32 undef, i32 8>
1327  %Z = bitcast <16 x i16> %B to <8 x i32>
1328  ret <8 x i32> %Z
1329}
1330
1331define <8 x i32> @shuf_zext_16i16_to_8i32_offset8(<16 x i16> %A) nounwind uwtable readnone ssp {
1332; SSE2-LABEL: shuf_zext_16i16_to_8i32_offset8:
1333; SSE2:       # BB#0: # %entry
1334; SSE2-NEXT:    pxor %xmm2, %xmm2
1335; SSE2-NEXT:    movdqa %xmm1, %xmm0
1336; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1337; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
1338; SSE2-NEXT:    retq
1339;
1340; SSSE3-LABEL: shuf_zext_16i16_to_8i32_offset8:
1341; SSSE3:       # BB#0: # %entry
1342; SSSE3-NEXT:    pxor %xmm2, %xmm2
1343; SSSE3-NEXT:    movdqa %xmm1, %xmm0
1344; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1345; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
1346; SSSE3-NEXT:    retq
1347;
1348; SSE41-LABEL: shuf_zext_16i16_to_8i32_offset8:
1349; SSE41:       # BB#0: # %entry
1350; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,2,3,3]
1351; SSE41-NEXT:    pxor %xmm2, %xmm2
1352; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
1353; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1354; SSE41-NEXT:    movdqa %xmm2, %xmm1
1355; SSE41-NEXT:    retq
1356;
1357; AVX1-LABEL: shuf_zext_16i16_to_8i32_offset8:
1358; AVX1:       # BB#0: # %entry
1359; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1360; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
1361; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1362; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
1363; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1364; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1365; AVX1-NEXT:    retq
1366;
1367; AVX2-LABEL: shuf_zext_16i16_to_8i32_offset8:
1368; AVX2:       # BB#0: # %entry
1369; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
1370; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1371; AVX2-NEXT:    retq
1372entry:
1373  %B = shufflevector <16 x i16> %A, <16 x i16> zeroinitializer, <16 x i32> <i32 8, i32 16, i32 9, i32 16, i32 10, i32 16, i32 11, i32 16, i32 12, i32 16, i32 undef, i32 16, i32 14, i32 16, i32 undef, i32 16>
1374  %Z = bitcast <16 x i16> %B to <8 x i32>
1375  ret <8 x i32> %Z
1376}
1377
1378define <2 x i64> @shuf_zext_4i32_to_2i64_offset2(<4 x i32> %A) nounwind uwtable readnone ssp {
1379; SSE-LABEL: shuf_zext_4i32_to_2i64_offset2:
1380; SSE:       # BB#0: # %entry
1381; SSE-NEXT:    pxor %xmm1, %xmm1
1382; SSE-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1383; SSE-NEXT:    retq
1384;
1385; AVX-LABEL: shuf_zext_4i32_to_2i64_offset2:
1386; AVX:       # BB#0: # %entry
1387; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1388; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1389; AVX-NEXT:    retq
1390entry:
1391  %B = shufflevector <4 x i32> %A, <4 x i32> zeroinitializer, <4 x i32> <i32 2, i32 4, i32 3, i32 4>
1392  %Z = bitcast <4 x i32> %B to <2 x i64>
1393  ret <2 x i64> %Z
1394}
1395
1396define <4 x i64> @shuf_zext_4i32_to_4i64_offset1(<4 x i32> %A) nounwind uwtable readnone ssp {
1397; SSE2-LABEL: shuf_zext_4i32_to_4i64_offset1:
1398; SSE2:       # BB#0: # %entry
1399; SSE2-NEXT:    movdqa %xmm0, %xmm1
1400; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [0,0,4294967295,0]
1401; SSE2-NEXT:    pand %xmm1, %xmm0
1402; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1403; SSE2-NEXT:    retq
1404;
1405; SSSE3-LABEL: shuf_zext_4i32_to_4i64_offset1:
1406; SSSE3:       # BB#0: # %entry
1407; SSSE3-NEXT:    movdqa %xmm0, %xmm1
1408; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [0,0,4294967295,0]
1409; SSSE3-NEXT:    pand %xmm1, %xmm0
1410; SSSE3-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1411; SSSE3-NEXT:    retq
1412;
1413; SSE41-LABEL: shuf_zext_4i32_to_4i64_offset1:
1414; SSE41:       # BB#0: # %entry
1415; SSE41-NEXT:    movdqa %xmm0, %xmm1
1416; SSE41-NEXT:    pxor %xmm0, %xmm0
1417; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
1418; SSE41-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1419; SSE41-NEXT:    retq
1420;
1421; AVX1-LABEL: shuf_zext_4i32_to_4i64_offset1:
1422; AVX1:       # BB#0: # %entry
1423; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm0[3],zero,zero,zero
1424; AVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2
1425; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3]
1426; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1427; AVX1-NEXT:    retq
1428;
1429; AVX2-LABEL: shuf_zext_4i32_to_4i64_offset1:
1430; AVX2:       # BB#0: # %entry
1431; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[1,2,3,3,5,6,7,7]
1432; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1433; AVX2-NEXT:    retq
1434entry:
1435  %B = shufflevector <4 x i32> %A, <4 x i32> zeroinitializer, <8 x i32> <i32 undef, i32 4, i32 2, i32 4, i32 3, i32 4, i32 undef, i32 4>
1436  %Z = bitcast <8 x i32> %B to <4 x i64>
1437  ret <4 x i64> %Z
1438}
1439