1; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
6
7define <8 x i32> @zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp {
8; SSE2-LABEL: zext_8i16_to_8i32:
9; SSE2:       # BB#0: # %entry
10; SSE2-NEXT:    movdqa %xmm0, %xmm1
11; SSE2-NEXT:    pxor %xmm2, %xmm2
12; SSE2-NEXT:    # kill
13; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
14; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
15; SSE2-NEXT:    pand .LCPI0_0(%rip), %xmm1
16; SSE2-NEXT:    retq
17;
18; SSSE3-LABEL: zext_8i16_to_8i32:
19; SSSE3:       # BB#0: # %entry
20; SSSE3-NEXT:    movdqa %xmm0, %xmm1
21; SSSE3-NEXT:    pxor %xmm2, %xmm2
22; SSSE3-NEXT:    # kill
23; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
24; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
25; SSSE3-NEXT:    pand .LCPI0_0(%rip), %xmm1
26; SSSE3-NEXT:    retq
27;
28; SSE41-LABEL: zext_8i16_to_8i32:
29; SSE41:       # BB#0: # %entry
30; SSE41-NEXT:    movdqa %xmm0, %xmm1
31; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
32; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
33; SSE41-NEXT:    pand .LCPI0_0(%rip), %xmm1
34; SSE41-NEXT:    retq
35;
36; AVX1-LABEL: zext_8i16_to_8i32:
37; AVX1:       # BB#0: # %entry
38; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
39; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
40; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
41; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
42; AVX1-NEXT:    retq
43;
44; AVX2-LABEL: zext_8i16_to_8i32:
45; AVX2:       # BB#0: # %entry
46; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
47; AVX2-NEXT:    retq
48entry:
49  %B = zext <8 x i16> %A to <8 x i32>
50  ret <8 x i32>%B
51}
52
53define <4 x i64> @zext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp {
54; SSE2-LABEL: zext_4i32_to_4i64:
55; SSE2:       # BB#0: # %entry
56; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
57; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [4294967295,4294967295]
58; SSE2-NEXT:    pand %xmm3, %xmm2
59; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
60; SSE2-NEXT:    pand %xmm3, %xmm1
61; SSE2-NEXT:    movdqa %xmm2, %xmm0
62; SSE2-NEXT:    retq
63;
64; SSSE3-LABEL: zext_4i32_to_4i64:
65; SSSE3:       # BB#0: # %entry
66; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
67; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [4294967295,4294967295]
68; SSSE3-NEXT:    pand %xmm3, %xmm2
69; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
70; SSSE3-NEXT:    pand %xmm3, %xmm1
71; SSSE3-NEXT:    movdqa %xmm2, %xmm0
72; SSSE3-NEXT:    retq
73;
74; SSE41-LABEL: zext_4i32_to_4i64:
75; SSE41:       # BB#0: # %entry
76; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
77; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [4294967295,4294967295]
78; SSE41-NEXT:    pand %xmm3, %xmm2
79; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
80; SSE41-NEXT:    pand %xmm3, %xmm1
81; SSE41-NEXT:    movdqa %xmm2, %xmm0
82; SSE41-NEXT:    retq
83;
84; AVX1-LABEL: zext_4i32_to_4i64:
85; AVX1:       # BB#0: # %entry
86; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
87; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
88; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
89; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
90; AVX1-NEXT:    retq
91;
92; AVX2-LABEL: zext_4i32_to_4i64:
93; AVX2:       # BB#0: # %entry
94; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
95; AVX2-NEXT:    retq
96entry:
97  %B = zext <4 x i32> %A to <4 x i64>
98  ret <4 x i64>%B
99}
100
101define <8 x i32> @zext_8i8_to_8i32(<8 x i8> %z) {
102; SSE2-LABEL: zext_8i8_to_8i32:
103; SSE2:       # BB#0: # %entry
104; SSE2-NEXT:    movdqa %xmm0, %xmm2
105; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
106; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255]
107; SSE2-NEXT:    pand %xmm1, %xmm2
108; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
109; SSE2-NEXT:    pand %xmm0, %xmm1
110; SSE2-NEXT:    movdqa %xmm2, %xmm0
111; SSE2-NEXT:    retq
112;
113; SSSE3-LABEL: zext_8i8_to_8i32:
114; SSSE3:       # BB#0: # %entry
115; SSSE3-NEXT:    movdqa %xmm0, %xmm2
116; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
117; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255]
118; SSSE3-NEXT:    pand %xmm1, %xmm2
119; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
120; SSSE3-NEXT:    pand %xmm0, %xmm1
121; SSSE3-NEXT:    movdqa %xmm2, %xmm0
122; SSSE3-NEXT:    retq
123;
124; SSE41-LABEL: zext_8i8_to_8i32:
125; SSE41:       # BB#0: # %entry
126; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
127; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255]
128; SSE41-NEXT:    pand %xmm1, %xmm2
129; SSE41-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
130; SSE41-NEXT:    pand %xmm0, %xmm1
131; SSE41-NEXT:    movdqa %xmm2, %xmm0
132; SSE41-NEXT:    retq
133;
134; AVX1-LABEL: zext_8i8_to_8i32:
135; AVX1:       # BB#0: # %entry
136; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
137; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
138; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
139; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
140; AVX1-NEXT:    retq
141;
142; AVX2-LABEL: zext_8i8_to_8i32:
143; AVX2:       # BB#0: # %entry
144; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
145; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm1
146; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
147; AVX2-NEXT:    retq
148entry:
149  %t = zext <8 x i8> %z to <8 x i32>
150  ret <8 x i32> %t
151}
152
153; PR17654
154define <16 x i16> @zext_16i8_to_16i16(<16 x i8> %z) {
155; SSE2-LABEL: zext_16i8_to_16i16:
156; SSE2:       # BB#0: # %entry
157; SSE2-NEXT:    movdqa %xmm0, %xmm1
158; SSE2-NEXT:    pxor %xmm2, %xmm2
159; SSE2-NEXT:    # kill
160; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
161; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
162; SSE2-NEXT:    pand .LCPI3_0(%rip), %xmm1
163; SSE2-NEXT:    retq
164;
165; SSSE3-LABEL: zext_16i8_to_16i16:
166; SSSE3:       # BB#0: # %entry
167; SSSE3-NEXT:    movdqa %xmm0, %xmm1
168; SSSE3-NEXT:    pxor %xmm2, %xmm2
169; SSSE3-NEXT:    # kill
170; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
171; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
172; SSSE3-NEXT:    pand .LCPI3_0(%rip), %xmm1
173; SSSE3-NEXT:    retq
174;
175; SSE41-LABEL: zext_16i8_to_16i16:
176; SSE41:       # BB#0: # %entry
177; SSE41-NEXT:    movdqa %xmm0, %xmm1
178; SSE41-NEXT:    pmovzxbw %xmm1, %xmm0 {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
179; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
180; SSE41-NEXT:    pand .LCPI3_0(%rip), %xmm1
181; SSE41-NEXT:    retq
182;
183; AVX1-LABEL: zext_16i8_to_16i16:
184; AVX1:       # BB#0: # %entry
185; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
186; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
187; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
188; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
189; AVX1-NEXT:    retq
190;
191; AVX2-LABEL: zext_16i8_to_16i16:
192; AVX2:       # BB#0: # %entry
193; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
194; AVX2-NEXT:    retq
195entry:
196  %t = zext <16 x i8> %z to <16 x i16>
197  ret <16 x i16> %t
198}
199
200define <16 x i16> @load_zext_16i8_to_16i16(<16 x i8> *%ptr) {
201; SSE2-LABEL: load_zext_16i8_to_16i16:
202; SSE2:        # BB#0: # %entry
203; SSE2-NEXT:   movdqa (%rdi), %xmm1
204; SSE2-NEXT:   pxor %xmm2, %xmm2
205; SSE2-NEXT:   movdqa %xmm1, %xmm0
206; SSE2-NEXT:   punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
207; SSE2-NEXT:   punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
208; SSE2-NEXT:   pand .LCPI4_0(%rip), %xmm1
209; SSE2-NEXT:   retq
210;
211; SSSE3-LABEL: load_zext_16i8_to_16i16:
212; SSSE3:        # BB#0: # %entry
213; SSSE3-NEXT:   movdqa (%rdi), %xmm1
214; SSSE3-NEXT:   pxor %xmm2, %xmm2
215; SSSE3-NEXT:   movdqa %xmm1, %xmm0
216; SSSE3-NEXT:   punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
217; SSSE3-NEXT:   punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
218; SSSE3-NEXT:   pand .LCPI4_0(%rip), %xmm1
219; SSSE3-NEXT:   retq
220;
221; SSE41-LABEL: load_zext_16i8_to_16i16:
222; SSE41:       # BB#0: # %entry
223; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
224; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
225; SSE41-NEXT:    retq
226;
227; AVX1-LABEL: load_zext_16i8_to_16i16:
228; AVX1:       # BB#0: # %entry
229; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
230; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
231; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
232; AVX1-NEXT:    retq
233;
234; AVX2-LABEL: load_zext_16i8_to_16i16:
235; AVX2:       # BB#0: # %entry
236; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
237; AVX2-NEXT:    retq
238entry:
239 %X = load <16 x i8>, <16 x i8>* %ptr
240 %Y = zext <16 x i8> %X to <16 x i16>
241 ret <16 x i16> %Y
242}
243
244define <8 x i32> @load_zext_8i16_to_8i32(<8 x i16> *%ptr) {
245; SSE2-LABEL: load_zext_8i16_to_8i32:
246; SSE2:          # BB#0: # %entry
247; SSE2-NEXT:   movdqa (%rdi), %xmm1
248; SSE2-NEXT:   pxor %xmm2, %xmm2
249; SSE2-NEXT:   movdqa %xmm1, %xmm0
250; SSE2-NEXT:   punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
251; SSE2-NEXT:   punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
252; SSE2-NEXT:   pand .LCPI5_0(%rip), %xmm1
253; SSE2-NEXT:   retq
254;
255; SSSE3-LABEL: load_zext_8i16_to_8i32:
256; SSSE3:        # BB#0: # %entry
257; SSSE3-NEXT:   movdqa (%rdi), %xmm1
258; SSSE3-NEXT:   pxor %xmm2, %xmm2
259; SSSE3-NEXT:   movdqa %xmm1, %xmm0
260; SSSE3-NEXT:   punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
261; SSSE3-NEXT:   punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
262; SSSE3-NEXT:   pand .LCPI5_0(%rip), %xmm1
263; SSSE3-NEXT:   retq
264;
265; SSE41-LABEL: load_zext_8i16_to_8i32:
266; SSE41:       # BB#0: # %entry
267; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
268; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
269; SSE41-NEXT:    retq
270;
271; AVX1-LABEL: load_zext_8i16_to_8i32:
272; AVX1:       # BB#0: # %entry
273; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
274; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
275; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
276; AVX1-NEXT:    retq
277;
278; AVX2-LABEL: load_zext_8i16_to_8i32:
279; AVX2:       # BB#0: # %entry
280; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
281; AVX2-NEXT:    retq
282entry:
283 %X = load <8 x i16>, <8 x i16>* %ptr
284 %Y = zext <8 x i16> %X to <8 x i32>
285 ret <8 x i32>%Y
286}
287
288define <4 x i64> @load_zext_4i32_to_4i64(<4 x i32> *%ptr) {
289; SSE2-LABEL: load_zext_4i32_to_4i64:
290; SSE2:       # BB#0: # %entry
291; SSE2-NEXT:    movdqa (%rdi), %xmm1
292; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
293; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [4294967295,4294967295]
294; SSE2-NEXT:    pand %xmm2, %xmm0
295; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
296; SSE2-NEXT:    pand %xmm2, %xmm1
297; SSE2-NEXT:    retq
298;
299; SSSE3-LABEL: load_zext_4i32_to_4i64:
300; SSSE3:       # BB#0: # %entry
301; SSSE3-NEXT:    movdqa (%rdi), %xmm1
302; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
303; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [4294967295,4294967295]
304; SSSE3-NEXT:    pand %xmm2, %xmm0
305; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
306; SSSE3-NEXT:    pand %xmm2, %xmm1
307; SSSE3-NEXT:    retq
308;
309; SSE41-LABEL: load_zext_4i32_to_4i64:
310; SSE41:       # BB#0: # %entry
311; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
312; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
313; SSE41-NEXT:    retq
314;
315; AVX1-LABEL: load_zext_4i32_to_4i64:
316; AVX1:       # BB#0: # %entry
317; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
318; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
319; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
320; AVX1-NEXT:    retq
321;
322; AVX2-LABEL: load_zext_4i32_to_4i64:
323; AVX2:       # BB#0: # %entry
324; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
325; AVX2-NEXT:    retq
326entry:
327 %X = load <4 x i32>, <4 x i32>* %ptr
328 %Y = zext <4 x i32> %X to <4 x i64>
329 ret <4 x i64>%Y
330}
331
332define <8 x i32> @shuf_zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp {
333; SSE2-LABEL: shuf_zext_8i16_to_8i32:
334; SSE2:       # BB#0: # %entry
335; SSE2-NEXT:    movdqa %xmm0, %xmm1
336; SSE2-NEXT:    pxor %xmm2, %xmm2
337; SSE2-NEXT:    # kill
338; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
339; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
340; SSE2-NEXT:    retq
341;
342; SSSE3-LABEL: shuf_zext_8i16_to_8i32:
343; SSSE3:       # BB#0: # %entry
344; SSSE3-NEXT:    movdqa %xmm0, %xmm1
345; SSSE3-NEXT:    pxor %xmm2, %xmm2
346; SSSE3-NEXT:    # kill
347; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
348; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
349; SSSE3-NEXT:    retq
350;
351; SSE41-LABEL: shuf_zext_8i16_to_8i32:
352; SSE41:       # BB#0: # %entry
353; SSE41-NEXT:    movdqa %xmm0, %xmm1
354; SSE41-NEXT:    pxor %xmm2, %xmm2
355; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
356; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
357; SSE41-NEXT:    retq
358;
359; AVX1-LABEL: shuf_zext_8i16_to_8i32:
360; AVX1:       # BB#0: # %entry
361; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
362; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
363; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
364; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
365; AVX1-NEXT:    retq
366;
367; AVX2-LABEL: shuf_zext_8i16_to_8i32:
368; AVX2:       # BB#0: # %entry
369; AVX2-NEXT:    # kill
370; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
371; AVX2-NEXT:    retq
372entry:
373  %B = shufflevector <8 x i16> %A, <8 x i16> zeroinitializer, <16 x i32> <i32 0, i32 8, i32 1, i32 8, i32 2, i32 8, i32 3, i32 8, i32 4, i32 8, i32 5, i32 8, i32 6, i32 8, i32 7, i32 8>
374  %Z = bitcast <16 x i16> %B to <8 x i32>
375  ret <8 x i32> %Z
376}
377
378define <4 x i64> @shuf_zext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp {
379; SSE2-LABEL: shuf_zext_4i32_to_4i64:
380; SSE2:       # BB#0: # %entry
381; SSE2-NEXT:    movdqa %xmm0, %xmm1
382; SSE2-NEXT:    pxor %xmm2, %xmm2
383; SSE2-NEXT:    # kill
384; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
385; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
386; SSE2-NEXT:    retq
387;
388; SSSE3-LABEL: shuf_zext_4i32_to_4i64:
389; SSSE3:       # BB#0: # %entry
390; SSSE3-NEXT:    movdqa %xmm0, %xmm1
391; SSSE3-NEXT:    pxor %xmm2, %xmm2
392; SSSE3-NEXT:    # kill
393; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
394; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
395; SSSE3-NEXT:    retq
396;
397; SSE41-LABEL: shuf_zext_4i32_to_4i64:
398; SSE41:       # BB#0: # %entry
399; SSE41-NEXT:    movdqa %xmm0, %xmm1
400; SSE41-NEXT:    pxor %xmm2, %xmm2
401; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
402; SSE41-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
403; SSE41-NEXT:    retq
404;
405; AVX1-LABEL: shuf_zext_4i32_to_4i64:
406; AVX1:       # BB#0: # %entry
407; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero
408; AVX1-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
409; AVX1-NEXT:    vblendpd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
410; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,0,3,0]
411; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
412; AVX1-NEXT:    retq
413;
414; AVX2-LABEL: shuf_zext_4i32_to_4i64:
415; AVX2:       # BB#0: # %entry
416; AVX2-NEXT:    # kill
417; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
418; AVX2-NEXT:    retq
419entry:
420  %B = shufflevector <4 x i32> %A, <4 x i32> zeroinitializer, <8 x i32> <i32 0, i32 4, i32 1, i32 4, i32 2, i32 4, i32 3, i32 4>
421  %Z = bitcast <8 x i32> %B to <4 x i64>
422  ret <4 x i64> %Z
423}
424
425define <8 x i32> @shuf_zext_8i8_to_8i32(<8 x i8> %A) {
426; SSE2-LABEL: shuf_zext_8i8_to_8i32:
427; SSE2:       # BB#0: # %entry
428; SSE2-NEXT:    pand .LCPI9_0(%rip), %xmm0
429; SSE2-NEXT:    packuswb %xmm0, %xmm0
430; SSE2-NEXT:    pxor %xmm1, %xmm1
431; SSE2-NEXT:    movdqa %xmm0, %xmm2
432; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
433; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
434; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
435; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
436; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
437; SSE2-NEXT:    pandn %xmm0, %xmm1
438; SSE2-NEXT:    movdqa %xmm2, %xmm0
439; SSE2-NEXT:    retq
440;
441; SSSE3-LABEL: shuf_zext_8i8_to_8i32:
442; SSSE3:       # BB#0: # %entry
443; SSSE3-NEXT:    movdqa %xmm0, %xmm1
444; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
445; SSSE3-NEXT:    pxor %xmm2, %xmm2
446; SSSE3-NEXT:    movdqa %xmm1, %xmm0
447; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
448; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
449; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
450; SSSE3-NEXT:    retq
451;
452; SSE41-LABEL: shuf_zext_8i8_to_8i32:
453; SSE41:       # BB#0: # %entry
454; SSE41-NEXT:    movdqa %xmm0, %xmm1
455; SSE41-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
456; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
457; SSE41-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
458; SSE41-NEXT:    retq
459;
460; AVX1-LABEL: shuf_zext_8i8_to_8i32:
461; AVX1:       # BB#0: # %entry
462; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
463; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
464; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
465; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
466; AVX1-NEXT:    retq
467;
468; AVX2-LABEL: shuf_zext_8i8_to_8i32:
469; AVX2:       # BB#0: # %entry
470; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
471; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
472; AVX2-NEXT:    retq
473entry:
474  %B = shufflevector <8 x i8> %A, <8 x i8> zeroinitializer, <32 x i32> <i32 0, i32 8, i32 8, i32 8, i32 1, i32 8, i32 8, i32 8, i32 2, i32 8, i32 8, i32 8, i32 3, i32 8, i32 8, i32 8, i32 4, i32 8, i32 8, i32 8, i32 5, i32 8, i32 8, i32 8, i32 6, i32 8, i32 8, i32 8, i32 7, i32 8, i32 8, i32 8>
475  %Z = bitcast <32 x i8> %B to <8 x i32>
476  ret <8 x i32> %Z
477}
478