1; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 6 7define <8 x i32> @zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp { 8; SSE2-LABEL: zext_8i16_to_8i32: 9; SSE2: # BB#0: # %entry 10; SSE2-NEXT: movdqa %xmm0, %xmm1 11; SSE2-NEXT: pxor %xmm2, %xmm2 12; SSE2-NEXT: # kill 13; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 14; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 15; SSE2-NEXT: pand .LCPI0_0(%rip), %xmm1 16; SSE2-NEXT: retq 17; 18; SSSE3-LABEL: zext_8i16_to_8i32: 19; SSSE3: # BB#0: # %entry 20; SSSE3-NEXT: movdqa %xmm0, %xmm1 21; SSSE3-NEXT: pxor %xmm2, %xmm2 22; SSSE3-NEXT: # kill 23; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 24; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 25; SSSE3-NEXT: pand .LCPI0_0(%rip), %xmm1 26; SSSE3-NEXT: retq 27; 28; SSE41-LABEL: zext_8i16_to_8i32: 29; SSE41: # BB#0: # %entry 30; SSE41-NEXT: movdqa %xmm0, %xmm1 31; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 32; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 33; SSE41-NEXT: pand .LCPI0_0(%rip), %xmm1 34; SSE41-NEXT: retq 35; 36; AVX1-LABEL: zext_8i16_to_8i32: 37; AVX1: # BB#0: # %entry 38; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 39; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 40; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 41; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 42; AVX1-NEXT: retq 43; 44; AVX2-LABEL: zext_8i16_to_8i32: 45; AVX2: # BB#0: # %entry 46; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 47; AVX2-NEXT: retq 48entry: 49 %B = zext <8 x i16> %A to <8 x i32> 50 ret <8 x i32>%B 51} 52 53define <4 x i64> @zext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp { 54; SSE2-LABEL: zext_4i32_to_4i64: 55; SSE2: # BB#0: # %entry 56; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] 57; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295] 58; SSE2-NEXT: pand %xmm3, %xmm2 59; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] 60; SSE2-NEXT: pand %xmm3, %xmm1 61; SSE2-NEXT: movdqa %xmm2, %xmm0 62; SSE2-NEXT: retq 63; 64; SSSE3-LABEL: zext_4i32_to_4i64: 65; SSSE3: # BB#0: # %entry 66; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] 67; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295] 68; SSSE3-NEXT: pand %xmm3, %xmm2 69; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] 70; SSSE3-NEXT: pand %xmm3, %xmm1 71; SSSE3-NEXT: movdqa %xmm2, %xmm0 72; SSSE3-NEXT: retq 73; 74; SSE41-LABEL: zext_4i32_to_4i64: 75; SSE41: # BB#0: # %entry 76; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero 77; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295] 78; SSE41-NEXT: pand %xmm3, %xmm2 79; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] 80; SSE41-NEXT: pand %xmm3, %xmm1 81; SSE41-NEXT: movdqa %xmm2, %xmm0 82; SSE41-NEXT: retq 83; 84; AVX1-LABEL: zext_4i32_to_4i64: 85; AVX1: # BB#0: # %entry 86; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 87; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 88; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 89; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 90; AVX1-NEXT: retq 91; 92; AVX2-LABEL: zext_4i32_to_4i64: 93; AVX2: # BB#0: # %entry 94; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 95; AVX2-NEXT: retq 96entry: 97 %B = zext <4 x i32> %A to <4 x i64> 98 ret <4 x i64>%B 99} 100 101define <8 x i32> @zext_8i8_to_8i32(<8 x i8> %z) { 102; SSE2-LABEL: zext_8i8_to_8i32: 103; SSE2: # BB#0: # %entry 104; SSE2-NEXT: movdqa %xmm0, %xmm2 105; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] 106; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255] 107; SSE2-NEXT: pand %xmm1, %xmm2 108; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] 109; SSE2-NEXT: pand %xmm0, %xmm1 110; SSE2-NEXT: movdqa %xmm2, %xmm0 111; SSE2-NEXT: retq 112; 113; SSSE3-LABEL: zext_8i8_to_8i32: 114; SSSE3: # BB#0: # %entry 115; SSSE3-NEXT: movdqa %xmm0, %xmm2 116; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] 117; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255] 118; SSSE3-NEXT: pand %xmm1, %xmm2 119; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] 120; SSSE3-NEXT: pand %xmm0, %xmm1 121; SSSE3-NEXT: movdqa %xmm2, %xmm0 122; SSSE3-NEXT: retq 123; 124; SSE41-LABEL: zext_8i8_to_8i32: 125; SSE41: # BB#0: # %entry 126; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 127; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255] 128; SSE41-NEXT: pand %xmm1, %xmm2 129; SSE41-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] 130; SSE41-NEXT: pand %xmm0, %xmm1 131; SSE41-NEXT: movdqa %xmm2, %xmm0 132; SSE41-NEXT: retq 133; 134; AVX1-LABEL: zext_8i8_to_8i32: 135; AVX1: # BB#0: # %entry 136; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 137; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] 138; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 139; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 140; AVX1-NEXT: retq 141; 142; AVX2-LABEL: zext_8i8_to_8i32: 143; AVX2: # BB#0: # %entry 144; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 145; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 146; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 147; AVX2-NEXT: retq 148entry: 149 %t = zext <8 x i8> %z to <8 x i32> 150 ret <8 x i32> %t 151} 152 153; PR17654 154define <16 x i16> @zext_16i8_to_16i16(<16 x i8> %z) { 155; SSE2-LABEL: zext_16i8_to_16i16: 156; SSE2: # BB#0: # %entry 157; SSE2-NEXT: movdqa %xmm0, %xmm1 158; SSE2-NEXT: pxor %xmm2, %xmm2 159; SSE2-NEXT: # kill 160; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 161; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 162; SSE2-NEXT: pand .LCPI3_0(%rip), %xmm1 163; SSE2-NEXT: retq 164; 165; SSSE3-LABEL: zext_16i8_to_16i16: 166; SSSE3: # BB#0: # %entry 167; SSSE3-NEXT: movdqa %xmm0, %xmm1 168; SSSE3-NEXT: pxor %xmm2, %xmm2 169; SSSE3-NEXT: # kill 170; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 171; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 172; SSSE3-NEXT: pand .LCPI3_0(%rip), %xmm1 173; SSSE3-NEXT: retq 174; 175; SSE41-LABEL: zext_16i8_to_16i16: 176; SSE41: # BB#0: # %entry 177; SSE41-NEXT: movdqa %xmm0, %xmm1 178; SSE41-NEXT: pmovzxbw %xmm1, %xmm0 {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 179; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 180; SSE41-NEXT: pand .LCPI3_0(%rip), %xmm1 181; SSE41-NEXT: retq 182; 183; AVX1-LABEL: zext_16i8_to_16i16: 184; AVX1: # BB#0: # %entry 185; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 186; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 187; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 188; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 189; AVX1-NEXT: retq 190; 191; AVX2-LABEL: zext_16i8_to_16i16: 192; AVX2: # BB#0: # %entry 193; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 194; AVX2-NEXT: retq 195entry: 196 %t = zext <16 x i8> %z to <16 x i16> 197 ret <16 x i16> %t 198} 199 200define <16 x i16> @load_zext_16i8_to_16i16(<16 x i8> *%ptr) { 201; SSE2-LABEL: load_zext_16i8_to_16i16: 202; SSE2: # BB#0: # %entry 203; SSE2-NEXT: movdqa (%rdi), %xmm1 204; SSE2-NEXT: pxor %xmm2, %xmm2 205; SSE2-NEXT: movdqa %xmm1, %xmm0 206; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 207; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 208; SSE2-NEXT: pand .LCPI4_0(%rip), %xmm1 209; SSE2-NEXT: retq 210; 211; SSSE3-LABEL: load_zext_16i8_to_16i16: 212; SSSE3: # BB#0: # %entry 213; SSSE3-NEXT: movdqa (%rdi), %xmm1 214; SSSE3-NEXT: pxor %xmm2, %xmm2 215; SSSE3-NEXT: movdqa %xmm1, %xmm0 216; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 217; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 218; SSSE3-NEXT: pand .LCPI4_0(%rip), %xmm1 219; SSSE3-NEXT: retq 220; 221; SSE41-LABEL: load_zext_16i8_to_16i16: 222; SSE41: # BB#0: # %entry 223; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 224; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 225; SSE41-NEXT: retq 226; 227; AVX1-LABEL: load_zext_16i8_to_16i16: 228; AVX1: # BB#0: # %entry 229; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 230; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 231; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 232; AVX1-NEXT: retq 233; 234; AVX2-LABEL: load_zext_16i8_to_16i16: 235; AVX2: # BB#0: # %entry 236; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero 237; AVX2-NEXT: retq 238entry: 239 %X = load <16 x i8>, <16 x i8>* %ptr 240 %Y = zext <16 x i8> %X to <16 x i16> 241 ret <16 x i16> %Y 242} 243 244define <8 x i32> @load_zext_8i16_to_8i32(<8 x i16> *%ptr) { 245; SSE2-LABEL: load_zext_8i16_to_8i32: 246; SSE2: # BB#0: # %entry 247; SSE2-NEXT: movdqa (%rdi), %xmm1 248; SSE2-NEXT: pxor %xmm2, %xmm2 249; SSE2-NEXT: movdqa %xmm1, %xmm0 250; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 251; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 252; SSE2-NEXT: pand .LCPI5_0(%rip), %xmm1 253; SSE2-NEXT: retq 254; 255; SSSE3-LABEL: load_zext_8i16_to_8i32: 256; SSSE3: # BB#0: # %entry 257; SSSE3-NEXT: movdqa (%rdi), %xmm1 258; SSSE3-NEXT: pxor %xmm2, %xmm2 259; SSSE3-NEXT: movdqa %xmm1, %xmm0 260; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 261; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 262; SSSE3-NEXT: pand .LCPI5_0(%rip), %xmm1 263; SSSE3-NEXT: retq 264; 265; SSE41-LABEL: load_zext_8i16_to_8i32: 266; SSE41: # BB#0: # %entry 267; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 268; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 269; SSE41-NEXT: retq 270; 271; AVX1-LABEL: load_zext_8i16_to_8i32: 272; AVX1: # BB#0: # %entry 273; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 274; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 275; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 276; AVX1-NEXT: retq 277; 278; AVX2-LABEL: load_zext_8i16_to_8i32: 279; AVX2: # BB#0: # %entry 280; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 281; AVX2-NEXT: retq 282entry: 283 %X = load <8 x i16>, <8 x i16>* %ptr 284 %Y = zext <8 x i16> %X to <8 x i32> 285 ret <8 x i32>%Y 286} 287 288define <4 x i64> @load_zext_4i32_to_4i64(<4 x i32> *%ptr) { 289; SSE2-LABEL: load_zext_4i32_to_4i64: 290; SSE2: # BB#0: # %entry 291; SSE2-NEXT: movdqa (%rdi), %xmm1 292; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3] 293; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295] 294; SSE2-NEXT: pand %xmm2, %xmm0 295; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] 296; SSE2-NEXT: pand %xmm2, %xmm1 297; SSE2-NEXT: retq 298; 299; SSSE3-LABEL: load_zext_4i32_to_4i64: 300; SSSE3: # BB#0: # %entry 301; SSSE3-NEXT: movdqa (%rdi), %xmm1 302; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3] 303; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295] 304; SSSE3-NEXT: pand %xmm2, %xmm0 305; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] 306; SSSE3-NEXT: pand %xmm2, %xmm1 307; SSSE3-NEXT: retq 308; 309; SSE41-LABEL: load_zext_4i32_to_4i64: 310; SSE41: # BB#0: # %entry 311; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero 312; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero 313; SSE41-NEXT: retq 314; 315; AVX1-LABEL: load_zext_4i32_to_4i64: 316; AVX1: # BB#0: # %entry 317; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero 318; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero 319; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 320; AVX1-NEXT: retq 321; 322; AVX2-LABEL: load_zext_4i32_to_4i64: 323; AVX2: # BB#0: # %entry 324; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 325; AVX2-NEXT: retq 326entry: 327 %X = load <4 x i32>, <4 x i32>* %ptr 328 %Y = zext <4 x i32> %X to <4 x i64> 329 ret <4 x i64>%Y 330} 331 332define <8 x i32> @shuf_zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp { 333; SSE2-LABEL: shuf_zext_8i16_to_8i32: 334; SSE2: # BB#0: # %entry 335; SSE2-NEXT: movdqa %xmm0, %xmm1 336; SSE2-NEXT: pxor %xmm2, %xmm2 337; SSE2-NEXT: # kill 338; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 339; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 340; SSE2-NEXT: retq 341; 342; SSSE3-LABEL: shuf_zext_8i16_to_8i32: 343; SSSE3: # BB#0: # %entry 344; SSSE3-NEXT: movdqa %xmm0, %xmm1 345; SSSE3-NEXT: pxor %xmm2, %xmm2 346; SSSE3-NEXT: # kill 347; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 348; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 349; SSSE3-NEXT: retq 350; 351; SSE41-LABEL: shuf_zext_8i16_to_8i32: 352; SSE41: # BB#0: # %entry 353; SSE41-NEXT: movdqa %xmm0, %xmm1 354; SSE41-NEXT: pxor %xmm2, %xmm2 355; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 356; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 357; SSE41-NEXT: retq 358; 359; AVX1-LABEL: shuf_zext_8i16_to_8i32: 360; AVX1: # BB#0: # %entry 361; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 362; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 363; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 364; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 365; AVX1-NEXT: retq 366; 367; AVX2-LABEL: shuf_zext_8i16_to_8i32: 368; AVX2: # BB#0: # %entry 369; AVX2-NEXT: # kill 370; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 371; AVX2-NEXT: retq 372entry: 373 %B = shufflevector <8 x i16> %A, <8 x i16> zeroinitializer, <16 x i32> <i32 0, i32 8, i32 1, i32 8, i32 2, i32 8, i32 3, i32 8, i32 4, i32 8, i32 5, i32 8, i32 6, i32 8, i32 7, i32 8> 374 %Z = bitcast <16 x i16> %B to <8 x i32> 375 ret <8 x i32> %Z 376} 377 378define <4 x i64> @shuf_zext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp { 379; SSE2-LABEL: shuf_zext_4i32_to_4i64: 380; SSE2: # BB#0: # %entry 381; SSE2-NEXT: movdqa %xmm0, %xmm1 382; SSE2-NEXT: pxor %xmm2, %xmm2 383; SSE2-NEXT: # kill 384; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 385; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] 386; SSE2-NEXT: retq 387; 388; SSSE3-LABEL: shuf_zext_4i32_to_4i64: 389; SSSE3: # BB#0: # %entry 390; SSSE3-NEXT: movdqa %xmm0, %xmm1 391; SSSE3-NEXT: pxor %xmm2, %xmm2 392; SSSE3-NEXT: # kill 393; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 394; SSSE3-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] 395; SSSE3-NEXT: retq 396; 397; SSE41-LABEL: shuf_zext_4i32_to_4i64: 398; SSE41: # BB#0: # %entry 399; SSE41-NEXT: movdqa %xmm0, %xmm1 400; SSE41-NEXT: pxor %xmm2, %xmm2 401; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero 402; SSE41-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] 403; SSE41-NEXT: retq 404; 405; AVX1-LABEL: shuf_zext_4i32_to_4i64: 406; AVX1: # BB#0: # %entry 407; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero 408; AVX1-NEXT: vxorpd %xmm2, %xmm2, %xmm2 409; AVX1-NEXT: vblendpd {{.*#+}} xmm0 = xmm2[0],xmm0[1] 410; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,0,3,0] 411; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 412; AVX1-NEXT: retq 413; 414; AVX2-LABEL: shuf_zext_4i32_to_4i64: 415; AVX2: # BB#0: # %entry 416; AVX2-NEXT: # kill 417; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 418; AVX2-NEXT: retq 419entry: 420 %B = shufflevector <4 x i32> %A, <4 x i32> zeroinitializer, <8 x i32> <i32 0, i32 4, i32 1, i32 4, i32 2, i32 4, i32 3, i32 4> 421 %Z = bitcast <8 x i32> %B to <4 x i64> 422 ret <4 x i64> %Z 423} 424 425define <8 x i32> @shuf_zext_8i8_to_8i32(<8 x i8> %A) { 426; SSE2-LABEL: shuf_zext_8i8_to_8i32: 427; SSE2: # BB#0: # %entry 428; SSE2-NEXT: pand .LCPI9_0(%rip), %xmm0 429; SSE2-NEXT: packuswb %xmm0, %xmm0 430; SSE2-NEXT: pxor %xmm1, %xmm1 431; SSE2-NEXT: movdqa %xmm0, %xmm2 432; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 433; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 434; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 435; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] 436; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] 437; SSE2-NEXT: pandn %xmm0, %xmm1 438; SSE2-NEXT: movdqa %xmm2, %xmm0 439; SSE2-NEXT: retq 440; 441; SSSE3-LABEL: shuf_zext_8i8_to_8i32: 442; SSSE3: # BB#0: # %entry 443; SSSE3-NEXT: movdqa %xmm0, %xmm1 444; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] 445; SSSE3-NEXT: pxor %xmm2, %xmm2 446; SSSE3-NEXT: movdqa %xmm1, %xmm0 447; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 448; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 449; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero 450; SSSE3-NEXT: retq 451; 452; SSE41-LABEL: shuf_zext_8i8_to_8i32: 453; SSE41: # BB#0: # %entry 454; SSE41-NEXT: movdqa %xmm0, %xmm1 455; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] 456; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 457; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero 458; SSE41-NEXT: retq 459; 460; AVX1-LABEL: shuf_zext_8i8_to_8i32: 461; AVX1: # BB#0: # %entry 462; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] 463; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 464; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 465; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 466; AVX1-NEXT: retq 467; 468; AVX2-LABEL: shuf_zext_8i8_to_8i32: 469; AVX2: # BB#0: # %entry 470; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] 471; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 472; AVX2-NEXT: retq 473entry: 474 %B = shufflevector <8 x i8> %A, <8 x i8> zeroinitializer, <32 x i32> <i32 0, i32 8, i32 8, i32 8, i32 1, i32 8, i32 8, i32 8, i32 2, i32 8, i32 8, i32 8, i32 3, i32 8, i32 8, i32 8, i32 4, i32 8, i32 8, i32 8, i32 5, i32 8, i32 8, i32 8, i32 6, i32 8, i32 8, i32 8, i32 7, i32 8, i32 8, i32 8> 475 %Z = bitcast <32 x i8> %B to <8 x i32> 476 ret <8 x i32> %Z 477} 478