1; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 6; 7; Just one 32-bit run to make sure we do reasonable things there. 8; RUN: llc < %s -mtriple=i686-unknown-unknown -mcpu=i686 -mattr=+sse4.1 | FileCheck %s --check-prefix=X32-SSE41 9 10define <8 x i32> @sext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp { 11; SSE2-LABEL: sext_8i16_to_8i32: 12; SSE2: # BB#0: # %entry 13; SSE2-NEXT: movdqa %xmm0, %xmm1 14; SSE2-NEXT: # kill: XMM0<def> XMM1<kill> 15; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 16; SSE2-NEXT: pslld $16, %xmm0 17; SSE2-NEXT: psrad $16, %xmm0 18; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 19; SSE2-NEXT: pslld $16, %xmm1 20; SSE2-NEXT: psrad $16, %xmm1 21; SSE2-NEXT: retq 22; 23; SSSE3-LABEL: sext_8i16_to_8i32: 24; SSSE3: # BB#0: # %entry 25; SSSE3-NEXT: movdqa %xmm0, %xmm1 26; SSSE3-NEXT: # kill: XMM0<def> XMM1<kill> 27; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 28; SSSE3-NEXT: pslld $16, %xmm0 29; SSSE3-NEXT: psrad $16, %xmm0 30; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 31; SSSE3-NEXT: pslld $16, %xmm1 32; SSSE3-NEXT: psrad $16, %xmm1 33; SSSE3-NEXT: retq 34; 35; SSE41-LABEL: sext_8i16_to_8i32: 36; SSE41: # BB#0: # %entry 37; SSE41-NEXT: movdqa %xmm0, %xmm1 38; SSE41-NEXT: pmovzxwd %xmm1, %xmm0 39; SSE41-NEXT: pslld $16, %xmm0 40; SSE41-NEXT: psrad $16, %xmm0 41; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 42; SSE41-NEXT: pslld $16, %xmm1 43; SSE41-NEXT: psrad $16, %xmm1 44; SSE41-NEXT: retq 45; 46; AVX1-LABEL: sext_8i16_to_8i32: 47; AVX1: # BB#0: # %entry 48; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1 49; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 50; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 51; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 52; AVX1-NEXT: retq 53; 54; AVX2-LABEL: sext_8i16_to_8i32: 55; AVX2: # BB#0: # %entry 56; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 57; AVX2-NEXT: retq 58; 59; X32-SSE41-LABEL: sext_8i16_to_8i32: 60; X32-SSE41: # BB#0: # %entry 61; X32-SSE41-NEXT: movdqa %xmm0, %xmm1 62; X32-SSE41-NEXT: pmovzxwd %xmm1, %xmm0 63; X32-SSE41-NEXT: pslld $16, %xmm0 64; X32-SSE41-NEXT: psrad $16, %xmm0 65; X32-SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 66; X32-SSE41-NEXT: pslld $16, %xmm1 67; X32-SSE41-NEXT: psrad $16, %xmm1 68; X32-SSE41-NEXT: retl 69entry: 70 %B = sext <8 x i16> %A to <8 x i32> 71 ret <8 x i32>%B 72} 73 74define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp { 75; SSE2-LABEL: sext_4i32_to_4i64: 76; SSE2: # BB#0: # %entry 77; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3] 78; SSE2-NEXT: movd %xmm1, %rax 79; SSE2-NEXT: cltq 80; SSE2-NEXT: movd %rax, %xmm2 81; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 82; SSE2-NEXT: movd %xmm1, %rax 83; SSE2-NEXT: cltq 84; SSE2-NEXT: movd %rax, %xmm1 85; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] 86; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] 87; SSE2-NEXT: movd %xmm0, %rax 88; SSE2-NEXT: cltq 89; SSE2-NEXT: movd %rax, %xmm1 90; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 91; SSE2-NEXT: movd %xmm0, %rax 92; SSE2-NEXT: cltq 93; SSE2-NEXT: movd %rax, %xmm0 94; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 95; SSE2-NEXT: movdqa %xmm2, %xmm0 96; SSE2-NEXT: retq 97; 98; SSSE3-LABEL: sext_4i32_to_4i64: 99; SSSE3: # BB#0: # %entry 100; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3] 101; SSSE3-NEXT: movd %xmm1, %rax 102; SSSE3-NEXT: cltq 103; SSSE3-NEXT: movd %rax, %xmm2 104; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 105; SSSE3-NEXT: movd %xmm1, %rax 106; SSSE3-NEXT: cltq 107; SSSE3-NEXT: movd %rax, %xmm1 108; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] 109; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] 110; SSSE3-NEXT: movd %xmm0, %rax 111; SSSE3-NEXT: cltq 112; SSSE3-NEXT: movd %rax, %xmm1 113; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 114; SSSE3-NEXT: movd %xmm0, %rax 115; SSSE3-NEXT: cltq 116; SSSE3-NEXT: movd %rax, %xmm0 117; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 118; SSSE3-NEXT: movdqa %xmm2, %xmm0 119; SSSE3-NEXT: retq 120; 121; SSE41-LABEL: sext_4i32_to_4i64: 122; SSE41: # BB#0: # %entry 123; SSE41-NEXT: pmovzxdq %xmm0, %xmm1 124; SSE41-NEXT: pextrq $1, %xmm1, %rax 125; SSE41-NEXT: cltq 126; SSE41-NEXT: movd %rax, %xmm3 127; SSE41-NEXT: movd %xmm1, %rax 128; SSE41-NEXT: cltq 129; SSE41-NEXT: movd %rax, %xmm2 130; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] 131; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] 132; SSE41-NEXT: pextrq $1, %xmm0, %rax 133; SSE41-NEXT: cltq 134; SSE41-NEXT: movd %rax, %xmm3 135; SSE41-NEXT: movd %xmm0, %rax 136; SSE41-NEXT: cltq 137; SSE41-NEXT: movd %rax, %xmm1 138; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] 139; SSE41-NEXT: movdqa %xmm2, %xmm0 140; SSE41-NEXT: retq 141; 142; AVX1-LABEL: sext_4i32_to_4i64: 143; AVX1: # BB#0: # %entry 144; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 145; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 146; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 147; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 148; AVX1-NEXT: retq 149; 150; AVX2-LABEL: sext_4i32_to_4i64: 151; AVX2: # BB#0: # %entry 152; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 153; AVX2-NEXT: retq 154; 155; X32-SSE41-LABEL: sext_4i32_to_4i64: 156; X32-SSE41: # BB#0: # %entry 157; X32-SSE41-NEXT: pmovzxdq %xmm0, %xmm2 158; X32-SSE41-NEXT: movd %xmm2, %eax 159; X32-SSE41-NEXT: sarl $31, %eax 160; X32-SSE41-NEXT: pextrd $2, %xmm2, %ecx 161; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm2 162; X32-SSE41-NEXT: sarl $31, %ecx 163; X32-SSE41-NEXT: pinsrd $3, %ecx, %xmm2 164; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] 165; X32-SSE41-NEXT: movd %xmm1, %eax 166; X32-SSE41-NEXT: sarl $31, %eax 167; X32-SSE41-NEXT: pextrd $2, %xmm1, %ecx 168; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm1 169; X32-SSE41-NEXT: sarl $31, %ecx 170; X32-SSE41-NEXT: pinsrd $3, %ecx, %xmm1 171; X32-SSE41-NEXT: movdqa %xmm2, %xmm0 172; X32-SSE41-NEXT: retl 173entry: 174 %B = sext <4 x i32> %A to <4 x i64> 175 ret <4 x i64>%B 176} 177 178define <4 x i32> @load_sext_test1(<4 x i16> *%ptr) { 179; SSE2-LABEL: load_sext_test1: 180; SSE2: # BB#0: # %entry 181; SSE2-NEXT: movq (%rdi), %xmm0 182; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 183; SSE2-NEXT: psrad $16, %xmm0 184; SSE2-NEXT: retq 185; 186; SSSE3-LABEL: load_sext_test1: 187; SSSE3: # BB#0: # %entry 188; SSSE3-NEXT: movq (%rdi), %xmm0 189; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 190; SSSE3-NEXT: psrad $16, %xmm0 191; SSSE3-NEXT: retq 192; 193; SSE41-LABEL: load_sext_test1: 194; SSE41: # BB#0: # %entry 195; SSE41-NEXT: pmovsxwd (%rdi), %xmm0 196; SSE41-NEXT: retq 197; 198; AVX-LABEL: load_sext_test1: 199; AVX: # BB#0: # %entry 200; AVX-NEXT: vpmovsxwd (%rdi), %xmm0 201; AVX-NEXT: retq 202; 203; X32-SSE41-LABEL: load_sext_test1: 204; X32-SSE41: # BB#0: # %entry 205; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax 206; X32-SSE41-NEXT: pmovsxwd (%eax), %xmm0 207; X32-SSE41-NEXT: retl 208entry: 209 %X = load <4 x i16>, <4 x i16>* %ptr 210 %Y = sext <4 x i16> %X to <4 x i32> 211 ret <4 x i32>%Y 212} 213 214define <4 x i32> @load_sext_test2(<4 x i8> *%ptr) { 215; SSE2-LABEL: load_sext_test2: 216; SSE2: # BB#0: # %entry 217; SSE2-NEXT: movd (%rdi), %xmm0 218; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 219; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 220; SSE2-NEXT: psrad $24, %xmm0 221; SSE2-NEXT: retq 222; 223; SSSE3-LABEL: load_sext_test2: 224; SSSE3: # BB#0: # %entry 225; SSSE3-NEXT: movd (%rdi), %xmm0 226; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 227; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 228; SSSE3-NEXT: psrad $24, %xmm0 229; SSSE3-NEXT: retq 230; 231; SSE41-LABEL: load_sext_test2: 232; SSE41: # BB#0: # %entry 233; SSE41-NEXT: pmovsxbd (%rdi), %xmm0 234; SSE41-NEXT: retq 235; 236; AVX-LABEL: load_sext_test2: 237; AVX: # BB#0: # %entry 238; AVX-NEXT: vpmovsxbd (%rdi), %xmm0 239; AVX-NEXT: retq 240; 241; X32-SSE41-LABEL: load_sext_test2: 242; X32-SSE41: # BB#0: # %entry 243; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax 244; X32-SSE41-NEXT: pmovsxbd (%eax), %xmm0 245; X32-SSE41-NEXT: retl 246entry: 247 %X = load <4 x i8>, <4 x i8>* %ptr 248 %Y = sext <4 x i8> %X to <4 x i32> 249 ret <4 x i32>%Y 250} 251 252define <2 x i64> @load_sext_test3(<2 x i8> *%ptr) { 253; SSE2-LABEL: load_sext_test3: 254; SSE2: # BB#0: # %entry 255; SSE2-NEXT: movsbq 1(%rdi), %rax 256; SSE2-NEXT: movd %rax, %xmm1 257; SSE2-NEXT: movsbq (%rdi), %rax 258; SSE2-NEXT: movd %rax, %xmm0 259; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 260; SSE2-NEXT: retq 261; 262; SSSE3-LABEL: load_sext_test3: 263; SSSE3: # BB#0: # %entry 264; SSSE3-NEXT: movsbq 1(%rdi), %rax 265; SSSE3-NEXT: movd %rax, %xmm1 266; SSSE3-NEXT: movsbq (%rdi), %rax 267; SSSE3-NEXT: movd %rax, %xmm0 268; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 269; SSSE3-NEXT: retq 270; 271; SSE41-LABEL: load_sext_test3: 272; SSE41: # BB#0: # %entry 273; SSE41-NEXT: pmovsxbq (%rdi), %xmm0 274; SSE41-NEXT: retq 275; 276; AVX-LABEL: load_sext_test3: 277; AVX: # BB#0: # %entry 278; AVX-NEXT: vpmovsxbq (%rdi), %xmm0 279; AVX-NEXT: retq 280; 281; X32-SSE41-LABEL: load_sext_test3: 282; X32-SSE41: # BB#0: # %entry 283; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax 284; X32-SSE41-NEXT: pmovsxbq (%eax), %xmm0 285; X32-SSE41-NEXT: retl 286entry: 287 %X = load <2 x i8>, <2 x i8>* %ptr 288 %Y = sext <2 x i8> %X to <2 x i64> 289 ret <2 x i64>%Y 290} 291 292define <2 x i64> @load_sext_test4(<2 x i16> *%ptr) { 293; SSE2-LABEL: load_sext_test4: 294; SSE2: # BB#0: # %entry 295; SSE2-NEXT: movswq 2(%rdi), %rax 296; SSE2-NEXT: movd %rax, %xmm1 297; SSE2-NEXT: movswq (%rdi), %rax 298; SSE2-NEXT: movd %rax, %xmm0 299; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 300; SSE2-NEXT: retq 301; 302; SSSE3-LABEL: load_sext_test4: 303; SSSE3: # BB#0: # %entry 304; SSSE3-NEXT: movswq 2(%rdi), %rax 305; SSSE3-NEXT: movd %rax, %xmm1 306; SSSE3-NEXT: movswq (%rdi), %rax 307; SSSE3-NEXT: movd %rax, %xmm0 308; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 309; SSSE3-NEXT: retq 310; 311; SSE41-LABEL: load_sext_test4: 312; SSE41: # BB#0: # %entry 313; SSE41-NEXT: pmovsxwq (%rdi), %xmm0 314; SSE41-NEXT: retq 315; 316; AVX-LABEL: load_sext_test4: 317; AVX: # BB#0: # %entry 318; AVX-NEXT: vpmovsxwq (%rdi), %xmm0 319; AVX-NEXT: retq 320; 321; X32-SSE41-LABEL: load_sext_test4: 322; X32-SSE41: # BB#0: # %entry 323; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax 324; X32-SSE41-NEXT: pmovsxwq (%eax), %xmm0 325; X32-SSE41-NEXT: retl 326entry: 327 %X = load <2 x i16>, <2 x i16>* %ptr 328 %Y = sext <2 x i16> %X to <2 x i64> 329 ret <2 x i64>%Y 330} 331 332define <2 x i64> @load_sext_test5(<2 x i32> *%ptr) { 333; SSE2-LABEL: load_sext_test5: 334; SSE2: # BB#0: # %entry 335; SSE2-NEXT: movslq 4(%rdi), %rax 336; SSE2-NEXT: movd %rax, %xmm1 337; SSE2-NEXT: movslq (%rdi), %rax 338; SSE2-NEXT: movd %rax, %xmm0 339; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 340; SSE2-NEXT: retq 341; 342; SSSE3-LABEL: load_sext_test5: 343; SSSE3: # BB#0: # %entry 344; SSSE3-NEXT: movslq 4(%rdi), %rax 345; SSSE3-NEXT: movd %rax, %xmm1 346; SSSE3-NEXT: movslq (%rdi), %rax 347; SSSE3-NEXT: movd %rax, %xmm0 348; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 349; SSSE3-NEXT: retq 350; 351; SSE41-LABEL: load_sext_test5: 352; SSE41: # BB#0: # %entry 353; SSE41-NEXT: pmovsxdq (%rdi), %xmm0 354; SSE41-NEXT: retq 355; 356; AVX-LABEL: load_sext_test5: 357; AVX: # BB#0: # %entry 358; AVX-NEXT: vpmovsxdq (%rdi), %xmm0 359; AVX-NEXT: retq 360; 361; X32-SSE41-LABEL: load_sext_test5: 362; X32-SSE41: # BB#0: # %entry 363; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax 364; X32-SSE41-NEXT: pmovsxdq (%eax), %xmm0 365; X32-SSE41-NEXT: retl 366entry: 367 %X = load <2 x i32>, <2 x i32>* %ptr 368 %Y = sext <2 x i32> %X to <2 x i64> 369 ret <2 x i64>%Y 370} 371 372define <8 x i16> @load_sext_test6(<8 x i8> *%ptr) { 373; SSE2-LABEL: load_sext_test6: 374; SSE2: # BB#0: # %entry 375; SSE2-NEXT: movq (%rdi), %xmm0 376; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 377; SSE2-NEXT: psraw $8, %xmm0 378; SSE2-NEXT: retq 379; 380; SSSE3-LABEL: load_sext_test6: 381; SSSE3: # BB#0: # %entry 382; SSSE3-NEXT: movq (%rdi), %xmm0 383; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 384; SSSE3-NEXT: psraw $8, %xmm0 385; SSSE3-NEXT: retq 386; 387; SSE41-LABEL: load_sext_test6: 388; SSE41: # BB#0: # %entry 389; SSE41-NEXT: pmovsxbw (%rdi), %xmm0 390; SSE41-NEXT: retq 391; 392; AVX-LABEL: load_sext_test6: 393; AVX: # BB#0: # %entry 394; AVX-NEXT: vpmovsxbw (%rdi), %xmm0 395; AVX-NEXT: retq 396; 397; X32-SSE41-LABEL: load_sext_test6: 398; X32-SSE41: # BB#0: # %entry 399; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax 400; X32-SSE41-NEXT: pmovsxbw (%eax), %xmm0 401; X32-SSE41-NEXT: retl 402entry: 403 %X = load <8 x i8>, <8 x i8>* %ptr 404 %Y = sext <8 x i8> %X to <8 x i16> 405 ret <8 x i16>%Y 406} 407 408define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) { 409; SSE2-LABEL: sext_4i1_to_4i64: 410; SSE2: # BB#0: 411; SSE2-NEXT: pslld $31, %xmm0 412; SSE2-NEXT: psrad $31, %xmm0 413; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3] 414; SSE2-NEXT: movd %xmm1, %rax 415; SSE2-NEXT: cltq 416; SSE2-NEXT: movd %rax, %xmm2 417; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 418; SSE2-NEXT: movd %xmm1, %rax 419; SSE2-NEXT: cltq 420; SSE2-NEXT: movd %rax, %xmm1 421; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] 422; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] 423; SSE2-NEXT: movd %xmm0, %rax 424; SSE2-NEXT: cltq 425; SSE2-NEXT: movd %rax, %xmm1 426; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 427; SSE2-NEXT: movd %xmm0, %rax 428; SSE2-NEXT: cltq 429; SSE2-NEXT: movd %rax, %xmm0 430; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 431; SSE2-NEXT: movdqa %xmm2, %xmm0 432; SSE2-NEXT: retq 433; 434; SSSE3-LABEL: sext_4i1_to_4i64: 435; SSSE3: # BB#0: 436; SSSE3-NEXT: pslld $31, %xmm0 437; SSSE3-NEXT: psrad $31, %xmm0 438; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3] 439; SSSE3-NEXT: movd %xmm1, %rax 440; SSSE3-NEXT: cltq 441; SSSE3-NEXT: movd %rax, %xmm2 442; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 443; SSSE3-NEXT: movd %xmm1, %rax 444; SSSE3-NEXT: cltq 445; SSSE3-NEXT: movd %rax, %xmm1 446; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] 447; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] 448; SSSE3-NEXT: movd %xmm0, %rax 449; SSSE3-NEXT: cltq 450; SSSE3-NEXT: movd %rax, %xmm1 451; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 452; SSSE3-NEXT: movd %xmm0, %rax 453; SSSE3-NEXT: cltq 454; SSSE3-NEXT: movd %rax, %xmm0 455; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 456; SSSE3-NEXT: movdqa %xmm2, %xmm0 457; SSSE3-NEXT: retq 458; 459; SSE41-LABEL: sext_4i1_to_4i64: 460; SSE41: # BB#0: 461; SSE41-NEXT: pslld $31, %xmm0 462; SSE41-NEXT: psrad $31, %xmm0 463; SSE41-NEXT: pmovzxdq %xmm0, %xmm1 464; SSE41-NEXT: pextrq $1, %xmm1, %rax 465; SSE41-NEXT: cltq 466; SSE41-NEXT: movd %rax, %xmm3 467; SSE41-NEXT: movd %xmm1, %rax 468; SSE41-NEXT: cltq 469; SSE41-NEXT: movd %rax, %xmm2 470; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] 471; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] 472; SSE41-NEXT: pextrq $1, %xmm0, %rax 473; SSE41-NEXT: cltq 474; SSE41-NEXT: movd %rax, %xmm3 475; SSE41-NEXT: movd %xmm0, %rax 476; SSE41-NEXT: cltq 477; SSE41-NEXT: movd %rax, %xmm1 478; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] 479; SSE41-NEXT: movdqa %xmm2, %xmm0 480; SSE41-NEXT: retq 481; 482; AVX1-LABEL: sext_4i1_to_4i64: 483; AVX1: # BB#0: 484; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 485; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0 486; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 487; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 488; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 489; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 490; AVX1-NEXT: retq 491; 492; AVX2-LABEL: sext_4i1_to_4i64: 493; AVX2: # BB#0: 494; AVX2-NEXT: vpslld $31, %xmm0, %xmm0 495; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0 496; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 497; AVX2-NEXT: retq 498; 499; X32-SSE41-LABEL: sext_4i1_to_4i64: 500; X32-SSE41: # BB#0: 501; X32-SSE41-NEXT: pslld $31, %xmm0 502; X32-SSE41-NEXT: psrad $31, %xmm0 503; X32-SSE41-NEXT: pmovzxdq %xmm0, %xmm2 504; X32-SSE41-NEXT: movd %xmm2, %eax 505; X32-SSE41-NEXT: sarl $31, %eax 506; X32-SSE41-NEXT: pextrd $2, %xmm2, %ecx 507; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm2 508; X32-SSE41-NEXT: sarl $31, %ecx 509; X32-SSE41-NEXT: pinsrd $3, %ecx, %xmm2 510; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] 511; X32-SSE41-NEXT: movd %xmm1, %eax 512; X32-SSE41-NEXT: sarl $31, %eax 513; X32-SSE41-NEXT: pextrd $2, %xmm1, %ecx 514; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm1 515; X32-SSE41-NEXT: sarl $31, %ecx 516; X32-SSE41-NEXT: pinsrd $3, %ecx, %xmm1 517; X32-SSE41-NEXT: movdqa %xmm2, %xmm0 518; X32-SSE41-NEXT: retl 519 %extmask = sext <4 x i1> %mask to <4 x i64> 520 ret <4 x i64> %extmask 521} 522 523define <16 x i16> @sext_16i8_to_16i16(<16 x i8> *%ptr) { 524; SSE2-LABEL: sext_16i8_to_16i16: 525; SSE2: # BB#0: # %entry 526; SSE2-NEXT: movq (%rdi), %xmm0 527; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 528; SSE2-NEXT: psraw $8, %xmm0 529; SSE2-NEXT: movq 8(%rdi), %xmm1 530; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 531; SSE2-NEXT: psraw $8, %xmm1 532; SSE2-NEXT: retq 533; 534; SSSE3-LABEL: sext_16i8_to_16i16: 535; SSSE3: # BB#0: # %entry 536; SSSE3-NEXT: movq (%rdi), %xmm0 537; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 538; SSSE3-NEXT: psraw $8, %xmm0 539; SSSE3-NEXT: movq 8(%rdi), %xmm1 540; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 541; SSSE3-NEXT: psraw $8, %xmm1 542; SSSE3-NEXT: retq 543; 544; SSE41-LABEL: sext_16i8_to_16i16: 545; SSE41: # BB#0: # %entry 546; SSE41-NEXT: pmovsxbw (%rdi), %xmm0 547; SSE41-NEXT: pmovsxbw 8(%rdi), %xmm1 548; SSE41-NEXT: retq 549; 550; AVX1-LABEL: sext_16i8_to_16i16: 551; AVX1: # BB#0: # %entry 552; AVX1-NEXT: vpmovsxbw (%rdi), %xmm0 553; AVX1-NEXT: vpmovsxbw 8(%rdi), %xmm1 554; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 555; AVX1-NEXT: retq 556; 557; AVX2-LABEL: sext_16i8_to_16i16: 558; AVX2: # BB#0: # %entry 559; AVX2-NEXT: vpmovsxbw (%rdi), %ymm0 560; AVX2-NEXT: retq 561; 562; X32-SSE41-LABEL: sext_16i8_to_16i16: 563; X32-SSE41: # BB#0: # %entry 564; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax 565; X32-SSE41-NEXT: pmovsxbw (%eax), %xmm0 566; X32-SSE41-NEXT: pmovsxbw 8(%eax), %xmm1 567; X32-SSE41-NEXT: retl 568entry: 569 %X = load <16 x i8>, <16 x i8>* %ptr 570 %Y = sext <16 x i8> %X to <16 x i16> 571 ret <16 x i16> %Y 572} 573 574define <4 x i64> @sext_4i8_to_4i64(<4 x i8> %mask) { 575; SSE2-LABEL: sext_4i8_to_4i64: 576; SSE2: # BB#0: 577; SSE2-NEXT: pslld $24, %xmm0 578; SSE2-NEXT: psrad $24, %xmm0 579; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3] 580; SSE2-NEXT: movd %xmm1, %rax 581; SSE2-NEXT: cltq 582; SSE2-NEXT: movd %rax, %xmm2 583; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 584; SSE2-NEXT: movd %xmm1, %rax 585; SSE2-NEXT: cltq 586; SSE2-NEXT: movd %rax, %xmm1 587; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] 588; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] 589; SSE2-NEXT: movd %xmm0, %rax 590; SSE2-NEXT: cltq 591; SSE2-NEXT: movd %rax, %xmm1 592; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 593; SSE2-NEXT: movd %xmm0, %rax 594; SSE2-NEXT: cltq 595; SSE2-NEXT: movd %rax, %xmm0 596; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 597; SSE2-NEXT: movdqa %xmm2, %xmm0 598; SSE2-NEXT: retq 599; 600; SSSE3-LABEL: sext_4i8_to_4i64: 601; SSSE3: # BB#0: 602; SSSE3-NEXT: pslld $24, %xmm0 603; SSSE3-NEXT: psrad $24, %xmm0 604; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3] 605; SSSE3-NEXT: movd %xmm1, %rax 606; SSSE3-NEXT: cltq 607; SSSE3-NEXT: movd %rax, %xmm2 608; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 609; SSSE3-NEXT: movd %xmm1, %rax 610; SSSE3-NEXT: cltq 611; SSSE3-NEXT: movd %rax, %xmm1 612; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] 613; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] 614; SSSE3-NEXT: movd %xmm0, %rax 615; SSSE3-NEXT: cltq 616; SSSE3-NEXT: movd %rax, %xmm1 617; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 618; SSSE3-NEXT: movd %xmm0, %rax 619; SSSE3-NEXT: cltq 620; SSSE3-NEXT: movd %rax, %xmm0 621; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 622; SSSE3-NEXT: movdqa %xmm2, %xmm0 623; SSSE3-NEXT: retq 624; 625; SSE41-LABEL: sext_4i8_to_4i64: 626; SSE41: # BB#0: 627; SSE41-NEXT: pslld $24, %xmm0 628; SSE41-NEXT: psrad $24, %xmm0 629; SSE41-NEXT: pmovzxdq %xmm0, %xmm1 630; SSE41-NEXT: pextrq $1, %xmm1, %rax 631; SSE41-NEXT: cltq 632; SSE41-NEXT: movd %rax, %xmm3 633; SSE41-NEXT: movd %xmm1, %rax 634; SSE41-NEXT: cltq 635; SSE41-NEXT: movd %rax, %xmm2 636; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] 637; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] 638; SSE41-NEXT: pextrq $1, %xmm0, %rax 639; SSE41-NEXT: cltq 640; SSE41-NEXT: movd %rax, %xmm3 641; SSE41-NEXT: movd %xmm0, %rax 642; SSE41-NEXT: cltq 643; SSE41-NEXT: movd %rax, %xmm1 644; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] 645; SSE41-NEXT: movdqa %xmm2, %xmm0 646; SSE41-NEXT: retq 647; 648; AVX1-LABEL: sext_4i8_to_4i64: 649; AVX1: # BB#0: 650; AVX1-NEXT: vpslld $24, %xmm0, %xmm0 651; AVX1-NEXT: vpsrad $24, %xmm0, %xmm0 652; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 653; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 654; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 655; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 656; AVX1-NEXT: retq 657; 658; AVX2-LABEL: sext_4i8_to_4i64: 659; AVX2: # BB#0: 660; AVX2-NEXT: vpslld $24, %xmm0, %xmm0 661; AVX2-NEXT: vpsrad $24, %xmm0, %xmm0 662; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 663; AVX2-NEXT: retq 664; 665; X32-SSE41-LABEL: sext_4i8_to_4i64: 666; X32-SSE41: # BB#0: 667; X32-SSE41-NEXT: pslld $24, %xmm0 668; X32-SSE41-NEXT: psrad $24, %xmm0 669; X32-SSE41-NEXT: pmovzxdq %xmm0, %xmm2 670; X32-SSE41-NEXT: movd %xmm2, %eax 671; X32-SSE41-NEXT: sarl $31, %eax 672; X32-SSE41-NEXT: pextrd $2, %xmm2, %ecx 673; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm2 674; X32-SSE41-NEXT: sarl $31, %ecx 675; X32-SSE41-NEXT: pinsrd $3, %ecx, %xmm2 676; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] 677; X32-SSE41-NEXT: movd %xmm1, %eax 678; X32-SSE41-NEXT: sarl $31, %eax 679; X32-SSE41-NEXT: pextrd $2, %xmm1, %ecx 680; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm1 681; X32-SSE41-NEXT: sarl $31, %ecx 682; X32-SSE41-NEXT: pinsrd $3, %ecx, %xmm1 683; X32-SSE41-NEXT: movdqa %xmm2, %xmm0 684; X32-SSE41-NEXT: retl 685 %extmask = sext <4 x i8> %mask to <4 x i64> 686 ret <4 x i64> %extmask 687} 688 689define <4 x i64> @load_sext_4i8_to_4i64(<4 x i8> *%ptr) { 690; SSE2-LABEL: load_sext_4i8_to_4i64: 691; SSE2: # BB#0: # %entry 692; SSE2-NEXT: movsbq 1(%rdi), %rax 693; SSE2-NEXT: movd %rax, %xmm1 694; SSE2-NEXT: movsbq (%rdi), %rax 695; SSE2-NEXT: movd %rax, %xmm0 696; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 697; SSE2-NEXT: movsbq 3(%rdi), %rax 698; SSE2-NEXT: movd %rax, %xmm2 699; SSE2-NEXT: movsbq 2(%rdi), %rax 700; SSE2-NEXT: movd %rax, %xmm1 701; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 702; SSE2-NEXT: retq 703; 704; SSSE3-LABEL: load_sext_4i8_to_4i64: 705; SSSE3: # BB#0: # %entry 706; SSSE3-NEXT: movsbq 1(%rdi), %rax 707; SSSE3-NEXT: movd %rax, %xmm1 708; SSSE3-NEXT: movsbq (%rdi), %rax 709; SSSE3-NEXT: movd %rax, %xmm0 710; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 711; SSSE3-NEXT: movsbq 3(%rdi), %rax 712; SSSE3-NEXT: movd %rax, %xmm2 713; SSSE3-NEXT: movsbq 2(%rdi), %rax 714; SSSE3-NEXT: movd %rax, %xmm1 715; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 716; SSSE3-NEXT: retq 717; 718; SSE41-LABEL: load_sext_4i8_to_4i64: 719; SSE41: # BB#0: # %entry 720; SSE41-NEXT: pmovsxbq (%rdi), %xmm0 721; SSE41-NEXT: pmovsxbq 2(%rdi), %xmm1 722; SSE41-NEXT: retq 723; 724; AVX1-LABEL: load_sext_4i8_to_4i64: 725; AVX1: # BB#0: # %entry 726; AVX1-NEXT: vpmovsxbd (%rdi), %xmm0 727; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 728; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 729; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 730; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 731; AVX1-NEXT: retq 732; 733; AVX2-LABEL: load_sext_4i8_to_4i64: 734; AVX2: # BB#0: # %entry 735; AVX2-NEXT: vpmovsxbq (%rdi), %ymm0 736; AVX2-NEXT: retq 737; 738; X32-SSE41-LABEL: load_sext_4i8_to_4i64: 739; X32-SSE41: # BB#0: # %entry 740; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax 741; X32-SSE41-NEXT: pmovsxbq (%eax), %xmm0 742; X32-SSE41-NEXT: pmovsxbq 2(%eax), %xmm1 743; X32-SSE41-NEXT: retl 744entry: 745 %X = load <4 x i8>, <4 x i8>* %ptr 746 %Y = sext <4 x i8> %X to <4 x i64> 747 ret <4 x i64>%Y 748} 749 750define <4 x i64> @load_sext_4i16_to_4i64(<4 x i16> *%ptr) { 751; SSE2-LABEL: load_sext_4i16_to_4i64: 752; SSE2: # BB#0: # %entry 753; SSE2-NEXT: movswq 2(%rdi), %rax 754; SSE2-NEXT: movd %rax, %xmm1 755; SSE2-NEXT: movswq (%rdi), %rax 756; SSE2-NEXT: movd %rax, %xmm0 757; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 758; SSE2-NEXT: movswq 6(%rdi), %rax 759; SSE2-NEXT: movd %rax, %xmm2 760; SSE2-NEXT: movswq 4(%rdi), %rax 761; SSE2-NEXT: movd %rax, %xmm1 762; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 763; SSE2-NEXT: retq 764; 765; SSSE3-LABEL: load_sext_4i16_to_4i64: 766; SSSE3: # BB#0: # %entry 767; SSSE3-NEXT: movswq 2(%rdi), %rax 768; SSSE3-NEXT: movd %rax, %xmm1 769; SSSE3-NEXT: movswq (%rdi), %rax 770; SSSE3-NEXT: movd %rax, %xmm0 771; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 772; SSSE3-NEXT: movswq 6(%rdi), %rax 773; SSSE3-NEXT: movd %rax, %xmm2 774; SSSE3-NEXT: movswq 4(%rdi), %rax 775; SSSE3-NEXT: movd %rax, %xmm1 776; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 777; SSSE3-NEXT: retq 778; 779; SSE41-LABEL: load_sext_4i16_to_4i64: 780; SSE41: # BB#0: # %entry 781; SSE41-NEXT: pmovsxwq (%rdi), %xmm0 782; SSE41-NEXT: pmovsxwq 4(%rdi), %xmm1 783; SSE41-NEXT: retq 784; 785; AVX1-LABEL: load_sext_4i16_to_4i64: 786; AVX1: # BB#0: # %entry 787; AVX1-NEXT: vpmovsxwd (%rdi), %xmm0 788; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 789; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 790; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 791; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 792; AVX1-NEXT: retq 793; 794; AVX2-LABEL: load_sext_4i16_to_4i64: 795; AVX2: # BB#0: # %entry 796; AVX2-NEXT: vpmovsxwq (%rdi), %ymm0 797; AVX2-NEXT: retq 798; 799; X32-SSE41-LABEL: load_sext_4i16_to_4i64: 800; X32-SSE41: # BB#0: # %entry 801; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax 802; X32-SSE41-NEXT: pmovsxwq (%eax), %xmm0 803; X32-SSE41-NEXT: pmovsxwq 4(%eax), %xmm1 804; X32-SSE41-NEXT: retl 805entry: 806 %X = load <4 x i16>, <4 x i16>* %ptr 807 %Y = sext <4 x i16> %X to <4 x i64> 808 ret <4 x i64>%Y 809} 810