1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP,XOPAVX1 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOP,XOPAVX2 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW 10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=AVX512VL,AVX512DQVL 11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512VL,AVX512BWVL 12; 13; Just one 32-bit run to make sure we do reasonable things for i64 shifts. 14; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86-SSE 15 16; 17; Variable Shifts 18; 19 20define <2 x i32> @var_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind { 21; SSE2-LABEL: var_shift_v2i32: 22; SSE2: # %bb.0: 23; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] 24; SSE2-NEXT: movdqa %xmm0, %xmm3 25; SSE2-NEXT: psrad %xmm2, %xmm3 26; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7] 27; SSE2-NEXT: movdqa %xmm0, %xmm2 28; SSE2-NEXT: psrad %xmm4, %xmm2 29; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] 30; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 31; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] 32; SSE2-NEXT: movdqa %xmm0, %xmm4 33; SSE2-NEXT: psrad %xmm3, %xmm4 34; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] 35; SSE2-NEXT: psrad %xmm1, %xmm0 36; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1] 37; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3] 38; SSE2-NEXT: movaps %xmm2, %xmm0 39; SSE2-NEXT: retq 40; 41; SSE41-LABEL: var_shift_v2i32: 42; SSE41: # %bb.0: 43; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] 44; SSE41-NEXT: movdqa %xmm0, %xmm3 45; SSE41-NEXT: psrad %xmm2, %xmm3 46; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] 47; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] 48; SSE41-NEXT: movdqa %xmm0, %xmm5 49; SSE41-NEXT: psrad %xmm4, %xmm5 50; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7] 51; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] 52; SSE41-NEXT: movdqa %xmm0, %xmm3 53; SSE41-NEXT: psrad %xmm1, %xmm3 54; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7] 55; SSE41-NEXT: psrad %xmm1, %xmm0 56; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] 57; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7] 58; SSE41-NEXT: retq 59; 60; AVX1-LABEL: var_shift_v2i32: 61; AVX1: # %bb.0: 62; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 63; AVX1-NEXT: vpsrad %xmm2, %xmm0, %xmm2 64; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3 65; AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3 66; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] 67; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 68; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] 69; AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3 70; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 71; AVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0 72; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] 73; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 74; AVX1-NEXT: retq 75; 76; AVX2-LABEL: var_shift_v2i32: 77; AVX2: # %bb.0: 78; AVX2-NEXT: vpsravd %xmm1, %xmm0, %xmm0 79; AVX2-NEXT: retq 80; 81; XOPAVX1-LABEL: var_shift_v2i32: 82; XOPAVX1: # %bb.0: 83; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 84; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1 85; XOPAVX1-NEXT: vpshad %xmm1, %xmm0, %xmm0 86; XOPAVX1-NEXT: retq 87; 88; XOPAVX2-LABEL: var_shift_v2i32: 89; XOPAVX2: # %bb.0: 90; XOPAVX2-NEXT: vpsravd %xmm1, %xmm0, %xmm0 91; XOPAVX2-NEXT: retq 92; 93; AVX512-LABEL: var_shift_v2i32: 94; AVX512: # %bb.0: 95; AVX512-NEXT: vpsravd %xmm1, %xmm0, %xmm0 96; AVX512-NEXT: retq 97; 98; AVX512VL-LABEL: var_shift_v2i32: 99; AVX512VL: # %bb.0: 100; AVX512VL-NEXT: vpsravd %xmm1, %xmm0, %xmm0 101; AVX512VL-NEXT: retq 102; 103; X86-SSE-LABEL: var_shift_v2i32: 104; X86-SSE: # %bb.0: 105; X86-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] 106; X86-SSE-NEXT: movdqa %xmm0, %xmm3 107; X86-SSE-NEXT: psrad %xmm2, %xmm3 108; X86-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7] 109; X86-SSE-NEXT: movdqa %xmm0, %xmm2 110; X86-SSE-NEXT: psrad %xmm4, %xmm2 111; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] 112; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 113; X86-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] 114; X86-SSE-NEXT: movdqa %xmm0, %xmm4 115; X86-SSE-NEXT: psrad %xmm3, %xmm4 116; X86-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] 117; X86-SSE-NEXT: psrad %xmm1, %xmm0 118; X86-SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1] 119; X86-SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3] 120; X86-SSE-NEXT: movaps %xmm2, %xmm0 121; X86-SSE-NEXT: retl 122 %shift = ashr <2 x i32> %a, %b 123 ret <2 x i32> %shift 124} 125 126define <4 x i16> @var_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind { 127; SSE2-LABEL: var_shift_v4i16: 128; SSE2: # %bb.0: 129; SSE2-NEXT: psllw $12, %xmm1 130; SSE2-NEXT: movdqa %xmm1, %xmm2 131; SSE2-NEXT: psraw $15, %xmm2 132; SSE2-NEXT: movdqa %xmm2, %xmm3 133; SSE2-NEXT: pandn %xmm0, %xmm3 134; SSE2-NEXT: psraw $8, %xmm0 135; SSE2-NEXT: pand %xmm2, %xmm0 136; SSE2-NEXT: por %xmm3, %xmm0 137; SSE2-NEXT: paddw %xmm1, %xmm1 138; SSE2-NEXT: movdqa %xmm1, %xmm2 139; SSE2-NEXT: psraw $15, %xmm2 140; SSE2-NEXT: movdqa %xmm2, %xmm3 141; SSE2-NEXT: pandn %xmm0, %xmm3 142; SSE2-NEXT: psraw $4, %xmm0 143; SSE2-NEXT: pand %xmm2, %xmm0 144; SSE2-NEXT: por %xmm3, %xmm0 145; SSE2-NEXT: paddw %xmm1, %xmm1 146; SSE2-NEXT: movdqa %xmm1, %xmm2 147; SSE2-NEXT: psraw $15, %xmm2 148; SSE2-NEXT: movdqa %xmm2, %xmm3 149; SSE2-NEXT: pandn %xmm0, %xmm3 150; SSE2-NEXT: psraw $2, %xmm0 151; SSE2-NEXT: pand %xmm2, %xmm0 152; SSE2-NEXT: por %xmm3, %xmm0 153; SSE2-NEXT: paddw %xmm1, %xmm1 154; SSE2-NEXT: psraw $15, %xmm1 155; SSE2-NEXT: movdqa %xmm1, %xmm2 156; SSE2-NEXT: pandn %xmm0, %xmm2 157; SSE2-NEXT: psraw $1, %xmm0 158; SSE2-NEXT: pand %xmm1, %xmm0 159; SSE2-NEXT: por %xmm2, %xmm0 160; SSE2-NEXT: retq 161; 162; SSE41-LABEL: var_shift_v4i16: 163; SSE41: # %bb.0: 164; SSE41-NEXT: movdqa %xmm1, %xmm2 165; SSE41-NEXT: movdqa %xmm0, %xmm1 166; SSE41-NEXT: movdqa %xmm2, %xmm0 167; SSE41-NEXT: psllw $12, %xmm0 168; SSE41-NEXT: psllw $4, %xmm2 169; SSE41-NEXT: por %xmm0, %xmm2 170; SSE41-NEXT: movdqa %xmm2, %xmm3 171; SSE41-NEXT: paddw %xmm2, %xmm3 172; SSE41-NEXT: movdqa %xmm1, %xmm4 173; SSE41-NEXT: psraw $8, %xmm4 174; SSE41-NEXT: movdqa %xmm2, %xmm0 175; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm1 176; SSE41-NEXT: movdqa %xmm1, %xmm2 177; SSE41-NEXT: psraw $4, %xmm2 178; SSE41-NEXT: movdqa %xmm3, %xmm0 179; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 180; SSE41-NEXT: movdqa %xmm1, %xmm2 181; SSE41-NEXT: psraw $2, %xmm2 182; SSE41-NEXT: paddw %xmm3, %xmm3 183; SSE41-NEXT: movdqa %xmm3, %xmm0 184; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 185; SSE41-NEXT: movdqa %xmm1, %xmm2 186; SSE41-NEXT: psraw $1, %xmm2 187; SSE41-NEXT: paddw %xmm3, %xmm3 188; SSE41-NEXT: movdqa %xmm3, %xmm0 189; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 190; SSE41-NEXT: movdqa %xmm1, %xmm0 191; SSE41-NEXT: retq 192; 193; AVX1-LABEL: var_shift_v4i16: 194; AVX1: # %bb.0: 195; AVX1-NEXT: vpsllw $12, %xmm1, %xmm2 196; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 197; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 198; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm2 199; AVX1-NEXT: vpsraw $8, %xmm0, %xmm3 200; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 201; AVX1-NEXT: vpsraw $4, %xmm0, %xmm1 202; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 203; AVX1-NEXT: vpsraw $2, %xmm0, %xmm1 204; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 205; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 206; AVX1-NEXT: vpsraw $1, %xmm0, %xmm1 207; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 208; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 209; AVX1-NEXT: retq 210; 211; AVX2-LABEL: var_shift_v4i16: 212; AVX2: # %bb.0: 213; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 214; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 215; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0 216; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 217; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 218; AVX2-NEXT: vzeroupper 219; AVX2-NEXT: retq 220; 221; XOP-LABEL: var_shift_v4i16: 222; XOP: # %bb.0: 223; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 224; XOP-NEXT: vpsubw %xmm1, %xmm2, %xmm1 225; XOP-NEXT: vpshaw %xmm1, %xmm0, %xmm0 226; XOP-NEXT: retq 227; 228; AVX512DQ-LABEL: var_shift_v4i16: 229; AVX512DQ: # %bb.0: 230; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 231; AVX512DQ-NEXT: vpmovsxwd %xmm0, %ymm0 232; AVX512DQ-NEXT: vpsravd %ymm1, %ymm0, %ymm0 233; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 234; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 235; AVX512DQ-NEXT: vzeroupper 236; AVX512DQ-NEXT: retq 237; 238; AVX512BW-LABEL: var_shift_v4i16: 239; AVX512BW: # %bb.0: 240; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 241; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 242; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 243; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 244; AVX512BW-NEXT: vzeroupper 245; AVX512BW-NEXT: retq 246; 247; AVX512DQVL-LABEL: var_shift_v4i16: 248; AVX512DQVL: # %bb.0: 249; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 250; AVX512DQVL-NEXT: vpmovsxwd %xmm0, %ymm0 251; AVX512DQVL-NEXT: vpsravd %ymm1, %ymm0, %ymm0 252; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0 253; AVX512DQVL-NEXT: vzeroupper 254; AVX512DQVL-NEXT: retq 255; 256; AVX512BWVL-LABEL: var_shift_v4i16: 257; AVX512BWVL: # %bb.0: 258; AVX512BWVL-NEXT: vpsravw %xmm1, %xmm0, %xmm0 259; AVX512BWVL-NEXT: retq 260; 261; X86-SSE-LABEL: var_shift_v4i16: 262; X86-SSE: # %bb.0: 263; X86-SSE-NEXT: psllw $12, %xmm1 264; X86-SSE-NEXT: movdqa %xmm1, %xmm2 265; X86-SSE-NEXT: psraw $15, %xmm2 266; X86-SSE-NEXT: movdqa %xmm2, %xmm3 267; X86-SSE-NEXT: pandn %xmm0, %xmm3 268; X86-SSE-NEXT: psraw $8, %xmm0 269; X86-SSE-NEXT: pand %xmm2, %xmm0 270; X86-SSE-NEXT: por %xmm3, %xmm0 271; X86-SSE-NEXT: paddw %xmm1, %xmm1 272; X86-SSE-NEXT: movdqa %xmm1, %xmm2 273; X86-SSE-NEXT: psraw $15, %xmm2 274; X86-SSE-NEXT: movdqa %xmm2, %xmm3 275; X86-SSE-NEXT: pandn %xmm0, %xmm3 276; X86-SSE-NEXT: psraw $4, %xmm0 277; X86-SSE-NEXT: pand %xmm2, %xmm0 278; X86-SSE-NEXT: por %xmm3, %xmm0 279; X86-SSE-NEXT: paddw %xmm1, %xmm1 280; X86-SSE-NEXT: movdqa %xmm1, %xmm2 281; X86-SSE-NEXT: psraw $15, %xmm2 282; X86-SSE-NEXT: movdqa %xmm2, %xmm3 283; X86-SSE-NEXT: pandn %xmm0, %xmm3 284; X86-SSE-NEXT: psraw $2, %xmm0 285; X86-SSE-NEXT: pand %xmm2, %xmm0 286; X86-SSE-NEXT: por %xmm3, %xmm0 287; X86-SSE-NEXT: paddw %xmm1, %xmm1 288; X86-SSE-NEXT: psraw $15, %xmm1 289; X86-SSE-NEXT: movdqa %xmm1, %xmm2 290; X86-SSE-NEXT: pandn %xmm0, %xmm2 291; X86-SSE-NEXT: psraw $1, %xmm0 292; X86-SSE-NEXT: pand %xmm1, %xmm0 293; X86-SSE-NEXT: por %xmm2, %xmm0 294; X86-SSE-NEXT: retl 295 %shift = ashr <4 x i16> %a, %b 296 ret <4 x i16> %shift 297} 298 299define <2 x i16> @var_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind { 300; SSE2-LABEL: var_shift_v2i16: 301; SSE2: # %bb.0: 302; SSE2-NEXT: psllw $12, %xmm1 303; SSE2-NEXT: movdqa %xmm1, %xmm2 304; SSE2-NEXT: psraw $15, %xmm2 305; SSE2-NEXT: movdqa %xmm2, %xmm3 306; SSE2-NEXT: pandn %xmm0, %xmm3 307; SSE2-NEXT: psraw $8, %xmm0 308; SSE2-NEXT: pand %xmm2, %xmm0 309; SSE2-NEXT: por %xmm3, %xmm0 310; SSE2-NEXT: paddw %xmm1, %xmm1 311; SSE2-NEXT: movdqa %xmm1, %xmm2 312; SSE2-NEXT: psraw $15, %xmm2 313; SSE2-NEXT: movdqa %xmm2, %xmm3 314; SSE2-NEXT: pandn %xmm0, %xmm3 315; SSE2-NEXT: psraw $4, %xmm0 316; SSE2-NEXT: pand %xmm2, %xmm0 317; SSE2-NEXT: por %xmm3, %xmm0 318; SSE2-NEXT: paddw %xmm1, %xmm1 319; SSE2-NEXT: movdqa %xmm1, %xmm2 320; SSE2-NEXT: psraw $15, %xmm2 321; SSE2-NEXT: movdqa %xmm2, %xmm3 322; SSE2-NEXT: pandn %xmm0, %xmm3 323; SSE2-NEXT: psraw $2, %xmm0 324; SSE2-NEXT: pand %xmm2, %xmm0 325; SSE2-NEXT: por %xmm3, %xmm0 326; SSE2-NEXT: paddw %xmm1, %xmm1 327; SSE2-NEXT: psraw $15, %xmm1 328; SSE2-NEXT: movdqa %xmm1, %xmm2 329; SSE2-NEXT: pandn %xmm0, %xmm2 330; SSE2-NEXT: psraw $1, %xmm0 331; SSE2-NEXT: pand %xmm1, %xmm0 332; SSE2-NEXT: por %xmm2, %xmm0 333; SSE2-NEXT: retq 334; 335; SSE41-LABEL: var_shift_v2i16: 336; SSE41: # %bb.0: 337; SSE41-NEXT: movdqa %xmm1, %xmm2 338; SSE41-NEXT: movdqa %xmm0, %xmm1 339; SSE41-NEXT: movdqa %xmm2, %xmm0 340; SSE41-NEXT: psllw $12, %xmm0 341; SSE41-NEXT: psllw $4, %xmm2 342; SSE41-NEXT: por %xmm0, %xmm2 343; SSE41-NEXT: movdqa %xmm2, %xmm3 344; SSE41-NEXT: paddw %xmm2, %xmm3 345; SSE41-NEXT: movdqa %xmm1, %xmm4 346; SSE41-NEXT: psraw $8, %xmm4 347; SSE41-NEXT: movdqa %xmm2, %xmm0 348; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm1 349; SSE41-NEXT: movdqa %xmm1, %xmm2 350; SSE41-NEXT: psraw $4, %xmm2 351; SSE41-NEXT: movdqa %xmm3, %xmm0 352; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 353; SSE41-NEXT: movdqa %xmm1, %xmm2 354; SSE41-NEXT: psraw $2, %xmm2 355; SSE41-NEXT: paddw %xmm3, %xmm3 356; SSE41-NEXT: movdqa %xmm3, %xmm0 357; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 358; SSE41-NEXT: movdqa %xmm1, %xmm2 359; SSE41-NEXT: psraw $1, %xmm2 360; SSE41-NEXT: paddw %xmm3, %xmm3 361; SSE41-NEXT: movdqa %xmm3, %xmm0 362; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 363; SSE41-NEXT: movdqa %xmm1, %xmm0 364; SSE41-NEXT: retq 365; 366; AVX1-LABEL: var_shift_v2i16: 367; AVX1: # %bb.0: 368; AVX1-NEXT: vpsllw $12, %xmm1, %xmm2 369; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 370; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 371; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm2 372; AVX1-NEXT: vpsraw $8, %xmm0, %xmm3 373; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 374; AVX1-NEXT: vpsraw $4, %xmm0, %xmm1 375; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 376; AVX1-NEXT: vpsraw $2, %xmm0, %xmm1 377; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 378; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 379; AVX1-NEXT: vpsraw $1, %xmm0, %xmm1 380; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 381; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 382; AVX1-NEXT: retq 383; 384; AVX2-LABEL: var_shift_v2i16: 385; AVX2: # %bb.0: 386; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 387; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 388; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0 389; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 390; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 391; AVX2-NEXT: vzeroupper 392; AVX2-NEXT: retq 393; 394; XOP-LABEL: var_shift_v2i16: 395; XOP: # %bb.0: 396; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 397; XOP-NEXT: vpsubw %xmm1, %xmm2, %xmm1 398; XOP-NEXT: vpshaw %xmm1, %xmm0, %xmm0 399; XOP-NEXT: retq 400; 401; AVX512DQ-LABEL: var_shift_v2i16: 402; AVX512DQ: # %bb.0: 403; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 404; AVX512DQ-NEXT: vpmovsxwd %xmm0, %ymm0 405; AVX512DQ-NEXT: vpsravd %ymm1, %ymm0, %ymm0 406; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 407; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 408; AVX512DQ-NEXT: vzeroupper 409; AVX512DQ-NEXT: retq 410; 411; AVX512BW-LABEL: var_shift_v2i16: 412; AVX512BW: # %bb.0: 413; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 414; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 415; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 416; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 417; AVX512BW-NEXT: vzeroupper 418; AVX512BW-NEXT: retq 419; 420; AVX512DQVL-LABEL: var_shift_v2i16: 421; AVX512DQVL: # %bb.0: 422; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 423; AVX512DQVL-NEXT: vpmovsxwd %xmm0, %ymm0 424; AVX512DQVL-NEXT: vpsravd %ymm1, %ymm0, %ymm0 425; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0 426; AVX512DQVL-NEXT: vzeroupper 427; AVX512DQVL-NEXT: retq 428; 429; AVX512BWVL-LABEL: var_shift_v2i16: 430; AVX512BWVL: # %bb.0: 431; AVX512BWVL-NEXT: vpsravw %xmm1, %xmm0, %xmm0 432; AVX512BWVL-NEXT: retq 433; 434; X86-SSE-LABEL: var_shift_v2i16: 435; X86-SSE: # %bb.0: 436; X86-SSE-NEXT: psllw $12, %xmm1 437; X86-SSE-NEXT: movdqa %xmm1, %xmm2 438; X86-SSE-NEXT: psraw $15, %xmm2 439; X86-SSE-NEXT: movdqa %xmm2, %xmm3 440; X86-SSE-NEXT: pandn %xmm0, %xmm3 441; X86-SSE-NEXT: psraw $8, %xmm0 442; X86-SSE-NEXT: pand %xmm2, %xmm0 443; X86-SSE-NEXT: por %xmm3, %xmm0 444; X86-SSE-NEXT: paddw %xmm1, %xmm1 445; X86-SSE-NEXT: movdqa %xmm1, %xmm2 446; X86-SSE-NEXT: psraw $15, %xmm2 447; X86-SSE-NEXT: movdqa %xmm2, %xmm3 448; X86-SSE-NEXT: pandn %xmm0, %xmm3 449; X86-SSE-NEXT: psraw $4, %xmm0 450; X86-SSE-NEXT: pand %xmm2, %xmm0 451; X86-SSE-NEXT: por %xmm3, %xmm0 452; X86-SSE-NEXT: paddw %xmm1, %xmm1 453; X86-SSE-NEXT: movdqa %xmm1, %xmm2 454; X86-SSE-NEXT: psraw $15, %xmm2 455; X86-SSE-NEXT: movdqa %xmm2, %xmm3 456; X86-SSE-NEXT: pandn %xmm0, %xmm3 457; X86-SSE-NEXT: psraw $2, %xmm0 458; X86-SSE-NEXT: pand %xmm2, %xmm0 459; X86-SSE-NEXT: por %xmm3, %xmm0 460; X86-SSE-NEXT: paddw %xmm1, %xmm1 461; X86-SSE-NEXT: psraw $15, %xmm1 462; X86-SSE-NEXT: movdqa %xmm1, %xmm2 463; X86-SSE-NEXT: pandn %xmm0, %xmm2 464; X86-SSE-NEXT: psraw $1, %xmm0 465; X86-SSE-NEXT: pand %xmm1, %xmm0 466; X86-SSE-NEXT: por %xmm2, %xmm0 467; X86-SSE-NEXT: retl 468 %shift = ashr <2 x i16> %a, %b 469 ret <2 x i16> %shift 470} 471 472define <8 x i8> @var_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind { 473; SSE2-LABEL: var_shift_v8i8: 474; SSE2: # %bb.0: 475; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 476; SSE2-NEXT: psllw $5, %xmm1 477; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] 478; SSE2-NEXT: pxor %xmm3, %xmm3 479; SSE2-NEXT: pxor %xmm5, %xmm5 480; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 481; SSE2-NEXT: movdqa %xmm5, %xmm6 482; SSE2-NEXT: pandn %xmm2, %xmm6 483; SSE2-NEXT: psraw $4, %xmm2 484; SSE2-NEXT: pand %xmm5, %xmm2 485; SSE2-NEXT: por %xmm6, %xmm2 486; SSE2-NEXT: paddw %xmm4, %xmm4 487; SSE2-NEXT: pxor %xmm5, %xmm5 488; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 489; SSE2-NEXT: movdqa %xmm5, %xmm6 490; SSE2-NEXT: pandn %xmm2, %xmm6 491; SSE2-NEXT: psraw $2, %xmm2 492; SSE2-NEXT: pand %xmm5, %xmm2 493; SSE2-NEXT: por %xmm6, %xmm2 494; SSE2-NEXT: paddw %xmm4, %xmm4 495; SSE2-NEXT: pxor %xmm5, %xmm5 496; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 497; SSE2-NEXT: movdqa %xmm5, %xmm4 498; SSE2-NEXT: pandn %xmm2, %xmm4 499; SSE2-NEXT: psraw $1, %xmm2 500; SSE2-NEXT: pand %xmm5, %xmm2 501; SSE2-NEXT: por %xmm4, %xmm2 502; SSE2-NEXT: psrlw $8, %xmm2 503; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 504; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 505; SSE2-NEXT: pxor %xmm4, %xmm4 506; SSE2-NEXT: pcmpgtw %xmm1, %xmm4 507; SSE2-NEXT: movdqa %xmm4, %xmm5 508; SSE2-NEXT: pandn %xmm0, %xmm5 509; SSE2-NEXT: psraw $4, %xmm0 510; SSE2-NEXT: pand %xmm4, %xmm0 511; SSE2-NEXT: por %xmm5, %xmm0 512; SSE2-NEXT: paddw %xmm1, %xmm1 513; SSE2-NEXT: pxor %xmm4, %xmm4 514; SSE2-NEXT: pcmpgtw %xmm1, %xmm4 515; SSE2-NEXT: movdqa %xmm4, %xmm5 516; SSE2-NEXT: pandn %xmm0, %xmm5 517; SSE2-NEXT: psraw $2, %xmm0 518; SSE2-NEXT: pand %xmm4, %xmm0 519; SSE2-NEXT: por %xmm5, %xmm0 520; SSE2-NEXT: paddw %xmm1, %xmm1 521; SSE2-NEXT: pcmpgtw %xmm1, %xmm3 522; SSE2-NEXT: movdqa %xmm3, %xmm1 523; SSE2-NEXT: pandn %xmm0, %xmm1 524; SSE2-NEXT: psraw $1, %xmm0 525; SSE2-NEXT: pand %xmm3, %xmm0 526; SSE2-NEXT: por %xmm1, %xmm0 527; SSE2-NEXT: psrlw $8, %xmm0 528; SSE2-NEXT: packuswb %xmm2, %xmm0 529; SSE2-NEXT: retq 530; 531; SSE41-LABEL: var_shift_v8i8: 532; SSE41: # %bb.0: 533; SSE41-NEXT: movdqa %xmm0, %xmm2 534; SSE41-NEXT: psllw $5, %xmm1 535; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 536; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] 537; SSE41-NEXT: movdqa %xmm3, %xmm4 538; SSE41-NEXT: psraw $4, %xmm4 539; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3 540; SSE41-NEXT: movdqa %xmm3, %xmm4 541; SSE41-NEXT: psraw $2, %xmm4 542; SSE41-NEXT: paddw %xmm0, %xmm0 543; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3 544; SSE41-NEXT: movdqa %xmm3, %xmm4 545; SSE41-NEXT: psraw $1, %xmm4 546; SSE41-NEXT: paddw %xmm0, %xmm0 547; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3 548; SSE41-NEXT: psrlw $8, %xmm3 549; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 550; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 551; SSE41-NEXT: movdqa %xmm1, %xmm2 552; SSE41-NEXT: psraw $4, %xmm2 553; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 554; SSE41-NEXT: movdqa %xmm1, %xmm2 555; SSE41-NEXT: psraw $2, %xmm2 556; SSE41-NEXT: paddw %xmm0, %xmm0 557; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 558; SSE41-NEXT: movdqa %xmm1, %xmm2 559; SSE41-NEXT: psraw $1, %xmm2 560; SSE41-NEXT: paddw %xmm0, %xmm0 561; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 562; SSE41-NEXT: psrlw $8, %xmm1 563; SSE41-NEXT: packuswb %xmm3, %xmm1 564; SSE41-NEXT: movdqa %xmm1, %xmm0 565; SSE41-NEXT: retq 566; 567; AVX-LABEL: var_shift_v8i8: 568; AVX: # %bb.0: 569; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 570; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 571; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 572; AVX-NEXT: vpsraw $4, %xmm3, %xmm4 573; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 574; AVX-NEXT: vpsraw $2, %xmm3, %xmm4 575; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2 576; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 577; AVX-NEXT: vpsraw $1, %xmm3, %xmm4 578; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2 579; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2 580; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2 581; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 582; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 583; AVX-NEXT: vpsraw $4, %xmm0, %xmm3 584; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 585; AVX-NEXT: vpsraw $2, %xmm0, %xmm3 586; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1 587; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 588; AVX-NEXT: vpsraw $1, %xmm0, %xmm3 589; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1 590; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 591; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 592; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 593; AVX-NEXT: retq 594; 595; XOP-LABEL: var_shift_v8i8: 596; XOP: # %bb.0: 597; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 598; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1 599; XOP-NEXT: vpshab %xmm1, %xmm0, %xmm0 600; XOP-NEXT: retq 601; 602; AVX512DQ-LABEL: var_shift_v8i8: 603; AVX512DQ: # %bb.0: 604; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero 605; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 606; AVX512DQ-NEXT: vpsravd %zmm1, %zmm0, %zmm0 607; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 608; AVX512DQ-NEXT: vzeroupper 609; AVX512DQ-NEXT: retq 610; 611; AVX512BW-LABEL: var_shift_v8i8: 612; AVX512BW: # %bb.0: 613; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 614; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 615; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 616; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 617; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 618; AVX512BW-NEXT: vzeroupper 619; AVX512BW-NEXT: retq 620; 621; AVX512DQVL-LABEL: var_shift_v8i8: 622; AVX512DQVL: # %bb.0: 623; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero 624; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0 625; AVX512DQVL-NEXT: vpsravd %zmm1, %zmm0, %zmm0 626; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 627; AVX512DQVL-NEXT: vzeroupper 628; AVX512DQVL-NEXT: retq 629; 630; AVX512BWVL-LABEL: var_shift_v8i8: 631; AVX512BWVL: # %bb.0: 632; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 633; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 634; AVX512BWVL-NEXT: vpsravw %ymm1, %ymm0, %ymm0 635; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 636; AVX512BWVL-NEXT: vzeroupper 637; AVX512BWVL-NEXT: retq 638; 639; X86-SSE-LABEL: var_shift_v8i8: 640; X86-SSE: # %bb.0: 641; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 642; X86-SSE-NEXT: psllw $5, %xmm1 643; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] 644; X86-SSE-NEXT: pxor %xmm3, %xmm3 645; X86-SSE-NEXT: pxor %xmm5, %xmm5 646; X86-SSE-NEXT: pcmpgtw %xmm4, %xmm5 647; X86-SSE-NEXT: movdqa %xmm5, %xmm6 648; X86-SSE-NEXT: pandn %xmm2, %xmm6 649; X86-SSE-NEXT: psraw $4, %xmm2 650; X86-SSE-NEXT: pand %xmm5, %xmm2 651; X86-SSE-NEXT: por %xmm6, %xmm2 652; X86-SSE-NEXT: paddw %xmm4, %xmm4 653; X86-SSE-NEXT: pxor %xmm5, %xmm5 654; X86-SSE-NEXT: pcmpgtw %xmm4, %xmm5 655; X86-SSE-NEXT: movdqa %xmm5, %xmm6 656; X86-SSE-NEXT: pandn %xmm2, %xmm6 657; X86-SSE-NEXT: psraw $2, %xmm2 658; X86-SSE-NEXT: pand %xmm5, %xmm2 659; X86-SSE-NEXT: por %xmm6, %xmm2 660; X86-SSE-NEXT: paddw %xmm4, %xmm4 661; X86-SSE-NEXT: pxor %xmm5, %xmm5 662; X86-SSE-NEXT: pcmpgtw %xmm4, %xmm5 663; X86-SSE-NEXT: movdqa %xmm5, %xmm4 664; X86-SSE-NEXT: pandn %xmm2, %xmm4 665; X86-SSE-NEXT: psraw $1, %xmm2 666; X86-SSE-NEXT: pand %xmm5, %xmm2 667; X86-SSE-NEXT: por %xmm4, %xmm2 668; X86-SSE-NEXT: psrlw $8, %xmm2 669; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 670; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 671; X86-SSE-NEXT: pxor %xmm4, %xmm4 672; X86-SSE-NEXT: pcmpgtw %xmm1, %xmm4 673; X86-SSE-NEXT: movdqa %xmm4, %xmm5 674; X86-SSE-NEXT: pandn %xmm0, %xmm5 675; X86-SSE-NEXT: psraw $4, %xmm0 676; X86-SSE-NEXT: pand %xmm4, %xmm0 677; X86-SSE-NEXT: por %xmm5, %xmm0 678; X86-SSE-NEXT: paddw %xmm1, %xmm1 679; X86-SSE-NEXT: pxor %xmm4, %xmm4 680; X86-SSE-NEXT: pcmpgtw %xmm1, %xmm4 681; X86-SSE-NEXT: movdqa %xmm4, %xmm5 682; X86-SSE-NEXT: pandn %xmm0, %xmm5 683; X86-SSE-NEXT: psraw $2, %xmm0 684; X86-SSE-NEXT: pand %xmm4, %xmm0 685; X86-SSE-NEXT: por %xmm5, %xmm0 686; X86-SSE-NEXT: paddw %xmm1, %xmm1 687; X86-SSE-NEXT: pcmpgtw %xmm1, %xmm3 688; X86-SSE-NEXT: movdqa %xmm3, %xmm1 689; X86-SSE-NEXT: pandn %xmm0, %xmm1 690; X86-SSE-NEXT: psraw $1, %xmm0 691; X86-SSE-NEXT: pand %xmm3, %xmm0 692; X86-SSE-NEXT: por %xmm1, %xmm0 693; X86-SSE-NEXT: psrlw $8, %xmm0 694; X86-SSE-NEXT: packuswb %xmm2, %xmm0 695; X86-SSE-NEXT: retl 696 %shift = ashr <8 x i8> %a, %b 697 ret <8 x i8> %shift 698} 699 700define <4 x i8> @var_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind { 701; SSE2-LABEL: var_shift_v4i8: 702; SSE2: # %bb.0: 703; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 704; SSE2-NEXT: psllw $5, %xmm1 705; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] 706; SSE2-NEXT: pxor %xmm3, %xmm3 707; SSE2-NEXT: pxor %xmm5, %xmm5 708; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 709; SSE2-NEXT: movdqa %xmm5, %xmm6 710; SSE2-NEXT: pandn %xmm2, %xmm6 711; SSE2-NEXT: psraw $4, %xmm2 712; SSE2-NEXT: pand %xmm5, %xmm2 713; SSE2-NEXT: por %xmm6, %xmm2 714; SSE2-NEXT: paddw %xmm4, %xmm4 715; SSE2-NEXT: pxor %xmm5, %xmm5 716; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 717; SSE2-NEXT: movdqa %xmm5, %xmm6 718; SSE2-NEXT: pandn %xmm2, %xmm6 719; SSE2-NEXT: psraw $2, %xmm2 720; SSE2-NEXT: pand %xmm5, %xmm2 721; SSE2-NEXT: por %xmm6, %xmm2 722; SSE2-NEXT: paddw %xmm4, %xmm4 723; SSE2-NEXT: pxor %xmm5, %xmm5 724; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 725; SSE2-NEXT: movdqa %xmm5, %xmm4 726; SSE2-NEXT: pandn %xmm2, %xmm4 727; SSE2-NEXT: psraw $1, %xmm2 728; SSE2-NEXT: pand %xmm5, %xmm2 729; SSE2-NEXT: por %xmm4, %xmm2 730; SSE2-NEXT: psrlw $8, %xmm2 731; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 732; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 733; SSE2-NEXT: pxor %xmm4, %xmm4 734; SSE2-NEXT: pcmpgtw %xmm1, %xmm4 735; SSE2-NEXT: movdqa %xmm4, %xmm5 736; SSE2-NEXT: pandn %xmm0, %xmm5 737; SSE2-NEXT: psraw $4, %xmm0 738; SSE2-NEXT: pand %xmm4, %xmm0 739; SSE2-NEXT: por %xmm5, %xmm0 740; SSE2-NEXT: paddw %xmm1, %xmm1 741; SSE2-NEXT: pxor %xmm4, %xmm4 742; SSE2-NEXT: pcmpgtw %xmm1, %xmm4 743; SSE2-NEXT: movdqa %xmm4, %xmm5 744; SSE2-NEXT: pandn %xmm0, %xmm5 745; SSE2-NEXT: psraw $2, %xmm0 746; SSE2-NEXT: pand %xmm4, %xmm0 747; SSE2-NEXT: por %xmm5, %xmm0 748; SSE2-NEXT: paddw %xmm1, %xmm1 749; SSE2-NEXT: pcmpgtw %xmm1, %xmm3 750; SSE2-NEXT: movdqa %xmm3, %xmm1 751; SSE2-NEXT: pandn %xmm0, %xmm1 752; SSE2-NEXT: psraw $1, %xmm0 753; SSE2-NEXT: pand %xmm3, %xmm0 754; SSE2-NEXT: por %xmm1, %xmm0 755; SSE2-NEXT: psrlw $8, %xmm0 756; SSE2-NEXT: packuswb %xmm2, %xmm0 757; SSE2-NEXT: retq 758; 759; SSE41-LABEL: var_shift_v4i8: 760; SSE41: # %bb.0: 761; SSE41-NEXT: movdqa %xmm0, %xmm2 762; SSE41-NEXT: psllw $5, %xmm1 763; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 764; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] 765; SSE41-NEXT: movdqa %xmm3, %xmm4 766; SSE41-NEXT: psraw $4, %xmm4 767; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3 768; SSE41-NEXT: movdqa %xmm3, %xmm4 769; SSE41-NEXT: psraw $2, %xmm4 770; SSE41-NEXT: paddw %xmm0, %xmm0 771; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3 772; SSE41-NEXT: movdqa %xmm3, %xmm4 773; SSE41-NEXT: psraw $1, %xmm4 774; SSE41-NEXT: paddw %xmm0, %xmm0 775; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3 776; SSE41-NEXT: psrlw $8, %xmm3 777; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 778; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 779; SSE41-NEXT: movdqa %xmm1, %xmm2 780; SSE41-NEXT: psraw $4, %xmm2 781; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 782; SSE41-NEXT: movdqa %xmm1, %xmm2 783; SSE41-NEXT: psraw $2, %xmm2 784; SSE41-NEXT: paddw %xmm0, %xmm0 785; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 786; SSE41-NEXT: movdqa %xmm1, %xmm2 787; SSE41-NEXT: psraw $1, %xmm2 788; SSE41-NEXT: paddw %xmm0, %xmm0 789; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 790; SSE41-NEXT: psrlw $8, %xmm1 791; SSE41-NEXT: packuswb %xmm3, %xmm1 792; SSE41-NEXT: movdqa %xmm1, %xmm0 793; SSE41-NEXT: retq 794; 795; AVX-LABEL: var_shift_v4i8: 796; AVX: # %bb.0: 797; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 798; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 799; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 800; AVX-NEXT: vpsraw $4, %xmm3, %xmm4 801; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 802; AVX-NEXT: vpsraw $2, %xmm3, %xmm4 803; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2 804; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 805; AVX-NEXT: vpsraw $1, %xmm3, %xmm4 806; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2 807; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2 808; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2 809; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 810; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 811; AVX-NEXT: vpsraw $4, %xmm0, %xmm3 812; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 813; AVX-NEXT: vpsraw $2, %xmm0, %xmm3 814; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1 815; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 816; AVX-NEXT: vpsraw $1, %xmm0, %xmm3 817; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1 818; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 819; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 820; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 821; AVX-NEXT: retq 822; 823; XOP-LABEL: var_shift_v4i8: 824; XOP: # %bb.0: 825; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 826; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1 827; XOP-NEXT: vpshab %xmm1, %xmm0, %xmm0 828; XOP-NEXT: retq 829; 830; AVX512DQ-LABEL: var_shift_v4i8: 831; AVX512DQ: # %bb.0: 832; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero 833; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 834; AVX512DQ-NEXT: vpsravd %zmm1, %zmm0, %zmm0 835; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 836; AVX512DQ-NEXT: vzeroupper 837; AVX512DQ-NEXT: retq 838; 839; AVX512BW-LABEL: var_shift_v4i8: 840; AVX512BW: # %bb.0: 841; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 842; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 843; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 844; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 845; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 846; AVX512BW-NEXT: vzeroupper 847; AVX512BW-NEXT: retq 848; 849; AVX512DQVL-LABEL: var_shift_v4i8: 850; AVX512DQVL: # %bb.0: 851; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero 852; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0 853; AVX512DQVL-NEXT: vpsravd %zmm1, %zmm0, %zmm0 854; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 855; AVX512DQVL-NEXT: vzeroupper 856; AVX512DQVL-NEXT: retq 857; 858; AVX512BWVL-LABEL: var_shift_v4i8: 859; AVX512BWVL: # %bb.0: 860; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 861; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 862; AVX512BWVL-NEXT: vpsravw %ymm1, %ymm0, %ymm0 863; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 864; AVX512BWVL-NEXT: vzeroupper 865; AVX512BWVL-NEXT: retq 866; 867; X86-SSE-LABEL: var_shift_v4i8: 868; X86-SSE: # %bb.0: 869; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 870; X86-SSE-NEXT: psllw $5, %xmm1 871; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] 872; X86-SSE-NEXT: pxor %xmm3, %xmm3 873; X86-SSE-NEXT: pxor %xmm5, %xmm5 874; X86-SSE-NEXT: pcmpgtw %xmm4, %xmm5 875; X86-SSE-NEXT: movdqa %xmm5, %xmm6 876; X86-SSE-NEXT: pandn %xmm2, %xmm6 877; X86-SSE-NEXT: psraw $4, %xmm2 878; X86-SSE-NEXT: pand %xmm5, %xmm2 879; X86-SSE-NEXT: por %xmm6, %xmm2 880; X86-SSE-NEXT: paddw %xmm4, %xmm4 881; X86-SSE-NEXT: pxor %xmm5, %xmm5 882; X86-SSE-NEXT: pcmpgtw %xmm4, %xmm5 883; X86-SSE-NEXT: movdqa %xmm5, %xmm6 884; X86-SSE-NEXT: pandn %xmm2, %xmm6 885; X86-SSE-NEXT: psraw $2, %xmm2 886; X86-SSE-NEXT: pand %xmm5, %xmm2 887; X86-SSE-NEXT: por %xmm6, %xmm2 888; X86-SSE-NEXT: paddw %xmm4, %xmm4 889; X86-SSE-NEXT: pxor %xmm5, %xmm5 890; X86-SSE-NEXT: pcmpgtw %xmm4, %xmm5 891; X86-SSE-NEXT: movdqa %xmm5, %xmm4 892; X86-SSE-NEXT: pandn %xmm2, %xmm4 893; X86-SSE-NEXT: psraw $1, %xmm2 894; X86-SSE-NEXT: pand %xmm5, %xmm2 895; X86-SSE-NEXT: por %xmm4, %xmm2 896; X86-SSE-NEXT: psrlw $8, %xmm2 897; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 898; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 899; X86-SSE-NEXT: pxor %xmm4, %xmm4 900; X86-SSE-NEXT: pcmpgtw %xmm1, %xmm4 901; X86-SSE-NEXT: movdqa %xmm4, %xmm5 902; X86-SSE-NEXT: pandn %xmm0, %xmm5 903; X86-SSE-NEXT: psraw $4, %xmm0 904; X86-SSE-NEXT: pand %xmm4, %xmm0 905; X86-SSE-NEXT: por %xmm5, %xmm0 906; X86-SSE-NEXT: paddw %xmm1, %xmm1 907; X86-SSE-NEXT: pxor %xmm4, %xmm4 908; X86-SSE-NEXT: pcmpgtw %xmm1, %xmm4 909; X86-SSE-NEXT: movdqa %xmm4, %xmm5 910; X86-SSE-NEXT: pandn %xmm0, %xmm5 911; X86-SSE-NEXT: psraw $2, %xmm0 912; X86-SSE-NEXT: pand %xmm4, %xmm0 913; X86-SSE-NEXT: por %xmm5, %xmm0 914; X86-SSE-NEXT: paddw %xmm1, %xmm1 915; X86-SSE-NEXT: pcmpgtw %xmm1, %xmm3 916; X86-SSE-NEXT: movdqa %xmm3, %xmm1 917; X86-SSE-NEXT: pandn %xmm0, %xmm1 918; X86-SSE-NEXT: psraw $1, %xmm0 919; X86-SSE-NEXT: pand %xmm3, %xmm0 920; X86-SSE-NEXT: por %xmm1, %xmm0 921; X86-SSE-NEXT: psrlw $8, %xmm0 922; X86-SSE-NEXT: packuswb %xmm2, %xmm0 923; X86-SSE-NEXT: retl 924 %shift = ashr <4 x i8> %a, %b 925 ret <4 x i8> %shift 926} 927 928define <2 x i8> @var_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind { 929; SSE2-LABEL: var_shift_v2i8: 930; SSE2: # %bb.0: 931; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 932; SSE2-NEXT: psllw $5, %xmm1 933; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] 934; SSE2-NEXT: pxor %xmm3, %xmm3 935; SSE2-NEXT: pxor %xmm5, %xmm5 936; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 937; SSE2-NEXT: movdqa %xmm5, %xmm6 938; SSE2-NEXT: pandn %xmm2, %xmm6 939; SSE2-NEXT: psraw $4, %xmm2 940; SSE2-NEXT: pand %xmm5, %xmm2 941; SSE2-NEXT: por %xmm6, %xmm2 942; SSE2-NEXT: paddw %xmm4, %xmm4 943; SSE2-NEXT: pxor %xmm5, %xmm5 944; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 945; SSE2-NEXT: movdqa %xmm5, %xmm6 946; SSE2-NEXT: pandn %xmm2, %xmm6 947; SSE2-NEXT: psraw $2, %xmm2 948; SSE2-NEXT: pand %xmm5, %xmm2 949; SSE2-NEXT: por %xmm6, %xmm2 950; SSE2-NEXT: paddw %xmm4, %xmm4 951; SSE2-NEXT: pxor %xmm5, %xmm5 952; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 953; SSE2-NEXT: movdqa %xmm5, %xmm4 954; SSE2-NEXT: pandn %xmm2, %xmm4 955; SSE2-NEXT: psraw $1, %xmm2 956; SSE2-NEXT: pand %xmm5, %xmm2 957; SSE2-NEXT: por %xmm4, %xmm2 958; SSE2-NEXT: psrlw $8, %xmm2 959; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 960; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 961; SSE2-NEXT: pxor %xmm4, %xmm4 962; SSE2-NEXT: pcmpgtw %xmm1, %xmm4 963; SSE2-NEXT: movdqa %xmm4, %xmm5 964; SSE2-NEXT: pandn %xmm0, %xmm5 965; SSE2-NEXT: psraw $4, %xmm0 966; SSE2-NEXT: pand %xmm4, %xmm0 967; SSE2-NEXT: por %xmm5, %xmm0 968; SSE2-NEXT: paddw %xmm1, %xmm1 969; SSE2-NEXT: pxor %xmm4, %xmm4 970; SSE2-NEXT: pcmpgtw %xmm1, %xmm4 971; SSE2-NEXT: movdqa %xmm4, %xmm5 972; SSE2-NEXT: pandn %xmm0, %xmm5 973; SSE2-NEXT: psraw $2, %xmm0 974; SSE2-NEXT: pand %xmm4, %xmm0 975; SSE2-NEXT: por %xmm5, %xmm0 976; SSE2-NEXT: paddw %xmm1, %xmm1 977; SSE2-NEXT: pcmpgtw %xmm1, %xmm3 978; SSE2-NEXT: movdqa %xmm3, %xmm1 979; SSE2-NEXT: pandn %xmm0, %xmm1 980; SSE2-NEXT: psraw $1, %xmm0 981; SSE2-NEXT: pand %xmm3, %xmm0 982; SSE2-NEXT: por %xmm1, %xmm0 983; SSE2-NEXT: psrlw $8, %xmm0 984; SSE2-NEXT: packuswb %xmm2, %xmm0 985; SSE2-NEXT: retq 986; 987; SSE41-LABEL: var_shift_v2i8: 988; SSE41: # %bb.0: 989; SSE41-NEXT: movdqa %xmm0, %xmm2 990; SSE41-NEXT: psllw $5, %xmm1 991; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 992; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] 993; SSE41-NEXT: movdqa %xmm3, %xmm4 994; SSE41-NEXT: psraw $4, %xmm4 995; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3 996; SSE41-NEXT: movdqa %xmm3, %xmm4 997; SSE41-NEXT: psraw $2, %xmm4 998; SSE41-NEXT: paddw %xmm0, %xmm0 999; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3 1000; SSE41-NEXT: movdqa %xmm3, %xmm4 1001; SSE41-NEXT: psraw $1, %xmm4 1002; SSE41-NEXT: paddw %xmm0, %xmm0 1003; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3 1004; SSE41-NEXT: psrlw $8, %xmm3 1005; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1006; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 1007; SSE41-NEXT: movdqa %xmm1, %xmm2 1008; SSE41-NEXT: psraw $4, %xmm2 1009; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 1010; SSE41-NEXT: movdqa %xmm1, %xmm2 1011; SSE41-NEXT: psraw $2, %xmm2 1012; SSE41-NEXT: paddw %xmm0, %xmm0 1013; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 1014; SSE41-NEXT: movdqa %xmm1, %xmm2 1015; SSE41-NEXT: psraw $1, %xmm2 1016; SSE41-NEXT: paddw %xmm0, %xmm0 1017; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 1018; SSE41-NEXT: psrlw $8, %xmm1 1019; SSE41-NEXT: packuswb %xmm3, %xmm1 1020; SSE41-NEXT: movdqa %xmm1, %xmm0 1021; SSE41-NEXT: retq 1022; 1023; AVX-LABEL: var_shift_v2i8: 1024; AVX: # %bb.0: 1025; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 1026; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1027; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1028; AVX-NEXT: vpsraw $4, %xmm3, %xmm4 1029; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 1030; AVX-NEXT: vpsraw $2, %xmm3, %xmm4 1031; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2 1032; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 1033; AVX-NEXT: vpsraw $1, %xmm3, %xmm4 1034; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2 1035; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2 1036; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2 1037; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1038; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1039; AVX-NEXT: vpsraw $4, %xmm0, %xmm3 1040; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 1041; AVX-NEXT: vpsraw $2, %xmm0, %xmm3 1042; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1 1043; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 1044; AVX-NEXT: vpsraw $1, %xmm0, %xmm3 1045; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1 1046; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 1047; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 1048; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 1049; AVX-NEXT: retq 1050; 1051; XOP-LABEL: var_shift_v2i8: 1052; XOP: # %bb.0: 1053; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 1054; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1 1055; XOP-NEXT: vpshab %xmm1, %xmm0, %xmm0 1056; XOP-NEXT: retq 1057; 1058; AVX512DQ-LABEL: var_shift_v2i8: 1059; AVX512DQ: # %bb.0: 1060; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero 1061; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 1062; AVX512DQ-NEXT: vpsravd %zmm1, %zmm0, %zmm0 1063; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 1064; AVX512DQ-NEXT: vzeroupper 1065; AVX512DQ-NEXT: retq 1066; 1067; AVX512BW-LABEL: var_shift_v2i8: 1068; AVX512BW: # %bb.0: 1069; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 1070; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 1071; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 1072; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1073; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1074; AVX512BW-NEXT: vzeroupper 1075; AVX512BW-NEXT: retq 1076; 1077; AVX512DQVL-LABEL: var_shift_v2i8: 1078; AVX512DQVL: # %bb.0: 1079; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero 1080; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0 1081; AVX512DQVL-NEXT: vpsravd %zmm1, %zmm0, %zmm0 1082; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 1083; AVX512DQVL-NEXT: vzeroupper 1084; AVX512DQVL-NEXT: retq 1085; 1086; AVX512BWVL-LABEL: var_shift_v2i8: 1087; AVX512BWVL: # %bb.0: 1088; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 1089; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 1090; AVX512BWVL-NEXT: vpsravw %ymm1, %ymm0, %ymm0 1091; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 1092; AVX512BWVL-NEXT: vzeroupper 1093; AVX512BWVL-NEXT: retq 1094; 1095; X86-SSE-LABEL: var_shift_v2i8: 1096; X86-SSE: # %bb.0: 1097; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 1098; X86-SSE-NEXT: psllw $5, %xmm1 1099; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] 1100; X86-SSE-NEXT: pxor %xmm3, %xmm3 1101; X86-SSE-NEXT: pxor %xmm5, %xmm5 1102; X86-SSE-NEXT: pcmpgtw %xmm4, %xmm5 1103; X86-SSE-NEXT: movdqa %xmm5, %xmm6 1104; X86-SSE-NEXT: pandn %xmm2, %xmm6 1105; X86-SSE-NEXT: psraw $4, %xmm2 1106; X86-SSE-NEXT: pand %xmm5, %xmm2 1107; X86-SSE-NEXT: por %xmm6, %xmm2 1108; X86-SSE-NEXT: paddw %xmm4, %xmm4 1109; X86-SSE-NEXT: pxor %xmm5, %xmm5 1110; X86-SSE-NEXT: pcmpgtw %xmm4, %xmm5 1111; X86-SSE-NEXT: movdqa %xmm5, %xmm6 1112; X86-SSE-NEXT: pandn %xmm2, %xmm6 1113; X86-SSE-NEXT: psraw $2, %xmm2 1114; X86-SSE-NEXT: pand %xmm5, %xmm2 1115; X86-SSE-NEXT: por %xmm6, %xmm2 1116; X86-SSE-NEXT: paddw %xmm4, %xmm4 1117; X86-SSE-NEXT: pxor %xmm5, %xmm5 1118; X86-SSE-NEXT: pcmpgtw %xmm4, %xmm5 1119; X86-SSE-NEXT: movdqa %xmm5, %xmm4 1120; X86-SSE-NEXT: pandn %xmm2, %xmm4 1121; X86-SSE-NEXT: psraw $1, %xmm2 1122; X86-SSE-NEXT: pand %xmm5, %xmm2 1123; X86-SSE-NEXT: por %xmm4, %xmm2 1124; X86-SSE-NEXT: psrlw $8, %xmm2 1125; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1126; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1127; X86-SSE-NEXT: pxor %xmm4, %xmm4 1128; X86-SSE-NEXT: pcmpgtw %xmm1, %xmm4 1129; X86-SSE-NEXT: movdqa %xmm4, %xmm5 1130; X86-SSE-NEXT: pandn %xmm0, %xmm5 1131; X86-SSE-NEXT: psraw $4, %xmm0 1132; X86-SSE-NEXT: pand %xmm4, %xmm0 1133; X86-SSE-NEXT: por %xmm5, %xmm0 1134; X86-SSE-NEXT: paddw %xmm1, %xmm1 1135; X86-SSE-NEXT: pxor %xmm4, %xmm4 1136; X86-SSE-NEXT: pcmpgtw %xmm1, %xmm4 1137; X86-SSE-NEXT: movdqa %xmm4, %xmm5 1138; X86-SSE-NEXT: pandn %xmm0, %xmm5 1139; X86-SSE-NEXT: psraw $2, %xmm0 1140; X86-SSE-NEXT: pand %xmm4, %xmm0 1141; X86-SSE-NEXT: por %xmm5, %xmm0 1142; X86-SSE-NEXT: paddw %xmm1, %xmm1 1143; X86-SSE-NEXT: pcmpgtw %xmm1, %xmm3 1144; X86-SSE-NEXT: movdqa %xmm3, %xmm1 1145; X86-SSE-NEXT: pandn %xmm0, %xmm1 1146; X86-SSE-NEXT: psraw $1, %xmm0 1147; X86-SSE-NEXT: pand %xmm3, %xmm0 1148; X86-SSE-NEXT: por %xmm1, %xmm0 1149; X86-SSE-NEXT: psrlw $8, %xmm0 1150; X86-SSE-NEXT: packuswb %xmm2, %xmm0 1151; X86-SSE-NEXT: retl 1152 %shift = ashr <2 x i8> %a, %b 1153 ret <2 x i8> %shift 1154} 1155 1156; 1157; Uniform Variable Shifts 1158; 1159 1160define <2 x i32> @splatvar_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind { 1161; SSE2-LABEL: splatvar_shift_v2i32: 1162; SSE2: # %bb.0: 1163; SSE2-NEXT: xorps %xmm2, %xmm2 1164; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] 1165; SSE2-NEXT: psrad %xmm2, %xmm0 1166; SSE2-NEXT: retq 1167; 1168; SSE41-LABEL: splatvar_shift_v2i32: 1169; SSE41: # %bb.0: 1170; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 1171; SSE41-NEXT: psrad %xmm1, %xmm0 1172; SSE41-NEXT: retq 1173; 1174; AVX-LABEL: splatvar_shift_v2i32: 1175; AVX: # %bb.0: 1176; AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 1177; AVX-NEXT: vpsrad %xmm1, %xmm0, %xmm0 1178; AVX-NEXT: retq 1179; 1180; XOP-LABEL: splatvar_shift_v2i32: 1181; XOP: # %bb.0: 1182; XOP-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 1183; XOP-NEXT: vpsrad %xmm1, %xmm0, %xmm0 1184; XOP-NEXT: retq 1185; 1186; AVX512-LABEL: splatvar_shift_v2i32: 1187; AVX512: # %bb.0: 1188; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 1189; AVX512-NEXT: vpsrad %xmm1, %xmm0, %xmm0 1190; AVX512-NEXT: retq 1191; 1192; AVX512VL-LABEL: splatvar_shift_v2i32: 1193; AVX512VL: # %bb.0: 1194; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 1195; AVX512VL-NEXT: vpsrad %xmm1, %xmm0, %xmm0 1196; AVX512VL-NEXT: retq 1197; 1198; X86-SSE-LABEL: splatvar_shift_v2i32: 1199; X86-SSE: # %bb.0: 1200; X86-SSE-NEXT: xorps %xmm2, %xmm2 1201; X86-SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] 1202; X86-SSE-NEXT: psrad %xmm2, %xmm0 1203; X86-SSE-NEXT: retl 1204 %splat = shufflevector <2 x i32> %b, <2 x i32> undef, <2 x i32> zeroinitializer 1205 %shift = ashr <2 x i32> %a, %splat 1206 ret <2 x i32> %shift 1207} 1208 1209define <4 x i16> @splatvar_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind { 1210; SSE2-LABEL: splatvar_shift_v4i16: 1211; SSE2: # %bb.0: 1212; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] 1213; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1214; SSE2-NEXT: psraw %xmm1, %xmm0 1215; SSE2-NEXT: retq 1216; 1217; SSE41-LABEL: splatvar_shift_v4i16: 1218; SSE41: # %bb.0: 1219; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 1220; SSE41-NEXT: psraw %xmm1, %xmm0 1221; SSE41-NEXT: retq 1222; 1223; AVX-LABEL: splatvar_shift_v4i16: 1224; AVX: # %bb.0: 1225; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 1226; AVX-NEXT: vpsraw %xmm1, %xmm0, %xmm0 1227; AVX-NEXT: retq 1228; 1229; XOP-LABEL: splatvar_shift_v4i16: 1230; XOP: # %bb.0: 1231; XOP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 1232; XOP-NEXT: vpsraw %xmm1, %xmm0, %xmm0 1233; XOP-NEXT: retq 1234; 1235; AVX512-LABEL: splatvar_shift_v4i16: 1236; AVX512: # %bb.0: 1237; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 1238; AVX512-NEXT: vpsraw %xmm1, %xmm0, %xmm0 1239; AVX512-NEXT: retq 1240; 1241; AVX512VL-LABEL: splatvar_shift_v4i16: 1242; AVX512VL: # %bb.0: 1243; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 1244; AVX512VL-NEXT: vpsraw %xmm1, %xmm0, %xmm0 1245; AVX512VL-NEXT: retq 1246; 1247; X86-SSE-LABEL: splatvar_shift_v4i16: 1248; X86-SSE: # %bb.0: 1249; X86-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] 1250; X86-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1251; X86-SSE-NEXT: psraw %xmm1, %xmm0 1252; X86-SSE-NEXT: retl 1253 %splat = shufflevector <4 x i16> %b, <4 x i16> undef, <4 x i32> zeroinitializer 1254 %shift = ashr <4 x i16> %a, %splat 1255 ret <4 x i16> %shift 1256} 1257 1258define <2 x i16> @splatvar_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind { 1259; SSE2-LABEL: splatvar_shift_v2i16: 1260; SSE2: # %bb.0: 1261; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] 1262; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1263; SSE2-NEXT: psraw %xmm1, %xmm0 1264; SSE2-NEXT: retq 1265; 1266; SSE41-LABEL: splatvar_shift_v2i16: 1267; SSE41: # %bb.0: 1268; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 1269; SSE41-NEXT: psraw %xmm1, %xmm0 1270; SSE41-NEXT: retq 1271; 1272; AVX-LABEL: splatvar_shift_v2i16: 1273; AVX: # %bb.0: 1274; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 1275; AVX-NEXT: vpsraw %xmm1, %xmm0, %xmm0 1276; AVX-NEXT: retq 1277; 1278; XOP-LABEL: splatvar_shift_v2i16: 1279; XOP: # %bb.0: 1280; XOP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 1281; XOP-NEXT: vpsraw %xmm1, %xmm0, %xmm0 1282; XOP-NEXT: retq 1283; 1284; AVX512-LABEL: splatvar_shift_v2i16: 1285; AVX512: # %bb.0: 1286; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 1287; AVX512-NEXT: vpsraw %xmm1, %xmm0, %xmm0 1288; AVX512-NEXT: retq 1289; 1290; AVX512VL-LABEL: splatvar_shift_v2i16: 1291; AVX512VL: # %bb.0: 1292; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 1293; AVX512VL-NEXT: vpsraw %xmm1, %xmm0, %xmm0 1294; AVX512VL-NEXT: retq 1295; 1296; X86-SSE-LABEL: splatvar_shift_v2i16: 1297; X86-SSE: # %bb.0: 1298; X86-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] 1299; X86-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1300; X86-SSE-NEXT: psraw %xmm1, %xmm0 1301; X86-SSE-NEXT: retl 1302 %splat = shufflevector <2 x i16> %b, <2 x i16> undef, <2 x i32> zeroinitializer 1303 %shift = ashr <2 x i16> %a, %splat 1304 ret <2 x i16> %shift 1305} 1306 1307define <8 x i8> @splatvar_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind { 1308; SSE2-LABEL: splatvar_shift_v8i8: 1309; SSE2: # %bb.0: 1310; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] 1311; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1312; SSE2-NEXT: psrlw %xmm1, %xmm0 1313; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 1314; SSE2-NEXT: psrlw %xmm1, %xmm2 1315; SSE2-NEXT: psrlw $8, %xmm2 1316; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1317; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] 1318; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] 1319; SSE2-NEXT: pand %xmm2, %xmm0 1320; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] 1321; SSE2-NEXT: psrlw %xmm1, %xmm2 1322; SSE2-NEXT: pxor %xmm2, %xmm0 1323; SSE2-NEXT: psubb %xmm2, %xmm0 1324; SSE2-NEXT: retq 1325; 1326; SSE41-LABEL: splatvar_shift_v8i8: 1327; SSE41: # %bb.0: 1328; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1329; SSE41-NEXT: psrlw %xmm1, %xmm0 1330; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 1331; SSE41-NEXT: psrlw %xmm1, %xmm2 1332; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 1333; SSE41-NEXT: pand %xmm2, %xmm0 1334; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] 1335; SSE41-NEXT: psrlw %xmm1, %xmm2 1336; SSE41-NEXT: pxor %xmm2, %xmm0 1337; SSE41-NEXT: psubb %xmm2, %xmm0 1338; SSE41-NEXT: retq 1339; 1340; AVX1-LABEL: splatvar_shift_v8i8: 1341; AVX1: # %bb.0: 1342; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1343; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 1344; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1345; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 1346; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 1347; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 1348; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] 1349; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 1350; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 1351; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 1352; AVX1-NEXT: retq 1353; 1354; AVX2-LABEL: splatvar_shift_v8i8: 1355; AVX2: # %bb.0: 1356; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1357; AVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 1358; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1359; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 1360; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 1361; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2 1362; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 1363; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] 1364; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 1365; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 1366; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 1367; AVX2-NEXT: retq 1368; 1369; XOPAVX1-LABEL: splatvar_shift_v8i8: 1370; XOPAVX1: # %bb.0: 1371; XOPAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1372; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] 1373; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 1374; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 1375; XOPAVX1-NEXT: vpshab %xmm1, %xmm0, %xmm0 1376; XOPAVX1-NEXT: retq 1377; 1378; XOPAVX2-LABEL: splatvar_shift_v8i8: 1379; XOPAVX2: # %bb.0: 1380; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 1381; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 1382; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1 1383; XOPAVX2-NEXT: vpshab %xmm1, %xmm0, %xmm0 1384; XOPAVX2-NEXT: retq 1385; 1386; AVX512DQ-LABEL: splatvar_shift_v8i8: 1387; AVX512DQ: # %bb.0: 1388; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1389; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 1390; AVX512DQ-NEXT: vpsrad %xmm1, %zmm0, %zmm0 1391; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 1392; AVX512DQ-NEXT: vzeroupper 1393; AVX512DQ-NEXT: retq 1394; 1395; AVX512BW-LABEL: splatvar_shift_v8i8: 1396; AVX512BW: # %bb.0: 1397; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1398; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 1399; AVX512BW-NEXT: vpsraw %xmm1, %ymm0, %ymm0 1400; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1401; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1402; AVX512BW-NEXT: vzeroupper 1403; AVX512BW-NEXT: retq 1404; 1405; AVX512DQVL-LABEL: splatvar_shift_v8i8: 1406; AVX512DQVL: # %bb.0: 1407; AVX512DQVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1408; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0 1409; AVX512DQVL-NEXT: vpsrad %xmm1, %zmm0, %zmm0 1410; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 1411; AVX512DQVL-NEXT: vzeroupper 1412; AVX512DQVL-NEXT: retq 1413; 1414; AVX512BWVL-LABEL: splatvar_shift_v8i8: 1415; AVX512BWVL: # %bb.0: 1416; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1417; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 1418; AVX512BWVL-NEXT: vpsraw %xmm1, %ymm0, %ymm0 1419; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 1420; AVX512BWVL-NEXT: vzeroupper 1421; AVX512BWVL-NEXT: retq 1422; 1423; X86-SSE-LABEL: splatvar_shift_v8i8: 1424; X86-SSE: # %bb.0: 1425; X86-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] 1426; X86-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1427; X86-SSE-NEXT: psrlw %xmm1, %xmm0 1428; X86-SSE-NEXT: pcmpeqd %xmm2, %xmm2 1429; X86-SSE-NEXT: psrlw %xmm1, %xmm2 1430; X86-SSE-NEXT: psrlw $8, %xmm2 1431; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1432; X86-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] 1433; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] 1434; X86-SSE-NEXT: pand %xmm2, %xmm0 1435; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] 1436; X86-SSE-NEXT: psrlw %xmm1, %xmm2 1437; X86-SSE-NEXT: pxor %xmm2, %xmm0 1438; X86-SSE-NEXT: psubb %xmm2, %xmm0 1439; X86-SSE-NEXT: retl 1440 %splat = shufflevector <8 x i8> %b, <8 x i8> undef, <8 x i32> zeroinitializer 1441 %shift = ashr <8 x i8> %a, %splat 1442 ret <8 x i8> %shift 1443} 1444 1445define <4 x i8> @splatvar_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind { 1446; SSE2-LABEL: splatvar_shift_v4i8: 1447; SSE2: # %bb.0: 1448; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] 1449; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1450; SSE2-NEXT: psrlw %xmm1, %xmm0 1451; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 1452; SSE2-NEXT: psrlw %xmm1, %xmm2 1453; SSE2-NEXT: psrlw $8, %xmm2 1454; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1455; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] 1456; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] 1457; SSE2-NEXT: pand %xmm2, %xmm0 1458; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] 1459; SSE2-NEXT: psrlw %xmm1, %xmm2 1460; SSE2-NEXT: pxor %xmm2, %xmm0 1461; SSE2-NEXT: psubb %xmm2, %xmm0 1462; SSE2-NEXT: retq 1463; 1464; SSE41-LABEL: splatvar_shift_v4i8: 1465; SSE41: # %bb.0: 1466; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1467; SSE41-NEXT: psrlw %xmm1, %xmm0 1468; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 1469; SSE41-NEXT: psrlw %xmm1, %xmm2 1470; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 1471; SSE41-NEXT: pand %xmm2, %xmm0 1472; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] 1473; SSE41-NEXT: psrlw %xmm1, %xmm2 1474; SSE41-NEXT: pxor %xmm2, %xmm0 1475; SSE41-NEXT: psubb %xmm2, %xmm0 1476; SSE41-NEXT: retq 1477; 1478; AVX1-LABEL: splatvar_shift_v4i8: 1479; AVX1: # %bb.0: 1480; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1481; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 1482; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1483; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 1484; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 1485; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 1486; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] 1487; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 1488; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 1489; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 1490; AVX1-NEXT: retq 1491; 1492; AVX2-LABEL: splatvar_shift_v4i8: 1493; AVX2: # %bb.0: 1494; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1495; AVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 1496; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1497; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 1498; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 1499; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2 1500; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 1501; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] 1502; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 1503; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 1504; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 1505; AVX2-NEXT: retq 1506; 1507; XOPAVX1-LABEL: splatvar_shift_v4i8: 1508; XOPAVX1: # %bb.0: 1509; XOPAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1510; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] 1511; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 1512; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 1513; XOPAVX1-NEXT: vpshab %xmm1, %xmm0, %xmm0 1514; XOPAVX1-NEXT: retq 1515; 1516; XOPAVX2-LABEL: splatvar_shift_v4i8: 1517; XOPAVX2: # %bb.0: 1518; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 1519; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 1520; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1 1521; XOPAVX2-NEXT: vpshab %xmm1, %xmm0, %xmm0 1522; XOPAVX2-NEXT: retq 1523; 1524; AVX512DQ-LABEL: splatvar_shift_v4i8: 1525; AVX512DQ: # %bb.0: 1526; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1527; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 1528; AVX512DQ-NEXT: vpsrad %xmm1, %zmm0, %zmm0 1529; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 1530; AVX512DQ-NEXT: vzeroupper 1531; AVX512DQ-NEXT: retq 1532; 1533; AVX512BW-LABEL: splatvar_shift_v4i8: 1534; AVX512BW: # %bb.0: 1535; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1536; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 1537; AVX512BW-NEXT: vpsraw %xmm1, %ymm0, %ymm0 1538; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1539; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1540; AVX512BW-NEXT: vzeroupper 1541; AVX512BW-NEXT: retq 1542; 1543; AVX512DQVL-LABEL: splatvar_shift_v4i8: 1544; AVX512DQVL: # %bb.0: 1545; AVX512DQVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1546; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0 1547; AVX512DQVL-NEXT: vpsrad %xmm1, %zmm0, %zmm0 1548; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 1549; AVX512DQVL-NEXT: vzeroupper 1550; AVX512DQVL-NEXT: retq 1551; 1552; AVX512BWVL-LABEL: splatvar_shift_v4i8: 1553; AVX512BWVL: # %bb.0: 1554; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1555; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 1556; AVX512BWVL-NEXT: vpsraw %xmm1, %ymm0, %ymm0 1557; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 1558; AVX512BWVL-NEXT: vzeroupper 1559; AVX512BWVL-NEXT: retq 1560; 1561; X86-SSE-LABEL: splatvar_shift_v4i8: 1562; X86-SSE: # %bb.0: 1563; X86-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] 1564; X86-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1565; X86-SSE-NEXT: psrlw %xmm1, %xmm0 1566; X86-SSE-NEXT: pcmpeqd %xmm2, %xmm2 1567; X86-SSE-NEXT: psrlw %xmm1, %xmm2 1568; X86-SSE-NEXT: psrlw $8, %xmm2 1569; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1570; X86-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] 1571; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] 1572; X86-SSE-NEXT: pand %xmm2, %xmm0 1573; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] 1574; X86-SSE-NEXT: psrlw %xmm1, %xmm2 1575; X86-SSE-NEXT: pxor %xmm2, %xmm0 1576; X86-SSE-NEXT: psubb %xmm2, %xmm0 1577; X86-SSE-NEXT: retl 1578 %splat = shufflevector <4 x i8> %b, <4 x i8> undef, <4 x i32> zeroinitializer 1579 %shift = ashr <4 x i8> %a, %splat 1580 ret <4 x i8> %shift 1581} 1582 1583define <2 x i8> @splatvar_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind { 1584; SSE2-LABEL: splatvar_shift_v2i8: 1585; SSE2: # %bb.0: 1586; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] 1587; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1588; SSE2-NEXT: psrlw %xmm1, %xmm0 1589; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 1590; SSE2-NEXT: psrlw %xmm1, %xmm2 1591; SSE2-NEXT: psrlw $8, %xmm2 1592; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1593; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] 1594; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] 1595; SSE2-NEXT: pand %xmm2, %xmm0 1596; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] 1597; SSE2-NEXT: psrlw %xmm1, %xmm2 1598; SSE2-NEXT: pxor %xmm2, %xmm0 1599; SSE2-NEXT: psubb %xmm2, %xmm0 1600; SSE2-NEXT: retq 1601; 1602; SSE41-LABEL: splatvar_shift_v2i8: 1603; SSE41: # %bb.0: 1604; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1605; SSE41-NEXT: psrlw %xmm1, %xmm0 1606; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 1607; SSE41-NEXT: psrlw %xmm1, %xmm2 1608; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 1609; SSE41-NEXT: pand %xmm2, %xmm0 1610; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] 1611; SSE41-NEXT: psrlw %xmm1, %xmm2 1612; SSE41-NEXT: pxor %xmm2, %xmm0 1613; SSE41-NEXT: psubb %xmm2, %xmm0 1614; SSE41-NEXT: retq 1615; 1616; AVX1-LABEL: splatvar_shift_v2i8: 1617; AVX1: # %bb.0: 1618; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1619; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 1620; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1621; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 1622; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 1623; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 1624; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] 1625; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 1626; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 1627; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 1628; AVX1-NEXT: retq 1629; 1630; AVX2-LABEL: splatvar_shift_v2i8: 1631; AVX2: # %bb.0: 1632; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1633; AVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 1634; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1635; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 1636; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 1637; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2 1638; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 1639; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] 1640; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 1641; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 1642; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 1643; AVX2-NEXT: retq 1644; 1645; XOP-LABEL: splatvar_shift_v2i8: 1646; XOP: # %bb.0: 1647; XOP-NEXT: insertq {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7,u,u,u,u,u,u,u,u] 1648; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 1649; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1 1650; XOP-NEXT: vpshab %xmm1, %xmm0, %xmm0 1651; XOP-NEXT: retq 1652; 1653; AVX512DQ-LABEL: splatvar_shift_v2i8: 1654; AVX512DQ: # %bb.0: 1655; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1656; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 1657; AVX512DQ-NEXT: vpsrad %xmm1, %zmm0, %zmm0 1658; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 1659; AVX512DQ-NEXT: vzeroupper 1660; AVX512DQ-NEXT: retq 1661; 1662; AVX512BW-LABEL: splatvar_shift_v2i8: 1663; AVX512BW: # %bb.0: 1664; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1665; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 1666; AVX512BW-NEXT: vpsraw %xmm1, %ymm0, %ymm0 1667; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1668; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1669; AVX512BW-NEXT: vzeroupper 1670; AVX512BW-NEXT: retq 1671; 1672; AVX512DQVL-LABEL: splatvar_shift_v2i8: 1673; AVX512DQVL: # %bb.0: 1674; AVX512DQVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1675; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0 1676; AVX512DQVL-NEXT: vpsrad %xmm1, %zmm0, %zmm0 1677; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 1678; AVX512DQVL-NEXT: vzeroupper 1679; AVX512DQVL-NEXT: retq 1680; 1681; AVX512BWVL-LABEL: splatvar_shift_v2i8: 1682; AVX512BWVL: # %bb.0: 1683; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1684; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 1685; AVX512BWVL-NEXT: vpsraw %xmm1, %ymm0, %ymm0 1686; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 1687; AVX512BWVL-NEXT: vzeroupper 1688; AVX512BWVL-NEXT: retq 1689; 1690; X86-SSE-LABEL: splatvar_shift_v2i8: 1691; X86-SSE: # %bb.0: 1692; X86-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] 1693; X86-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1694; X86-SSE-NEXT: psrlw %xmm1, %xmm0 1695; X86-SSE-NEXT: pcmpeqd %xmm2, %xmm2 1696; X86-SSE-NEXT: psrlw %xmm1, %xmm2 1697; X86-SSE-NEXT: psrlw $8, %xmm2 1698; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1699; X86-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] 1700; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] 1701; X86-SSE-NEXT: pand %xmm2, %xmm0 1702; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] 1703; X86-SSE-NEXT: psrlw %xmm1, %xmm2 1704; X86-SSE-NEXT: pxor %xmm2, %xmm0 1705; X86-SSE-NEXT: psubb %xmm2, %xmm0 1706; X86-SSE-NEXT: retl 1707 %splat = shufflevector <2 x i8> %b, <2 x i8> undef, <2 x i32> zeroinitializer 1708 %shift = ashr <2 x i8> %a, %splat 1709 ret <2 x i8> %shift 1710} 1711 1712; 1713; Constant Shifts 1714; 1715 1716define <2 x i32> @constant_shift_v2i32(<2 x i32> %a) nounwind { 1717; SSE2-LABEL: constant_shift_v2i32: 1718; SSE2: # %bb.0: 1719; SSE2-NEXT: movdqa %xmm0, %xmm1 1720; SSE2-NEXT: psrad $4, %xmm1 1721; SSE2-NEXT: psrad $5, %xmm0 1722; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 1723; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1724; SSE2-NEXT: movdqa %xmm1, %xmm0 1725; SSE2-NEXT: retq 1726; 1727; SSE41-LABEL: constant_shift_v2i32: 1728; SSE41: # %bb.0: 1729; SSE41-NEXT: movdqa %xmm0, %xmm1 1730; SSE41-NEXT: psrad $5, %xmm1 1731; SSE41-NEXT: psrad $4, %xmm0 1732; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] 1733; SSE41-NEXT: retq 1734; 1735; AVX1-LABEL: constant_shift_v2i32: 1736; AVX1: # %bb.0: 1737; AVX1-NEXT: vpsrad $5, %xmm0, %xmm1 1738; AVX1-NEXT: vpsrad $4, %xmm0, %xmm0 1739; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] 1740; AVX1-NEXT: retq 1741; 1742; AVX2-LABEL: constant_shift_v2i32: 1743; AVX2: # %bb.0: 1744; AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 1745; AVX2-NEXT: retq 1746; 1747; XOPAVX1-LABEL: constant_shift_v2i32: 1748; XOPAVX1: # %bb.0: 1749; XOPAVX1-NEXT: vpshad {{.*}}(%rip), %xmm0, %xmm0 1750; XOPAVX1-NEXT: retq 1751; 1752; XOPAVX2-LABEL: constant_shift_v2i32: 1753; XOPAVX2: # %bb.0: 1754; XOPAVX2-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 1755; XOPAVX2-NEXT: retq 1756; 1757; AVX512-LABEL: constant_shift_v2i32: 1758; AVX512: # %bb.0: 1759; AVX512-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 1760; AVX512-NEXT: retq 1761; 1762; AVX512VL-LABEL: constant_shift_v2i32: 1763; AVX512VL: # %bb.0: 1764; AVX512VL-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 1765; AVX512VL-NEXT: retq 1766; 1767; X86-SSE-LABEL: constant_shift_v2i32: 1768; X86-SSE: # %bb.0: 1769; X86-SSE-NEXT: movdqa %xmm0, %xmm1 1770; X86-SSE-NEXT: psrad $4, %xmm1 1771; X86-SSE-NEXT: psrad $5, %xmm0 1772; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 1773; X86-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1774; X86-SSE-NEXT: movdqa %xmm1, %xmm0 1775; X86-SSE-NEXT: retl 1776 %shift = ashr <2 x i32> %a, <i32 4, i32 5> 1777 ret <2 x i32> %shift 1778} 1779 1780define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind { 1781; SSE2-LABEL: constant_shift_v4i16: 1782; SSE2: # %bb.0: 1783; SSE2-NEXT: movdqa %xmm0, %xmm1 1784; SSE2-NEXT: psraw $2, %xmm1 1785; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] 1786; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] 1787; SSE2-NEXT: movaps {{.*#+}} xmm2 = [65535,0,65535,0,65535,65535,65535,65535] 1788; SSE2-NEXT: movaps %xmm1, %xmm0 1789; SSE2-NEXT: andps %xmm2, %xmm0 1790; SSE2-NEXT: psraw $1, %xmm1 1791; SSE2-NEXT: andnps %xmm1, %xmm2 1792; SSE2-NEXT: orps %xmm2, %xmm0 1793; SSE2-NEXT: retq 1794; 1795; SSE41-LABEL: constant_shift_v4i16: 1796; SSE41: # %bb.0: 1797; SSE41-NEXT: movdqa {{.*#+}} xmm1 = <u,32768,16384,8192,u,u,u,u> 1798; SSE41-NEXT: pmulhw %xmm0, %xmm1 1799; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] 1800; SSE41-NEXT: psraw $1, %xmm0 1801; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7] 1802; SSE41-NEXT: retq 1803; 1804; AVX-LABEL: constant_shift_v4i16: 1805; AVX: # %bb.0: 1806; AVX-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1 1807; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] 1808; AVX-NEXT: vpsraw $1, %xmm0, %xmm0 1809; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7] 1810; AVX-NEXT: retq 1811; 1812; XOP-LABEL: constant_shift_v4i16: 1813; XOP: # %bb.0: 1814; XOP-NEXT: vpshaw {{.*}}(%rip), %xmm0, %xmm0 1815; XOP-NEXT: retq 1816; 1817; AVX512DQ-LABEL: constant_shift_v4i16: 1818; AVX512DQ: # %bb.0: 1819; AVX512DQ-NEXT: vpmovsxwd %xmm0, %ymm0 1820; AVX512DQ-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0 1821; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 1822; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1823; AVX512DQ-NEXT: vzeroupper 1824; AVX512DQ-NEXT: retq 1825; 1826; AVX512BW-LABEL: constant_shift_v4i16: 1827; AVX512BW: # %bb.0: 1828; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1829; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,2,3,u,u,u,u> 1830; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 1831; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1832; AVX512BW-NEXT: vzeroupper 1833; AVX512BW-NEXT: retq 1834; 1835; AVX512DQVL-LABEL: constant_shift_v4i16: 1836; AVX512DQVL: # %bb.0: 1837; AVX512DQVL-NEXT: vpmovsxwd %xmm0, %ymm0 1838; AVX512DQVL-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0 1839; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0 1840; AVX512DQVL-NEXT: vzeroupper 1841; AVX512DQVL-NEXT: retq 1842; 1843; AVX512BWVL-LABEL: constant_shift_v4i16: 1844; AVX512BWVL: # %bb.0: 1845; AVX512BWVL-NEXT: vpsravw {{.*}}(%rip), %xmm0, %xmm0 1846; AVX512BWVL-NEXT: retq 1847; 1848; X86-SSE-LABEL: constant_shift_v4i16: 1849; X86-SSE: # %bb.0: 1850; X86-SSE-NEXT: movdqa %xmm0, %xmm1 1851; X86-SSE-NEXT: psraw $2, %xmm1 1852; X86-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] 1853; X86-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] 1854; X86-SSE-NEXT: movaps {{.*#+}} xmm2 = [65535,0,65535,0,65535,65535,65535,65535] 1855; X86-SSE-NEXT: movaps %xmm1, %xmm0 1856; X86-SSE-NEXT: andps %xmm2, %xmm0 1857; X86-SSE-NEXT: psraw $1, %xmm1 1858; X86-SSE-NEXT: andnps %xmm1, %xmm2 1859; X86-SSE-NEXT: orps %xmm2, %xmm0 1860; X86-SSE-NEXT: retl 1861 %shift = ashr <4 x i16> %a, <i16 0, i16 1, i16 2, i16 3> 1862 ret <4 x i16> %shift 1863} 1864 1865define <2 x i16> @constant_shift_v2i16(<2 x i16> %a) nounwind { 1866; SSE2-LABEL: constant_shift_v2i16: 1867; SSE2: # %bb.0: 1868; SSE2-NEXT: movdqa %xmm0, %xmm1 1869; SSE2-NEXT: psraw $3, %xmm1 1870; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535] 1871; SSE2-NEXT: psraw $2, %xmm0 1872; SSE2-NEXT: pand %xmm2, %xmm0 1873; SSE2-NEXT: pandn %xmm1, %xmm2 1874; SSE2-NEXT: por %xmm2, %xmm0 1875; SSE2-NEXT: retq 1876; 1877; SSE41-LABEL: constant_shift_v2i16: 1878; SSE41: # %bb.0: 1879; SSE41-NEXT: movdqa %xmm0, %xmm1 1880; SSE41-NEXT: psraw $3, %xmm1 1881; SSE41-NEXT: psraw $2, %xmm0 1882; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] 1883; SSE41-NEXT: retq 1884; 1885; AVX-LABEL: constant_shift_v2i16: 1886; AVX: # %bb.0: 1887; AVX-NEXT: vpsraw $3, %xmm0, %xmm1 1888; AVX-NEXT: vpsraw $2, %xmm0, %xmm0 1889; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] 1890; AVX-NEXT: retq 1891; 1892; XOP-LABEL: constant_shift_v2i16: 1893; XOP: # %bb.0: 1894; XOP-NEXT: vpshaw {{.*}}(%rip), %xmm0, %xmm0 1895; XOP-NEXT: retq 1896; 1897; AVX512DQ-LABEL: constant_shift_v2i16: 1898; AVX512DQ: # %bb.0: 1899; AVX512DQ-NEXT: vpsraw $3, %xmm0, %xmm1 1900; AVX512DQ-NEXT: vpsraw $2, %xmm0, %xmm0 1901; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] 1902; AVX512DQ-NEXT: retq 1903; 1904; AVX512BW-LABEL: constant_shift_v2i16: 1905; AVX512BW: # %bb.0: 1906; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1907; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = <2,3,u,u,u,u,u,u> 1908; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 1909; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1910; AVX512BW-NEXT: vzeroupper 1911; AVX512BW-NEXT: retq 1912; 1913; AVX512DQVL-LABEL: constant_shift_v2i16: 1914; AVX512DQVL: # %bb.0: 1915; AVX512DQVL-NEXT: vpsraw $3, %xmm0, %xmm1 1916; AVX512DQVL-NEXT: vpsraw $2, %xmm0, %xmm0 1917; AVX512DQVL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] 1918; AVX512DQVL-NEXT: retq 1919; 1920; AVX512BWVL-LABEL: constant_shift_v2i16: 1921; AVX512BWVL: # %bb.0: 1922; AVX512BWVL-NEXT: vpsravw {{.*}}(%rip), %xmm0, %xmm0 1923; AVX512BWVL-NEXT: retq 1924; 1925; X86-SSE-LABEL: constant_shift_v2i16: 1926; X86-SSE: # %bb.0: 1927; X86-SSE-NEXT: movdqa %xmm0, %xmm1 1928; X86-SSE-NEXT: psraw $3, %xmm1 1929; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535] 1930; X86-SSE-NEXT: psraw $2, %xmm0 1931; X86-SSE-NEXT: pand %xmm2, %xmm0 1932; X86-SSE-NEXT: pandn %xmm1, %xmm2 1933; X86-SSE-NEXT: por %xmm2, %xmm0 1934; X86-SSE-NEXT: retl 1935 %shift = ashr <2 x i16> %a, <i16 2, i16 3> 1936 ret <2 x i16> %shift 1937} 1938 1939define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind { 1940; SSE-LABEL: constant_shift_v8i8: 1941; SSE: # %bb.0: 1942; SSE-NEXT: pxor %xmm1, %xmm1 1943; SSE-NEXT: movdqa %xmm0, %xmm2 1944; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 1945; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1946; SSE-NEXT: psraw $8, %xmm0 1947; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 1948; SSE-NEXT: psrlw $8, %xmm0 1949; SSE-NEXT: packuswb %xmm2, %xmm0 1950; SSE-NEXT: retq 1951; 1952; AVX1-LABEL: constant_shift_v8i8: 1953; AVX1: # %bb.0: 1954; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 1955; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 1956; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1957; AVX1-NEXT: vpsraw $8, %xmm0, %xmm0 1958; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 1959; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 1960; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1961; AVX1-NEXT: retq 1962; 1963; AVX2-LABEL: constant_shift_v8i8: 1964; AVX2: # %bb.0: 1965; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 1966; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 1967; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 1968; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1969; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1970; AVX2-NEXT: vzeroupper 1971; AVX2-NEXT: retq 1972; 1973; XOP-LABEL: constant_shift_v8i8: 1974; XOP: # %bb.0: 1975; XOP-NEXT: vpshab {{.*}}(%rip), %xmm0, %xmm0 1976; XOP-NEXT: retq 1977; 1978; AVX512DQ-LABEL: constant_shift_v8i8: 1979; AVX512DQ: # %bb.0: 1980; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 1981; AVX512DQ-NEXT: vpsravd {{.*}}(%rip), %zmm0, %zmm0 1982; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 1983; AVX512DQ-NEXT: vzeroupper 1984; AVX512DQ-NEXT: retq 1985; 1986; AVX512BW-LABEL: constant_shift_v8i8: 1987; AVX512BW: # %bb.0: 1988; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,0,0,0,0,0,0,0] 1989; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 1990; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 1991; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1992; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1993; AVX512BW-NEXT: vzeroupper 1994; AVX512BW-NEXT: retq 1995; 1996; AVX512DQVL-LABEL: constant_shift_v8i8: 1997; AVX512DQVL: # %bb.0: 1998; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0 1999; AVX512DQVL-NEXT: vpsravd {{.*}}(%rip), %zmm0, %zmm0 2000; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 2001; AVX512DQVL-NEXT: vzeroupper 2002; AVX512DQVL-NEXT: retq 2003; 2004; AVX512BWVL-LABEL: constant_shift_v8i8: 2005; AVX512BWVL: # %bb.0: 2006; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 2007; AVX512BWVL-NEXT: vpsravw {{.*}}(%rip), %ymm0, %ymm0 2008; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 2009; AVX512BWVL-NEXT: vzeroupper 2010; AVX512BWVL-NEXT: retq 2011; 2012; X86-SSE-LABEL: constant_shift_v8i8: 2013; X86-SSE: # %bb.0: 2014; X86-SSE-NEXT: pxor %xmm1, %xmm1 2015; X86-SSE-NEXT: movdqa %xmm0, %xmm2 2016; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 2017; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2018; X86-SSE-NEXT: psraw $8, %xmm0 2019; X86-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0 2020; X86-SSE-NEXT: psrlw $8, %xmm0 2021; X86-SSE-NEXT: packuswb %xmm2, %xmm0 2022; X86-SSE-NEXT: retl 2023 %shift = ashr <8 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7> 2024 ret <8 x i8> %shift 2025} 2026 2027define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind { 2028; SSE-LABEL: constant_shift_v4i8: 2029; SSE: # %bb.0: 2030; SSE-NEXT: pxor %xmm1, %xmm1 2031; SSE-NEXT: movdqa %xmm0, %xmm2 2032; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 2033; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2034; SSE-NEXT: psraw $8, %xmm0 2035; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 2036; SSE-NEXT: psrlw $8, %xmm0 2037; SSE-NEXT: packuswb %xmm2, %xmm0 2038; SSE-NEXT: retq 2039; 2040; AVX1-LABEL: constant_shift_v4i8: 2041; AVX1: # %bb.0: 2042; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 2043; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 2044; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2045; AVX1-NEXT: vpsraw $8, %xmm0, %xmm0 2046; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 2047; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 2048; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 2049; AVX1-NEXT: retq 2050; 2051; AVX2-LABEL: constant_shift_v4i8: 2052; AVX2: # %bb.0: 2053; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 2054; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 2055; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 2056; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2057; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 2058; AVX2-NEXT: vzeroupper 2059; AVX2-NEXT: retq 2060; 2061; XOP-LABEL: constant_shift_v4i8: 2062; XOP: # %bb.0: 2063; XOP-NEXT: vpshab {{.*}}(%rip), %xmm0, %xmm0 2064; XOP-NEXT: retq 2065; 2066; AVX512DQ-LABEL: constant_shift_v4i8: 2067; AVX512DQ: # %bb.0: 2068; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 2069; AVX512DQ-NEXT: vpsravd {{.*}}(%rip), %zmm0, %zmm0 2070; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 2071; AVX512DQ-NEXT: vzeroupper 2072; AVX512DQ-NEXT: retq 2073; 2074; AVX512BW-LABEL: constant_shift_v4i8: 2075; AVX512BW: # %bb.0: 2076; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,0,0,0,0,0,0,0,0,0,0,0,0] 2077; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 2078; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 2079; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 2080; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2081; AVX512BW-NEXT: vzeroupper 2082; AVX512BW-NEXT: retq 2083; 2084; AVX512DQVL-LABEL: constant_shift_v4i8: 2085; AVX512DQVL: # %bb.0: 2086; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0 2087; AVX512DQVL-NEXT: vpsravd {{.*}}(%rip), %zmm0, %zmm0 2088; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 2089; AVX512DQVL-NEXT: vzeroupper 2090; AVX512DQVL-NEXT: retq 2091; 2092; AVX512BWVL-LABEL: constant_shift_v4i8: 2093; AVX512BWVL: # %bb.0: 2094; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 2095; AVX512BWVL-NEXT: vpsravw {{.*}}(%rip), %ymm0, %ymm0 2096; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 2097; AVX512BWVL-NEXT: vzeroupper 2098; AVX512BWVL-NEXT: retq 2099; 2100; X86-SSE-LABEL: constant_shift_v4i8: 2101; X86-SSE: # %bb.0: 2102; X86-SSE-NEXT: pxor %xmm1, %xmm1 2103; X86-SSE-NEXT: movdqa %xmm0, %xmm2 2104; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 2105; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2106; X86-SSE-NEXT: psraw $8, %xmm0 2107; X86-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0 2108; X86-SSE-NEXT: psrlw $8, %xmm0 2109; X86-SSE-NEXT: packuswb %xmm2, %xmm0 2110; X86-SSE-NEXT: retl 2111 %shift = ashr <4 x i8> %a, <i8 0, i8 1, i8 2, i8 3> 2112 ret <4 x i8> %shift 2113} 2114 2115define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind { 2116; SSE-LABEL: constant_shift_v2i8: 2117; SSE: # %bb.0: 2118; SSE-NEXT: pxor %xmm1, %xmm1 2119; SSE-NEXT: movdqa %xmm0, %xmm2 2120; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 2121; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2122; SSE-NEXT: psraw $8, %xmm0 2123; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 2124; SSE-NEXT: psrlw $8, %xmm0 2125; SSE-NEXT: packuswb %xmm2, %xmm0 2126; SSE-NEXT: retq 2127; 2128; AVX1-LABEL: constant_shift_v2i8: 2129; AVX1: # %bb.0: 2130; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 2131; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 2132; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2133; AVX1-NEXT: vpsraw $8, %xmm0, %xmm0 2134; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 2135; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 2136; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 2137; AVX1-NEXT: retq 2138; 2139; AVX2-LABEL: constant_shift_v2i8: 2140; AVX2: # %bb.0: 2141; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 2142; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 2143; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 2144; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2145; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 2146; AVX2-NEXT: vzeroupper 2147; AVX2-NEXT: retq 2148; 2149; XOP-LABEL: constant_shift_v2i8: 2150; XOP: # %bb.0: 2151; XOP-NEXT: vpshab {{.*}}(%rip), %xmm0, %xmm0 2152; XOP-NEXT: retq 2153; 2154; AVX512DQ-LABEL: constant_shift_v2i8: 2155; AVX512DQ: # %bb.0: 2156; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 2157; AVX512DQ-NEXT: vpsravd {{.*}}(%rip), %zmm0, %zmm0 2158; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 2159; AVX512DQ-NEXT: vzeroupper 2160; AVX512DQ-NEXT: retq 2161; 2162; AVX512BW-LABEL: constant_shift_v2i8: 2163; AVX512BW: # %bb.0: 2164; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0] 2165; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 2166; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 2167; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 2168; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2169; AVX512BW-NEXT: vzeroupper 2170; AVX512BW-NEXT: retq 2171; 2172; AVX512DQVL-LABEL: constant_shift_v2i8: 2173; AVX512DQVL: # %bb.0: 2174; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0 2175; AVX512DQVL-NEXT: vpsravd {{.*}}(%rip), %zmm0, %zmm0 2176; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 2177; AVX512DQVL-NEXT: vzeroupper 2178; AVX512DQVL-NEXT: retq 2179; 2180; AVX512BWVL-LABEL: constant_shift_v2i8: 2181; AVX512BWVL: # %bb.0: 2182; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 2183; AVX512BWVL-NEXT: vpsravw {{.*}}(%rip), %ymm0, %ymm0 2184; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 2185; AVX512BWVL-NEXT: vzeroupper 2186; AVX512BWVL-NEXT: retq 2187; 2188; X86-SSE-LABEL: constant_shift_v2i8: 2189; X86-SSE: # %bb.0: 2190; X86-SSE-NEXT: pxor %xmm1, %xmm1 2191; X86-SSE-NEXT: movdqa %xmm0, %xmm2 2192; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 2193; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2194; X86-SSE-NEXT: psraw $8, %xmm0 2195; X86-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0 2196; X86-SSE-NEXT: psrlw $8, %xmm0 2197; X86-SSE-NEXT: packuswb %xmm2, %xmm0 2198; X86-SSE-NEXT: retl 2199 %shift = ashr <2 x i8> %a, <i8 2, i8 3> 2200 ret <2 x i8> %shift 2201} 2202 2203; 2204; Uniform Constant Shifts 2205; 2206 2207define <2 x i32> @splatconstant_shift_v2i32(<2 x i32> %a) nounwind { 2208; SSE-LABEL: splatconstant_shift_v2i32: 2209; SSE: # %bb.0: 2210; SSE-NEXT: psrad $5, %xmm0 2211; SSE-NEXT: retq 2212; 2213; AVX-LABEL: splatconstant_shift_v2i32: 2214; AVX: # %bb.0: 2215; AVX-NEXT: vpsrad $5, %xmm0, %xmm0 2216; AVX-NEXT: retq 2217; 2218; XOP-LABEL: splatconstant_shift_v2i32: 2219; XOP: # %bb.0: 2220; XOP-NEXT: vpsrad $5, %xmm0, %xmm0 2221; XOP-NEXT: retq 2222; 2223; AVX512-LABEL: splatconstant_shift_v2i32: 2224; AVX512: # %bb.0: 2225; AVX512-NEXT: vpsrad $5, %xmm0, %xmm0 2226; AVX512-NEXT: retq 2227; 2228; AVX512VL-LABEL: splatconstant_shift_v2i32: 2229; AVX512VL: # %bb.0: 2230; AVX512VL-NEXT: vpsrad $5, %xmm0, %xmm0 2231; AVX512VL-NEXT: retq 2232; 2233; X86-SSE-LABEL: splatconstant_shift_v2i32: 2234; X86-SSE: # %bb.0: 2235; X86-SSE-NEXT: psrad $5, %xmm0 2236; X86-SSE-NEXT: retl 2237 %shift = ashr <2 x i32> %a, <i32 5, i32 5> 2238 ret <2 x i32> %shift 2239} 2240 2241define <4 x i16> @splatconstant_shift_v4i16(<4 x i16> %a) nounwind { 2242; SSE-LABEL: splatconstant_shift_v4i16: 2243; SSE: # %bb.0: 2244; SSE-NEXT: psraw $3, %xmm0 2245; SSE-NEXT: retq 2246; 2247; AVX-LABEL: splatconstant_shift_v4i16: 2248; AVX: # %bb.0: 2249; AVX-NEXT: vpsraw $3, %xmm0, %xmm0 2250; AVX-NEXT: retq 2251; 2252; XOP-LABEL: splatconstant_shift_v4i16: 2253; XOP: # %bb.0: 2254; XOP-NEXT: vpsraw $3, %xmm0, %xmm0 2255; XOP-NEXT: retq 2256; 2257; AVX512-LABEL: splatconstant_shift_v4i16: 2258; AVX512: # %bb.0: 2259; AVX512-NEXT: vpsraw $3, %xmm0, %xmm0 2260; AVX512-NEXT: retq 2261; 2262; AVX512VL-LABEL: splatconstant_shift_v4i16: 2263; AVX512VL: # %bb.0: 2264; AVX512VL-NEXT: vpsraw $3, %xmm0, %xmm0 2265; AVX512VL-NEXT: retq 2266; 2267; X86-SSE-LABEL: splatconstant_shift_v4i16: 2268; X86-SSE: # %bb.0: 2269; X86-SSE-NEXT: psraw $3, %xmm0 2270; X86-SSE-NEXT: retl 2271 %shift = ashr <4 x i16> %a, <i16 3, i16 3, i16 3, i16 3> 2272 ret <4 x i16> %shift 2273} 2274 2275define <2 x i16> @splatconstant_shift_v2i16(<2 x i16> %a) nounwind { 2276; SSE-LABEL: splatconstant_shift_v2i16: 2277; SSE: # %bb.0: 2278; SSE-NEXT: psraw $3, %xmm0 2279; SSE-NEXT: retq 2280; 2281; AVX-LABEL: splatconstant_shift_v2i16: 2282; AVX: # %bb.0: 2283; AVX-NEXT: vpsraw $3, %xmm0, %xmm0 2284; AVX-NEXT: retq 2285; 2286; XOP-LABEL: splatconstant_shift_v2i16: 2287; XOP: # %bb.0: 2288; XOP-NEXT: vpsraw $3, %xmm0, %xmm0 2289; XOP-NEXT: retq 2290; 2291; AVX512-LABEL: splatconstant_shift_v2i16: 2292; AVX512: # %bb.0: 2293; AVX512-NEXT: vpsraw $3, %xmm0, %xmm0 2294; AVX512-NEXT: retq 2295; 2296; AVX512VL-LABEL: splatconstant_shift_v2i16: 2297; AVX512VL: # %bb.0: 2298; AVX512VL-NEXT: vpsraw $3, %xmm0, %xmm0 2299; AVX512VL-NEXT: retq 2300; 2301; X86-SSE-LABEL: splatconstant_shift_v2i16: 2302; X86-SSE: # %bb.0: 2303; X86-SSE-NEXT: psraw $3, %xmm0 2304; X86-SSE-NEXT: retl 2305 %shift = ashr <2 x i16> %a, <i16 3, i16 3> 2306 ret <2 x i16> %shift 2307} 2308 2309define <8 x i8> @splatconstant_shift_v8i8(<8 x i8> %a) nounwind { 2310; SSE-LABEL: splatconstant_shift_v8i8: 2311; SSE: # %bb.0: 2312; SSE-NEXT: psrlw $3, %xmm0 2313; SSE-NEXT: pand {{.*}}(%rip), %xmm0 2314; SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 2315; SSE-NEXT: pxor %xmm1, %xmm0 2316; SSE-NEXT: psubb %xmm1, %xmm0 2317; SSE-NEXT: retq 2318; 2319; AVX-LABEL: splatconstant_shift_v8i8: 2320; AVX: # %bb.0: 2321; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0 2322; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 2323; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 2324; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 2325; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 2326; AVX-NEXT: retq 2327; 2328; XOP-LABEL: splatconstant_shift_v8i8: 2329; XOP: # %bb.0: 2330; XOP-NEXT: vpshab {{.*}}(%rip), %xmm0, %xmm0 2331; XOP-NEXT: retq 2332; 2333; AVX512-LABEL: splatconstant_shift_v8i8: 2334; AVX512: # %bb.0: 2335; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0 2336; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 2337; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 2338; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 2339; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0 2340; AVX512-NEXT: retq 2341; 2342; AVX512VL-LABEL: splatconstant_shift_v8i8: 2343; AVX512VL: # %bb.0: 2344; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0 2345; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 2346; AVX512VL-NEXT: vpternlogq $108, {{.*}}(%rip), %xmm1, %xmm0 2347; AVX512VL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 2348; AVX512VL-NEXT: retq 2349; 2350; X86-SSE-LABEL: splatconstant_shift_v8i8: 2351; X86-SSE: # %bb.0: 2352; X86-SSE-NEXT: psrlw $3, %xmm0 2353; X86-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 2354; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 2355; X86-SSE-NEXT: pxor %xmm1, %xmm0 2356; X86-SSE-NEXT: psubb %xmm1, %xmm0 2357; X86-SSE-NEXT: retl 2358 %shift = ashr <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3> 2359 ret <8 x i8> %shift 2360} 2361 2362define <4 x i8> @splatconstant_shift_v4i8(<4 x i8> %a) nounwind { 2363; SSE-LABEL: splatconstant_shift_v4i8: 2364; SSE: # %bb.0: 2365; SSE-NEXT: psrlw $3, %xmm0 2366; SSE-NEXT: pand {{.*}}(%rip), %xmm0 2367; SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 2368; SSE-NEXT: pxor %xmm1, %xmm0 2369; SSE-NEXT: psubb %xmm1, %xmm0 2370; SSE-NEXT: retq 2371; 2372; AVX-LABEL: splatconstant_shift_v4i8: 2373; AVX: # %bb.0: 2374; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0 2375; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 2376; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 2377; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 2378; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 2379; AVX-NEXT: retq 2380; 2381; XOP-LABEL: splatconstant_shift_v4i8: 2382; XOP: # %bb.0: 2383; XOP-NEXT: vpshab {{.*}}(%rip), %xmm0, %xmm0 2384; XOP-NEXT: retq 2385; 2386; AVX512-LABEL: splatconstant_shift_v4i8: 2387; AVX512: # %bb.0: 2388; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0 2389; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 2390; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 2391; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 2392; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0 2393; AVX512-NEXT: retq 2394; 2395; AVX512VL-LABEL: splatconstant_shift_v4i8: 2396; AVX512VL: # %bb.0: 2397; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0 2398; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 2399; AVX512VL-NEXT: vpternlogq $108, {{.*}}(%rip), %xmm1, %xmm0 2400; AVX512VL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 2401; AVX512VL-NEXT: retq 2402; 2403; X86-SSE-LABEL: splatconstant_shift_v4i8: 2404; X86-SSE: # %bb.0: 2405; X86-SSE-NEXT: psrlw $3, %xmm0 2406; X86-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 2407; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 2408; X86-SSE-NEXT: pxor %xmm1, %xmm0 2409; X86-SSE-NEXT: psubb %xmm1, %xmm0 2410; X86-SSE-NEXT: retl 2411 %shift = ashr <4 x i8> %a, <i8 3, i8 3, i8 3, i8 3> 2412 ret <4 x i8> %shift 2413} 2414 2415define <2 x i8> @splatconstant_shift_v2i8(<2 x i8> %a) nounwind { 2416; SSE-LABEL: splatconstant_shift_v2i8: 2417; SSE: # %bb.0: 2418; SSE-NEXT: psrlw $3, %xmm0 2419; SSE-NEXT: pand {{.*}}(%rip), %xmm0 2420; SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 2421; SSE-NEXT: pxor %xmm1, %xmm0 2422; SSE-NEXT: psubb %xmm1, %xmm0 2423; SSE-NEXT: retq 2424; 2425; AVX-LABEL: splatconstant_shift_v2i8: 2426; AVX: # %bb.0: 2427; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0 2428; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 2429; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 2430; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 2431; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 2432; AVX-NEXT: retq 2433; 2434; XOP-LABEL: splatconstant_shift_v2i8: 2435; XOP: # %bb.0: 2436; XOP-NEXT: vpshab {{.*}}(%rip), %xmm0, %xmm0 2437; XOP-NEXT: retq 2438; 2439; AVX512-LABEL: splatconstant_shift_v2i8: 2440; AVX512: # %bb.0: 2441; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0 2442; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 2443; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 2444; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 2445; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0 2446; AVX512-NEXT: retq 2447; 2448; AVX512VL-LABEL: splatconstant_shift_v2i8: 2449; AVX512VL: # %bb.0: 2450; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0 2451; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 2452; AVX512VL-NEXT: vpternlogq $108, {{.*}}(%rip), %xmm1, %xmm0 2453; AVX512VL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 2454; AVX512VL-NEXT: retq 2455; 2456; X86-SSE-LABEL: splatconstant_shift_v2i8: 2457; X86-SSE: # %bb.0: 2458; X86-SSE-NEXT: psrlw $3, %xmm0 2459; X86-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 2460; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 2461; X86-SSE-NEXT: pxor %xmm1, %xmm0 2462; X86-SSE-NEXT: psubb %xmm1, %xmm0 2463; X86-SSE-NEXT: retl 2464 %shift = ashr <2 x i8> %a, <i8 3, i8 3> 2465 ret <2 x i8> %shift 2466} 2467