1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2 8; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW 9; 10; Just one 32-bit run to make sure we do reasonable things for i64 shifts. 11; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE2 12 13; 14; Variable Shifts 15; 16 17define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { 18; SSE2-LABEL: var_shift_v2i64: 19; SSE2: # BB#0: 20; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] 21; SSE2-NEXT: movdqa %xmm0, %xmm2 22; SSE2-NEXT: psrlq %xmm3, %xmm2 23; SSE2-NEXT: psrlq %xmm1, %xmm0 24; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] 25; SSE2-NEXT: movapd %xmm2, %xmm0 26; SSE2-NEXT: retq 27; 28; SSE41-LABEL: var_shift_v2i64: 29; SSE41: # BB#0: 30; SSE41-NEXT: movdqa %xmm0, %xmm2 31; SSE41-NEXT: psrlq %xmm1, %xmm2 32; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 33; SSE41-NEXT: psrlq %xmm1, %xmm0 34; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] 35; SSE41-NEXT: retq 36; 37; AVX1-LABEL: var_shift_v2i64: 38; AVX1: # BB#0: 39; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm2 40; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 41; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 42; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] 43; AVX1-NEXT: retq 44; 45; AVX2-LABEL: var_shift_v2i64: 46; AVX2: # BB#0: 47; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 48; AVX2-NEXT: retq 49; 50; XOPAVX1-LABEL: var_shift_v2i64: 51; XOPAVX1: # BB#0: 52; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 53; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1 54; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0 55; XOPAVX1-NEXT: retq 56; 57; XOPAVX2-LABEL: var_shift_v2i64: 58; XOPAVX2: # BB#0: 59; XOPAVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 60; XOPAVX2-NEXT: retq 61; 62; AVX512-LABEL: var_shift_v2i64: 63; AVX512: ## BB#0: 64; AVX512-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 65; AVX512-NEXT: retq 66; 67; X32-SSE-LABEL: var_shift_v2i64: 68; X32-SSE: # BB#0: 69; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] 70; X32-SSE-NEXT: movdqa %xmm0, %xmm2 71; X32-SSE-NEXT: psrlq %xmm3, %xmm2 72; X32-SSE-NEXT: movq {{.*#+}} xmm1 = xmm1[0],zero 73; X32-SSE-NEXT: psrlq %xmm1, %xmm0 74; X32-SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] 75; X32-SSE-NEXT: movapd %xmm2, %xmm0 76; X32-SSE-NEXT: retl 77 %shift = lshr <2 x i64> %a, %b 78 ret <2 x i64> %shift 79} 80 81define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { 82; SSE2-LABEL: var_shift_v4i32: 83; SSE2: # BB#0: 84; SSE2-NEXT: movdqa %xmm1, %xmm2 85; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 86; SSE2-NEXT: movdqa %xmm0, %xmm3 87; SSE2-NEXT: psrld %xmm2, %xmm3 88; SSE2-NEXT: movdqa %xmm1, %xmm2 89; SSE2-NEXT: psrlq $32, %xmm2 90; SSE2-NEXT: movdqa %xmm0, %xmm4 91; SSE2-NEXT: psrld %xmm2, %xmm4 92; SSE2-NEXT: movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1] 93; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] 94; SSE2-NEXT: pxor %xmm3, %xmm3 95; SSE2-NEXT: movdqa %xmm1, %xmm4 96; SSE2-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] 97; SSE2-NEXT: movdqa %xmm0, %xmm5 98; SSE2-NEXT: psrld %xmm4, %xmm5 99; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] 100; SSE2-NEXT: psrld %xmm1, %xmm0 101; SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1] 102; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3] 103; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 104; SSE2-NEXT: retq 105; 106; SSE41-LABEL: var_shift_v4i32: 107; SSE41: # BB#0: 108; SSE41-NEXT: movdqa %xmm1, %xmm2 109; SSE41-NEXT: psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 110; SSE41-NEXT: movdqa %xmm0, %xmm3 111; SSE41-NEXT: psrld %xmm2, %xmm3 112; SSE41-NEXT: movdqa %xmm1, %xmm2 113; SSE41-NEXT: psrlq $32, %xmm2 114; SSE41-NEXT: movdqa %xmm0, %xmm4 115; SSE41-NEXT: psrld %xmm2, %xmm4 116; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7] 117; SSE41-NEXT: pxor %xmm2, %xmm2 118; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero 119; SSE41-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] 120; SSE41-NEXT: movdqa %xmm0, %xmm2 121; SSE41-NEXT: psrld %xmm1, %xmm2 122; SSE41-NEXT: psrld %xmm3, %xmm0 123; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] 124; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7] 125; SSE41-NEXT: retq 126; 127; AVX1-LABEL: var_shift_v4i32: 128; AVX1: # BB#0: 129; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 130; AVX1-NEXT: vpsrld %xmm2, %xmm0, %xmm2 131; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3 132; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3 133; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] 134; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 135; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] 136; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3 137; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 138; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0 139; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] 140; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 141; AVX1-NEXT: retq 142; 143; AVX2-LABEL: var_shift_v4i32: 144; AVX2: # BB#0: 145; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 146; AVX2-NEXT: retq 147; 148; XOPAVX1-LABEL: var_shift_v4i32: 149; XOPAVX1: # BB#0: 150; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 151; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1 152; XOPAVX1-NEXT: vpshld %xmm1, %xmm0, %xmm0 153; XOPAVX1-NEXT: retq 154; 155; XOPAVX2-LABEL: var_shift_v4i32: 156; XOPAVX2: # BB#0: 157; XOPAVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 158; XOPAVX2-NEXT: retq 159; 160; AVX512-LABEL: var_shift_v4i32: 161; AVX512: ## BB#0: 162; AVX512-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 163; AVX512-NEXT: retq 164; 165; X32-SSE-LABEL: var_shift_v4i32: 166; X32-SSE: # BB#0: 167; X32-SSE-NEXT: movdqa %xmm1, %xmm2 168; X32-SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 169; X32-SSE-NEXT: movdqa %xmm0, %xmm3 170; X32-SSE-NEXT: psrld %xmm2, %xmm3 171; X32-SSE-NEXT: movdqa %xmm1, %xmm2 172; X32-SSE-NEXT: psrlq $32, %xmm2 173; X32-SSE-NEXT: movdqa %xmm0, %xmm4 174; X32-SSE-NEXT: psrld %xmm2, %xmm4 175; X32-SSE-NEXT: movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1] 176; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] 177; X32-SSE-NEXT: pxor %xmm3, %xmm3 178; X32-SSE-NEXT: movdqa %xmm1, %xmm4 179; X32-SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] 180; X32-SSE-NEXT: movdqa %xmm0, %xmm5 181; X32-SSE-NEXT: psrld %xmm4, %xmm5 182; X32-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] 183; X32-SSE-NEXT: psrld %xmm1, %xmm0 184; X32-SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1] 185; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3] 186; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 187; X32-SSE-NEXT: retl 188 %shift = lshr <4 x i32> %a, %b 189 ret <4 x i32> %shift 190} 191 192define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { 193; SSE2-LABEL: var_shift_v8i16: 194; SSE2: # BB#0: 195; SSE2-NEXT: psllw $12, %xmm1 196; SSE2-NEXT: movdqa %xmm1, %xmm2 197; SSE2-NEXT: psraw $15, %xmm2 198; SSE2-NEXT: movdqa %xmm2, %xmm3 199; SSE2-NEXT: pandn %xmm0, %xmm3 200; SSE2-NEXT: psrlw $8, %xmm0 201; SSE2-NEXT: pand %xmm2, %xmm0 202; SSE2-NEXT: por %xmm3, %xmm0 203; SSE2-NEXT: paddw %xmm1, %xmm1 204; SSE2-NEXT: movdqa %xmm1, %xmm2 205; SSE2-NEXT: psraw $15, %xmm2 206; SSE2-NEXT: movdqa %xmm2, %xmm3 207; SSE2-NEXT: pandn %xmm0, %xmm3 208; SSE2-NEXT: psrlw $4, %xmm0 209; SSE2-NEXT: pand %xmm2, %xmm0 210; SSE2-NEXT: por %xmm3, %xmm0 211; SSE2-NEXT: paddw %xmm1, %xmm1 212; SSE2-NEXT: movdqa %xmm1, %xmm2 213; SSE2-NEXT: psraw $15, %xmm2 214; SSE2-NEXT: movdqa %xmm2, %xmm3 215; SSE2-NEXT: pandn %xmm0, %xmm3 216; SSE2-NEXT: psrlw $2, %xmm0 217; SSE2-NEXT: pand %xmm2, %xmm0 218; SSE2-NEXT: por %xmm3, %xmm0 219; SSE2-NEXT: paddw %xmm1, %xmm1 220; SSE2-NEXT: psraw $15, %xmm1 221; SSE2-NEXT: movdqa %xmm1, %xmm2 222; SSE2-NEXT: pandn %xmm0, %xmm2 223; SSE2-NEXT: psrlw $1, %xmm0 224; SSE2-NEXT: pand %xmm1, %xmm0 225; SSE2-NEXT: por %xmm2, %xmm0 226; SSE2-NEXT: retq 227; 228; SSE41-LABEL: var_shift_v8i16: 229; SSE41: # BB#0: 230; SSE41-NEXT: movdqa %xmm0, %xmm2 231; SSE41-NEXT: movdqa %xmm1, %xmm0 232; SSE41-NEXT: psllw $12, %xmm0 233; SSE41-NEXT: psllw $4, %xmm1 234; SSE41-NEXT: por %xmm0, %xmm1 235; SSE41-NEXT: movdqa %xmm1, %xmm3 236; SSE41-NEXT: paddw %xmm3, %xmm3 237; SSE41-NEXT: movdqa %xmm2, %xmm4 238; SSE41-NEXT: psrlw $8, %xmm4 239; SSE41-NEXT: movdqa %xmm1, %xmm0 240; SSE41-NEXT: pblendvb %xmm4, %xmm2 241; SSE41-NEXT: movdqa %xmm2, %xmm1 242; SSE41-NEXT: psrlw $4, %xmm1 243; SSE41-NEXT: movdqa %xmm3, %xmm0 244; SSE41-NEXT: pblendvb %xmm1, %xmm2 245; SSE41-NEXT: movdqa %xmm2, %xmm1 246; SSE41-NEXT: psrlw $2, %xmm1 247; SSE41-NEXT: paddw %xmm3, %xmm3 248; SSE41-NEXT: movdqa %xmm3, %xmm0 249; SSE41-NEXT: pblendvb %xmm1, %xmm2 250; SSE41-NEXT: movdqa %xmm2, %xmm1 251; SSE41-NEXT: psrlw $1, %xmm1 252; SSE41-NEXT: paddw %xmm3, %xmm3 253; SSE41-NEXT: movdqa %xmm3, %xmm0 254; SSE41-NEXT: pblendvb %xmm1, %xmm2 255; SSE41-NEXT: movdqa %xmm2, %xmm0 256; SSE41-NEXT: retq 257; 258; AVX1-LABEL: var_shift_v8i16: 259; AVX1: # BB#0: 260; AVX1-NEXT: vpsllw $12, %xmm1, %xmm2 261; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 262; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 263; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm2 264; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm3 265; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 266; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm1 267; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 268; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1 269; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 270; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 271; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm1 272; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 273; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 274; AVX1-NEXT: retq 275; 276; AVX2-LABEL: var_shift_v8i16: 277; AVX2: # BB#0: 278; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 279; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 280; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 281; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero 282; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 283; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 284; AVX2-NEXT: vzeroupper 285; AVX2-NEXT: retq 286; 287; XOP-LABEL: var_shift_v8i16: 288; XOP: # BB#0: 289; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 290; XOP-NEXT: vpsubw %xmm1, %xmm2, %xmm1 291; XOP-NEXT: vpshlw %xmm1, %xmm0, %xmm0 292; XOP-NEXT: retq 293; 294; AVX512-LABEL: var_shift_v8i16: 295; AVX512: ## BB#0: 296; AVX512-NEXT: ## kill: %XMM1<def> %XMM1<kill> %ZMM1<def> 297; AVX512-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def> 298; AVX512-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 299; AVX512-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill> 300; AVX512-NEXT: retq 301; 302; X32-SSE-LABEL: var_shift_v8i16: 303; X32-SSE: # BB#0: 304; X32-SSE-NEXT: psllw $12, %xmm1 305; X32-SSE-NEXT: movdqa %xmm1, %xmm2 306; X32-SSE-NEXT: psraw $15, %xmm2 307; X32-SSE-NEXT: movdqa %xmm2, %xmm3 308; X32-SSE-NEXT: pandn %xmm0, %xmm3 309; X32-SSE-NEXT: psrlw $8, %xmm0 310; X32-SSE-NEXT: pand %xmm2, %xmm0 311; X32-SSE-NEXT: por %xmm3, %xmm0 312; X32-SSE-NEXT: paddw %xmm1, %xmm1 313; X32-SSE-NEXT: movdqa %xmm1, %xmm2 314; X32-SSE-NEXT: psraw $15, %xmm2 315; X32-SSE-NEXT: movdqa %xmm2, %xmm3 316; X32-SSE-NEXT: pandn %xmm0, %xmm3 317; X32-SSE-NEXT: psrlw $4, %xmm0 318; X32-SSE-NEXT: pand %xmm2, %xmm0 319; X32-SSE-NEXT: por %xmm3, %xmm0 320; X32-SSE-NEXT: paddw %xmm1, %xmm1 321; X32-SSE-NEXT: movdqa %xmm1, %xmm2 322; X32-SSE-NEXT: psraw $15, %xmm2 323; X32-SSE-NEXT: movdqa %xmm2, %xmm3 324; X32-SSE-NEXT: pandn %xmm0, %xmm3 325; X32-SSE-NEXT: psrlw $2, %xmm0 326; X32-SSE-NEXT: pand %xmm2, %xmm0 327; X32-SSE-NEXT: por %xmm3, %xmm0 328; X32-SSE-NEXT: paddw %xmm1, %xmm1 329; X32-SSE-NEXT: psraw $15, %xmm1 330; X32-SSE-NEXT: movdqa %xmm1, %xmm2 331; X32-SSE-NEXT: pandn %xmm0, %xmm2 332; X32-SSE-NEXT: psrlw $1, %xmm0 333; X32-SSE-NEXT: pand %xmm1, %xmm0 334; X32-SSE-NEXT: por %xmm2, %xmm0 335; X32-SSE-NEXT: retl 336 %shift = lshr <8 x i16> %a, %b 337 ret <8 x i16> %shift 338} 339 340define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { 341; SSE2-LABEL: var_shift_v16i8: 342; SSE2: # BB#0: 343; SSE2-NEXT: psllw $5, %xmm1 344; SSE2-NEXT: pxor %xmm2, %xmm2 345; SSE2-NEXT: pxor %xmm3, %xmm3 346; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 347; SSE2-NEXT: movdqa %xmm3, %xmm4 348; SSE2-NEXT: pandn %xmm0, %xmm4 349; SSE2-NEXT: psrlw $4, %xmm0 350; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 351; SSE2-NEXT: pand %xmm3, %xmm0 352; SSE2-NEXT: por %xmm4, %xmm0 353; SSE2-NEXT: paddb %xmm1, %xmm1 354; SSE2-NEXT: pxor %xmm3, %xmm3 355; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 356; SSE2-NEXT: movdqa %xmm3, %xmm4 357; SSE2-NEXT: pandn %xmm0, %xmm4 358; SSE2-NEXT: psrlw $2, %xmm0 359; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 360; SSE2-NEXT: pand %xmm3, %xmm0 361; SSE2-NEXT: por %xmm4, %xmm0 362; SSE2-NEXT: paddb %xmm1, %xmm1 363; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 364; SSE2-NEXT: movdqa %xmm2, %xmm1 365; SSE2-NEXT: pandn %xmm0, %xmm1 366; SSE2-NEXT: psrlw $1, %xmm0 367; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 368; SSE2-NEXT: pand %xmm2, %xmm0 369; SSE2-NEXT: por %xmm1, %xmm0 370; SSE2-NEXT: retq 371; 372; SSE41-LABEL: var_shift_v16i8: 373; SSE41: # BB#0: 374; SSE41-NEXT: movdqa %xmm0, %xmm2 375; SSE41-NEXT: psllw $5, %xmm1 376; SSE41-NEXT: movdqa %xmm2, %xmm3 377; SSE41-NEXT: psrlw $4, %xmm3 378; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 379; SSE41-NEXT: movdqa %xmm1, %xmm0 380; SSE41-NEXT: pblendvb %xmm3, %xmm2 381; SSE41-NEXT: movdqa %xmm2, %xmm3 382; SSE41-NEXT: psrlw $2, %xmm3 383; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 384; SSE41-NEXT: paddb %xmm1, %xmm1 385; SSE41-NEXT: movdqa %xmm1, %xmm0 386; SSE41-NEXT: pblendvb %xmm3, %xmm2 387; SSE41-NEXT: movdqa %xmm2, %xmm3 388; SSE41-NEXT: psrlw $1, %xmm3 389; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 390; SSE41-NEXT: paddb %xmm1, %xmm1 391; SSE41-NEXT: movdqa %xmm1, %xmm0 392; SSE41-NEXT: pblendvb %xmm3, %xmm2 393; SSE41-NEXT: movdqa %xmm2, %xmm0 394; SSE41-NEXT: retq 395; 396; AVX-LABEL: var_shift_v16i8: 397; AVX: # BB#0: 398; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 399; AVX-NEXT: vpsrlw $4, %xmm0, %xmm2 400; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 401; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 402; AVX-NEXT: vpsrlw $2, %xmm0, %xmm2 403; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 404; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 405; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 406; AVX-NEXT: vpsrlw $1, %xmm0, %xmm2 407; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 408; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 409; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 410; AVX-NEXT: retq 411; 412; XOP-LABEL: var_shift_v16i8: 413; XOP: # BB#0: 414; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 415; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1 416; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0 417; XOP-NEXT: retq 418; 419; AVX512-LABEL: var_shift_v16i8: 420; AVX512: ## BB#0: 421; AVX512-NEXT: vpsllw $5, %xmm1, %xmm1 422; AVX512-NEXT: vpsrlw $4, %xmm0, %xmm2 423; AVX512-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 424; AVX512-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 425; AVX512-NEXT: vpsrlw $2, %xmm0, %xmm2 426; AVX512-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 427; AVX512-NEXT: vpaddb %xmm1, %xmm1, %xmm1 428; AVX512-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 429; AVX512-NEXT: vpsrlw $1, %xmm0, %xmm2 430; AVX512-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 431; AVX512-NEXT: vpaddb %xmm1, %xmm1, %xmm1 432; AVX512-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 433; AVX512-NEXT: retq 434; 435; X32-SSE-LABEL: var_shift_v16i8: 436; X32-SSE: # BB#0: 437; X32-SSE-NEXT: psllw $5, %xmm1 438; X32-SSE-NEXT: pxor %xmm2, %xmm2 439; X32-SSE-NEXT: pxor %xmm3, %xmm3 440; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3 441; X32-SSE-NEXT: movdqa %xmm3, %xmm4 442; X32-SSE-NEXT: pandn %xmm0, %xmm4 443; X32-SSE-NEXT: psrlw $4, %xmm0 444; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 445; X32-SSE-NEXT: pand %xmm3, %xmm0 446; X32-SSE-NEXT: por %xmm4, %xmm0 447; X32-SSE-NEXT: paddb %xmm1, %xmm1 448; X32-SSE-NEXT: pxor %xmm3, %xmm3 449; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3 450; X32-SSE-NEXT: movdqa %xmm3, %xmm4 451; X32-SSE-NEXT: pandn %xmm0, %xmm4 452; X32-SSE-NEXT: psrlw $2, %xmm0 453; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 454; X32-SSE-NEXT: pand %xmm3, %xmm0 455; X32-SSE-NEXT: por %xmm4, %xmm0 456; X32-SSE-NEXT: paddb %xmm1, %xmm1 457; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm2 458; X32-SSE-NEXT: movdqa %xmm2, %xmm1 459; X32-SSE-NEXT: pandn %xmm0, %xmm1 460; X32-SSE-NEXT: psrlw $1, %xmm0 461; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 462; X32-SSE-NEXT: pand %xmm2, %xmm0 463; X32-SSE-NEXT: por %xmm1, %xmm0 464; X32-SSE-NEXT: retl 465 %shift = lshr <16 x i8> %a, %b 466 ret <16 x i8> %shift 467} 468 469; 470; Uniform Variable Shifts 471; 472 473define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { 474; SSE-LABEL: splatvar_shift_v2i64: 475; SSE: # BB#0: 476; SSE-NEXT: psrlq %xmm1, %xmm0 477; SSE-NEXT: retq 478; 479; AVX-LABEL: splatvar_shift_v2i64: 480; AVX: # BB#0: 481; AVX-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 482; AVX-NEXT: retq 483; 484; XOP-LABEL: splatvar_shift_v2i64: 485; XOP: # BB#0: 486; XOP-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 487; XOP-NEXT: retq 488; 489; AVX512-LABEL: splatvar_shift_v2i64: 490; AVX512: ## BB#0: 491; AVX512-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 492; AVX512-NEXT: retq 493; 494; X32-SSE-LABEL: splatvar_shift_v2i64: 495; X32-SSE: # BB#0: 496; X32-SSE-NEXT: movq {{.*#+}} xmm1 = xmm1[0],zero 497; X32-SSE-NEXT: psrlq %xmm1, %xmm0 498; X32-SSE-NEXT: retl 499 %splat = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer 500 %shift = lshr <2 x i64> %a, %splat 501 ret <2 x i64> %shift 502} 503 504define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { 505; SSE2-LABEL: splatvar_shift_v4i32: 506; SSE2: # BB#0: 507; SSE2-NEXT: xorps %xmm2, %xmm2 508; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] 509; SSE2-NEXT: psrld %xmm2, %xmm0 510; SSE2-NEXT: retq 511; 512; SSE41-LABEL: splatvar_shift_v4i32: 513; SSE41: # BB#0: 514; SSE41-NEXT: pxor %xmm2, %xmm2 515; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3,4,5,6,7] 516; SSE41-NEXT: psrld %xmm2, %xmm0 517; SSE41-NEXT: retq 518; 519; AVX-LABEL: splatvar_shift_v4i32: 520; AVX: # BB#0: 521; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 522; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] 523; AVX-NEXT: vpsrld %xmm1, %xmm0, %xmm0 524; AVX-NEXT: retq 525; 526; XOP-LABEL: splatvar_shift_v4i32: 527; XOP: # BB#0: 528; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 529; XOP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] 530; XOP-NEXT: vpsrld %xmm1, %xmm0, %xmm0 531; XOP-NEXT: retq 532; 533; AVX512-LABEL: splatvar_shift_v4i32: 534; AVX512: ## BB#0: 535; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2 536; AVX512-NEXT: vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3] 537; AVX512-NEXT: vpsrld %xmm1, %xmm0, %xmm0 538; AVX512-NEXT: retq 539; 540; X32-SSE-LABEL: splatvar_shift_v4i32: 541; X32-SSE: # BB#0: 542; X32-SSE-NEXT: xorps %xmm2, %xmm2 543; X32-SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] 544; X32-SSE-NEXT: psrld %xmm2, %xmm0 545; X32-SSE-NEXT: retl 546 %splat = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> zeroinitializer 547 %shift = lshr <4 x i32> %a, %splat 548 ret <4 x i32> %shift 549} 550 551define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { 552; SSE2-LABEL: splatvar_shift_v8i16: 553; SSE2: # BB#0: 554; SSE2-NEXT: movd %xmm1, %eax 555; SSE2-NEXT: movzwl %ax, %eax 556; SSE2-NEXT: movd %eax, %xmm1 557; SSE2-NEXT: psrlw %xmm1, %xmm0 558; SSE2-NEXT: retq 559; 560; SSE41-LABEL: splatvar_shift_v8i16: 561; SSE41: # BB#0: 562; SSE41-NEXT: pxor %xmm2, %xmm2 563; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3,4,5,6,7] 564; SSE41-NEXT: psrlw %xmm2, %xmm0 565; SSE41-NEXT: retq 566; 567; AVX-LABEL: splatvar_shift_v8i16: 568; AVX: # BB#0: 569; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 570; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] 571; AVX-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 572; AVX-NEXT: retq 573; 574; XOP-LABEL: splatvar_shift_v8i16: 575; XOP: # BB#0: 576; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 577; XOP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] 578; XOP-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 579; XOP-NEXT: retq 580; 581; AVX512-LABEL: splatvar_shift_v8i16: 582; AVX512: ## BB#0: 583; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 584; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] 585; AVX512-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 586; AVX512-NEXT: retq 587; 588; X32-SSE-LABEL: splatvar_shift_v8i16: 589; X32-SSE: # BB#0: 590; X32-SSE-NEXT: movd %xmm1, %eax 591; X32-SSE-NEXT: movzwl %ax, %eax 592; X32-SSE-NEXT: movd %eax, %xmm1 593; X32-SSE-NEXT: psrlw %xmm1, %xmm0 594; X32-SSE-NEXT: retl 595 %splat = shufflevector <8 x i16> %b, <8 x i16> undef, <8 x i32> zeroinitializer 596 %shift = lshr <8 x i16> %a, %splat 597 ret <8 x i16> %shift 598} 599 600define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { 601; SSE2-LABEL: splatvar_shift_v16i8: 602; SSE2: # BB#0: 603; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 604; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] 605; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] 606; SSE2-NEXT: psllw $5, %xmm2 607; SSE2-NEXT: pxor %xmm1, %xmm1 608; SSE2-NEXT: pxor %xmm3, %xmm3 609; SSE2-NEXT: pcmpgtb %xmm2, %xmm3 610; SSE2-NEXT: movdqa %xmm3, %xmm4 611; SSE2-NEXT: pandn %xmm0, %xmm4 612; SSE2-NEXT: psrlw $4, %xmm0 613; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 614; SSE2-NEXT: pand %xmm3, %xmm0 615; SSE2-NEXT: por %xmm4, %xmm0 616; SSE2-NEXT: paddb %xmm2, %xmm2 617; SSE2-NEXT: pxor %xmm3, %xmm3 618; SSE2-NEXT: pcmpgtb %xmm2, %xmm3 619; SSE2-NEXT: movdqa %xmm3, %xmm4 620; SSE2-NEXT: pandn %xmm0, %xmm4 621; SSE2-NEXT: psrlw $2, %xmm0 622; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 623; SSE2-NEXT: pand %xmm3, %xmm0 624; SSE2-NEXT: por %xmm4, %xmm0 625; SSE2-NEXT: paddb %xmm2, %xmm2 626; SSE2-NEXT: pcmpgtb %xmm2, %xmm1 627; SSE2-NEXT: movdqa %xmm1, %xmm2 628; SSE2-NEXT: pandn %xmm0, %xmm2 629; SSE2-NEXT: psrlw $1, %xmm0 630; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 631; SSE2-NEXT: pand %xmm1, %xmm0 632; SSE2-NEXT: por %xmm2, %xmm0 633; SSE2-NEXT: retq 634; 635; SSE41-LABEL: splatvar_shift_v16i8: 636; SSE41: # BB#0: 637; SSE41-NEXT: movdqa %xmm0, %xmm2 638; SSE41-NEXT: pxor %xmm0, %xmm0 639; SSE41-NEXT: pshufb %xmm0, %xmm1 640; SSE41-NEXT: psllw $5, %xmm1 641; SSE41-NEXT: movdqa %xmm1, %xmm3 642; SSE41-NEXT: paddb %xmm3, %xmm3 643; SSE41-NEXT: movdqa %xmm2, %xmm4 644; SSE41-NEXT: psrlw $4, %xmm4 645; SSE41-NEXT: pand {{.*}}(%rip), %xmm4 646; SSE41-NEXT: movdqa %xmm1, %xmm0 647; SSE41-NEXT: pblendvb %xmm4, %xmm2 648; SSE41-NEXT: movdqa %xmm2, %xmm1 649; SSE41-NEXT: psrlw $2, %xmm1 650; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 651; SSE41-NEXT: movdqa %xmm3, %xmm0 652; SSE41-NEXT: pblendvb %xmm1, %xmm2 653; SSE41-NEXT: movdqa %xmm2, %xmm1 654; SSE41-NEXT: psrlw $1, %xmm1 655; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 656; SSE41-NEXT: paddb %xmm3, %xmm3 657; SSE41-NEXT: movdqa %xmm3, %xmm0 658; SSE41-NEXT: pblendvb %xmm1, %xmm2 659; SSE41-NEXT: movdqa %xmm2, %xmm0 660; SSE41-NEXT: retq 661; 662; AVX1-LABEL: splatvar_shift_v16i8: 663; AVX1: # BB#0: 664; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 665; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 666; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1 667; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm2 668; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm3 669; AVX1-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 670; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 671; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1 672; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 673; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 674; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm1 675; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 676; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm2 677; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 678; AVX1-NEXT: retq 679; 680; AVX2-LABEL: splatvar_shift_v16i8: 681; AVX2: # BB#0: 682; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 683; AVX2-NEXT: vpsllw $5, %xmm1, %xmm1 684; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm2 685; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 686; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 687; AVX2-NEXT: vpsrlw $2, %xmm0, %xmm2 688; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 689; AVX2-NEXT: vpaddb %xmm1, %xmm1, %xmm1 690; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 691; AVX2-NEXT: vpsrlw $1, %xmm0, %xmm2 692; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 693; AVX2-NEXT: vpaddb %xmm1, %xmm1, %xmm1 694; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 695; AVX2-NEXT: retq 696; 697; XOPAVX1-LABEL: splatvar_shift_v16i8: 698; XOPAVX1: # BB#0: 699; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 700; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 701; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 702; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm0 703; XOPAVX1-NEXT: retq 704; 705; XOPAVX2-LABEL: splatvar_shift_v16i8: 706; XOPAVX2: # BB#0: 707; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 708; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 709; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1 710; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0 711; XOPAVX2-NEXT: retq 712; 713; AVX512-LABEL: splatvar_shift_v16i8: 714; AVX512: ## BB#0: 715; AVX512-NEXT: vpbroadcastb %xmm1, %xmm1 716; AVX512-NEXT: vpsllw $5, %xmm1, %xmm1 717; AVX512-NEXT: vpsrlw $4, %xmm0, %xmm2 718; AVX512-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 719; AVX512-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 720; AVX512-NEXT: vpsrlw $2, %xmm0, %xmm2 721; AVX512-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 722; AVX512-NEXT: vpaddb %xmm1, %xmm1, %xmm1 723; AVX512-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 724; AVX512-NEXT: vpsrlw $1, %xmm0, %xmm2 725; AVX512-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 726; AVX512-NEXT: vpaddb %xmm1, %xmm1, %xmm1 727; AVX512-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 728; AVX512-NEXT: retq 729; 730; X32-SSE-LABEL: splatvar_shift_v16i8: 731; X32-SSE: # BB#0: 732; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 733; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] 734; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] 735; X32-SSE-NEXT: psllw $5, %xmm2 736; X32-SSE-NEXT: pxor %xmm1, %xmm1 737; X32-SSE-NEXT: pxor %xmm3, %xmm3 738; X32-SSE-NEXT: pcmpgtb %xmm2, %xmm3 739; X32-SSE-NEXT: movdqa %xmm3, %xmm4 740; X32-SSE-NEXT: pandn %xmm0, %xmm4 741; X32-SSE-NEXT: psrlw $4, %xmm0 742; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 743; X32-SSE-NEXT: pand %xmm3, %xmm0 744; X32-SSE-NEXT: por %xmm4, %xmm0 745; X32-SSE-NEXT: paddb %xmm2, %xmm2 746; X32-SSE-NEXT: pxor %xmm3, %xmm3 747; X32-SSE-NEXT: pcmpgtb %xmm2, %xmm3 748; X32-SSE-NEXT: movdqa %xmm3, %xmm4 749; X32-SSE-NEXT: pandn %xmm0, %xmm4 750; X32-SSE-NEXT: psrlw $2, %xmm0 751; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 752; X32-SSE-NEXT: pand %xmm3, %xmm0 753; X32-SSE-NEXT: por %xmm4, %xmm0 754; X32-SSE-NEXT: paddb %xmm2, %xmm2 755; X32-SSE-NEXT: pcmpgtb %xmm2, %xmm1 756; X32-SSE-NEXT: movdqa %xmm1, %xmm2 757; X32-SSE-NEXT: pandn %xmm0, %xmm2 758; X32-SSE-NEXT: psrlw $1, %xmm0 759; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 760; X32-SSE-NEXT: pand %xmm1, %xmm0 761; X32-SSE-NEXT: por %xmm2, %xmm0 762; X32-SSE-NEXT: retl 763 %splat = shufflevector <16 x i8> %b, <16 x i8> undef, <16 x i32> zeroinitializer 764 %shift = lshr <16 x i8> %a, %splat 765 ret <16 x i8> %shift 766} 767 768; 769; Constant Shifts 770; 771 772define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind { 773; SSE2-LABEL: constant_shift_v2i64: 774; SSE2: # BB#0: 775; SSE2-NEXT: movdqa %xmm0, %xmm1 776; SSE2-NEXT: psrlq $7, %xmm1 777; SSE2-NEXT: psrlq $1, %xmm0 778; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 779; SSE2-NEXT: movapd %xmm1, %xmm0 780; SSE2-NEXT: retq 781; 782; SSE41-LABEL: constant_shift_v2i64: 783; SSE41: # BB#0: 784; SSE41-NEXT: movdqa %xmm0, %xmm1 785; SSE41-NEXT: psrlq $7, %xmm1 786; SSE41-NEXT: psrlq $1, %xmm0 787; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 788; SSE41-NEXT: retq 789; 790; AVX1-LABEL: constant_shift_v2i64: 791; AVX1: # BB#0: 792; AVX1-NEXT: vpsrlq $7, %xmm0, %xmm1 793; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm0 794; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 795; AVX1-NEXT: retq 796; 797; AVX2-LABEL: constant_shift_v2i64: 798; AVX2: # BB#0: 799; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0 800; AVX2-NEXT: retq 801; 802; XOPAVX1-LABEL: constant_shift_v2i64: 803; XOPAVX1: # BB#0: 804; XOPAVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 805; XOPAVX1-NEXT: vpsubq {{.*}}(%rip), %xmm1, %xmm1 806; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0 807; XOPAVX1-NEXT: retq 808; 809; XOPAVX2-LABEL: constant_shift_v2i64: 810; XOPAVX2: # BB#0: 811; XOPAVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0 812; XOPAVX2-NEXT: retq 813; 814; AVX512-LABEL: constant_shift_v2i64: 815; AVX512: ## BB#0: 816; AVX512-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0 817; AVX512-NEXT: retq 818; 819; X32-SSE-LABEL: constant_shift_v2i64: 820; X32-SSE: # BB#0: 821; X32-SSE-NEXT: movdqa %xmm0, %xmm1 822; X32-SSE-NEXT: psrlq $7, %xmm1 823; X32-SSE-NEXT: psrlq $1, %xmm0 824; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 825; X32-SSE-NEXT: movapd %xmm1, %xmm0 826; X32-SSE-NEXT: retl 827 %shift = lshr <2 x i64> %a, <i64 1, i64 7> 828 ret <2 x i64> %shift 829} 830 831define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) nounwind { 832; SSE2-LABEL: constant_shift_v4i32: 833; SSE2: # BB#0: 834; SSE2-NEXT: movdqa %xmm0, %xmm1 835; SSE2-NEXT: psrld $7, %xmm1 836; SSE2-NEXT: movdqa %xmm0, %xmm2 837; SSE2-NEXT: psrld $5, %xmm2 838; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] 839; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] 840; SSE2-NEXT: movdqa %xmm0, %xmm2 841; SSE2-NEXT: psrld $6, %xmm2 842; SSE2-NEXT: psrld $4, %xmm0 843; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] 844; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 845; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 846; SSE2-NEXT: retq 847; 848; SSE41-LABEL: constant_shift_v4i32: 849; SSE41: # BB#0: 850; SSE41-NEXT: movdqa %xmm0, %xmm1 851; SSE41-NEXT: psrld $7, %xmm1 852; SSE41-NEXT: movdqa %xmm0, %xmm2 853; SSE41-NEXT: psrld $5, %xmm2 854; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] 855; SSE41-NEXT: movdqa %xmm0, %xmm1 856; SSE41-NEXT: psrld $6, %xmm1 857; SSE41-NEXT: psrld $4, %xmm0 858; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 859; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 860; SSE41-NEXT: retq 861; 862; AVX1-LABEL: constant_shift_v4i32: 863; AVX1: # BB#0: 864; AVX1-NEXT: vpsrld $7, %xmm0, %xmm1 865; AVX1-NEXT: vpsrld $5, %xmm0, %xmm2 866; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] 867; AVX1-NEXT: vpsrld $6, %xmm0, %xmm2 868; AVX1-NEXT: vpsrld $4, %xmm0, %xmm0 869; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] 870; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] 871; AVX1-NEXT: retq 872; 873; AVX2-LABEL: constant_shift_v4i32: 874; AVX2: # BB#0: 875; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0 876; AVX2-NEXT: retq 877; 878; XOPAVX1-LABEL: constant_shift_v4i32: 879; XOPAVX1: # BB#0: 880; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm0 881; XOPAVX1-NEXT: retq 882; 883; XOPAVX2-LABEL: constant_shift_v4i32: 884; XOPAVX2: # BB#0: 885; XOPAVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0 886; XOPAVX2-NEXT: retq 887; 888; AVX512-LABEL: constant_shift_v4i32: 889; AVX512: ## BB#0: 890; AVX512-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0 891; AVX512-NEXT: retq 892; 893; X32-SSE-LABEL: constant_shift_v4i32: 894; X32-SSE: # BB#0: 895; X32-SSE-NEXT: movdqa %xmm0, %xmm1 896; X32-SSE-NEXT: psrld $7, %xmm1 897; X32-SSE-NEXT: movdqa %xmm0, %xmm2 898; X32-SSE-NEXT: psrld $5, %xmm2 899; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] 900; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] 901; X32-SSE-NEXT: movdqa %xmm0, %xmm2 902; X32-SSE-NEXT: psrld $6, %xmm2 903; X32-SSE-NEXT: psrld $4, %xmm0 904; X32-SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] 905; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 906; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 907; X32-SSE-NEXT: retl 908 %shift = lshr <4 x i32> %a, <i32 4, i32 5, i32 6, i32 7> 909 ret <4 x i32> %shift 910} 911 912define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind { 913; SSE2-LABEL: constant_shift_v8i16: 914; SSE2: # BB#0: 915; SSE2-NEXT: movdqa %xmm0, %xmm1 916; SSE2-NEXT: psrlw $4, %xmm1 917; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 918; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] 919; SSE2-NEXT: psrlw $2, %xmm1 920; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] 921; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 922; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0] 923; SSE2-NEXT: movdqa %xmm2, %xmm1 924; SSE2-NEXT: pand %xmm0, %xmm1 925; SSE2-NEXT: psrlw $1, %xmm2 926; SSE2-NEXT: pandn %xmm2, %xmm0 927; SSE2-NEXT: por %xmm1, %xmm0 928; SSE2-NEXT: retq 929; 930; SSE41-LABEL: constant_shift_v8i16: 931; SSE41: # BB#0: 932; SSE41-NEXT: movdqa %xmm0, %xmm1 933; SSE41-NEXT: psrlw $4, %xmm1 934; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] 935; SSE41-NEXT: movdqa %xmm1, %xmm2 936; SSE41-NEXT: psrlw $2, %xmm2 937; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] 938; SSE41-NEXT: movdqa %xmm2, %xmm0 939; SSE41-NEXT: psrlw $1, %xmm0 940; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4],xmm0[5],xmm2[6],xmm0[7] 941; SSE41-NEXT: retq 942; 943; AVX1-LABEL: constant_shift_v8i16: 944; AVX1: # BB#0: 945; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm1 946; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 947; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1 948; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] 949; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm1 950; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] 951; AVX1-NEXT: retq 952; 953; AVX2-LABEL: constant_shift_v8i16: 954; AVX2: # BB#0: 955; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 956; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %ymm0, %ymm0 957; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero 958; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 959; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 960; AVX2-NEXT: vzeroupper 961; AVX2-NEXT: retq 962; 963; XOP-LABEL: constant_shift_v8i16: 964; XOP: # BB#0: 965; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1 966; XOP-NEXT: vpsubw {{.*}}(%rip), %xmm1, %xmm1 967; XOP-NEXT: vpshlw %xmm1, %xmm0, %xmm0 968; XOP-NEXT: retq 969; 970; AVX512-LABEL: constant_shift_v8i16: 971; AVX512: ## BB#0: 972; AVX512-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def> 973; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7] 974; AVX512-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 975; AVX512-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill> 976; AVX512-NEXT: retq 977; 978; X32-SSE-LABEL: constant_shift_v8i16: 979; X32-SSE: # BB#0: 980; X32-SSE-NEXT: movdqa %xmm0, %xmm1 981; X32-SSE-NEXT: psrlw $4, %xmm1 982; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 983; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] 984; X32-SSE-NEXT: psrlw $2, %xmm1 985; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] 986; X32-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 987; X32-SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0] 988; X32-SSE-NEXT: movdqa %xmm2, %xmm1 989; X32-SSE-NEXT: pand %xmm0, %xmm1 990; X32-SSE-NEXT: psrlw $1, %xmm2 991; X32-SSE-NEXT: pandn %xmm2, %xmm0 992; X32-SSE-NEXT: por %xmm1, %xmm0 993; X32-SSE-NEXT: retl 994 %shift = lshr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7> 995 ret <8 x i16> %shift 996} 997 998define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind { 999; SSE2-LABEL: constant_shift_v16i8: 1000; SSE2: # BB#0: 1001; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] 1002; SSE2-NEXT: psllw $5, %xmm2 1003; SSE2-NEXT: pxor %xmm1, %xmm1 1004; SSE2-NEXT: pxor %xmm3, %xmm3 1005; SSE2-NEXT: pcmpgtb %xmm2, %xmm3 1006; SSE2-NEXT: movdqa %xmm3, %xmm4 1007; SSE2-NEXT: pandn %xmm0, %xmm4 1008; SSE2-NEXT: psrlw $4, %xmm0 1009; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 1010; SSE2-NEXT: pand %xmm3, %xmm0 1011; SSE2-NEXT: por %xmm4, %xmm0 1012; SSE2-NEXT: paddb %xmm2, %xmm2 1013; SSE2-NEXT: pxor %xmm3, %xmm3 1014; SSE2-NEXT: pcmpgtb %xmm2, %xmm3 1015; SSE2-NEXT: movdqa %xmm3, %xmm4 1016; SSE2-NEXT: pandn %xmm0, %xmm4 1017; SSE2-NEXT: psrlw $2, %xmm0 1018; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 1019; SSE2-NEXT: pand %xmm3, %xmm0 1020; SSE2-NEXT: por %xmm4, %xmm0 1021; SSE2-NEXT: paddb %xmm2, %xmm2 1022; SSE2-NEXT: pcmpgtb %xmm2, %xmm1 1023; SSE2-NEXT: movdqa %xmm1, %xmm2 1024; SSE2-NEXT: pandn %xmm0, %xmm2 1025; SSE2-NEXT: psrlw $1, %xmm0 1026; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 1027; SSE2-NEXT: pand %xmm1, %xmm0 1028; SSE2-NEXT: por %xmm2, %xmm0 1029; SSE2-NEXT: retq 1030; 1031; SSE41-LABEL: constant_shift_v16i8: 1032; SSE41: # BB#0: 1033; SSE41-NEXT: movdqa %xmm0, %xmm1 1034; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] 1035; SSE41-NEXT: psllw $5, %xmm0 1036; SSE41-NEXT: movdqa %xmm1, %xmm2 1037; SSE41-NEXT: psrlw $4, %xmm2 1038; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 1039; SSE41-NEXT: pblendvb %xmm2, %xmm1 1040; SSE41-NEXT: movdqa %xmm1, %xmm2 1041; SSE41-NEXT: psrlw $2, %xmm2 1042; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 1043; SSE41-NEXT: paddb %xmm0, %xmm0 1044; SSE41-NEXT: pblendvb %xmm2, %xmm1 1045; SSE41-NEXT: movdqa %xmm1, %xmm2 1046; SSE41-NEXT: psrlw $1, %xmm2 1047; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 1048; SSE41-NEXT: paddb %xmm0, %xmm0 1049; SSE41-NEXT: pblendvb %xmm2, %xmm1 1050; SSE41-NEXT: movdqa %xmm1, %xmm0 1051; SSE41-NEXT: retq 1052; 1053; AVX-LABEL: constant_shift_v16i8: 1054; AVX: # BB#0: 1055; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] 1056; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 1057; AVX-NEXT: vpsrlw $4, %xmm0, %xmm2 1058; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 1059; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 1060; AVX-NEXT: vpsrlw $2, %xmm0, %xmm2 1061; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 1062; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 1063; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 1064; AVX-NEXT: vpsrlw $1, %xmm0, %xmm2 1065; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 1066; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 1067; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 1068; AVX-NEXT: retq 1069; 1070; XOP-LABEL: constant_shift_v16i8: 1071; XOP: # BB#0: 1072; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1 1073; XOP-NEXT: vpsubb {{.*}}(%rip), %xmm1, %xmm1 1074; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0 1075; XOP-NEXT: retq 1076; 1077; AVX512-LABEL: constant_shift_v16i8: 1078; AVX512: ## BB#0: 1079; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] 1080; AVX512-NEXT: vpsllw $5, %xmm1, %xmm1 1081; AVX512-NEXT: vpsrlw $4, %xmm0, %xmm2 1082; AVX512-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 1083; AVX512-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 1084; AVX512-NEXT: vpsrlw $2, %xmm0, %xmm2 1085; AVX512-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 1086; AVX512-NEXT: vpaddb %xmm1, %xmm1, %xmm1 1087; AVX512-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 1088; AVX512-NEXT: vpsrlw $1, %xmm0, %xmm2 1089; AVX512-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 1090; AVX512-NEXT: vpaddb %xmm1, %xmm1, %xmm1 1091; AVX512-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 1092; AVX512-NEXT: retq 1093; 1094; X32-SSE-LABEL: constant_shift_v16i8: 1095; X32-SSE: # BB#0: 1096; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] 1097; X32-SSE-NEXT: psllw $5, %xmm2 1098; X32-SSE-NEXT: pxor %xmm1, %xmm1 1099; X32-SSE-NEXT: pxor %xmm3, %xmm3 1100; X32-SSE-NEXT: pcmpgtb %xmm2, %xmm3 1101; X32-SSE-NEXT: movdqa %xmm3, %xmm4 1102; X32-SSE-NEXT: pandn %xmm0, %xmm4 1103; X32-SSE-NEXT: psrlw $4, %xmm0 1104; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 1105; X32-SSE-NEXT: pand %xmm3, %xmm0 1106; X32-SSE-NEXT: por %xmm4, %xmm0 1107; X32-SSE-NEXT: paddb %xmm2, %xmm2 1108; X32-SSE-NEXT: pxor %xmm3, %xmm3 1109; X32-SSE-NEXT: pcmpgtb %xmm2, %xmm3 1110; X32-SSE-NEXT: movdqa %xmm3, %xmm4 1111; X32-SSE-NEXT: pandn %xmm0, %xmm4 1112; X32-SSE-NEXT: psrlw $2, %xmm0 1113; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 1114; X32-SSE-NEXT: pand %xmm3, %xmm0 1115; X32-SSE-NEXT: por %xmm4, %xmm0 1116; X32-SSE-NEXT: paddb %xmm2, %xmm2 1117; X32-SSE-NEXT: pcmpgtb %xmm2, %xmm1 1118; X32-SSE-NEXT: movdqa %xmm1, %xmm2 1119; X32-SSE-NEXT: pandn %xmm0, %xmm2 1120; X32-SSE-NEXT: psrlw $1, %xmm0 1121; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 1122; X32-SSE-NEXT: pand %xmm1, %xmm0 1123; X32-SSE-NEXT: por %xmm2, %xmm0 1124; X32-SSE-NEXT: retl 1125 %shift = lshr <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0> 1126 ret <16 x i8> %shift 1127} 1128 1129; 1130; Uniform Constant Shifts 1131; 1132 1133define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) nounwind { 1134; SSE-LABEL: splatconstant_shift_v2i64: 1135; SSE: # BB#0: 1136; SSE-NEXT: psrlq $7, %xmm0 1137; SSE-NEXT: retq 1138; 1139; AVX-LABEL: splatconstant_shift_v2i64: 1140; AVX: # BB#0: 1141; AVX-NEXT: vpsrlq $7, %xmm0, %xmm0 1142; AVX-NEXT: retq 1143; 1144; XOP-LABEL: splatconstant_shift_v2i64: 1145; XOP: # BB#0: 1146; XOP-NEXT: vpsrlq $7, %xmm0, %xmm0 1147; XOP-NEXT: retq 1148; 1149; AVX512-LABEL: splatconstant_shift_v2i64: 1150; AVX512: ## BB#0: 1151; AVX512-NEXT: vpsrlq $7, %xmm0, %xmm0 1152; AVX512-NEXT: retq 1153; 1154; X32-SSE-LABEL: splatconstant_shift_v2i64: 1155; X32-SSE: # BB#0: 1156; X32-SSE-NEXT: psrlq $7, %xmm0 1157; X32-SSE-NEXT: retl 1158 %shift = lshr <2 x i64> %a, <i64 7, i64 7> 1159 ret <2 x i64> %shift 1160} 1161 1162define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) nounwind { 1163; SSE-LABEL: splatconstant_shift_v4i32: 1164; SSE: # BB#0: 1165; SSE-NEXT: psrld $5, %xmm0 1166; SSE-NEXT: retq 1167; 1168; AVX-LABEL: splatconstant_shift_v4i32: 1169; AVX: # BB#0: 1170; AVX-NEXT: vpsrld $5, %xmm0, %xmm0 1171; AVX-NEXT: retq 1172; 1173; XOP-LABEL: splatconstant_shift_v4i32: 1174; XOP: # BB#0: 1175; XOP-NEXT: vpsrld $5, %xmm0, %xmm0 1176; XOP-NEXT: retq 1177; 1178; AVX512-LABEL: splatconstant_shift_v4i32: 1179; AVX512: ## BB#0: 1180; AVX512-NEXT: vpsrld $5, %xmm0, %xmm0 1181; AVX512-NEXT: retq 1182; 1183; X32-SSE-LABEL: splatconstant_shift_v4i32: 1184; X32-SSE: # BB#0: 1185; X32-SSE-NEXT: psrld $5, %xmm0 1186; X32-SSE-NEXT: retl 1187 %shift = lshr <4 x i32> %a, <i32 5, i32 5, i32 5, i32 5> 1188 ret <4 x i32> %shift 1189} 1190 1191define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) nounwind { 1192; SSE-LABEL: splatconstant_shift_v8i16: 1193; SSE: # BB#0: 1194; SSE-NEXT: psrlw $3, %xmm0 1195; SSE-NEXT: retq 1196; 1197; AVX-LABEL: splatconstant_shift_v8i16: 1198; AVX: # BB#0: 1199; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0 1200; AVX-NEXT: retq 1201; 1202; XOP-LABEL: splatconstant_shift_v8i16: 1203; XOP: # BB#0: 1204; XOP-NEXT: vpsrlw $3, %xmm0, %xmm0 1205; XOP-NEXT: retq 1206; 1207; AVX512-LABEL: splatconstant_shift_v8i16: 1208; AVX512: ## BB#0: 1209; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0 1210; AVX512-NEXT: retq 1211; 1212; X32-SSE-LABEL: splatconstant_shift_v8i16: 1213; X32-SSE: # BB#0: 1214; X32-SSE-NEXT: psrlw $3, %xmm0 1215; X32-SSE-NEXT: retl 1216 %shift = lshr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3> 1217 ret <8 x i16> %shift 1218} 1219 1220define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) nounwind { 1221; SSE-LABEL: splatconstant_shift_v16i8: 1222; SSE: # BB#0: 1223; SSE-NEXT: psrlw $3, %xmm0 1224; SSE-NEXT: pand {{.*}}(%rip), %xmm0 1225; SSE-NEXT: retq 1226; 1227; AVX-LABEL: splatconstant_shift_v16i8: 1228; AVX: # BB#0: 1229; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0 1230; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 1231; AVX-NEXT: retq 1232; 1233; XOP-LABEL: splatconstant_shift_v16i8: 1234; XOP: # BB#0: 1235; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1 1236; XOP-NEXT: vpsubb {{.*}}(%rip), %xmm1, %xmm1 1237; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0 1238; XOP-NEXT: retq 1239; 1240; AVX512-LABEL: splatconstant_shift_v16i8: 1241; AVX512: ## BB#0: 1242; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0 1243; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 1244; AVX512-NEXT: retq 1245; 1246; X32-SSE-LABEL: splatconstant_shift_v16i8: 1247; X32-SSE: # BB#0: 1248; X32-SSE-NEXT: psrlw $3, %xmm0 1249; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 1250; X32-SSE-NEXT: retl 1251 %shift = lshr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3> 1252 ret <16 x i8> %shift 1253} 1254