1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX1 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=XOPAVX1 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=XOPAVX2 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=AVX512VL,AVX512DQVL 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512VL,AVX512BWVL 10; 11; 32-bit runs to make sure we do reasonable things for i64 shifts. 12; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X86-AVX1 13; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X86-AVX2 14 15; 16; Variable Shifts 17; 18 19define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { 20; AVX1-LABEL: var_shift_v4i64: 21; AVX1: # %bb.0: 22; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 23; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] 24; AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm4 25; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] 26; AVX1-NEXT: vpsrlq %xmm5, %xmm3, %xmm6 27; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4,5,6,7] 28; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 29; AVX1-NEXT: vpsrlq %xmm2, %xmm6, %xmm2 30; AVX1-NEXT: vpsrlq %xmm5, %xmm6, %xmm5 31; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4,5,6,7] 32; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 33; AVX1-NEXT: vpsubq %xmm4, %xmm2, %xmm2 34; AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm4 35; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] 36; AVX1-NEXT: vpsrlq %xmm5, %xmm3, %xmm3 37; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] 38; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm1 39; AVX1-NEXT: vpsrlq %xmm5, %xmm0, %xmm0 40; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] 41; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0 42; AVX1-NEXT: vpsubq %xmm3, %xmm0, %xmm0 43; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 44; AVX1-NEXT: retq 45; 46; AVX2-LABEL: var_shift_v4i64: 47; AVX2: # %bb.0: 48; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] 49; AVX2-NEXT: vpsrlvq %ymm1, %ymm2, %ymm2 50; AVX2-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0 51; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 52; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0 53; AVX2-NEXT: retq 54; 55; XOPAVX1-LABEL: var_shift_v4i64: 56; XOPAVX1: # %bb.0: 57; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 58; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 59; XOPAVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2 60; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 61; XOPAVX1-NEXT: vpshaq %xmm2, %xmm4, %xmm2 62; XOPAVX1-NEXT: vpsubq %xmm1, %xmm3, %xmm1 63; XOPAVX1-NEXT: vpshaq %xmm1, %xmm0, %xmm0 64; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 65; XOPAVX1-NEXT: retq 66; 67; XOPAVX2-LABEL: var_shift_v4i64: 68; XOPAVX2: # %bb.0: 69; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] 70; XOPAVX2-NEXT: vpsrlvq %ymm1, %ymm2, %ymm2 71; XOPAVX2-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0 72; XOPAVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 73; XOPAVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0 74; XOPAVX2-NEXT: retq 75; 76; AVX512-LABEL: var_shift_v4i64: 77; AVX512: # %bb.0: 78; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 79; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 80; AVX512-NEXT: vpsravq %zmm1, %zmm0, %zmm0 81; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 82; AVX512-NEXT: retq 83; 84; AVX512VL-LABEL: var_shift_v4i64: 85; AVX512VL: # %bb.0: 86; AVX512VL-NEXT: vpsravq %ymm1, %ymm0, %ymm0 87; AVX512VL-NEXT: retq 88; 89; X86-AVX1-LABEL: var_shift_v4i64: 90; X86-AVX1: # %bb.0: 91; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 92; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] 93; X86-AVX1-NEXT: # xmm3 = mem[0,0] 94; X86-AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm4 95; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] 96; X86-AVX1-NEXT: vpsrlq %xmm5, %xmm3, %xmm6 97; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4,5,6,7] 98; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 99; X86-AVX1-NEXT: vpsrlq %xmm2, %xmm6, %xmm2 100; X86-AVX1-NEXT: vpsrlq %xmm5, %xmm6, %xmm5 101; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4,5,6,7] 102; X86-AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 103; X86-AVX1-NEXT: vpsubq %xmm4, %xmm2, %xmm2 104; X86-AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm4 105; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] 106; X86-AVX1-NEXT: vpsrlq %xmm5, %xmm3, %xmm3 107; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] 108; X86-AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm1 109; X86-AVX1-NEXT: vpsrlq %xmm5, %xmm0, %xmm0 110; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] 111; X86-AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0 112; X86-AVX1-NEXT: vpsubq %xmm3, %xmm0, %xmm0 113; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 114; X86-AVX1-NEXT: retl 115; 116; X86-AVX2-LABEL: var_shift_v4i64: 117; X86-AVX2: # %bb.0: 118; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648] 119; X86-AVX2-NEXT: vpsrlvq %ymm1, %ymm2, %ymm2 120; X86-AVX2-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0 121; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 122; X86-AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0 123; X86-AVX2-NEXT: retl 124 %shift = ashr <4 x i64> %a, %b 125 ret <4 x i64> %shift 126} 127 128define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { 129; AVX1-LABEL: var_shift_v8i32: 130; AVX1: # %bb.0: 131; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 132; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 133; AVX1-NEXT: vpsrldq {{.*#+}} xmm4 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 134; AVX1-NEXT: vpsrad %xmm4, %xmm2, %xmm4 135; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm5 136; AVX1-NEXT: vpsrad %xmm5, %xmm2, %xmm5 137; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] 138; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 139; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] 140; AVX1-NEXT: vpsrad %xmm6, %xmm2, %xmm6 141; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero 142; AVX1-NEXT: vpsrad %xmm3, %xmm2, %xmm2 143; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7] 144; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] 145; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 146; AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3 147; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4 148; AVX1-NEXT: vpsrad %xmm4, %xmm0, %xmm4 149; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] 150; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] 151; AVX1-NEXT: vpsrad %xmm4, %xmm0, %xmm4 152; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 153; AVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0 154; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7] 155; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] 156; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 157; AVX1-NEXT: retq 158; 159; AVX2-LABEL: var_shift_v8i32: 160; AVX2: # %bb.0: 161; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0 162; AVX2-NEXT: retq 163; 164; XOPAVX1-LABEL: var_shift_v8i32: 165; XOPAVX1: # %bb.0: 166; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 167; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 168; XOPAVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2 169; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 170; XOPAVX1-NEXT: vpshad %xmm2, %xmm4, %xmm2 171; XOPAVX1-NEXT: vpsubd %xmm1, %xmm3, %xmm1 172; XOPAVX1-NEXT: vpshad %xmm1, %xmm0, %xmm0 173; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 174; XOPAVX1-NEXT: retq 175; 176; XOPAVX2-LABEL: var_shift_v8i32: 177; XOPAVX2: # %bb.0: 178; XOPAVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0 179; XOPAVX2-NEXT: retq 180; 181; AVX512-LABEL: var_shift_v8i32: 182; AVX512: # %bb.0: 183; AVX512-NEXT: vpsravd %ymm1, %ymm0, %ymm0 184; AVX512-NEXT: retq 185; 186; AVX512VL-LABEL: var_shift_v8i32: 187; AVX512VL: # %bb.0: 188; AVX512VL-NEXT: vpsravd %ymm1, %ymm0, %ymm0 189; AVX512VL-NEXT: retq 190; 191; X86-AVX1-LABEL: var_shift_v8i32: 192; X86-AVX1: # %bb.0: 193; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 194; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 195; X86-AVX1-NEXT: vpsrldq {{.*#+}} xmm4 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 196; X86-AVX1-NEXT: vpsrad %xmm4, %xmm2, %xmm4 197; X86-AVX1-NEXT: vpsrlq $32, %xmm3, %xmm5 198; X86-AVX1-NEXT: vpsrad %xmm5, %xmm2, %xmm5 199; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] 200; X86-AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 201; X86-AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] 202; X86-AVX1-NEXT: vpsrad %xmm6, %xmm2, %xmm6 203; X86-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero 204; X86-AVX1-NEXT: vpsrad %xmm3, %xmm2, %xmm2 205; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7] 206; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] 207; X86-AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 208; X86-AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3 209; X86-AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4 210; X86-AVX1-NEXT: vpsrad %xmm4, %xmm0, %xmm4 211; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] 212; X86-AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] 213; X86-AVX1-NEXT: vpsrad %xmm4, %xmm0, %xmm4 214; X86-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 215; X86-AVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0 216; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7] 217; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] 218; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 219; X86-AVX1-NEXT: retl 220; 221; X86-AVX2-LABEL: var_shift_v8i32: 222; X86-AVX2: # %bb.0: 223; X86-AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0 224; X86-AVX2-NEXT: retl 225 %shift = ashr <8 x i32> %a, %b 226 ret <8 x i32> %shift 227} 228 229define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind { 230; AVX1-LABEL: var_shift_v16i16: 231; AVX1: # %bb.0: 232; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 233; AVX1-NEXT: vpsllw $12, %xmm2, %xmm3 234; AVX1-NEXT: vpsllw $4, %xmm2, %xmm2 235; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2 236; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm3 237; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 238; AVX1-NEXT: vpsraw $8, %xmm4, %xmm5 239; AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm2 240; AVX1-NEXT: vpsraw $4, %xmm2, %xmm4 241; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2 242; AVX1-NEXT: vpsraw $2, %xmm2, %xmm4 243; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 244; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2 245; AVX1-NEXT: vpsraw $1, %xmm2, %xmm4 246; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 247; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2 248; AVX1-NEXT: vpsllw $12, %xmm1, %xmm3 249; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 250; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1 251; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm3 252; AVX1-NEXT: vpsraw $8, %xmm0, %xmm4 253; AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0 254; AVX1-NEXT: vpsraw $4, %xmm0, %xmm1 255; AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0 256; AVX1-NEXT: vpsraw $2, %xmm0, %xmm1 257; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 258; AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0 259; AVX1-NEXT: vpsraw $1, %xmm0, %xmm1 260; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 261; AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0 262; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 263; AVX1-NEXT: retq 264; 265; AVX2-LABEL: var_shift_v16i16: 266; AVX2: # %bb.0: 267; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 268; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] 269; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15] 270; AVX2-NEXT: vpsravd %ymm3, %ymm4, %ymm3 271; AVX2-NEXT: vpsrld $16, %ymm3, %ymm3 272; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] 273; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] 274; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0 275; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 276; AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0 277; AVX2-NEXT: retq 278; 279; XOPAVX1-LABEL: var_shift_v16i16: 280; XOPAVX1: # %bb.0: 281; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 282; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 283; XOPAVX1-NEXT: vpsubw %xmm2, %xmm3, %xmm2 284; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 285; XOPAVX1-NEXT: vpshaw %xmm2, %xmm4, %xmm2 286; XOPAVX1-NEXT: vpsubw %xmm1, %xmm3, %xmm1 287; XOPAVX1-NEXT: vpshaw %xmm1, %xmm0, %xmm0 288; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 289; XOPAVX1-NEXT: retq 290; 291; XOPAVX2-LABEL: var_shift_v16i16: 292; XOPAVX2: # %bb.0: 293; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 294; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 295; XOPAVX2-NEXT: vpsubw %xmm2, %xmm3, %xmm2 296; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 297; XOPAVX2-NEXT: vpshaw %xmm2, %xmm4, %xmm2 298; XOPAVX2-NEXT: vpsubw %xmm1, %xmm3, %xmm1 299; XOPAVX2-NEXT: vpshaw %xmm1, %xmm0, %xmm0 300; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 301; XOPAVX2-NEXT: retq 302; 303; AVX512DQ-LABEL: var_shift_v16i16: 304; AVX512DQ: # %bb.0: 305; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 306; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 307; AVX512DQ-NEXT: vpsravd %zmm1, %zmm0, %zmm0 308; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 309; AVX512DQ-NEXT: retq 310; 311; AVX512BW-LABEL: var_shift_v16i16: 312; AVX512BW: # %bb.0: 313; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 314; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 315; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 316; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 317; AVX512BW-NEXT: retq 318; 319; AVX512DQVL-LABEL: var_shift_v16i16: 320; AVX512DQVL: # %bb.0: 321; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 322; AVX512DQVL-NEXT: vpmovsxwd %ymm0, %zmm0 323; AVX512DQVL-NEXT: vpsravd %zmm1, %zmm0, %zmm0 324; AVX512DQVL-NEXT: vpmovdw %zmm0, %ymm0 325; AVX512DQVL-NEXT: retq 326; 327; AVX512BWVL-LABEL: var_shift_v16i16: 328; AVX512BWVL: # %bb.0: 329; AVX512BWVL-NEXT: vpsravw %ymm1, %ymm0, %ymm0 330; AVX512BWVL-NEXT: retq 331; 332; X86-AVX1-LABEL: var_shift_v16i16: 333; X86-AVX1: # %bb.0: 334; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 335; X86-AVX1-NEXT: vpsllw $12, %xmm2, %xmm3 336; X86-AVX1-NEXT: vpsllw $4, %xmm2, %xmm2 337; X86-AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2 338; X86-AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm3 339; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 340; X86-AVX1-NEXT: vpsraw $8, %xmm4, %xmm5 341; X86-AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm2 342; X86-AVX1-NEXT: vpsraw $4, %xmm2, %xmm4 343; X86-AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2 344; X86-AVX1-NEXT: vpsraw $2, %xmm2, %xmm4 345; X86-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 346; X86-AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2 347; X86-AVX1-NEXT: vpsraw $1, %xmm2, %xmm4 348; X86-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 349; X86-AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2 350; X86-AVX1-NEXT: vpsllw $12, %xmm1, %xmm3 351; X86-AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 352; X86-AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1 353; X86-AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm3 354; X86-AVX1-NEXT: vpsraw $8, %xmm0, %xmm4 355; X86-AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0 356; X86-AVX1-NEXT: vpsraw $4, %xmm0, %xmm1 357; X86-AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0 358; X86-AVX1-NEXT: vpsraw $2, %xmm0, %xmm1 359; X86-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 360; X86-AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0 361; X86-AVX1-NEXT: vpsraw $1, %xmm0, %xmm1 362; X86-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 363; X86-AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0 364; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 365; X86-AVX1-NEXT: retl 366; 367; X86-AVX2-LABEL: var_shift_v16i16: 368; X86-AVX2: # %bb.0: 369; X86-AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 370; X86-AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] 371; X86-AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15] 372; X86-AVX2-NEXT: vpsravd %ymm3, %ymm4, %ymm3 373; X86-AVX2-NEXT: vpsrld $16, %ymm3, %ymm3 374; X86-AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] 375; X86-AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] 376; X86-AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0 377; X86-AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 378; X86-AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0 379; X86-AVX2-NEXT: retl 380 %shift = ashr <16 x i16> %a, %b 381 ret <16 x i16> %shift 382} 383 384define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { 385; AVX1-LABEL: var_shift_v32i8: 386; AVX1: # %bb.0: 387; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 388; AVX1-NEXT: vpsllw $5, %xmm2, %xmm2 389; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 390; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 391; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 392; AVX1-NEXT: vpsraw $4, %xmm5, %xmm6 393; AVX1-NEXT: vpblendvb %xmm3, %xmm6, %xmm5, %xmm5 394; AVX1-NEXT: vpsraw $2, %xmm5, %xmm6 395; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 396; AVX1-NEXT: vpblendvb %xmm3, %xmm6, %xmm5, %xmm5 397; AVX1-NEXT: vpsraw $1, %xmm5, %xmm6 398; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 399; AVX1-NEXT: vpblendvb %xmm3, %xmm6, %xmm5, %xmm3 400; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 401; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 402; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 403; AVX1-NEXT: vpsraw $4, %xmm4, %xmm5 404; AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm4 405; AVX1-NEXT: vpsraw $2, %xmm4, %xmm5 406; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 407; AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm4 408; AVX1-NEXT: vpsraw $1, %xmm4, %xmm5 409; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 410; AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm2 411; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 412; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 413; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1 414; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 415; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 416; AVX1-NEXT: vpsraw $4, %xmm4, %xmm5 417; AVX1-NEXT: vpblendvb %xmm3, %xmm5, %xmm4, %xmm4 418; AVX1-NEXT: vpsraw $2, %xmm4, %xmm5 419; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 420; AVX1-NEXT: vpblendvb %xmm3, %xmm5, %xmm4, %xmm4 421; AVX1-NEXT: vpsraw $1, %xmm4, %xmm5 422; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 423; AVX1-NEXT: vpblendvb %xmm3, %xmm5, %xmm4, %xmm3 424; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 425; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 426; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 427; AVX1-NEXT: vpsraw $4, %xmm0, %xmm4 428; AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0 429; AVX1-NEXT: vpsraw $2, %xmm0, %xmm4 430; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm1 431; AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0 432; AVX1-NEXT: vpsraw $1, %xmm0, %xmm4 433; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm1 434; AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0 435; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 436; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 437; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 438; AVX1-NEXT: retq 439; 440; AVX2-LABEL: var_shift_v32i8: 441; AVX2: # %bb.0: 442; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1 443; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 444; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 445; AVX2-NEXT: vpsraw $4, %ymm3, %ymm4 446; AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 447; AVX2-NEXT: vpsraw $2, %ymm3, %ymm4 448; AVX2-NEXT: vpaddw %ymm2, %ymm2, %ymm2 449; AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 450; AVX2-NEXT: vpsraw $1, %ymm3, %ymm4 451; AVX2-NEXT: vpaddw %ymm2, %ymm2, %ymm2 452; AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2 453; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2 454; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 455; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 456; AVX2-NEXT: vpsraw $4, %ymm0, %ymm3 457; AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 458; AVX2-NEXT: vpsraw $2, %ymm0, %ymm3 459; AVX2-NEXT: vpaddw %ymm1, %ymm1, %ymm1 460; AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 461; AVX2-NEXT: vpsraw $1, %ymm0, %ymm3 462; AVX2-NEXT: vpaddw %ymm1, %ymm1, %ymm1 463; AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 464; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 465; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 466; AVX2-NEXT: retq 467; 468; XOPAVX1-LABEL: var_shift_v32i8: 469; XOPAVX1: # %bb.0: 470; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 471; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 472; XOPAVX1-NEXT: vpsubb %xmm2, %xmm3, %xmm2 473; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 474; XOPAVX1-NEXT: vpshab %xmm2, %xmm4, %xmm2 475; XOPAVX1-NEXT: vpsubb %xmm1, %xmm3, %xmm1 476; XOPAVX1-NEXT: vpshab %xmm1, %xmm0, %xmm0 477; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 478; XOPAVX1-NEXT: retq 479; 480; XOPAVX2-LABEL: var_shift_v32i8: 481; XOPAVX2: # %bb.0: 482; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 483; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 484; XOPAVX2-NEXT: vpsubb %xmm2, %xmm3, %xmm2 485; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 486; XOPAVX2-NEXT: vpshab %xmm2, %xmm4, %xmm2 487; XOPAVX2-NEXT: vpsubb %xmm1, %xmm3, %xmm1 488; XOPAVX2-NEXT: vpshab %xmm1, %xmm0, %xmm0 489; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 490; XOPAVX2-NEXT: retq 491; 492; AVX512DQ-LABEL: var_shift_v32i8: 493; AVX512DQ: # %bb.0: 494; AVX512DQ-NEXT: vpsllw $5, %ymm1, %ymm1 495; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 496; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 497; AVX512DQ-NEXT: vpsraw $4, %ymm3, %ymm4 498; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 499; AVX512DQ-NEXT: vpsraw $2, %ymm3, %ymm4 500; AVX512DQ-NEXT: vpaddw %ymm2, %ymm2, %ymm2 501; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 502; AVX512DQ-NEXT: vpsraw $1, %ymm3, %ymm4 503; AVX512DQ-NEXT: vpaddw %ymm2, %ymm2, %ymm2 504; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2 505; AVX512DQ-NEXT: vpsrlw $8, %ymm2, %ymm2 506; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 507; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 508; AVX512DQ-NEXT: vpsraw $4, %ymm0, %ymm3 509; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 510; AVX512DQ-NEXT: vpsraw $2, %ymm0, %ymm3 511; AVX512DQ-NEXT: vpaddw %ymm1, %ymm1, %ymm1 512; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 513; AVX512DQ-NEXT: vpsraw $1, %ymm0, %ymm3 514; AVX512DQ-NEXT: vpaddw %ymm1, %ymm1, %ymm1 515; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 516; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm0 517; AVX512DQ-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 518; AVX512DQ-NEXT: retq 519; 520; AVX512BW-LABEL: var_shift_v32i8: 521; AVX512BW: # %bb.0: 522; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero 523; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 524; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 525; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 526; AVX512BW-NEXT: retq 527; 528; AVX512DQVL-LABEL: var_shift_v32i8: 529; AVX512DQVL: # %bb.0: 530; AVX512DQVL-NEXT: vpsllw $5, %ymm1, %ymm1 531; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 532; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 533; AVX512DQVL-NEXT: vpsraw $4, %ymm3, %ymm4 534; AVX512DQVL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 535; AVX512DQVL-NEXT: vpsraw $2, %ymm3, %ymm4 536; AVX512DQVL-NEXT: vpaddw %ymm2, %ymm2, %ymm2 537; AVX512DQVL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 538; AVX512DQVL-NEXT: vpsraw $1, %ymm3, %ymm4 539; AVX512DQVL-NEXT: vpaddw %ymm2, %ymm2, %ymm2 540; AVX512DQVL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2 541; AVX512DQVL-NEXT: vpsrlw $8, %ymm2, %ymm2 542; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 543; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 544; AVX512DQVL-NEXT: vpsraw $4, %ymm0, %ymm3 545; AVX512DQVL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 546; AVX512DQVL-NEXT: vpsraw $2, %ymm0, %ymm3 547; AVX512DQVL-NEXT: vpaddw %ymm1, %ymm1, %ymm1 548; AVX512DQVL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 549; AVX512DQVL-NEXT: vpsraw $1, %ymm0, %ymm3 550; AVX512DQVL-NEXT: vpaddw %ymm1, %ymm1, %ymm1 551; AVX512DQVL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 552; AVX512DQVL-NEXT: vpsrlw $8, %ymm0, %ymm0 553; AVX512DQVL-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 554; AVX512DQVL-NEXT: retq 555; 556; AVX512BWVL-LABEL: var_shift_v32i8: 557; AVX512BWVL: # %bb.0: 558; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero 559; AVX512BWVL-NEXT: vpmovsxbw %ymm0, %zmm0 560; AVX512BWVL-NEXT: vpsravw %zmm1, %zmm0, %zmm0 561; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 562; AVX512BWVL-NEXT: retq 563; 564; X86-AVX1-LABEL: var_shift_v32i8: 565; X86-AVX1: # %bb.0: 566; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 567; X86-AVX1-NEXT: vpsllw $5, %xmm2, %xmm2 568; X86-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 569; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 570; X86-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 571; X86-AVX1-NEXT: vpsraw $4, %xmm5, %xmm6 572; X86-AVX1-NEXT: vpblendvb %xmm3, %xmm6, %xmm5, %xmm5 573; X86-AVX1-NEXT: vpsraw $2, %xmm5, %xmm6 574; X86-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 575; X86-AVX1-NEXT: vpblendvb %xmm3, %xmm6, %xmm5, %xmm5 576; X86-AVX1-NEXT: vpsraw $1, %xmm5, %xmm6 577; X86-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 578; X86-AVX1-NEXT: vpblendvb %xmm3, %xmm6, %xmm5, %xmm3 579; X86-AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 580; X86-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 581; X86-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 582; X86-AVX1-NEXT: vpsraw $4, %xmm4, %xmm5 583; X86-AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm4 584; X86-AVX1-NEXT: vpsraw $2, %xmm4, %xmm5 585; X86-AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 586; X86-AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm4 587; X86-AVX1-NEXT: vpsraw $1, %xmm4, %xmm5 588; X86-AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 589; X86-AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm2 590; X86-AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 591; X86-AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 592; X86-AVX1-NEXT: vpsllw $5, %xmm1, %xmm1 593; X86-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 594; X86-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 595; X86-AVX1-NEXT: vpsraw $4, %xmm4, %xmm5 596; X86-AVX1-NEXT: vpblendvb %xmm3, %xmm5, %xmm4, %xmm4 597; X86-AVX1-NEXT: vpsraw $2, %xmm4, %xmm5 598; X86-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 599; X86-AVX1-NEXT: vpblendvb %xmm3, %xmm5, %xmm4, %xmm4 600; X86-AVX1-NEXT: vpsraw $1, %xmm4, %xmm5 601; X86-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 602; X86-AVX1-NEXT: vpblendvb %xmm3, %xmm5, %xmm4, %xmm3 603; X86-AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 604; X86-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 605; X86-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 606; X86-AVX1-NEXT: vpsraw $4, %xmm0, %xmm4 607; X86-AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0 608; X86-AVX1-NEXT: vpsraw $2, %xmm0, %xmm4 609; X86-AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm1 610; X86-AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0 611; X86-AVX1-NEXT: vpsraw $1, %xmm0, %xmm4 612; X86-AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm1 613; X86-AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0 614; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 615; X86-AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 616; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 617; X86-AVX1-NEXT: retl 618; 619; X86-AVX2-LABEL: var_shift_v32i8: 620; X86-AVX2: # %bb.0: 621; X86-AVX2-NEXT: vpsllw $5, %ymm1, %ymm1 622; X86-AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 623; X86-AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 624; X86-AVX2-NEXT: vpsraw $4, %ymm3, %ymm4 625; X86-AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 626; X86-AVX2-NEXT: vpsraw $2, %ymm3, %ymm4 627; X86-AVX2-NEXT: vpaddw %ymm2, %ymm2, %ymm2 628; X86-AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 629; X86-AVX2-NEXT: vpsraw $1, %ymm3, %ymm4 630; X86-AVX2-NEXT: vpaddw %ymm2, %ymm2, %ymm2 631; X86-AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2 632; X86-AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2 633; X86-AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 634; X86-AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 635; X86-AVX2-NEXT: vpsraw $4, %ymm0, %ymm3 636; X86-AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 637; X86-AVX2-NEXT: vpsraw $2, %ymm0, %ymm3 638; X86-AVX2-NEXT: vpaddw %ymm1, %ymm1, %ymm1 639; X86-AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 640; X86-AVX2-NEXT: vpsraw $1, %ymm0, %ymm3 641; X86-AVX2-NEXT: vpaddw %ymm1, %ymm1, %ymm1 642; X86-AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 643; X86-AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 644; X86-AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 645; X86-AVX2-NEXT: retl 646 %shift = ashr <32 x i8> %a, %b 647 ret <32 x i8> %shift 648} 649 650; 651; Uniform Variable Shifts 652; 653 654define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { 655; AVX1-LABEL: splatvar_shift_v4i64: 656; AVX1: # %bb.0: 657; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] 658; AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 659; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 660; AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm3 661; AVX1-NEXT: vpxor %xmm2, %xmm3, %xmm3 662; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm3 663; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 664; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 665; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0 666; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 667; AVX1-NEXT: retq 668; 669; AVX2-LABEL: splatvar_shift_v4i64: 670; AVX2: # %bb.0: 671; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] 672; AVX2-NEXT: vpsrlq %xmm1, %ymm2, %ymm2 673; AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 674; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 675; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0 676; AVX2-NEXT: retq 677; 678; XOPAVX1-LABEL: splatvar_shift_v4i64: 679; XOPAVX1: # %bb.0: 680; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] 681; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 682; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1 683; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 684; XOPAVX1-NEXT: vpshaq %xmm1, %xmm2, %xmm2 685; XOPAVX1-NEXT: vpshaq %xmm1, %xmm0, %xmm0 686; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 687; XOPAVX1-NEXT: retq 688; 689; XOPAVX2-LABEL: splatvar_shift_v4i64: 690; XOPAVX2: # %bb.0: 691; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] 692; XOPAVX2-NEXT: vpsrlq %xmm1, %ymm2, %ymm2 693; XOPAVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 694; XOPAVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 695; XOPAVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0 696; XOPAVX2-NEXT: retq 697; 698; AVX512-LABEL: splatvar_shift_v4i64: 699; AVX512: # %bb.0: 700; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 701; AVX512-NEXT: vpsraq %xmm1, %zmm0, %zmm0 702; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 703; AVX512-NEXT: retq 704; 705; AVX512VL-LABEL: splatvar_shift_v4i64: 706; AVX512VL: # %bb.0: 707; AVX512VL-NEXT: vpsraq %xmm1, %ymm0, %ymm0 708; AVX512VL-NEXT: retq 709; 710; X86-AVX1-LABEL: splatvar_shift_v4i64: 711; X86-AVX1: # %bb.0: 712; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648] 713; X86-AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 714; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 715; X86-AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm3 716; X86-AVX1-NEXT: vpxor %xmm2, %xmm3, %xmm3 717; X86-AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm3 718; X86-AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 719; X86-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 720; X86-AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0 721; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 722; X86-AVX1-NEXT: retl 723; 724; X86-AVX2-LABEL: splatvar_shift_v4i64: 725; X86-AVX2: # %bb.0: 726; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648] 727; X86-AVX2-NEXT: vpsrlq %xmm1, %ymm2, %ymm2 728; X86-AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 729; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 730; X86-AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0 731; X86-AVX2-NEXT: retl 732 %splat = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer 733 %shift = ashr <4 x i64> %a, %splat 734 ret <4 x i64> %shift 735} 736 737define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { 738; AVX1-LABEL: splatvar_shift_v8i32: 739; AVX1: # %bb.0: 740; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 741; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 742; AVX1-NEXT: vpsrad %xmm1, %xmm2, %xmm2 743; AVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0 744; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 745; AVX1-NEXT: retq 746; 747; AVX2-LABEL: splatvar_shift_v8i32: 748; AVX2: # %bb.0: 749; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 750; AVX2-NEXT: vpsrad %xmm1, %ymm0, %ymm0 751; AVX2-NEXT: retq 752; 753; XOPAVX1-LABEL: splatvar_shift_v8i32: 754; XOPAVX1: # %bb.0: 755; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 756; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 757; XOPAVX1-NEXT: vpsrad %xmm1, %xmm2, %xmm2 758; XOPAVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0 759; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 760; XOPAVX1-NEXT: retq 761; 762; XOPAVX2-LABEL: splatvar_shift_v8i32: 763; XOPAVX2: # %bb.0: 764; XOPAVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 765; XOPAVX2-NEXT: vpsrad %xmm1, %ymm0, %ymm0 766; XOPAVX2-NEXT: retq 767; 768; AVX512-LABEL: splatvar_shift_v8i32: 769; AVX512: # %bb.0: 770; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 771; AVX512-NEXT: vpsrad %xmm1, %ymm0, %ymm0 772; AVX512-NEXT: retq 773; 774; AVX512VL-LABEL: splatvar_shift_v8i32: 775; AVX512VL: # %bb.0: 776; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 777; AVX512VL-NEXT: vpsrad %xmm1, %ymm0, %ymm0 778; AVX512VL-NEXT: retq 779; 780; X86-AVX1-LABEL: splatvar_shift_v8i32: 781; X86-AVX1: # %bb.0: 782; X86-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 783; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 784; X86-AVX1-NEXT: vpsrad %xmm1, %xmm2, %xmm2 785; X86-AVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0 786; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 787; X86-AVX1-NEXT: retl 788; 789; X86-AVX2-LABEL: splatvar_shift_v8i32: 790; X86-AVX2: # %bb.0: 791; X86-AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 792; X86-AVX2-NEXT: vpsrad %xmm1, %ymm0, %ymm0 793; X86-AVX2-NEXT: retl 794 %splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer 795 %shift = ashr <8 x i32> %a, %splat 796 ret <8 x i32> %shift 797} 798 799define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind { 800; AVX1-LABEL: splatvar_shift_v16i16: 801; AVX1: # %bb.0: 802; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 803; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 804; AVX1-NEXT: vpsraw %xmm1, %xmm2, %xmm2 805; AVX1-NEXT: vpsraw %xmm1, %xmm0, %xmm0 806; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 807; AVX1-NEXT: retq 808; 809; AVX2-LABEL: splatvar_shift_v16i16: 810; AVX2: # %bb.0: 811; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 812; AVX2-NEXT: vpsraw %xmm1, %ymm0, %ymm0 813; AVX2-NEXT: retq 814; 815; XOPAVX1-LABEL: splatvar_shift_v16i16: 816; XOPAVX1: # %bb.0: 817; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 818; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 819; XOPAVX1-NEXT: vpsraw %xmm1, %xmm2, %xmm2 820; XOPAVX1-NEXT: vpsraw %xmm1, %xmm0, %xmm0 821; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 822; XOPAVX1-NEXT: retq 823; 824; XOPAVX2-LABEL: splatvar_shift_v16i16: 825; XOPAVX2: # %bb.0: 826; XOPAVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 827; XOPAVX2-NEXT: vpsraw %xmm1, %ymm0, %ymm0 828; XOPAVX2-NEXT: retq 829; 830; AVX512-LABEL: splatvar_shift_v16i16: 831; AVX512: # %bb.0: 832; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 833; AVX512-NEXT: vpsraw %xmm1, %ymm0, %ymm0 834; AVX512-NEXT: retq 835; 836; AVX512VL-LABEL: splatvar_shift_v16i16: 837; AVX512VL: # %bb.0: 838; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 839; AVX512VL-NEXT: vpsraw %xmm1, %ymm0, %ymm0 840; AVX512VL-NEXT: retq 841; 842; X86-AVX1-LABEL: splatvar_shift_v16i16: 843; X86-AVX1: # %bb.0: 844; X86-AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 845; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 846; X86-AVX1-NEXT: vpsraw %xmm1, %xmm2, %xmm2 847; X86-AVX1-NEXT: vpsraw %xmm1, %xmm0, %xmm0 848; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 849; X86-AVX1-NEXT: retl 850; 851; X86-AVX2-LABEL: splatvar_shift_v16i16: 852; X86-AVX2: # %bb.0: 853; X86-AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 854; X86-AVX2-NEXT: vpsraw %xmm1, %ymm0, %ymm0 855; X86-AVX2-NEXT: retl 856 %splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer 857 %shift = ashr <16 x i16> %a, %splat 858 ret <16 x i16> %shift 859} 860 861define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { 862; AVX1-LABEL: splatvar_shift_v32i8: 863; AVX1: # %bb.0: 864; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 865; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 866; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 867; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 868; AVX1-NEXT: vpsrlw %xmm1, %xmm3, %xmm3 869; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 870; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 871; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [32896,32896,32896,32896,32896,32896,32896,32896] 872; AVX1-NEXT: vpsrlw %xmm1, %xmm4, %xmm4 873; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 874; AVX1-NEXT: vpsubb %xmm4, %xmm2, %xmm2 875; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 876; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 877; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0 878; AVX1-NEXT: vpsubb %xmm4, %xmm0, %xmm0 879; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 880; AVX1-NEXT: retq 881; 882; AVX2-LABEL: splatvar_shift_v32i8: 883; AVX2: # %bb.0: 884; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 885; AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 886; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 887; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 888; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 889; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2 890; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 891; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] 892; AVX2-NEXT: vpsrlw %xmm1, %ymm2, %ymm1 893; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 894; AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0 895; AVX2-NEXT: retq 896; 897; XOPAVX1-LABEL: splatvar_shift_v32i8: 898; XOPAVX1: # %bb.0: 899; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 900; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 901; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 902; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 903; XOPAVX1-NEXT: vpshab %xmm1, %xmm2, %xmm2 904; XOPAVX1-NEXT: vpshab %xmm1, %xmm0, %xmm0 905; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 906; XOPAVX1-NEXT: retq 907; 908; XOPAVX2-LABEL: splatvar_shift_v32i8: 909; XOPAVX2: # %bb.0: 910; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 911; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 912; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1 913; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 914; XOPAVX2-NEXT: vpshab %xmm1, %xmm2, %xmm2 915; XOPAVX2-NEXT: vpshab %xmm1, %xmm0, %xmm0 916; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 917; XOPAVX2-NEXT: retq 918; 919; AVX512DQ-LABEL: splatvar_shift_v32i8: 920; AVX512DQ: # %bb.0: 921; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 922; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 923; AVX512DQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 924; AVX512DQ-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 925; AVX512DQ-NEXT: vpsrlw $8, %xmm2, %xmm2 926; AVX512DQ-NEXT: vpbroadcastb %xmm2, %ymm2 927; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm0 928; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] 929; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm2, %ymm1 930; AVX512DQ-NEXT: vpxor %ymm1, %ymm0, %ymm0 931; AVX512DQ-NEXT: vpsubb %ymm1, %ymm0, %ymm0 932; AVX512DQ-NEXT: retq 933; 934; AVX512BW-LABEL: splatvar_shift_v32i8: 935; AVX512BW: # %bb.0: 936; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 937; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 938; AVX512BW-NEXT: vpsraw %xmm1, %zmm0, %zmm0 939; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 940; AVX512BW-NEXT: retq 941; 942; AVX512DQVL-LABEL: splatvar_shift_v32i8: 943; AVX512DQVL: # %bb.0: 944; AVX512DQVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 945; AVX512DQVL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 946; AVX512DQVL-NEXT: vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] 947; AVX512DQVL-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 948; AVX512DQVL-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 949; AVX512DQVL-NEXT: vpsrlw %xmm1, %xmm3, %xmm1 950; AVX512DQVL-NEXT: vpsrlw $8, %xmm1, %xmm1 951; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %ymm1 952; AVX512DQVL-NEXT: vpternlogq $108, %ymm0, %ymm2, %ymm1 953; AVX512DQVL-NEXT: vpsubb %ymm2, %ymm1, %ymm0 954; AVX512DQVL-NEXT: retq 955; 956; AVX512BWVL-LABEL: splatvar_shift_v32i8: 957; AVX512BWVL: # %bb.0: 958; AVX512BWVL-NEXT: vpmovsxbw %ymm0, %zmm0 959; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 960; AVX512BWVL-NEXT: vpsraw %xmm1, %zmm0, %zmm0 961; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 962; AVX512BWVL-NEXT: retq 963; 964; X86-AVX1-LABEL: splatvar_shift_v32i8: 965; X86-AVX1: # %bb.0: 966; X86-AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 967; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 968; X86-AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 969; X86-AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 970; X86-AVX1-NEXT: vpsrlw %xmm1, %xmm3, %xmm3 971; X86-AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 972; X86-AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 973; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [32896,32896,32896,32896,32896,32896,32896,32896] 974; X86-AVX1-NEXT: vpsrlw %xmm1, %xmm4, %xmm4 975; X86-AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 976; X86-AVX1-NEXT: vpsubb %xmm4, %xmm2, %xmm2 977; X86-AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 978; X86-AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 979; X86-AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0 980; X86-AVX1-NEXT: vpsubb %xmm4, %xmm0, %xmm0 981; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 982; X86-AVX1-NEXT: retl 983; 984; X86-AVX2-LABEL: splatvar_shift_v32i8: 985; X86-AVX2: # %bb.0: 986; X86-AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 987; X86-AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 988; X86-AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 989; X86-AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 990; X86-AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 991; X86-AVX2-NEXT: vpbroadcastb %xmm2, %ymm2 992; X86-AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 993; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] 994; X86-AVX2-NEXT: vpsrlw %xmm1, %ymm2, %ymm1 995; X86-AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 996; X86-AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0 997; X86-AVX2-NEXT: retl 998 %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer 999 %shift = ashr <32 x i8> %a, %splat 1000 ret <32 x i8> %shift 1001} 1002 1003; 1004; Constant Shifts 1005; 1006 1007define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind { 1008; AVX1-LABEL: constant_shift_v4i64: 1009; AVX1: # %bb.0: 1010; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1011; AVX1-NEXT: vpsrlq $62, %xmm1, %xmm2 1012; AVX1-NEXT: vpsrlq $31, %xmm1, %xmm1 1013; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] 1014; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4294967296,2] 1015; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 1016; AVX1-NEXT: vpsubq %xmm2, %xmm1, %xmm1 1017; AVX1-NEXT: vpsrlq $7, %xmm0, %xmm2 1018; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm0 1019; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] 1020; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4611686018427387904,72057594037927936] 1021; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 1022; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0 1023; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1024; AVX1-NEXT: retq 1025; 1026; AVX2-LABEL: constant_shift_v4i64: 1027; AVX2: # %bb.0: 1028; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm0 1029; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4611686018427387904,72057594037927936,4294967296,2] 1030; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 1031; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0 1032; AVX2-NEXT: retq 1033; 1034; XOPAVX1-LABEL: constant_shift_v4i64: 1035; XOPAVX1: # %bb.0: 1036; XOPAVX1-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm1 1037; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1038; XOPAVX1-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0 1039; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1040; XOPAVX1-NEXT: retq 1041; 1042; XOPAVX2-LABEL: constant_shift_v4i64: 1043; XOPAVX2: # %bb.0: 1044; XOPAVX2-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm0 1045; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4611686018427387904,72057594037927936,4294967296,2] 1046; XOPAVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 1047; XOPAVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0 1048; XOPAVX2-NEXT: retq 1049; 1050; AVX512-LABEL: constant_shift_v4i64: 1051; AVX512: # %bb.0: 1052; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1053; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [1,7,31,62] 1054; AVX512-NEXT: vpsravq %zmm1, %zmm0, %zmm0 1055; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 1056; AVX512-NEXT: retq 1057; 1058; AVX512VL-LABEL: constant_shift_v4i64: 1059; AVX512VL: # %bb.0: 1060; AVX512VL-NEXT: vpsravq {{.*}}(%rip), %ymm0, %ymm0 1061; AVX512VL-NEXT: retq 1062; 1063; X86-AVX1-LABEL: constant_shift_v4i64: 1064; X86-AVX1: # %bb.0: 1065; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1066; X86-AVX1-NEXT: vpsrlq $62, %xmm1, %xmm2 1067; X86-AVX1-NEXT: vpsrlq $31, %xmm1, %xmm1 1068; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] 1069; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,0] 1070; X86-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 1071; X86-AVX1-NEXT: vpsubq %xmm2, %xmm1, %xmm1 1072; X86-AVX1-NEXT: vpsrlq $7, %xmm0, %xmm2 1073; X86-AVX1-NEXT: vpsrlq $1, %xmm0, %xmm0 1074; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] 1075; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1073741824,0,16777216] 1076; X86-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 1077; X86-AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0 1078; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1079; X86-AVX1-NEXT: retl 1080; 1081; X86-AVX2-LABEL: constant_shift_v4i64: 1082; X86-AVX2: # %bb.0: 1083; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,7,0,31,0,62,0] 1084; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648] 1085; X86-AVX2-NEXT: vpsrlvq %ymm1, %ymm2, %ymm2 1086; X86-AVX2-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0 1087; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 1088; X86-AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0 1089; X86-AVX2-NEXT: retl 1090 %shift = ashr <4 x i64> %a, <i64 1, i64 7, i64 31, i64 62> 1091 ret <4 x i64> %shift 1092} 1093 1094define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) nounwind { 1095; AVX1-LABEL: constant_shift_v8i32: 1096; AVX1: # %bb.0: 1097; AVX1-NEXT: vpsrad $7, %xmm0, %xmm1 1098; AVX1-NEXT: vpsrad $5, %xmm0, %xmm2 1099; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] 1100; AVX1-NEXT: vpsrad $6, %xmm0, %xmm2 1101; AVX1-NEXT: vpsrad $4, %xmm0, %xmm3 1102; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] 1103; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] 1104; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1105; AVX1-NEXT: vpsrad $7, %xmm0, %xmm2 1106; AVX1-NEXT: vpsrad $9, %xmm0, %xmm3 1107; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] 1108; AVX1-NEXT: vpsrad $8, %xmm0, %xmm0 1109; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 1110; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1111; AVX1-NEXT: retq 1112; 1113; AVX2-LABEL: constant_shift_v8i32: 1114; AVX2: # %bb.0: 1115; AVX2-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0 1116; AVX2-NEXT: retq 1117; 1118; XOPAVX1-LABEL: constant_shift_v8i32: 1119; XOPAVX1: # %bb.0: 1120; XOPAVX1-NEXT: vpshad {{.*}}(%rip), %xmm0, %xmm1 1121; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1122; XOPAVX1-NEXT: vpshad {{.*}}(%rip), %xmm0, %xmm0 1123; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1124; XOPAVX1-NEXT: retq 1125; 1126; XOPAVX2-LABEL: constant_shift_v8i32: 1127; XOPAVX2: # %bb.0: 1128; XOPAVX2-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0 1129; XOPAVX2-NEXT: retq 1130; 1131; AVX512-LABEL: constant_shift_v8i32: 1132; AVX512: # %bb.0: 1133; AVX512-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0 1134; AVX512-NEXT: retq 1135; 1136; AVX512VL-LABEL: constant_shift_v8i32: 1137; AVX512VL: # %bb.0: 1138; AVX512VL-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0 1139; AVX512VL-NEXT: retq 1140; 1141; X86-AVX1-LABEL: constant_shift_v8i32: 1142; X86-AVX1: # %bb.0: 1143; X86-AVX1-NEXT: vpsrad $7, %xmm0, %xmm1 1144; X86-AVX1-NEXT: vpsrad $5, %xmm0, %xmm2 1145; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] 1146; X86-AVX1-NEXT: vpsrad $6, %xmm0, %xmm2 1147; X86-AVX1-NEXT: vpsrad $4, %xmm0, %xmm3 1148; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] 1149; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] 1150; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1151; X86-AVX1-NEXT: vpsrad $7, %xmm0, %xmm2 1152; X86-AVX1-NEXT: vpsrad $9, %xmm0, %xmm3 1153; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] 1154; X86-AVX1-NEXT: vpsrad $8, %xmm0, %xmm0 1155; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 1156; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1157; X86-AVX1-NEXT: retl 1158; 1159; X86-AVX2-LABEL: constant_shift_v8i32: 1160; X86-AVX2: # %bb.0: 1161; X86-AVX2-NEXT: vpsravd {{\.LCPI.*}}, %ymm0, %ymm0 1162; X86-AVX2-NEXT: retl 1163 %shift = ashr <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7> 1164 ret <8 x i32> %shift 1165} 1166 1167define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind { 1168; AVX1-LABEL: constant_shift_v16i16: 1169; AVX1: # %bb.0: 1170; AVX1-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1 1171; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] 1172; AVX1-NEXT: vpsraw $1, %xmm0, %xmm2 1173; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6,7] 1174; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1175; AVX1-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm0 1176; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1177; AVX1-NEXT: retq 1178; 1179; AVX2-LABEL: constant_shift_v16i16: 1180; AVX2: # %bb.0: 1181; AVX2-NEXT: vpmulhw {{.*}}(%rip), %ymm0, %ymm1 1182; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1,2,3,4,5,6,7] 1183; AVX2-NEXT: vpsraw $1, %xmm0, %xmm0 1184; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5,6,7] 1185; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 1186; AVX2-NEXT: retq 1187; 1188; XOPAVX1-LABEL: constant_shift_v16i16: 1189; XOPAVX1: # %bb.0: 1190; XOPAVX1-NEXT: vpshaw {{.*}}(%rip), %xmm0, %xmm1 1191; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1192; XOPAVX1-NEXT: vpshaw {{.*}}(%rip), %xmm0, %xmm0 1193; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1194; XOPAVX1-NEXT: retq 1195; 1196; XOPAVX2-LABEL: constant_shift_v16i16: 1197; XOPAVX2: # %bb.0: 1198; XOPAVX2-NEXT: vpshaw {{.*}}(%rip), %xmm0, %xmm1 1199; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 1200; XOPAVX2-NEXT: vpshaw {{.*}}(%rip), %xmm0, %xmm0 1201; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 1202; XOPAVX2-NEXT: retq 1203; 1204; AVX512DQ-LABEL: constant_shift_v16i16: 1205; AVX512DQ: # %bb.0: 1206; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 1207; AVX512DQ-NEXT: vpsravd {{.*}}(%rip), %zmm0, %zmm0 1208; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 1209; AVX512DQ-NEXT: retq 1210; 1211; AVX512BW-LABEL: constant_shift_v16i16: 1212; AVX512BW: # %bb.0: 1213; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1214; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] 1215; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 1216; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 1217; AVX512BW-NEXT: retq 1218; 1219; AVX512DQVL-LABEL: constant_shift_v16i16: 1220; AVX512DQVL: # %bb.0: 1221; AVX512DQVL-NEXT: vpmovsxwd %ymm0, %zmm0 1222; AVX512DQVL-NEXT: vpsravd {{.*}}(%rip), %zmm0, %zmm0 1223; AVX512DQVL-NEXT: vpmovdw %zmm0, %ymm0 1224; AVX512DQVL-NEXT: retq 1225; 1226; AVX512BWVL-LABEL: constant_shift_v16i16: 1227; AVX512BWVL: # %bb.0: 1228; AVX512BWVL-NEXT: vpsravw {{.*}}(%rip), %ymm0, %ymm0 1229; AVX512BWVL-NEXT: retq 1230; 1231; X86-AVX1-LABEL: constant_shift_v16i16: 1232; X86-AVX1: # %bb.0: 1233; X86-AVX1-NEXT: vpmulhw {{\.LCPI.*}}, %xmm0, %xmm1 1234; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] 1235; X86-AVX1-NEXT: vpsraw $1, %xmm0, %xmm2 1236; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6,7] 1237; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1238; X86-AVX1-NEXT: vpmulhw {{\.LCPI.*}}, %xmm0, %xmm0 1239; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1240; X86-AVX1-NEXT: retl 1241; 1242; X86-AVX2-LABEL: constant_shift_v16i16: 1243; X86-AVX2: # %bb.0: 1244; X86-AVX2-NEXT: vpmulhw {{\.LCPI.*}}, %ymm0, %ymm1 1245; X86-AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1,2,3,4,5,6,7] 1246; X86-AVX2-NEXT: vpsraw $1, %xmm0, %xmm0 1247; X86-AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5,6,7] 1248; X86-AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 1249; X86-AVX2-NEXT: retl 1250 %shift = ashr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> 1251 ret <16 x i16> %shift 1252} 1253 1254define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind { 1255; AVX1-LABEL: constant_shift_v32i8: 1256; AVX1: # %bb.0: 1257; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1258; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1259; AVX1-NEXT: vpsraw $8, %xmm2, %xmm2 1260; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2,4,8,16,32,64,128,256] 1261; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2 1262; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 1263; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1264; AVX1-NEXT: vpsraw $8, %xmm1, %xmm1 1265; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [256,128,64,32,16,8,4,2] 1266; AVX1-NEXT: vpmullw %xmm4, %xmm1, %xmm1 1267; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 1268; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 1269; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1270; AVX1-NEXT: vpsraw $8, %xmm2, %xmm2 1271; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2 1272; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 1273; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1274; AVX1-NEXT: vpsraw $8, %xmm0, %xmm0 1275; AVX1-NEXT: vpmullw %xmm4, %xmm0, %xmm0 1276; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 1277; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 1278; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1279; AVX1-NEXT: retq 1280; 1281; AVX2-LABEL: constant_shift_v32i8: 1282; AVX2: # %bb.0: 1283; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 1284; AVX2-NEXT: vpsraw $8, %ymm1, %ymm1 1285; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 1286; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 1287; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 1288; AVX2-NEXT: vpsraw $8, %ymm0, %ymm0 1289; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 1290; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 1291; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 1292; AVX2-NEXT: retq 1293; 1294; XOPAVX1-LABEL: constant_shift_v32i8: 1295; XOPAVX1: # %bb.0: 1296; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1297; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,254,253,252,251,250,249,249,250,251,252,253,254,255,0] 1298; XOPAVX1-NEXT: vpshab %xmm2, %xmm1, %xmm1 1299; XOPAVX1-NEXT: vpshab %xmm2, %xmm0, %xmm0 1300; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1301; XOPAVX1-NEXT: retq 1302; 1303; XOPAVX2-LABEL: constant_shift_v32i8: 1304; XOPAVX2: # %bb.0: 1305; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1306; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,254,253,252,251,250,249,249,250,251,252,253,254,255,0] 1307; XOPAVX2-NEXT: vpshab %xmm2, %xmm1, %xmm1 1308; XOPAVX2-NEXT: vpshab %xmm2, %xmm0, %xmm0 1309; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1310; XOPAVX2-NEXT: retq 1311; 1312; AVX512DQ-LABEL: constant_shift_v32i8: 1313; AVX512DQ: # %bb.0: 1314; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 1315; AVX512DQ-NEXT: vpsraw $8, %ymm1, %ymm1 1316; AVX512DQ-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 1317; AVX512DQ-NEXT: vpsrlw $8, %ymm1, %ymm1 1318; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 1319; AVX512DQ-NEXT: vpsraw $8, %ymm0, %ymm0 1320; AVX512DQ-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 1321; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm0 1322; AVX512DQ-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 1323; AVX512DQ-NEXT: retq 1324; 1325; AVX512BW-LABEL: constant_shift_v32i8: 1326; AVX512BW: # %bb.0: 1327; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 1328; AVX512BW-NEXT: vpsravw {{.*}}(%rip), %zmm0, %zmm0 1329; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1330; AVX512BW-NEXT: retq 1331; 1332; AVX512DQVL-LABEL: constant_shift_v32i8: 1333; AVX512DQVL: # %bb.0: 1334; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 1335; AVX512DQVL-NEXT: vpsraw $8, %ymm1, %ymm1 1336; AVX512DQVL-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 1337; AVX512DQVL-NEXT: vpsrlw $8, %ymm1, %ymm1 1338; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 1339; AVX512DQVL-NEXT: vpsraw $8, %ymm0, %ymm0 1340; AVX512DQVL-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 1341; AVX512DQVL-NEXT: vpsrlw $8, %ymm0, %ymm0 1342; AVX512DQVL-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 1343; AVX512DQVL-NEXT: retq 1344; 1345; AVX512BWVL-LABEL: constant_shift_v32i8: 1346; AVX512BWVL: # %bb.0: 1347; AVX512BWVL-NEXT: vpmovsxbw %ymm0, %zmm0 1348; AVX512BWVL-NEXT: vpsravw {{.*}}(%rip), %zmm0, %zmm0 1349; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 1350; AVX512BWVL-NEXT: retq 1351; 1352; X86-AVX1-LABEL: constant_shift_v32i8: 1353; X86-AVX1: # %bb.0: 1354; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1355; X86-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1356; X86-AVX1-NEXT: vpsraw $8, %xmm2, %xmm2 1357; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2,4,8,16,32,64,128,256] 1358; X86-AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2 1359; X86-AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 1360; X86-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1361; X86-AVX1-NEXT: vpsraw $8, %xmm1, %xmm1 1362; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [256,128,64,32,16,8,4,2] 1363; X86-AVX1-NEXT: vpmullw %xmm4, %xmm1, %xmm1 1364; X86-AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 1365; X86-AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 1366; X86-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1367; X86-AVX1-NEXT: vpsraw $8, %xmm2, %xmm2 1368; X86-AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2 1369; X86-AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 1370; X86-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1371; X86-AVX1-NEXT: vpsraw $8, %xmm0, %xmm0 1372; X86-AVX1-NEXT: vpmullw %xmm4, %xmm0, %xmm0 1373; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 1374; X86-AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 1375; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1376; X86-AVX1-NEXT: retl 1377; 1378; X86-AVX2-LABEL: constant_shift_v32i8: 1379; X86-AVX2: # %bb.0: 1380; X86-AVX2-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 1381; X86-AVX2-NEXT: vpsraw $8, %ymm1, %ymm1 1382; X86-AVX2-NEXT: vpmullw {{\.LCPI.*}}, %ymm1, %ymm1 1383; X86-AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 1384; X86-AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 1385; X86-AVX2-NEXT: vpsraw $8, %ymm0, %ymm0 1386; X86-AVX2-NEXT: vpmullw {{\.LCPI.*}}, %ymm0, %ymm0 1387; X86-AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 1388; X86-AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 1389; X86-AVX2-NEXT: retl 1390 %shift = ashr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0> 1391 ret <32 x i8> %shift 1392} 1393 1394; 1395; Uniform Constant Shifts 1396; 1397 1398define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) nounwind { 1399; AVX1-LABEL: splatconstant_shift_v4i64: 1400; AVX1: # %bb.0: 1401; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1402; AVX1-NEXT: vpsrad $7, %xmm1, %xmm2 1403; AVX1-NEXT: vpsrlq $7, %xmm1, %xmm1 1404; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] 1405; AVX1-NEXT: vpsrad $7, %xmm0, %xmm2 1406; AVX1-NEXT: vpsrlq $7, %xmm0, %xmm0 1407; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 1408; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1409; AVX1-NEXT: retq 1410; 1411; AVX2-LABEL: splatconstant_shift_v4i64: 1412; AVX2: # %bb.0: 1413; AVX2-NEXT: vpsrad $7, %ymm0, %ymm1 1414; AVX2-NEXT: vpsrlq $7, %ymm0, %ymm0 1415; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] 1416; AVX2-NEXT: retq 1417; 1418; XOPAVX1-LABEL: splatconstant_shift_v4i64: 1419; XOPAVX1: # %bb.0: 1420; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1421; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [18446744073709551609,18446744073709551609] 1422; XOPAVX1-NEXT: vpshaq %xmm2, %xmm1, %xmm1 1423; XOPAVX1-NEXT: vpshaq %xmm2, %xmm0, %xmm0 1424; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1425; XOPAVX1-NEXT: retq 1426; 1427; XOPAVX2-LABEL: splatconstant_shift_v4i64: 1428; XOPAVX2: # %bb.0: 1429; XOPAVX2-NEXT: vpsrad $7, %ymm0, %ymm1 1430; XOPAVX2-NEXT: vpsrlq $7, %ymm0, %ymm0 1431; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] 1432; XOPAVX2-NEXT: retq 1433; 1434; AVX512-LABEL: splatconstant_shift_v4i64: 1435; AVX512: # %bb.0: 1436; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1437; AVX512-NEXT: vpsraq $7, %zmm0, %zmm0 1438; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 1439; AVX512-NEXT: retq 1440; 1441; AVX512VL-LABEL: splatconstant_shift_v4i64: 1442; AVX512VL: # %bb.0: 1443; AVX512VL-NEXT: vpsraq $7, %ymm0, %ymm0 1444; AVX512VL-NEXT: retq 1445; 1446; X86-AVX1-LABEL: splatconstant_shift_v4i64: 1447; X86-AVX1: # %bb.0: 1448; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1449; X86-AVX1-NEXT: vpsrad $7, %xmm1, %xmm2 1450; X86-AVX1-NEXT: vpsrlq $7, %xmm1, %xmm1 1451; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] 1452; X86-AVX1-NEXT: vpsrad $7, %xmm0, %xmm2 1453; X86-AVX1-NEXT: vpsrlq $7, %xmm0, %xmm0 1454; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 1455; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1456; X86-AVX1-NEXT: retl 1457; 1458; X86-AVX2-LABEL: splatconstant_shift_v4i64: 1459; X86-AVX2: # %bb.0: 1460; X86-AVX2-NEXT: vpsrad $7, %ymm0, %ymm1 1461; X86-AVX2-NEXT: vpsrlq $7, %ymm0, %ymm0 1462; X86-AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] 1463; X86-AVX2-NEXT: retl 1464 %shift = ashr <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7> 1465 ret <4 x i64> %shift 1466} 1467 1468define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) nounwind { 1469; AVX1-LABEL: splatconstant_shift_v8i32: 1470; AVX1: # %bb.0: 1471; AVX1-NEXT: vpsrad $5, %xmm0, %xmm1 1472; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1473; AVX1-NEXT: vpsrad $5, %xmm0, %xmm0 1474; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1475; AVX1-NEXT: retq 1476; 1477; AVX2-LABEL: splatconstant_shift_v8i32: 1478; AVX2: # %bb.0: 1479; AVX2-NEXT: vpsrad $5, %ymm0, %ymm0 1480; AVX2-NEXT: retq 1481; 1482; XOPAVX1-LABEL: splatconstant_shift_v8i32: 1483; XOPAVX1: # %bb.0: 1484; XOPAVX1-NEXT: vpsrad $5, %xmm0, %xmm1 1485; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1486; XOPAVX1-NEXT: vpsrad $5, %xmm0, %xmm0 1487; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1488; XOPAVX1-NEXT: retq 1489; 1490; XOPAVX2-LABEL: splatconstant_shift_v8i32: 1491; XOPAVX2: # %bb.0: 1492; XOPAVX2-NEXT: vpsrad $5, %ymm0, %ymm0 1493; XOPAVX2-NEXT: retq 1494; 1495; AVX512-LABEL: splatconstant_shift_v8i32: 1496; AVX512: # %bb.0: 1497; AVX512-NEXT: vpsrad $5, %ymm0, %ymm0 1498; AVX512-NEXT: retq 1499; 1500; AVX512VL-LABEL: splatconstant_shift_v8i32: 1501; AVX512VL: # %bb.0: 1502; AVX512VL-NEXT: vpsrad $5, %ymm0, %ymm0 1503; AVX512VL-NEXT: retq 1504; 1505; X86-AVX1-LABEL: splatconstant_shift_v8i32: 1506; X86-AVX1: # %bb.0: 1507; X86-AVX1-NEXT: vpsrad $5, %xmm0, %xmm1 1508; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1509; X86-AVX1-NEXT: vpsrad $5, %xmm0, %xmm0 1510; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1511; X86-AVX1-NEXT: retl 1512; 1513; X86-AVX2-LABEL: splatconstant_shift_v8i32: 1514; X86-AVX2: # %bb.0: 1515; X86-AVX2-NEXT: vpsrad $5, %ymm0, %ymm0 1516; X86-AVX2-NEXT: retl 1517 %shift = ashr <8 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5> 1518 ret <8 x i32> %shift 1519} 1520 1521define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) nounwind { 1522; AVX1-LABEL: splatconstant_shift_v16i16: 1523; AVX1: # %bb.0: 1524; AVX1-NEXT: vpsraw $3, %xmm0, %xmm1 1525; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1526; AVX1-NEXT: vpsraw $3, %xmm0, %xmm0 1527; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1528; AVX1-NEXT: retq 1529; 1530; AVX2-LABEL: splatconstant_shift_v16i16: 1531; AVX2: # %bb.0: 1532; AVX2-NEXT: vpsraw $3, %ymm0, %ymm0 1533; AVX2-NEXT: retq 1534; 1535; XOPAVX1-LABEL: splatconstant_shift_v16i16: 1536; XOPAVX1: # %bb.0: 1537; XOPAVX1-NEXT: vpsraw $3, %xmm0, %xmm1 1538; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1539; XOPAVX1-NEXT: vpsraw $3, %xmm0, %xmm0 1540; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1541; XOPAVX1-NEXT: retq 1542; 1543; XOPAVX2-LABEL: splatconstant_shift_v16i16: 1544; XOPAVX2: # %bb.0: 1545; XOPAVX2-NEXT: vpsraw $3, %ymm0, %ymm0 1546; XOPAVX2-NEXT: retq 1547; 1548; AVX512-LABEL: splatconstant_shift_v16i16: 1549; AVX512: # %bb.0: 1550; AVX512-NEXT: vpsraw $3, %ymm0, %ymm0 1551; AVX512-NEXT: retq 1552; 1553; AVX512VL-LABEL: splatconstant_shift_v16i16: 1554; AVX512VL: # %bb.0: 1555; AVX512VL-NEXT: vpsraw $3, %ymm0, %ymm0 1556; AVX512VL-NEXT: retq 1557; 1558; X86-AVX1-LABEL: splatconstant_shift_v16i16: 1559; X86-AVX1: # %bb.0: 1560; X86-AVX1-NEXT: vpsraw $3, %xmm0, %xmm1 1561; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1562; X86-AVX1-NEXT: vpsraw $3, %xmm0, %xmm0 1563; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1564; X86-AVX1-NEXT: retl 1565; 1566; X86-AVX2-LABEL: splatconstant_shift_v16i16: 1567; X86-AVX2: # %bb.0: 1568; X86-AVX2-NEXT: vpsraw $3, %ymm0, %ymm0 1569; X86-AVX2-NEXT: retl 1570 %shift = ashr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3> 1571 ret <16 x i16> %shift 1572} 1573 1574define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind { 1575; AVX1-LABEL: splatconstant_shift_v32i8: 1576; AVX1: # %bb.0: 1577; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1578; AVX1-NEXT: vpsrlw $3, %xmm1, %xmm1 1579; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] 1580; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 1581; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 1582; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1 1583; AVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1 1584; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0 1585; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 1586; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0 1587; AVX1-NEXT: vpsubb %xmm3, %xmm0, %xmm0 1588; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1589; AVX1-NEXT: retq 1590; 1591; AVX2-LABEL: splatconstant_shift_v32i8: 1592; AVX2: # %bb.0: 1593; AVX2-NEXT: vpsrlw $3, %ymm0, %ymm0 1594; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 1595; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 1596; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 1597; AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0 1598; AVX2-NEXT: retq 1599; 1600; XOPAVX1-LABEL: splatconstant_shift_v32i8: 1601; XOPAVX1: # %bb.0: 1602; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1603; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253] 1604; XOPAVX1-NEXT: vpshab %xmm2, %xmm1, %xmm1 1605; XOPAVX1-NEXT: vpshab %xmm2, %xmm0, %xmm0 1606; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1607; XOPAVX1-NEXT: retq 1608; 1609; XOPAVX2-LABEL: splatconstant_shift_v32i8: 1610; XOPAVX2: # %bb.0: 1611; XOPAVX2-NEXT: vpsrlw $3, %ymm0, %ymm0 1612; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 1613; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 1614; XOPAVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 1615; XOPAVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0 1616; XOPAVX2-NEXT: retq 1617; 1618; AVX512-LABEL: splatconstant_shift_v32i8: 1619; AVX512: # %bb.0: 1620; AVX512-NEXT: vpsrlw $3, %ymm0, %ymm0 1621; AVX512-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 1622; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 1623; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0 1624; AVX512-NEXT: vpsubb %ymm1, %ymm0, %ymm0 1625; AVX512-NEXT: retq 1626; 1627; AVX512VL-LABEL: splatconstant_shift_v32i8: 1628; AVX512VL: # %bb.0: 1629; AVX512VL-NEXT: vpsrlw $3, %ymm0, %ymm0 1630; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 1631; AVX512VL-NEXT: vpternlogq $108, {{.*}}(%rip), %ymm1, %ymm0 1632; AVX512VL-NEXT: vpsubb %ymm1, %ymm0, %ymm0 1633; AVX512VL-NEXT: retq 1634; 1635; X86-AVX1-LABEL: splatconstant_shift_v32i8: 1636; X86-AVX1: # %bb.0: 1637; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1638; X86-AVX1-NEXT: vpsrlw $3, %xmm1, %xmm1 1639; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] 1640; X86-AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 1641; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 1642; X86-AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1 1643; X86-AVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1 1644; X86-AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0 1645; X86-AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 1646; X86-AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0 1647; X86-AVX1-NEXT: vpsubb %xmm3, %xmm0, %xmm0 1648; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1649; X86-AVX1-NEXT: retl 1650; 1651; X86-AVX2-LABEL: splatconstant_shift_v32i8: 1652; X86-AVX2: # %bb.0: 1653; X86-AVX2-NEXT: vpsrlw $3, %ymm0, %ymm0 1654; X86-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm0, %ymm0 1655; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 1656; X86-AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 1657; X86-AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0 1658; X86-AVX2-NEXT: retl 1659 %shift = ashr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3> 1660 ret <32 x i8> %shift 1661} 1662