1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2-SLOW 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2-FAST 5 6; fold (srl 0, x) -> 0 7define <4 x i32> @combine_vec_lshr_zero(<4 x i32> %x) { 8; SSE-LABEL: combine_vec_lshr_zero: 9; SSE: # %bb.0: 10; SSE-NEXT: xorps %xmm0, %xmm0 11; SSE-NEXT: retq 12; 13; AVX-LABEL: combine_vec_lshr_zero: 14; AVX: # %bb.0: 15; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 16; AVX-NEXT: retq 17 %1 = lshr <4 x i32> zeroinitializer, %x 18 ret <4 x i32> %1 19} 20 21; fold (srl x, c >= size(x)) -> undef 22define <4 x i32> @combine_vec_lshr_outofrange0(<4 x i32> %x) { 23; CHECK-LABEL: combine_vec_lshr_outofrange0: 24; CHECK: # %bb.0: 25; CHECK-NEXT: retq 26 %1 = lshr <4 x i32> %x, <i32 33, i32 33, i32 33, i32 33> 27 ret <4 x i32> %1 28} 29 30define <4 x i32> @combine_vec_lshr_outofrange1(<4 x i32> %x) { 31; CHECK-LABEL: combine_vec_lshr_outofrange1: 32; CHECK: # %bb.0: 33; CHECK-NEXT: retq 34 %1 = lshr <4 x i32> %x, <i32 33, i32 34, i32 35, i32 36> 35 ret <4 x i32> %1 36} 37 38define <4 x i32> @combine_vec_lshr_outofrange2(<4 x i32> %x) { 39; CHECK-LABEL: combine_vec_lshr_outofrange2: 40; CHECK: # %bb.0: 41; CHECK-NEXT: retq 42 %1 = lshr <4 x i32> %x, <i32 33, i32 34, i32 35, i32 undef> 43 ret <4 x i32> %1 44} 45 46; fold (srl x, 0) -> x 47define <4 x i32> @combine_vec_lshr_by_zero(<4 x i32> %x) { 48; CHECK-LABEL: combine_vec_lshr_by_zero: 49; CHECK: # %bb.0: 50; CHECK-NEXT: retq 51 %1 = lshr <4 x i32> %x, zeroinitializer 52 ret <4 x i32> %1 53} 54 55; if (srl x, c) is known to be zero, return 0 56define <4 x i32> @combine_vec_lshr_known_zero0(<4 x i32> %x) { 57; SSE-LABEL: combine_vec_lshr_known_zero0: 58; SSE: # %bb.0: 59; SSE-NEXT: xorps %xmm0, %xmm0 60; SSE-NEXT: retq 61; 62; AVX-LABEL: combine_vec_lshr_known_zero0: 63; AVX: # %bb.0: 64; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 65; AVX-NEXT: retq 66 %1 = and <4 x i32> %x, <i32 15, i32 15, i32 15, i32 15> 67 %2 = lshr <4 x i32> %1, <i32 4, i32 4, i32 4, i32 4> 68 ret <4 x i32> %2 69} 70 71define <4 x i32> @combine_vec_lshr_known_zero1(<4 x i32> %x) { 72; SSE-LABEL: combine_vec_lshr_known_zero1: 73; SSE: # %bb.0: 74; SSE-NEXT: xorps %xmm0, %xmm0 75; SSE-NEXT: retq 76; 77; AVX-LABEL: combine_vec_lshr_known_zero1: 78; AVX: # %bb.0: 79; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15] 80; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 81; AVX-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0 82; AVX-NEXT: retq 83 %1 = and <4 x i32> %x, <i32 15, i32 15, i32 15, i32 15> 84 %2 = lshr <4 x i32> %1, <i32 8, i32 9, i32 10, i32 11> 85 ret <4 x i32> %2 86} 87 88; fold (srl (srl x, c1), c2) -> (srl x, (add c1, c2)) 89define <4 x i32> @combine_vec_lshr_lshr0(<4 x i32> %x) { 90; SSE-LABEL: combine_vec_lshr_lshr0: 91; SSE: # %bb.0: 92; SSE-NEXT: psrld $6, %xmm0 93; SSE-NEXT: retq 94; 95; AVX-LABEL: combine_vec_lshr_lshr0: 96; AVX: # %bb.0: 97; AVX-NEXT: vpsrld $6, %xmm0, %xmm0 98; AVX-NEXT: retq 99 %1 = lshr <4 x i32> %x, <i32 2, i32 2, i32 2, i32 2> 100 %2 = lshr <4 x i32> %1, <i32 4, i32 4, i32 4, i32 4> 101 ret <4 x i32> %2 102} 103 104define <4 x i32> @combine_vec_lshr_lshr1(<4 x i32> %x) { 105; SSE-LABEL: combine_vec_lshr_lshr1: 106; SSE: # %bb.0: 107; SSE-NEXT: movdqa %xmm0, %xmm1 108; SSE-NEXT: psrld $10, %xmm1 109; SSE-NEXT: movdqa %xmm0, %xmm2 110; SSE-NEXT: psrld $6, %xmm2 111; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] 112; SSE-NEXT: movdqa %xmm0, %xmm1 113; SSE-NEXT: psrld $8, %xmm1 114; SSE-NEXT: psrld $4, %xmm0 115; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 116; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 117; SSE-NEXT: retq 118; 119; AVX-LABEL: combine_vec_lshr_lshr1: 120; AVX: # %bb.0: 121; AVX-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0 122; AVX-NEXT: retq 123 %1 = lshr <4 x i32> %x, <i32 0, i32 1, i32 2, i32 3> 124 %2 = lshr <4 x i32> %1, <i32 4, i32 5, i32 6, i32 7> 125 ret <4 x i32> %2 126} 127 128; fold (srl (srl x, c1), c2) -> 0 129define <4 x i32> @combine_vec_lshr_lshr_zero0(<4 x i32> %x) { 130; SSE-LABEL: combine_vec_lshr_lshr_zero0: 131; SSE: # %bb.0: 132; SSE-NEXT: xorps %xmm0, %xmm0 133; SSE-NEXT: retq 134; 135; AVX-LABEL: combine_vec_lshr_lshr_zero0: 136; AVX: # %bb.0: 137; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 138; AVX-NEXT: retq 139 %1 = lshr <4 x i32> %x, <i32 16, i32 16, i32 16, i32 16> 140 %2 = lshr <4 x i32> %1, <i32 20, i32 20, i32 20, i32 20> 141 ret <4 x i32> %2 142} 143 144define <4 x i32> @combine_vec_lshr_lshr_zero1(<4 x i32> %x) { 145; SSE-LABEL: combine_vec_lshr_lshr_zero1: 146; SSE: # %bb.0: 147; SSE-NEXT: xorps %xmm0, %xmm0 148; SSE-NEXT: retq 149; 150; AVX-LABEL: combine_vec_lshr_lshr_zero1: 151; AVX: # %bb.0: 152; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 153; AVX-NEXT: retq 154 %1 = lshr <4 x i32> %x, <i32 17, i32 18, i32 19, i32 20> 155 %2 = lshr <4 x i32> %1, <i32 25, i32 26, i32 27, i32 28> 156 ret <4 x i32> %2 157} 158 159; fold (srl (trunc (srl x, c1)), c2) -> (trunc (srl x, (add c1, c2))) 160define <4 x i32> @combine_vec_lshr_trunc_lshr0(<4 x i64> %x) { 161; SSE-LABEL: combine_vec_lshr_trunc_lshr0: 162; SSE: # %bb.0: 163; SSE-NEXT: psrlq $48, %xmm1 164; SSE-NEXT: psrlq $48, %xmm0 165; SSE-NEXT: packusdw %xmm1, %xmm0 166; SSE-NEXT: retq 167; 168; AVX-LABEL: combine_vec_lshr_trunc_lshr0: 169; AVX: # %bb.0: 170; AVX-NEXT: vpsrlq $48, %ymm0, %ymm0 171; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1 172; AVX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 173; AVX-NEXT: vzeroupper 174; AVX-NEXT: retq 175 %1 = lshr <4 x i64> %x, <i64 32, i64 32, i64 32, i64 32> 176 %2 = trunc <4 x i64> %1 to <4 x i32> 177 %3 = lshr <4 x i32> %2, <i32 16, i32 16, i32 16, i32 16> 178 ret <4 x i32> %3 179} 180 181define <4 x i32> @combine_vec_lshr_trunc_lshr1(<4 x i64> %x) { 182; SSE-LABEL: combine_vec_lshr_trunc_lshr1: 183; SSE: # %bb.0: 184; SSE-NEXT: movdqa %xmm1, %xmm2 185; SSE-NEXT: psrlq $35, %xmm2 186; SSE-NEXT: psrlq $34, %xmm1 187; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] 188; SSE-NEXT: movdqa %xmm0, %xmm2 189; SSE-NEXT: psrlq $33, %xmm2 190; SSE-NEXT: psrlq $32, %xmm0 191; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5,6,7] 192; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2] 193; SSE-NEXT: movaps %xmm2, %xmm1 194; SSE-NEXT: psrld $19, %xmm1 195; SSE-NEXT: movaps %xmm2, %xmm3 196; SSE-NEXT: psrld $17, %xmm3 197; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7] 198; SSE-NEXT: psrld $18, %xmm2 199; SSE-NEXT: psrld $16, %xmm0 200; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] 201; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] 202; SSE-NEXT: retq 203; 204; AVX2-SLOW-LABEL: combine_vec_lshr_trunc_lshr1: 205; AVX2-SLOW: # %bb.0: 206; AVX2-SLOW-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm0 207; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 208; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 209; AVX2-SLOW-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0 210; AVX2-SLOW-NEXT: vzeroupper 211; AVX2-SLOW-NEXT: retq 212; 213; AVX2-FAST-LABEL: combine_vec_lshr_trunc_lshr1: 214; AVX2-FAST: # %bb.0: 215; AVX2-FAST-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm0 216; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> 217; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 218; AVX2-FAST-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0 219; AVX2-FAST-NEXT: vzeroupper 220; AVX2-FAST-NEXT: retq 221 %1 = lshr <4 x i64> %x, <i64 32, i64 33, i64 34, i64 35> 222 %2 = trunc <4 x i64> %1 to <4 x i32> 223 %3 = lshr <4 x i32> %2, <i32 16, i32 17, i32 18, i32 19> 224 ret <4 x i32> %3 225} 226 227; fold (srl (trunc (srl x, c1)), c2) -> 0 228define <4 x i32> @combine_vec_lshr_trunc_lshr_zero0(<4 x i64> %x) { 229; SSE-LABEL: combine_vec_lshr_trunc_lshr_zero0: 230; SSE: # %bb.0: 231; SSE-NEXT: xorps %xmm0, %xmm0 232; SSE-NEXT: retq 233; 234; AVX-LABEL: combine_vec_lshr_trunc_lshr_zero0: 235; AVX: # %bb.0: 236; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 237; AVX-NEXT: retq 238 %1 = lshr <4 x i64> %x, <i64 48, i64 48, i64 48, i64 48> 239 %2 = trunc <4 x i64> %1 to <4 x i32> 240 %3 = lshr <4 x i32> %2, <i32 24, i32 24, i32 24, i32 24> 241 ret <4 x i32> %3 242} 243 244define <4 x i32> @combine_vec_lshr_trunc_lshr_zero1(<4 x i64> %x) { 245; SSE-LABEL: combine_vec_lshr_trunc_lshr_zero1: 246; SSE: # %bb.0: 247; SSE-NEXT: xorps %xmm0, %xmm0 248; SSE-NEXT: retq 249; 250; AVX-LABEL: combine_vec_lshr_trunc_lshr_zero1: 251; AVX: # %bb.0: 252; AVX-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm0 253; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1 254; AVX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 255; AVX-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0 256; AVX-NEXT: vzeroupper 257; AVX-NEXT: retq 258 %1 = lshr <4 x i64> %x, <i64 48, i64 49, i64 50, i64 51> 259 %2 = trunc <4 x i64> %1 to <4 x i32> 260 %3 = lshr <4 x i32> %2, <i32 24, i32 25, i32 26, i32 27> 261 ret <4 x i32> %3 262} 263 264; fold (srl (shl x, c), c) -> (and x, cst2) 265define <4 x i32> @combine_vec_lshr_shl_mask0(<4 x i32> %x) { 266; SSE-LABEL: combine_vec_lshr_shl_mask0: 267; SSE: # %bb.0: 268; SSE-NEXT: andps {{.*}}(%rip), %xmm0 269; SSE-NEXT: retq 270; 271; AVX-LABEL: combine_vec_lshr_shl_mask0: 272; AVX: # %bb.0: 273; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [1073741823,1073741823,1073741823,1073741823] 274; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 275; AVX-NEXT: retq 276 %1 = shl <4 x i32> %x, <i32 2, i32 2, i32 2, i32 2> 277 %2 = lshr <4 x i32> %1, <i32 2, i32 2, i32 2, i32 2> 278 ret <4 x i32> %2 279} 280 281define <4 x i32> @combine_vec_lshr_shl_mask1(<4 x i32> %x) { 282; SSE-LABEL: combine_vec_lshr_shl_mask1: 283; SSE: # %bb.0: 284; SSE-NEXT: andps {{.*}}(%rip), %xmm0 285; SSE-NEXT: retq 286; 287; AVX-LABEL: combine_vec_lshr_shl_mask1: 288; AVX: # %bb.0: 289; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 290; AVX-NEXT: retq 291 %1 = shl <4 x i32> %x, <i32 2, i32 3, i32 4, i32 5> 292 %2 = lshr <4 x i32> %1, <i32 2, i32 3, i32 4, i32 5> 293 ret <4 x i32> %2 294} 295 296; fold (srl (sra X, Y), 31) -> (srl X, 31) 297define <4 x i32> @combine_vec_lshr_ashr_sign(<4 x i32> %x, <4 x i32> %y) { 298; SSE-LABEL: combine_vec_lshr_ashr_sign: 299; SSE: # %bb.0: 300; SSE-NEXT: psrld $31, %xmm0 301; SSE-NEXT: retq 302; 303; AVX-LABEL: combine_vec_lshr_ashr_sign: 304; AVX: # %bb.0: 305; AVX-NEXT: vpsrld $31, %xmm0, %xmm0 306; AVX-NEXT: retq 307 %1 = ashr <4 x i32> %x, %y 308 %2 = lshr <4 x i32> %1, <i32 31, i32 31, i32 31, i32 31> 309 ret <4 x i32> %2 310} 311 312; fold (srl (ctlz x), "5") -> x iff x has one bit set (the low bit). 313define <4 x i32> @combine_vec_lshr_lzcnt_bit0(<4 x i32> %x) { 314; SSE-LABEL: combine_vec_lshr_lzcnt_bit0: 315; SSE: # %bb.0: 316; SSE-NEXT: pand {{.*}}(%rip), %xmm0 317; SSE-NEXT: psrld $4, %xmm0 318; SSE-NEXT: pxor {{.*}}(%rip), %xmm0 319; SSE-NEXT: retq 320; 321; AVX-LABEL: combine_vec_lshr_lzcnt_bit0: 322; AVX: # %bb.0: 323; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [16,16,16,16] 324; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 325; AVX-NEXT: vpsrld $4, %xmm0, %xmm0 326; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] 327; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 328; AVX-NEXT: retq 329 %1 = and <4 x i32> %x, <i32 16, i32 16, i32 16, i32 16> 330 %2 = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %1, i1 0) 331 %3 = lshr <4 x i32> %2, <i32 5, i32 5, i32 5, i32 5> 332 ret <4 x i32> %3 333} 334 335define <4 x i32> @combine_vec_lshr_lzcnt_bit1(<4 x i32> %x) { 336; SSE-LABEL: combine_vec_lshr_lzcnt_bit1: 337; SSE: # %bb.0: 338; SSE-NEXT: pand {{.*}}(%rip), %xmm0 339; SSE-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 340; SSE-NEXT: movdqa %xmm2, %xmm3 341; SSE-NEXT: pshufb %xmm0, %xmm3 342; SSE-NEXT: movdqa %xmm0, %xmm1 343; SSE-NEXT: psrlw $4, %xmm1 344; SSE-NEXT: pxor %xmm4, %xmm4 345; SSE-NEXT: pshufb %xmm1, %xmm2 346; SSE-NEXT: pcmpeqb %xmm4, %xmm1 347; SSE-NEXT: pand %xmm3, %xmm1 348; SSE-NEXT: paddb %xmm2, %xmm1 349; SSE-NEXT: movdqa %xmm0, %xmm2 350; SSE-NEXT: pcmpeqb %xmm4, %xmm2 351; SSE-NEXT: psrlw $8, %xmm2 352; SSE-NEXT: pand %xmm1, %xmm2 353; SSE-NEXT: psrlw $8, %xmm1 354; SSE-NEXT: paddw %xmm2, %xmm1 355; SSE-NEXT: pcmpeqw %xmm4, %xmm0 356; SSE-NEXT: psrld $16, %xmm0 357; SSE-NEXT: pand %xmm1, %xmm0 358; SSE-NEXT: psrld $16, %xmm1 359; SSE-NEXT: paddd %xmm0, %xmm1 360; SSE-NEXT: psrld $5, %xmm1 361; SSE-NEXT: movdqa %xmm1, %xmm0 362; SSE-NEXT: retq 363; 364; AVX-LABEL: combine_vec_lshr_lzcnt_bit1: 365; AVX: # %bb.0: 366; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 367; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 368; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm2 369; AVX-NEXT: vpsrlw $4, %xmm0, %xmm3 370; AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4 371; AVX-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 372; AVX-NEXT: vpand %xmm5, %xmm2, %xmm2 373; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 374; AVX-NEXT: vpaddb %xmm1, %xmm2, %xmm1 375; AVX-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm2 376; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2 377; AVX-NEXT: vpand %xmm2, %xmm1, %xmm2 378; AVX-NEXT: vpsrlw $8, %xmm1, %xmm1 379; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 380; AVX-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0 381; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 382; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0 383; AVX-NEXT: vpsrld $16, %xmm1, %xmm1 384; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 385; AVX-NEXT: vpsrld $5, %xmm0, %xmm0 386; AVX-NEXT: retq 387 %1 = and <4 x i32> %x, <i32 4, i32 32, i32 64, i32 128> 388 %2 = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %1, i1 0) 389 %3 = lshr <4 x i32> %2, <i32 5, i32 5, i32 5, i32 5> 390 ret <4 x i32> %3 391} 392declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) 393 394; fold (srl x, (trunc (and y, c))) -> (srl x, (and (trunc y), (trunc c))). 395define <4 x i32> @combine_vec_lshr_trunc_and(<4 x i32> %x, <4 x i64> %y) { 396; SSE-LABEL: combine_vec_lshr_trunc_and: 397; SSE: # %bb.0: 398; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] 399; SSE-NEXT: andps {{.*}}(%rip), %xmm1 400; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] 401; SSE-NEXT: movdqa %xmm0, %xmm3 402; SSE-NEXT: psrld %xmm2, %xmm3 403; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] 404; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] 405; SSE-NEXT: movdqa %xmm0, %xmm5 406; SSE-NEXT: psrld %xmm4, %xmm5 407; SSE-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7] 408; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] 409; SSE-NEXT: movdqa %xmm0, %xmm3 410; SSE-NEXT: psrld %xmm1, %xmm3 411; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7] 412; SSE-NEXT: psrld %xmm1, %xmm0 413; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] 414; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7] 415; SSE-NEXT: retq 416; 417; AVX2-SLOW-LABEL: combine_vec_lshr_trunc_and: 418; AVX2-SLOW: # %bb.0: 419; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 420; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] 421; AVX2-SLOW-NEXT: vandps {{.*}}(%rip), %xmm1, %xmm1 422; AVX2-SLOW-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 423; AVX2-SLOW-NEXT: vzeroupper 424; AVX2-SLOW-NEXT: retq 425; 426; AVX2-FAST-LABEL: combine_vec_lshr_trunc_and: 427; AVX2-FAST: # %bb.0: 428; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u> 429; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 430; AVX2-FAST-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 431; AVX2-FAST-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 432; AVX2-FAST-NEXT: vzeroupper 433; AVX2-FAST-NEXT: retq 434 %1 = and <4 x i64> %y, <i64 15, i64 255, i64 4095, i64 65535> 435 %2 = trunc <4 x i64> %1 to <4 x i32> 436 %3 = lshr <4 x i32> %x, %2 437 ret <4 x i32> %3 438} 439