1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=CHECK,X86 3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=CHECK,X64 4 5; Check that under certain conditions we can factor out a rotate 6; from the following idioms: 7; (a*c0) >> s1 | (a*c1) 8; (a/c0) << s1 | (a/c1) 9; This targets cases where instcombine has folded a shl/srl/mul/udiv 10; with one of the shifts from the rotate idiom 11 12define <4 x i32> @vroll_v4i32_extract_shl(<4 x i32> %i) { 13; CHECK-LABEL: vroll_v4i32_extract_shl: 14; CHECK: # %bb.0: 15; CHECK-NEXT: vpslld $3, %xmm0, %xmm0 16; CHECK-NEXT: vprold $7, %zmm0, %zmm0 17; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 18; CHECK-NEXT: vzeroupper 19; CHECK-NEXT: ret{{[l|q]}} 20 %lhs_mul = shl <4 x i32> %i, <i32 3, i32 3, i32 3, i32 3> 21 %rhs_mul = shl <4 x i32> %i, <i32 10, i32 10, i32 10, i32 10> 22 %lhs_shift = lshr <4 x i32> %lhs_mul, <i32 25, i32 25, i32 25, i32 25> 23 %out = or <4 x i32> %lhs_shift, %rhs_mul 24 ret <4 x i32> %out 25} 26 27define <4 x i64> @vrolq_v4i64_extract_shrl(<4 x i64> %i) nounwind { 28; CHECK-LABEL: vrolq_v4i64_extract_shrl: 29; CHECK: # %bb.0: 30; CHECK-NEXT: vpsrlq $5, %ymm0, %ymm0 31; CHECK-NEXT: vprolq $29, %zmm0, %zmm0 32; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 33; CHECK-NEXT: ret{{[l|q]}} 34 %lhs_div = lshr <4 x i64> %i, <i64 40, i64 40, i64 40, i64 40> 35 %rhs_div = lshr <4 x i64> %i, <i64 5, i64 5, i64 5, i64 5> 36 %rhs_shift = shl <4 x i64> %rhs_div, <i64 29, i64 29, i64 29, i64 29> 37 %out = or <4 x i64> %lhs_div, %rhs_shift 38 ret <4 x i64> %out 39} 40 41define <8 x i32> @vroll_extract_mul(<8 x i32> %i) nounwind { 42; CHECK-LABEL: vroll_extract_mul: 43; CHECK: # %bb.0: 44; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10] 45; CHECK-NEXT: vpmulld %ymm1, %ymm0, %ymm0 46; CHECK-NEXT: vprold $6, %zmm0, %zmm0 47; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 48; CHECK-NEXT: ret{{[l|q]}} 49 %lhs_mul = mul <8 x i32> %i, <i32 640, i32 640, i32 640, i32 640, i32 640, i32 640, i32 640, i32 640> 50 %rhs_mul = mul <8 x i32> %i, <i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10> 51 %rhs_shift = lshr <8 x i32> %rhs_mul, <i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26> 52 %out = or <8 x i32> %lhs_mul, %rhs_shift 53 ret <8 x i32> %out 54} 55 56define <2 x i64> @vrolq_extract_udiv(<2 x i64> %i) nounwind { 57; X86-LABEL: vrolq_extract_udiv: 58; X86: # %bb.0: 59; X86-NEXT: subl $44, %esp 60; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 61; X86-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) 62; X86-NEXT: vmovss %xmm0, (%esp) 63; X86-NEXT: movl $0, {{[0-9]+}}(%esp) 64; X86-NEXT: movl $3, {{[0-9]+}}(%esp) 65; X86-NEXT: calll __udivdi3 66; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 67; X86-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) 68; X86-NEXT: vextractps $2, %xmm0, (%esp) 69; X86-NEXT: movl $0, {{[0-9]+}}(%esp) 70; X86-NEXT: movl $3, {{[0-9]+}}(%esp) 71; X86-NEXT: vmovd %eax, %xmm0 72; X86-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0 73; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 74; X86-NEXT: calll __udivdi3 75; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 76; X86-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 77; X86-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0 78; X86-NEXT: vprolq $57, %zmm0, %zmm0 79; X86-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 80; X86-NEXT: addl $44, %esp 81; X86-NEXT: vzeroupper 82; X86-NEXT: retl 83; 84; X64-LABEL: vrolq_extract_udiv: 85; X64: # %bb.0: 86; X64-NEXT: vpextrq $1, %xmm0, %rax 87; X64-NEXT: movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB 88; X64-NEXT: mulq %rcx 89; X64-NEXT: vmovq %rdx, %xmm1 90; X64-NEXT: vmovq %xmm0, %rax 91; X64-NEXT: mulq %rcx 92; X64-NEXT: vmovq %rdx, %xmm0 93; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 94; X64-NEXT: vpsrlq $1, %xmm0, %xmm0 95; X64-NEXT: vprolq $57, %zmm0, %zmm0 96; X64-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 97; X64-NEXT: vzeroupper 98; X64-NEXT: retq 99 %lhs_div = udiv <2 x i64> %i, <i64 3, i64 3> 100 %rhs_div = udiv <2 x i64> %i, <i64 384, i64 384> 101 %lhs_shift = shl <2 x i64> %lhs_div, <i64 57, i64 57> 102 %out = or <2 x i64> %lhs_shift, %rhs_div 103 ret <2 x i64> %out 104} 105 106define <4 x i32> @vrolw_extract_mul_with_mask(<4 x i32> %i) nounwind { 107; X86-LABEL: vrolw_extract_mul_with_mask: 108; X86: # %bb.0: 109; X86-NEXT: vpbroadcastd {{.*#+}} xmm1 = [9,9,9,9] 110; X86-NEXT: vpmulld %xmm1, %xmm0, %xmm0 111; X86-NEXT: vprold $7, %zmm0, %zmm0 112; X86-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 113; X86-NEXT: vzeroupper 114; X86-NEXT: retl 115; 116; X64-LABEL: vrolw_extract_mul_with_mask: 117; X64: # %bb.0: 118; X64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [9,9,9,9] 119; X64-NEXT: vpmulld %xmm1, %xmm0, %xmm0 120; X64-NEXT: vprold $7, %zmm0, %zmm0 121; X64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 122; X64-NEXT: vzeroupper 123; X64-NEXT: retq 124 %lhs_mul = mul <4 x i32> %i, <i32 1152, i32 1152, i32 1152, i32 1152> 125 %rhs_mul = mul <4 x i32> %i, <i32 9, i32 9, i32 9, i32 9> 126 %lhs_and = and <4 x i32> %lhs_mul, <i32 160, i32 160, i32 160, i32 160> 127 %rhs_shift = lshr <4 x i32> %rhs_mul, <i32 25, i32 25, i32 25, i32 25> 128 %out = or <4 x i32> %lhs_and, %rhs_shift 129 ret <4 x i32> %out 130} 131 132define <32 x i16> @illegal_no_extract_mul(<32 x i16> %i) nounwind { 133; X86-LABEL: illegal_no_extract_mul: 134; X86: # %bb.0: 135; X86-NEXT: vpmullw {{\.LCPI.*}}, %zmm0, %zmm1 136; X86-NEXT: vpmullw {{\.LCPI.*}}, %zmm0, %zmm0 137; X86-NEXT: vpsrlw $10, %zmm0, %zmm0 138; X86-NEXT: vporq %zmm0, %zmm1, %zmm0 139; X86-NEXT: retl 140; 141; X64-LABEL: illegal_no_extract_mul: 142; X64: # %bb.0: 143; X64-NEXT: vpmullw {{.*}}(%rip), %zmm0, %zmm1 144; X64-NEXT: vpmullw {{.*}}(%rip), %zmm0, %zmm0 145; X64-NEXT: vpsrlw $10, %zmm0, %zmm0 146; X64-NEXT: vporq %zmm0, %zmm1, %zmm0 147; X64-NEXT: retq 148 %lhs_mul = mul <32 x i16> %i, <i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640> 149 %rhs_mul = mul <32 x i16> %i, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10> 150 %rhs_shift = lshr <32 x i16> %rhs_mul, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10> 151 %out = or <32 x i16> %lhs_mul, %rhs_shift 152 ret <32 x i16> %out 153} 154 155; Result would undershift 156define <4 x i64> @no_extract_shl(<4 x i64> %i) nounwind { 157; CHECK-LABEL: no_extract_shl: 158; CHECK: # %bb.0: 159; CHECK-NEXT: vpsllq $11, %ymm0, %ymm1 160; CHECK-NEXT: vpsllq $24, %ymm0, %ymm0 161; CHECK-NEXT: vpsrlq $50, %ymm1, %ymm1 162; CHECK-NEXT: vpor %ymm0, %ymm1, %ymm0 163; CHECK-NEXT: ret{{[l|q]}} 164 %lhs_mul = shl <4 x i64> %i, <i64 11, i64 11, i64 11, i64 11> 165 %rhs_mul = shl <4 x i64> %i, <i64 24, i64 24, i64 24, i64 24> 166 %lhs_shift = lshr <4 x i64> %lhs_mul, <i64 50, i64 50, i64 50, i64 50> 167 %out = or <4 x i64> %lhs_shift, %rhs_mul 168 ret <4 x i64> %out 169} 170 171; Result would overshift 172define <4 x i32> @no_extract_shrl(<4 x i32> %i) nounwind { 173; CHECK-LABEL: no_extract_shrl: 174; CHECK: # %bb.0: 175; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4026531840,4026531840,4026531840,4026531840] 176; CHECK-NEXT: vpslld $25, %xmm0, %xmm2 177; CHECK-NEXT: vpand %xmm1, %xmm2, %xmm1 178; CHECK-NEXT: vpsrld $9, %xmm0, %xmm0 179; CHECK-NEXT: vpor %xmm0, %xmm1, %xmm0 180; CHECK-NEXT: ret{{[l|q]}} 181 %lhs_div = lshr <4 x i32> %i, <i32 3, i32 3, i32 3, i32 3> 182 %rhs_div = lshr <4 x i32> %i, <i32 9, i32 9, i32 9, i32 9> 183 %lhs_shift = shl <4 x i32> %lhs_div, <i32 28, i32 28, i32 28, i32 28> 184 %out = or <4 x i32> %lhs_shift, %rhs_div 185 ret <4 x i32> %out 186} 187 188; Can factor 512 from 1536, but result is 3 instead of 9 189define <8 x i32> @no_extract_mul(<8 x i32> %i) nounwind { 190; CHECK-LABEL: no_extract_mul: 191; CHECK: # %bb.0: 192; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1536,1536,1536,1536,1536,1536,1536,1536] 193; CHECK-NEXT: vpmulld %ymm1, %ymm0, %ymm1 194; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm2 = [9,9,9,9,9,9,9,9] 195; CHECK-NEXT: vpmulld %ymm2, %ymm0, %ymm0 196; CHECK-NEXT: vpsrld $23, %ymm0, %ymm0 197; CHECK-NEXT: vpor %ymm0, %ymm1, %ymm0 198; CHECK-NEXT: ret{{[l|q]}} 199 %lhs_mul = mul <8 x i32> %i, <i32 1536, i32 1536, i32 1536, i32 1536, i32 1536, i32 1536, i32 1536, i32 1536> 200 %rhs_mul = mul <8 x i32> %i, <i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9> 201 %rhs_shift = lshr <8 x i32> %rhs_mul, <i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23> 202 %out = or <8 x i32> %lhs_mul, %rhs_shift 203 ret <8 x i32> %out 204} 205 206; Can't evenly factor 256 from 770 207define <2 x i64> @no_extract_udiv(<2 x i64> %i) nounwind { 208; X86-LABEL: no_extract_udiv: 209; X86: # %bb.0: 210; X86-NEXT: subl $60, %esp 211; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 212; X86-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) 213; X86-NEXT: vmovss %xmm0, (%esp) 214; X86-NEXT: movl $0, {{[0-9]+}}(%esp) 215; X86-NEXT: movl $3, {{[0-9]+}}(%esp) 216; X86-NEXT: calll __udivdi3 217; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 218; X86-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) 219; X86-NEXT: vextractps $2, %xmm0, (%esp) 220; X86-NEXT: movl $0, {{[0-9]+}}(%esp) 221; X86-NEXT: movl $3, {{[0-9]+}}(%esp) 222; X86-NEXT: vmovd %eax, %xmm0 223; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 224; X86-NEXT: calll __udivdi3 225; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 226; X86-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 227; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 228; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 229; X86-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) 230; X86-NEXT: vmovss %xmm0, (%esp) 231; X86-NEXT: movl $0, {{[0-9]+}}(%esp) 232; X86-NEXT: movl $770, {{[0-9]+}}(%esp) # imm = 0x302 233; X86-NEXT: calll __udivdi3 234; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 235; X86-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) 236; X86-NEXT: vextractps $2, %xmm0, (%esp) 237; X86-NEXT: movl $0, {{[0-9]+}}(%esp) 238; X86-NEXT: movl $770, {{[0-9]+}}(%esp) # imm = 0x302 239; X86-NEXT: vmovd %eax, %xmm0 240; X86-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0 241; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 242; X86-NEXT: calll __udivdi3 243; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 244; X86-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 245; X86-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0 246; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload 247; X86-NEXT: vpsllq $56, %xmm1, %xmm1 248; X86-NEXT: vpor %xmm0, %xmm1, %xmm0 249; X86-NEXT: addl $60, %esp 250; X86-NEXT: retl 251; 252; X64-LABEL: no_extract_udiv: 253; X64: # %bb.0: 254; X64-NEXT: vpextrq $1, %xmm0, %rcx 255; X64-NEXT: movabsq $-6148914691236517205, %rdi # imm = 0xAAAAAAAAAAAAAAAB 256; X64-NEXT: movq %rcx, %rax 257; X64-NEXT: mulq %rdi 258; X64-NEXT: vmovq %rdx, %xmm1 259; X64-NEXT: vmovq %xmm0, %rsi 260; X64-NEXT: movq %rsi, %rax 261; X64-NEXT: mulq %rdi 262; X64-NEXT: vmovq %rdx, %xmm0 263; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 264; X64-NEXT: vpsrlq $1, %xmm0, %xmm0 265; X64-NEXT: movabsq $-6180857105216966645, %rdi # imm = 0xAA392F35DC17F00B 266; X64-NEXT: movq %rcx, %rax 267; X64-NEXT: mulq %rdi 268; X64-NEXT: vmovq %rdx, %xmm1 269; X64-NEXT: movq %rsi, %rax 270; X64-NEXT: mulq %rdi 271; X64-NEXT: vmovq %rdx, %xmm2 272; X64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 273; X64-NEXT: vpsrlq $9, %xmm1, %xmm1 274; X64-NEXT: vpsllq $56, %xmm0, %xmm0 275; X64-NEXT: vpor %xmm1, %xmm0, %xmm0 276; X64-NEXT: retq 277 %lhs_div = udiv <2 x i64> %i, <i64 3, i64 3> 278 %rhs_div = udiv <2 x i64> %i, <i64 770, i64 770> 279 %lhs_shift = shl <2 x i64> %lhs_div, <i64 56, i64 56> 280 %out = or <2 x i64> %lhs_shift, %rhs_div 281 ret <2 x i64> %out 282} 283 284; DAGCombiner transforms shl X, 1 into add X, X. 285define <4 x i32> @extract_add_1(<4 x i32> %i) nounwind { 286; CHECK-LABEL: extract_add_1: 287; CHECK: # %bb.0: 288; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 289; CHECK-NEXT: vprold $1, %zmm0, %zmm0 290; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 291; CHECK-NEXT: vzeroupper 292; CHECK-NEXT: ret{{[l|q]}} 293 %ii = add <4 x i32> %i, %i 294 %rhs = lshr <4 x i32> %i, <i32 31, i32 31, i32 31, i32 31> 295 %out = or <4 x i32> %ii, %rhs 296 ret <4 x i32> %out 297} 298 299define <4 x i32> @extract_add_1_comut(<4 x i32> %i) nounwind { 300; CHECK-LABEL: extract_add_1_comut: 301; CHECK: # %bb.0: 302; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 303; CHECK-NEXT: vprold $1, %zmm0, %zmm0 304; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 305; CHECK-NEXT: vzeroupper 306; CHECK-NEXT: ret{{[l|q]}} 307 %ii = add <4 x i32> %i, %i 308 %lhs = lshr <4 x i32> %i, <i32 31, i32 31, i32 31, i32 31> 309 %out = or <4 x i32> %lhs, %ii 310 ret <4 x i32> %out 311} 312 313define <4 x i32> @no_extract_add_1(<4 x i32> %i) nounwind { 314; CHECK-LABEL: no_extract_add_1: 315; CHECK: # %bb.0: 316; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm1 317; CHECK-NEXT: vpsrld $27, %xmm0, %xmm0 318; CHECK-NEXT: vpor %xmm0, %xmm1, %xmm0 319; CHECK-NEXT: ret{{[l|q]}} 320 %ii = add <4 x i32> %i, %i 321 %rhs = lshr <4 x i32> %i, <i32 27, i32 27, i32 27, i32 27> 322 %out = or <4 x i32> %ii, %rhs 323 ret <4 x i32> %out 324} 325