1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX --check-prefix=AVX512VL 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefix=AVX --check-prefix=AVX512DQVL 7 8define <2 x i64> @combine_shuffle_sext_pmuldq(<4 x i32> %a0, <4 x i32> %a1) { 9; SSE-LABEL: combine_shuffle_sext_pmuldq: 10; SSE: # %bb.0: 11; SSE-NEXT: pmuldq %xmm1, %xmm0 12; SSE-NEXT: retq 13; 14; AVX-LABEL: combine_shuffle_sext_pmuldq: 15; AVX: # %bb.0: 16; AVX-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 17; AVX-NEXT: retq 18 %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> <i32 0, i32 2> 19 %2 = shufflevector <4 x i32> %a1, <4 x i32> undef, <2 x i32> <i32 0, i32 2> 20 %3 = sext <2 x i32> %1 to <2 x i64> 21 %4 = sext <2 x i32> %2 to <2 x i64> 22 %5 = mul nuw <2 x i64> %3, %4 23 ret <2 x i64> %5 24} 25 26define <2 x i64> @combine_shuffle_zext_pmuludq(<4 x i32> %a0, <4 x i32> %a1) { 27; SSE-LABEL: combine_shuffle_zext_pmuludq: 28; SSE: # %bb.0: 29; SSE-NEXT: pmuludq %xmm1, %xmm0 30; SSE-NEXT: retq 31; 32; AVX-LABEL: combine_shuffle_zext_pmuludq: 33; AVX: # %bb.0: 34; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 35; AVX-NEXT: retq 36 %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> <i32 0, i32 2> 37 %2 = shufflevector <4 x i32> %a1, <4 x i32> undef, <2 x i32> <i32 0, i32 2> 38 %3 = zext <2 x i32> %1 to <2 x i64> 39 %4 = zext <2 x i32> %2 to <2 x i64> 40 %5 = mul nuw <2 x i64> %3, %4 41 ret <2 x i64> %5 42} 43 44define <2 x i64> @combine_shuffle_zero_pmuludq(<4 x i32> %a0, <4 x i32> %a1) { 45; SSE-LABEL: combine_shuffle_zero_pmuludq: 46; SSE: # %bb.0: 47; SSE-NEXT: pmuludq %xmm1, %xmm0 48; SSE-NEXT: retq 49; 50; AVX-LABEL: combine_shuffle_zero_pmuludq: 51; AVX: # %bb.0: 52; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 53; AVX-NEXT: retq 54 %1 = shufflevector <4 x i32> %a0, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 55 %2 = shufflevector <4 x i32> %a1, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 56 %3 = bitcast <4 x i32> %1 to <2 x i64> 57 %4 = bitcast <4 x i32> %2 to <2 x i64> 58 %5 = mul <2 x i64> %3, %4 59 ret <2 x i64> %5 60} 61 62define <4 x i64> @combine_shuffle_zero_pmuludq_256(<8 x i32> %a0, <8 x i32> %a1) { 63; SSE-LABEL: combine_shuffle_zero_pmuludq_256: 64; SSE: # %bb.0: 65; SSE-NEXT: pmuludq %xmm2, %xmm0 66; SSE-NEXT: pmuludq %xmm3, %xmm1 67; SSE-NEXT: retq 68; 69; AVX2-LABEL: combine_shuffle_zero_pmuludq_256: 70; AVX2: # %bb.0: 71; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 72; AVX2-NEXT: retq 73; 74; AVX512VL-LABEL: combine_shuffle_zero_pmuludq_256: 75; AVX512VL: # %bb.0: 76; AVX512VL-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 77; AVX512VL-NEXT: retq 78; 79; AVX512DQVL-LABEL: combine_shuffle_zero_pmuludq_256: 80; AVX512DQVL: # %bb.0: 81; AVX512DQVL-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 82; AVX512DQVL-NEXT: retq 83 %1 = shufflevector <8 x i32> %a0, <8 x i32> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> 84 %2 = shufflevector <8 x i32> %a1, <8 x i32> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> 85 %3 = bitcast <8 x i32> %1 to <4 x i64> 86 %4 = bitcast <8 x i32> %2 to <4 x i64> 87 %5 = mul <4 x i64> %3, %4 88 ret <4 x i64> %5 89} 90 91define <8 x i64> @combine_zext_pmuludq_256(<8 x i32> %a) { 92; SSE-LABEL: combine_zext_pmuludq_256: 93; SSE: # %bb.0: 94; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,1,3,3] 95; SSE-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero 96; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,1,3,3] 97; SSE-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 98; SSE-NEXT: movdqa {{.*#+}} xmm4 = [715827883,715827883] 99; SSE-NEXT: pmuludq %xmm4, %xmm0 100; SSE-NEXT: pmuludq %xmm4, %xmm1 101; SSE-NEXT: pmuludq %xmm4, %xmm2 102; SSE-NEXT: pmuludq %xmm4, %xmm3 103; SSE-NEXT: retq 104; 105; AVX2-LABEL: combine_zext_pmuludq_256: 106; AVX2: # %bb.0: 107; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 108; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 109; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 110; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [715827883,715827883,715827883,715827883] 111; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm0 112; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm1 113; AVX2-NEXT: retq 114; 115; AVX512VL-LABEL: combine_zext_pmuludq_256: 116; AVX512VL: # %bb.0: 117; AVX512VL-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero 118; AVX512VL-NEXT: vpmuludq {{.*}}(%rip){1to8}, %zmm0, %zmm0 119; AVX512VL-NEXT: retq 120; 121; AVX512DQVL-LABEL: combine_zext_pmuludq_256: 122; AVX512DQVL: # %bb.0: 123; AVX512DQVL-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero 124; AVX512DQVL-NEXT: vpmuludq {{.*}}(%rip){1to8}, %zmm0, %zmm0 125; AVX512DQVL-NEXT: retq 126 %1 = zext <8 x i32> %a to <8 x i64> 127 %2 = mul nuw nsw <8 x i64> %1, <i64 715827883, i64 715827883, i64 715827883, i64 715827883, i64 715827883, i64 715827883, i64 715827883, i64 715827883> 128 ret <8 x i64> %2 129} 130 131define void @PR39398(i32 %a0) { 132; SSE-LABEL: PR39398: 133; SSE: # %bb.0: # %bb 134; SSE-NEXT: .p2align 4, 0x90 135; SSE-NEXT: .LBB5_1: # %bb10 136; SSE-NEXT: # =>This Inner Loop Header: Depth=1 137; SSE-NEXT: cmpl $232, %edi 138; SSE-NEXT: jne .LBB5_1 139; SSE-NEXT: # %bb.2: # %bb34 140; SSE-NEXT: retq 141; 142; AVX-LABEL: PR39398: 143; AVX: # %bb.0: # %bb 144; AVX-NEXT: .p2align 4, 0x90 145; AVX-NEXT: .LBB5_1: # %bb10 146; AVX-NEXT: # =>This Inner Loop Header: Depth=1 147; AVX-NEXT: cmpl $232, %edi 148; AVX-NEXT: jne .LBB5_1 149; AVX-NEXT: # %bb.2: # %bb34 150; AVX-NEXT: retq 151bb: 152 %tmp9 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> zeroinitializer 153 br label %bb10 154 155bb10: ; preds = %bb10, %bb 156 %tmp12 = phi <4 x i32> [ <i32 9, i32 8, i32 7, i32 6>, %bb ], [ zeroinitializer, %bb10 ] 157 %tmp16 = add <4 x i32> %tmp12, <i32 -4, i32 -4, i32 -4, i32 -4> 158 %tmp18 = zext <4 x i32> %tmp12 to <4 x i64> 159 %tmp19 = zext <4 x i32> %tmp16 to <4 x i64> 160 %tmp20 = xor <4 x i64> %tmp18, <i64 -1, i64 -1, i64 -1, i64 -1> 161 %tmp21 = xor <4 x i64> %tmp19, <i64 -1, i64 -1, i64 -1, i64 -1> 162 %tmp24 = mul <4 x i64> %tmp9, %tmp20 163 %tmp25 = mul <4 x i64> %tmp9, %tmp21 164 %tmp26 = select <4 x i1> undef, <4 x i64> zeroinitializer, <4 x i64> %tmp24 165 %tmp27 = select <4 x i1> undef, <4 x i64> zeroinitializer, <4 x i64> %tmp25 166 %tmp28 = add <4 x i64> zeroinitializer, %tmp26 167 %tmp29 = add <4 x i64> zeroinitializer, %tmp27 168 %tmp33 = icmp eq i32 %a0, 232 169 br i1 %tmp33, label %bb34, label %bb10 170 171bb34: ; preds = %bb10 172 %tmp35 = add <4 x i64> %tmp29, %tmp28 173 ret void 174} 175 176define i32 @PR43159(<4 x i32>* %a0) { 177; SSE-LABEL: PR43159: 178; SSE: # %bb.0: # %entry 179; SSE-NEXT: movdqa (%rdi), %xmm0 180; SSE-NEXT: movdqa {{.*#+}} xmm1 = [1645975491,344322273,2164392969,1916962805] 181; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 182; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 183; SSE-NEXT: pmuludq %xmm2, %xmm3 184; SSE-NEXT: movdqa %xmm0, %xmm2 185; SSE-NEXT: psrld $1, %xmm2 186; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5],xmm0[6,7] 187; SSE-NEXT: pmuludq %xmm1, %xmm2 188; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] 189; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] 190; SSE-NEXT: psubd %xmm3, %xmm0 191; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 192; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm0 193; SSE-NEXT: pxor %xmm2, %xmm2 194; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] 195; SSE-NEXT: paddd %xmm1, %xmm2 196; SSE-NEXT: movdqa %xmm2, %xmm0 197; SSE-NEXT: psrld $7, %xmm0 198; SSE-NEXT: psrld $6, %xmm2 199; SSE-NEXT: movd %xmm2, %edi 200; SSE-NEXT: pextrd $1, %xmm0, %esi 201; SSE-NEXT: pextrd $2, %xmm2, %edx 202; SSE-NEXT: pextrd $3, %xmm0, %ecx 203; SSE-NEXT: jmp foo # TAILCALL 204; 205; AVX2-LABEL: PR43159: 206; AVX2: # %bb.0: # %entry 207; AVX2-NEXT: vmovdqa (%rdi), %xmm0 208; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1645975491,344322273,2164392969,1916962805] 209; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 210; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3 211; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] 212; AVX2-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 213; AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 214; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 215; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] 216; AVX2-NEXT: vpsubd %xmm2, %xmm0, %xmm0 217; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 218; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] 219; AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 220; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 221; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] 222; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 223; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0 224; AVX2-NEXT: vmovd %xmm0, %edi 225; AVX2-NEXT: vpextrd $1, %xmm0, %esi 226; AVX2-NEXT: vpextrd $2, %xmm0, %edx 227; AVX2-NEXT: vpextrd $3, %xmm0, %ecx 228; AVX2-NEXT: jmp foo # TAILCALL 229; 230; AVX512VL-LABEL: PR43159: 231; AVX512VL: # %bb.0: # %entry 232; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 233; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [1645975491,344322273,2164392969,1916962805] 234; AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 235; AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3 236; AVX512VL-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] 237; AVX512VL-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 238; AVX512VL-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 239; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 240; AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] 241; AVX512VL-NEXT: vpsubd %xmm2, %xmm0, %xmm0 242; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 243; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] 244; AVX512VL-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 245; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 246; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] 247; AVX512VL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 248; AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0 249; AVX512VL-NEXT: vmovd %xmm0, %edi 250; AVX512VL-NEXT: vpextrd $1, %xmm0, %esi 251; AVX512VL-NEXT: vpextrd $2, %xmm0, %edx 252; AVX512VL-NEXT: vpextrd $3, %xmm0, %ecx 253; AVX512VL-NEXT: jmp foo # TAILCALL 254; 255; AVX512DQVL-LABEL: PR43159: 256; AVX512DQVL: # %bb.0: # %entry 257; AVX512DQVL-NEXT: vmovdqa (%rdi), %xmm0 258; AVX512DQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [1645975491,344322273,2164392969,1916962805] 259; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 260; AVX512DQVL-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3 261; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] 262; AVX512DQVL-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 263; AVX512DQVL-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 264; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 265; AVX512DQVL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] 266; AVX512DQVL-NEXT: vpsubd %xmm2, %xmm0, %xmm0 267; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 268; AVX512DQVL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] 269; AVX512DQVL-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 270; AVX512DQVL-NEXT: vpxor %xmm2, %xmm2, %xmm2 271; AVX512DQVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] 272; AVX512DQVL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 273; AVX512DQVL-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0 274; AVX512DQVL-NEXT: vmovd %xmm0, %edi 275; AVX512DQVL-NEXT: vpextrd $1, %xmm0, %esi 276; AVX512DQVL-NEXT: vpextrd $2, %xmm0, %edx 277; AVX512DQVL-NEXT: vpextrd $3, %xmm0, %ecx 278; AVX512DQVL-NEXT: jmp foo # TAILCALL 279entry: 280 %0 = load <4 x i32>, <4 x i32>* %a0, align 16 281 %div = udiv <4 x i32> %0, <i32 167, i32 237, i32 254, i32 177> 282 %ext0 = extractelement <4 x i32> %div, i32 0 283 %ext1 = extractelement <4 x i32> %div, i32 1 284 %ext2 = extractelement <4 x i32> %div, i32 2 285 %ext3 = extractelement <4 x i32> %div, i32 3 286 %call = tail call i32 @foo(i32 %ext0, i32 %ext1, i32 %ext2, i32 %ext3) 287 ret i32 %call 288} 289declare dso_local i32 @foo(i32, i32, i32, i32) 290