1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=sse4.1 | FileCheck %s --check-prefix=SSE 3; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=avx2 | FileCheck %s --check-prefix=AVX2 4; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=avx512f | FileCheck %s --check-prefix=AVX512 5 6; Verify that we don't scalarize a packed vector shift left of 16-bit 7; signed integers if the amount is a constant build_vector. 8; Check that we produce a SSE2 packed integer multiply (pmullw) instead. 9 10define <8 x i16> @test1(<8 x i16> %a) { 11; SSE-LABEL: test1: 12; SSE: # BB#0: 13; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 14; SSE-NEXT: retq 15; 16; AVX2-LABEL: test1: 17; AVX2: # BB#0: 18; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 19; AVX2-NEXT: retq 20; 21; AVX512-LABEL: test1: 22; AVX512: # BB#0: 23; AVX512-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 24; AVX512-NEXT: retq 25 %shl = shl <8 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11> 26 ret <8 x i16> %shl 27} 28 29define <8 x i16> @test2(<8 x i16> %a) { 30; SSE-LABEL: test2: 31; SSE: # BB#0: 32; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 33; SSE-NEXT: retq 34; 35; AVX2-LABEL: test2: 36; AVX2: # BB#0: 37; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 38; AVX2-NEXT: retq 39; 40; AVX512-LABEL: test2: 41; AVX512: # BB#0: 42; AVX512-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 43; AVX512-NEXT: retq 44 %shl = shl <8 x i16> %a, <i16 0, i16 undef, i16 0, i16 0, i16 1, i16 undef, i16 -1, i16 1> 45 ret <8 x i16> %shl 46} 47 48; Verify that a vector shift left of 32-bit signed integers is simply expanded 49; into a SSE4.1 pmulld (instead of cvttps2dq + pmulld) if the vector of shift 50; counts is a constant build_vector. 51 52define <4 x i32> @test3(<4 x i32> %a) { 53; SSE-LABEL: test3: 54; SSE: # BB#0: 55; SSE-NEXT: pmulld {{.*}}(%rip), %xmm0 56; SSE-NEXT: retq 57; 58; AVX2-LABEL: test3: 59; AVX2: # BB#0: 60; AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 61; AVX2-NEXT: retq 62; 63; AVX512-LABEL: test3: 64; AVX512: # BB#0: 65; AVX512-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 66; AVX512-NEXT: retq 67 %shl = shl <4 x i32> %a, <i32 1, i32 -1, i32 2, i32 -3> 68 ret <4 x i32> %shl 69} 70 71define <4 x i32> @test4(<4 x i32> %a) { 72; SSE-LABEL: test4: 73; SSE: # BB#0: 74; SSE-NEXT: pmulld {{.*}}(%rip), %xmm0 75; SSE-NEXT: retq 76; 77; AVX2-LABEL: test4: 78; AVX2: # BB#0: 79; AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 80; AVX2-NEXT: retq 81; 82; AVX512-LABEL: test4: 83; AVX512: # BB#0: 84; AVX512-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 85; AVX512-NEXT: retq 86 %shl = shl <4 x i32> %a, <i32 0, i32 0, i32 1, i32 1> 87 ret <4 x i32> %shl 88} 89 90; If we have AVX/SSE2 but not AVX2, verify that the following shift is split 91; into two pmullw instructions. With AVX2, the test case below would produce 92; a single vpmullw. 93 94define <16 x i16> @test5(<16 x i16> %a) { 95; SSE-LABEL: test5: 96; SSE: # BB#0: 97; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2,2,4,8,128,1,512,2048] 98; SSE-NEXT: pmullw %xmm2, %xmm0 99; SSE-NEXT: pmullw %xmm2, %xmm1 100; SSE-NEXT: retq 101; 102; AVX2-LABEL: test5: 103; AVX2: # BB#0: 104; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 105; AVX2-NEXT: retq 106; 107; AVX512-LABEL: test5: 108; AVX512: # BB#0: 109; AVX512-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 110; AVX512-NEXT: retq 111 %shl = shl <16 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11> 112 ret <16 x i16> %shl 113} 114 115; If we have AVX/SSE4.1 but not AVX2, verify that the following shift is split 116; into two pmulld instructions. With AVX2, the test case below would produce 117; a single vpsllvd instead. 118 119define <8 x i32> @test6(<8 x i32> %a) { 120; SSE-LABEL: test6: 121; SSE: # BB#0: 122; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2,2,4,8] 123; SSE-NEXT: pmulld %xmm2, %xmm0 124; SSE-NEXT: pmulld %xmm2, %xmm1 125; SSE-NEXT: retq 126; 127; AVX2-LABEL: test6: 128; AVX2: # BB#0: 129; AVX2-NEXT: vpsllvd {{.*}}(%rip), %ymm0, %ymm0 130; AVX2-NEXT: retq 131; 132; AVX512-LABEL: test6: 133; AVX512: # BB#0: 134; AVX512-NEXT: vpsllvd {{.*}}(%rip), %ymm0, %ymm0 135; AVX512-NEXT: retq 136 %shl = shl <8 x i32> %a, <i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3> 137 ret <8 x i32> %shl 138} 139 140; With AVX2 and AVX512, the test case below should produce a sequence of 141; two vpmullw instructions. On SSE2 instead, we split the shift in four 142; parts and then we convert each part into a pmullw. 143 144define <32 x i16> @test7(<32 x i16> %a) { 145; SSE-LABEL: test7: 146; SSE: # BB#0: 147; SSE-NEXT: movdqa {{.*#+}} xmm4 = [2,2,4,8,128,1,512,2048] 148; SSE-NEXT: pmullw %xmm4, %xmm0 149; SSE-NEXT: pmullw %xmm4, %xmm1 150; SSE-NEXT: pmullw %xmm4, %xmm2 151; SSE-NEXT: pmullw %xmm4, %xmm3 152; SSE-NEXT: retq 153; 154; AVX2-LABEL: test7: 155; AVX2: # BB#0: 156; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [2,2,4,8,128,1,512,2048,2,2,4,8,128,1,512,2048] 157; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0 158; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1 159; AVX2-NEXT: retq 160; 161; AVX512-LABEL: test7: 162; AVX512: # BB#0: 163; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [2,2,4,8,128,1,512,2048,2,2,4,8,128,1,512,2048] 164; AVX512-NEXT: vpmullw %ymm2, %ymm0, %ymm0 165; AVX512-NEXT: vpmullw %ymm2, %ymm1, %ymm1 166; AVX512-NEXT: retq 167 %shl = shl <32 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11> 168 ret <32 x i16> %shl 169} 170 171; Similar to test7; the difference is that with AVX512 support 172; we only produce a single vpsllvd/vpsllvq instead of a pair of vpsllvd/vpsllvq. 173 174define <16 x i32> @test8(<16 x i32> %a) { 175; SSE-LABEL: test8: 176; SSE: # BB#0: 177; SSE-NEXT: movdqa {{.*#+}} xmm4 = [2,2,4,8] 178; SSE-NEXT: pmulld %xmm4, %xmm0 179; SSE-NEXT: pmulld %xmm4, %xmm1 180; SSE-NEXT: pmulld %xmm4, %xmm2 181; SSE-NEXT: pmulld %xmm4, %xmm3 182; SSE-NEXT: retq 183; 184; AVX2-LABEL: test8: 185; AVX2: # BB#0: 186; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [1,1,2,3,1,1,2,3] 187; AVX2-NEXT: vpsllvd %ymm2, %ymm0, %ymm0 188; AVX2-NEXT: vpsllvd %ymm2, %ymm1, %ymm1 189; AVX2-NEXT: retq 190; 191; AVX512-LABEL: test8: 192; AVX512: # BB#0: 193; AVX512-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0 194; AVX512-NEXT: retq 195 %shl = shl <16 x i32> %a, <i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3> 196 ret <16 x i32> %shl 197} 198 199; The shift from 'test9' gets shifted separately and blended if we don't have AVX2/AVX512f support. 200 201define <8 x i64> @test9(<8 x i64> %a) { 202; SSE-LABEL: test9: 203; SSE: # BB#0: 204; SSE-NEXT: movdqa %xmm1, %xmm4 205; SSE-NEXT: psllq $3, %xmm4 206; SSE-NEXT: psllq $2, %xmm1 207; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7] 208; SSE-NEXT: movdqa %xmm3, %xmm4 209; SSE-NEXT: psllq $3, %xmm4 210; SSE-NEXT: psllq $2, %xmm3 211; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7] 212; SSE-NEXT: paddq %xmm0, %xmm0 213; SSE-NEXT: paddq %xmm2, %xmm2 214; SSE-NEXT: retq 215; 216; AVX2-LABEL: test9: 217; AVX2: # BB#0: 218; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [1,1,2,3] 219; AVX2-NEXT: vpsllvq %ymm2, %ymm0, %ymm0 220; AVX2-NEXT: vpsllvq %ymm2, %ymm1, %ymm1 221; AVX2-NEXT: retq 222; 223; AVX512-LABEL: test9: 224; AVX512: # BB#0: 225; AVX512-NEXT: vpsllvq {{.*}}(%rip), %zmm0, %zmm0 226; AVX512-NEXT: retq 227 %shl = shl <8 x i64> %a, <i64 1, i64 1, i64 2, i64 3, i64 1, i64 1, i64 2, i64 3> 228 ret <8 x i64> %shl 229} 230