1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=sse4.1 | FileCheck %s --check-prefix=SSE
3; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=avx2 | FileCheck %s --check-prefix=AVX2
4; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=avx512f | FileCheck %s --check-prefix=AVX512
5
6; Verify that we don't scalarize a packed vector shift left of 16-bit
7; signed integers if the amount is a constant build_vector.
8; Check that we produce a SSE2 packed integer multiply (pmullw) instead.
9
10define <8 x i16> @test1(<8 x i16> %a) {
11; SSE-LABEL: test1:
12; SSE:       # BB#0:
13; SSE-NEXT:    pmullw {{.*}}(%rip), %xmm0
14; SSE-NEXT:    retq
15;
16; AVX2-LABEL: test1:
17; AVX2:       # BB#0:
18; AVX2-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
19; AVX2-NEXT:    retq
20;
21; AVX512-LABEL: test1:
22; AVX512:       # BB#0:
23; AVX512-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
24; AVX512-NEXT:    retq
25  %shl = shl <8 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
26  ret <8 x i16> %shl
27}
28
29define <8 x i16> @test2(<8 x i16> %a) {
30; SSE-LABEL: test2:
31; SSE:       # BB#0:
32; SSE-NEXT:    pmullw {{.*}}(%rip), %xmm0
33; SSE-NEXT:    retq
34;
35; AVX2-LABEL: test2:
36; AVX2:       # BB#0:
37; AVX2-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
38; AVX2-NEXT:    retq
39;
40; AVX512-LABEL: test2:
41; AVX512:       # BB#0:
42; AVX512-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
43; AVX512-NEXT:    retq
44  %shl = shl <8 x i16> %a, <i16 0, i16 undef, i16 0, i16 0, i16 1, i16 undef, i16 -1, i16 1>
45  ret <8 x i16> %shl
46}
47
48; Verify that a vector shift left of 32-bit signed integers is simply expanded
49; into a SSE4.1 pmulld (instead of cvttps2dq + pmulld) if the vector of shift
50; counts is a constant build_vector.
51
52define <4 x i32> @test3(<4 x i32> %a) {
53; SSE-LABEL: test3:
54; SSE:       # BB#0:
55; SSE-NEXT:    pmulld {{.*}}(%rip), %xmm0
56; SSE-NEXT:    retq
57;
58; AVX2-LABEL: test3:
59; AVX2:       # BB#0:
60; AVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
61; AVX2-NEXT:    retq
62;
63; AVX512-LABEL: test3:
64; AVX512:       # BB#0:
65; AVX512-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
66; AVX512-NEXT:    retq
67  %shl = shl <4 x i32> %a, <i32 1, i32 -1, i32 2, i32 -3>
68  ret <4 x i32> %shl
69}
70
71define <4 x i32> @test4(<4 x i32> %a) {
72; SSE-LABEL: test4:
73; SSE:       # BB#0:
74; SSE-NEXT:    pmulld {{.*}}(%rip), %xmm0
75; SSE-NEXT:    retq
76;
77; AVX2-LABEL: test4:
78; AVX2:       # BB#0:
79; AVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
80; AVX2-NEXT:    retq
81;
82; AVX512-LABEL: test4:
83; AVX512:       # BB#0:
84; AVX512-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
85; AVX512-NEXT:    retq
86  %shl = shl <4 x i32> %a, <i32 0, i32 0, i32 1, i32 1>
87  ret <4 x i32> %shl
88}
89
90; If we have AVX/SSE2 but not AVX2, verify that the following shift is split
91; into two pmullw instructions. With AVX2, the test case below would produce
92; a single vpmullw.
93
94define <16 x i16> @test5(<16 x i16> %a) {
95; SSE-LABEL: test5:
96; SSE:       # BB#0:
97; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [2,2,4,8,128,1,512,2048]
98; SSE-NEXT:    pmullw %xmm2, %xmm0
99; SSE-NEXT:    pmullw %xmm2, %xmm1
100; SSE-NEXT:    retq
101;
102; AVX2-LABEL: test5:
103; AVX2:       # BB#0:
104; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
105; AVX2-NEXT:    retq
106;
107; AVX512-LABEL: test5:
108; AVX512:       # BB#0:
109; AVX512-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
110; AVX512-NEXT:    retq
111  %shl = shl <16 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
112  ret <16 x i16> %shl
113}
114
115; If we have AVX/SSE4.1 but not AVX2, verify that the following shift is split
116; into two pmulld instructions. With AVX2, the test case below would produce
117; a single vpsllvd instead.
118
119define <8 x i32> @test6(<8 x i32> %a) {
120; SSE-LABEL: test6:
121; SSE:       # BB#0:
122; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [2,2,4,8]
123; SSE-NEXT:    pmulld %xmm2, %xmm0
124; SSE-NEXT:    pmulld %xmm2, %xmm1
125; SSE-NEXT:    retq
126;
127; AVX2-LABEL: test6:
128; AVX2:       # BB#0:
129; AVX2-NEXT:    vpsllvd {{.*}}(%rip), %ymm0, %ymm0
130; AVX2-NEXT:    retq
131;
132; AVX512-LABEL: test6:
133; AVX512:       # BB#0:
134; AVX512-NEXT:    vpsllvd {{.*}}(%rip), %ymm0, %ymm0
135; AVX512-NEXT:    retq
136  %shl = shl <8 x i32> %a, <i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3>
137  ret <8 x i32> %shl
138}
139
140; With AVX2 and AVX512, the test case below should produce a sequence of
141; two vpmullw instructions. On SSE2 instead, we split the shift in four
142; parts and then we convert each part into a pmullw.
143
144define <32 x i16> @test7(<32 x i16> %a) {
145; SSE-LABEL: test7:
146; SSE:       # BB#0:
147; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [2,2,4,8,128,1,512,2048]
148; SSE-NEXT:    pmullw %xmm4, %xmm0
149; SSE-NEXT:    pmullw %xmm4, %xmm1
150; SSE-NEXT:    pmullw %xmm4, %xmm2
151; SSE-NEXT:    pmullw %xmm4, %xmm3
152; SSE-NEXT:    retq
153;
154; AVX2-LABEL: test7:
155; AVX2:       # BB#0:
156; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [2,2,4,8,128,1,512,2048,2,2,4,8,128,1,512,2048]
157; AVX2-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
158; AVX2-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
159; AVX2-NEXT:    retq
160;
161; AVX512-LABEL: test7:
162; AVX512:       # BB#0:
163; AVX512-NEXT:    vmovdqa {{.*#+}} ymm2 = [2,2,4,8,128,1,512,2048,2,2,4,8,128,1,512,2048]
164; AVX512-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
165; AVX512-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
166; AVX512-NEXT:    retq
167  %shl = shl <32 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
168  ret <32 x i16> %shl
169}
170
171; Similar to test7; the difference is that with AVX512 support
172; we only produce a single vpsllvd/vpsllvq instead of a pair of vpsllvd/vpsllvq.
173
174define <16 x i32> @test8(<16 x i32> %a) {
175; SSE-LABEL: test8:
176; SSE:       # BB#0:
177; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [2,2,4,8]
178; SSE-NEXT:    pmulld %xmm4, %xmm0
179; SSE-NEXT:    pmulld %xmm4, %xmm1
180; SSE-NEXT:    pmulld %xmm4, %xmm2
181; SSE-NEXT:    pmulld %xmm4, %xmm3
182; SSE-NEXT:    retq
183;
184; AVX2-LABEL: test8:
185; AVX2:       # BB#0:
186; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,1,2,3,1,1,2,3]
187; AVX2-NEXT:    vpsllvd %ymm2, %ymm0, %ymm0
188; AVX2-NEXT:    vpsllvd %ymm2, %ymm1, %ymm1
189; AVX2-NEXT:    retq
190;
191; AVX512-LABEL: test8:
192; AVX512:       # BB#0:
193; AVX512-NEXT:    vpsllvd {{.*}}(%rip), %zmm0, %zmm0
194; AVX512-NEXT:    retq
195  %shl = shl <16 x i32> %a, <i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3>
196  ret <16 x i32> %shl
197}
198
199; The shift from 'test9' gets shifted separately and blended if we don't have AVX2/AVX512f support.
200
201define <8 x i64> @test9(<8 x i64> %a) {
202; SSE-LABEL: test9:
203; SSE:       # BB#0:
204; SSE-NEXT:    movdqa %xmm1, %xmm4
205; SSE-NEXT:    psllq $3, %xmm4
206; SSE-NEXT:    psllq $2, %xmm1
207; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7]
208; SSE-NEXT:    movdqa %xmm3, %xmm4
209; SSE-NEXT:    psllq $3, %xmm4
210; SSE-NEXT:    psllq $2, %xmm3
211; SSE-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7]
212; SSE-NEXT:    paddq %xmm0, %xmm0
213; SSE-NEXT:    paddq %xmm2, %xmm2
214; SSE-NEXT:    retq
215;
216; AVX2-LABEL: test9:
217; AVX2:       # BB#0:
218; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,1,2,3]
219; AVX2-NEXT:    vpsllvq %ymm2, %ymm0, %ymm0
220; AVX2-NEXT:    vpsllvq %ymm2, %ymm1, %ymm1
221; AVX2-NEXT:    retq
222;
223; AVX512-LABEL: test9:
224; AVX512:       # BB#0:
225; AVX512-NEXT:    vpsllvq {{.*}}(%rip), %zmm0, %zmm0
226; AVX512-NEXT:    retq
227  %shl = shl <8 x i64> %a, <i64 1, i64 1, i64 2, i64 3, i64 1, i64 1, i64 2, i64 3>
228  ret <8 x i64> %shl
229}
230