1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=avx2 | FileCheck %s --check-prefixes=AVX,AVX2
5; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=avx512f | FileCheck %s --check-prefixes=AVX,AVX512
6
7; Verify that we don't scalarize a packed vector shift left of 16-bit
8; signed integers if the amount is a constant build_vector.
9; Check that we produce a SSE2 packed integer multiply (pmullw) instead.
10
11define <8 x i16> @test1(<8 x i16> %a) {
12; SSE-LABEL: test1:
13; SSE:       # %bb.0:
14; SSE-NEXT:    pmullw {{.*}}(%rip), %xmm0
15; SSE-NEXT:    retq
16;
17; AVX-LABEL: test1:
18; AVX:       # %bb.0:
19; AVX-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
20; AVX-NEXT:    retq
21  %shl = shl <8 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
22  ret <8 x i16> %shl
23}
24
25define <8 x i16> @test2(<8 x i16> %a) {
26; SSE-LABEL: test2:
27; SSE:       # %bb.0:
28; SSE-NEXT:    pmullw {{.*}}(%rip), %xmm0
29; SSE-NEXT:    retq
30;
31; AVX-LABEL: test2:
32; AVX:       # %bb.0:
33; AVX-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
34; AVX-NEXT:    retq
35  %shl = shl <8 x i16> %a, <i16 0, i16 undef, i16 0, i16 0, i16 1, i16 undef, i16 -1, i16 1>
36  ret <8 x i16> %shl
37}
38
39; Verify that a vector shift left of 32-bit signed integers is simply expanded
40; into a SSE4.1 pmulld (instead of cvttps2dq + pmulld) if the vector of shift
41; counts is a constant build_vector.
42
43define <4 x i32> @test3(<4 x i32> %a) {
44; SSE2-LABEL: test3:
45; SSE2:       # %bb.0:
46; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
47; SSE2-NEXT:    pmuludq %xmm0, %xmm1
48; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
49; SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm0
50; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
51; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
52; SSE2-NEXT:    retq
53;
54; SSE41-LABEL: test3:
55; SSE41:       # %bb.0:
56; SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
57; SSE41-NEXT:    retq
58;
59; AVX-LABEL: test3:
60; AVX:       # %bb.0:
61; AVX-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
62; AVX-NEXT:    retq
63  %shl = shl <4 x i32> %a, <i32 1, i32 -1, i32 2, i32 -3>
64  ret <4 x i32> %shl
65}
66
67define <4 x i32> @test4(<4 x i32> %a) {
68; SSE2-LABEL: test4:
69; SSE2:       # %bb.0:
70; SSE2-NEXT:    movdqa %xmm0, %xmm1
71; SSE2-NEXT:    pslld $1, %xmm1
72; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
73; SSE2-NEXT:    movapd %xmm1, %xmm0
74; SSE2-NEXT:    retq
75;
76; SSE41-LABEL: test4:
77; SSE41:       # %bb.0:
78; SSE41-NEXT:    movdqa %xmm0, %xmm1
79; SSE41-NEXT:    pslld $1, %xmm1
80; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
81; SSE41-NEXT:    movdqa %xmm1, %xmm0
82; SSE41-NEXT:    retq
83;
84; AVX-LABEL: test4:
85; AVX:       # %bb.0:
86; AVX-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
87; AVX-NEXT:    retq
88  %shl = shl <4 x i32> %a, <i32 0, i32 0, i32 1, i32 1>
89  ret <4 x i32> %shl
90}
91
92; If we have AVX/SSE2 but not AVX2, verify that the following shift is split
93; into two pmullw instructions. With AVX2, the test case below would produce
94; a single vpmullw.
95
96define <16 x i16> @test5(<16 x i16> %a) {
97; SSE-LABEL: test5:
98; SSE:       # %bb.0:
99; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [2,2,4,8,128,1,512,2048]
100; SSE-NEXT:    pmullw %xmm2, %xmm0
101; SSE-NEXT:    pmullw %xmm2, %xmm1
102; SSE-NEXT:    retq
103;
104; AVX-LABEL: test5:
105; AVX:       # %bb.0:
106; AVX-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
107; AVX-NEXT:    retq
108  %shl = shl <16 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
109  ret <16 x i16> %shl
110}
111
112; If we have AVX/SSE4.1 but not AVX2, verify that the following shift is split
113; into two pmulld instructions. With AVX2, the test case below would produce
114; a single vpsllvd instead.
115
116define <8 x i32> @test6(<8 x i32> %a) {
117; SSE2-LABEL: test6:
118; SSE2:       # %bb.0:
119; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2,2,4,8]
120; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
121; SSE2-NEXT:    pmuludq %xmm2, %xmm0
122; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
123; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
124; SSE2-NEXT:    pmuludq %xmm4, %xmm3
125; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
126; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
127; SSE2-NEXT:    pmuludq %xmm1, %xmm2
128; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
129; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
130; SSE2-NEXT:    pmuludq %xmm4, %xmm1
131; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
132; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
133; SSE2-NEXT:    movdqa %xmm2, %xmm1
134; SSE2-NEXT:    retq
135;
136; SSE41-LABEL: test6:
137; SSE41:       # %bb.0:
138; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [2,2,4,8]
139; SSE41-NEXT:    pmulld %xmm2, %xmm0
140; SSE41-NEXT:    pmulld %xmm2, %xmm1
141; SSE41-NEXT:    retq
142;
143; AVX-LABEL: test6:
144; AVX:       # %bb.0:
145; AVX-NEXT:    vpsllvd {{.*}}(%rip), %ymm0, %ymm0
146; AVX-NEXT:    retq
147  %shl = shl <8 x i32> %a, <i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3>
148  ret <8 x i32> %shl
149}
150
151; With AVX2 and AVX512, the test case below should produce a sequence of
152; two vpmullw instructions. On SSE2 instead, we split the shift in four
153; parts and then we convert each part into a pmullw.
154
155define <32 x i16> @test7(<32 x i16> %a) {
156; SSE-LABEL: test7:
157; SSE:       # %bb.0:
158; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [2,2,4,8,128,1,512,2048]
159; SSE-NEXT:    pmullw %xmm4, %xmm0
160; SSE-NEXT:    pmullw %xmm4, %xmm1
161; SSE-NEXT:    pmullw %xmm4, %xmm2
162; SSE-NEXT:    pmullw %xmm4, %xmm3
163; SSE-NEXT:    retq
164;
165; AVX2-LABEL: test7:
166; AVX2:       # %bb.0:
167; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [2,2,4,8,128,1,512,2048,2,2,4,8,128,1,512,2048]
168; AVX2-NEXT:    # ymm2 = mem[0,1,0,1]
169; AVX2-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
170; AVX2-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
171; AVX2-NEXT:    retq
172;
173; AVX512-LABEL: test7:
174; AVX512:       # %bb.0:
175; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
176; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [2,2,4,8,128,1,512,2048,2,2,4,8,128,1,512,2048]
177; AVX512-NEXT:    # ymm2 = mem[0,1,0,1]
178; AVX512-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
179; AVX512-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
180; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
181; AVX512-NEXT:    retq
182  %shl = shl <32 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
183  ret <32 x i16> %shl
184}
185
186; Similar to test7; the difference is that with AVX512 support
187; we only produce a single vpsllvd/vpsllvq instead of a pair of vpsllvd/vpsllvq.
188
189define <16 x i32> @test8(<16 x i32> %a) {
190; SSE2-LABEL: test8:
191; SSE2:       # %bb.0:
192; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2,2,4,8]
193; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
194; SSE2-NEXT:    pmuludq %xmm4, %xmm0
195; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
196; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
197; SSE2-NEXT:    pmuludq %xmm6, %xmm5
198; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
199; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
200; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
201; SSE2-NEXT:    pmuludq %xmm4, %xmm1
202; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
203; SSE2-NEXT:    pmuludq %xmm6, %xmm5
204; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
205; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
206; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
207; SSE2-NEXT:    pmuludq %xmm4, %xmm2
208; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
209; SSE2-NEXT:    pmuludq %xmm6, %xmm5
210; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
211; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
212; SSE2-NEXT:    pmuludq %xmm3, %xmm4
213; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
214; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
215; SSE2-NEXT:    pmuludq %xmm6, %xmm3
216; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
217; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
218; SSE2-NEXT:    movdqa %xmm4, %xmm3
219; SSE2-NEXT:    retq
220;
221; SSE41-LABEL: test8:
222; SSE41:       # %bb.0:
223; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [2,2,4,8]
224; SSE41-NEXT:    pmulld %xmm4, %xmm0
225; SSE41-NEXT:    pmulld %xmm4, %xmm1
226; SSE41-NEXT:    pmulld %xmm4, %xmm2
227; SSE41-NEXT:    pmulld %xmm4, %xmm3
228; SSE41-NEXT:    retq
229;
230; AVX2-LABEL: test8:
231; AVX2:       # %bb.0:
232; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [1,1,2,3,1,1,2,3]
233; AVX2-NEXT:    # ymm2 = mem[0,1,0,1]
234; AVX2-NEXT:    vpsllvd %ymm2, %ymm0, %ymm0
235; AVX2-NEXT:    vpsllvd %ymm2, %ymm1, %ymm1
236; AVX2-NEXT:    retq
237;
238; AVX512-LABEL: test8:
239; AVX512:       # %bb.0:
240; AVX512-NEXT:    vpsllvd {{.*}}(%rip), %zmm0, %zmm0
241; AVX512-NEXT:    retq
242  %shl = shl <16 x i32> %a, <i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3>
243  ret <16 x i32> %shl
244}
245
246; The shift from 'test9' gets shifted separately and blended if we don't have AVX2/AVX512f support.
247
248define <8 x i64> @test9(<8 x i64> %a) {
249; SSE2-LABEL: test9:
250; SSE2:       # %bb.0:
251; SSE2-NEXT:    movdqa %xmm1, %xmm4
252; SSE2-NEXT:    psllq $2, %xmm4
253; SSE2-NEXT:    psllq $3, %xmm1
254; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1]
255; SSE2-NEXT:    movdqa %xmm3, %xmm4
256; SSE2-NEXT:    psllq $2, %xmm4
257; SSE2-NEXT:    psllq $3, %xmm3
258; SSE2-NEXT:    movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1]
259; SSE2-NEXT:    paddq %xmm0, %xmm0
260; SSE2-NEXT:    paddq %xmm2, %xmm2
261; SSE2-NEXT:    retq
262;
263; SSE41-LABEL: test9:
264; SSE41:       # %bb.0:
265; SSE41-NEXT:    movdqa %xmm1, %xmm4
266; SSE41-NEXT:    psllq $3, %xmm4
267; SSE41-NEXT:    psllq $2, %xmm1
268; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7]
269; SSE41-NEXT:    movdqa %xmm3, %xmm4
270; SSE41-NEXT:    psllq $3, %xmm4
271; SSE41-NEXT:    psllq $2, %xmm3
272; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7]
273; SSE41-NEXT:    paddq %xmm0, %xmm0
274; SSE41-NEXT:    paddq %xmm2, %xmm2
275; SSE41-NEXT:    retq
276;
277; AVX2-LABEL: test9:
278; AVX2:       # %bb.0:
279; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,1,2,3]
280; AVX2-NEXT:    vpsllvq %ymm2, %ymm0, %ymm0
281; AVX2-NEXT:    vpsllvq %ymm2, %ymm1, %ymm1
282; AVX2-NEXT:    retq
283;
284; AVX512-LABEL: test9:
285; AVX512:       # %bb.0:
286; AVX512-NEXT:    vpsllvq {{.*}}(%rip), %zmm0, %zmm0
287; AVX512-NEXT:    retq
288  %shl = shl <8 x i64> %a, <i64 1, i64 1, i64 2, i64 3, i64 1, i64 1, i64 2, i64 3>
289  ret <8 x i64> %shl
290}
291