1; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=+sse2,-sse4.1 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE2
2; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE41
3; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX
4; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core-avx2 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2
5; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=XOP -check-prefix=XOPAVX
6; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver4 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=XOP -check-prefix=XOPAVX2
7
8; Verify the cost of vector shift left instructions.
9
10;
11;
12; Variable Shifts
13;
14
15define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) {
16; CHECK: 'Cost Model Analysis' for function 'var_shift_v2i64':
17; SSE2: Found an estimated cost of 4 for instruction:   %shift
18; SSE41: Found an estimated cost of 4 for instruction:   %shift
19; AVX: Found an estimated cost of 4 for instruction:   %shift
20; AVX2: Found an estimated cost of 1 for instruction:   %shift
21; XOPAVX: Found an estimated cost of 1 for instruction:   %shift
22; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
23  %shift = shl <2 x i64> %a, %b
24  ret <2 x i64> %shift
25}
26
27define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) {
28; CHECK: 'Cost Model Analysis' for function 'var_shift_v4i64':
29; SSE2: Found an estimated cost of 8 for instruction:   %shift
30; SSE41: Found an estimated cost of 8 for instruction:   %shift
31; AVX: Found an estimated cost of 8 for instruction:   %shift
32; AVX2: Found an estimated cost of 1 for instruction:   %shift
33; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
34; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
35  %shift = shl <4 x i64> %a, %b
36  ret <4 x i64> %shift
37}
38
39define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) {
40; CHECK: 'Cost Model Analysis' for function 'var_shift_v4i32':
41; SSE2: Found an estimated cost of 10 for instruction:   %shift
42; SSE41: Found an estimated cost of 10 for instruction:   %shift
43; AVX: Found an estimated cost of 10 for instruction:   %shift
44; AVX2: Found an estimated cost of 1 for instruction:   %shift
45; XOPAVX: Found an estimated cost of 1 for instruction:   %shift
46; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
47  %shift = shl <4 x i32> %a, %b
48  ret <4 x i32> %shift
49}
50
51define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) {
52; CHECK: 'Cost Model Analysis' for function 'var_shift_v8i32':
53; SSE2: Found an estimated cost of 20 for instruction:   %shift
54; SSE41: Found an estimated cost of 20 for instruction:   %shift
55; AVX: Found an estimated cost of 20 for instruction:   %shift
56; AVX2: Found an estimated cost of 1 for instruction:   %shift
57; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
58; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
59  %shift = shl <8 x i32> %a, %b
60  ret <8 x i32> %shift
61}
62
63define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
64; CHECK: 'Cost Model Analysis' for function 'var_shift_v8i16':
65; SSE2: Found an estimated cost of 32 for instruction:   %shift
66; SSE41: Found an estimated cost of 32 for instruction:   %shift
67; AVX: Found an estimated cost of 32 for instruction:   %shift
68; AVX2: Found an estimated cost of 32 for instruction:   %shift
69; XOP: Found an estimated cost of 1 for instruction:   %shift
70  %shift = shl <8 x i16> %a, %b
71  ret <8 x i16> %shift
72}
73
74define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) {
75; CHECK: 'Cost Model Analysis' for function 'var_shift_v16i16':
76; SSE2: Found an estimated cost of 64 for instruction:   %shift
77; SSE41: Found an estimated cost of 64 for instruction:   %shift
78; AVX: Found an estimated cost of 64 for instruction:   %shift
79; AVX2: Found an estimated cost of 10 for instruction:   %shift
80; XOP: Found an estimated cost of 2 for instruction:   %shift
81  %shift = shl <16 x i16> %a, %b
82  ret <16 x i16> %shift
83}
84
85define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
86; CHECK: 'Cost Model Analysis' for function 'var_shift_v16i8':
87; SSE2: Found an estimated cost of 26 for instruction:   %shift
88; SSE41: Found an estimated cost of 26 for instruction:   %shift
89; AVX: Found an estimated cost of 26 for instruction:   %shift
90; AVX2: Found an estimated cost of 26 for instruction:   %shift
91; XOP: Found an estimated cost of 1 for instruction:   %shift
92  %shift = shl <16 x i8> %a, %b
93  ret <16 x i8> %shift
94}
95
96define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) {
97; CHECK: 'Cost Model Analysis' for function 'var_shift_v32i8':
98; SSE2: Found an estimated cost of 52 for instruction:   %shift
99; SSE41: Found an estimated cost of 52 for instruction:   %shift
100; AVX: Found an estimated cost of 52 for instruction:   %shift
101; AVX2: Found an estimated cost of 11 for instruction:   %shift
102; XOP: Found an estimated cost of 2 for instruction:   %shift
103  %shift = shl <32 x i8> %a, %b
104  ret <32 x i8> %shift
105}
106
107;
108; Uniform Variable Shifts
109;
110
111define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) {
112; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v2i64':
113; SSE2: Found an estimated cost of 4 for instruction:   %shift
114; SSE41: Found an estimated cost of 4 for instruction:   %shift
115; AVX: Found an estimated cost of 4 for instruction:   %shift
116; AVX2: Found an estimated cost of 1 for instruction:   %shift
117; XOPAVX: Found an estimated cost of 1 for instruction:   %shift
118; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
119  %splat = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer
120  %shift = shl <2 x i64> %a, %splat
121  ret <2 x i64> %shift
122}
123
124define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) {
125; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v4i64':
126; SSE2: Found an estimated cost of 8 for instruction:   %shift
127; SSE41: Found an estimated cost of 8 for instruction:   %shift
128; AVX: Found an estimated cost of 8 for instruction:   %shift
129; AVX2: Found an estimated cost of 1 for instruction:   %shift
130; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
131; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
132  %splat = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer
133  %shift = shl <4 x i64> %a, %splat
134  ret <4 x i64> %shift
135}
136
137define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) {
138; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v4i32':
139; SSE2: Found an estimated cost of 10 for instruction:   %shift
140; SSE41: Found an estimated cost of 10 for instruction:   %shift
141; AVX: Found an estimated cost of 10 for instruction:   %shift
142; AVX2: Found an estimated cost of 1 for instruction:   %shift
143; XOPAVX: Found an estimated cost of 1 for instruction:   %shift
144; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
145  %splat = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> zeroinitializer
146  %shift = shl <4 x i32> %a, %splat
147  ret <4 x i32> %shift
148}
149
150define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) {
151; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v8i32':
152; SSE2: Found an estimated cost of 20 for instruction:   %shift
153; SSE41: Found an estimated cost of 20 for instruction:   %shift
154; AVX: Found an estimated cost of 20 for instruction:   %shift
155; AVX2: Found an estimated cost of 1 for instruction:   %shift
156; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
157; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
158  %splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer
159  %shift = shl <8 x i32> %a, %splat
160  ret <8 x i32> %shift
161}
162
163define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
164; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v8i16':
165; SSE2: Found an estimated cost of 32 for instruction:   %shift
166; SSE41: Found an estimated cost of 32 for instruction:   %shift
167; AVX: Found an estimated cost of 32 for instruction:   %shift
168; AVX2: Found an estimated cost of 32 for instruction:   %shift
169; XOP: Found an estimated cost of 1 for instruction:   %shift
170  %splat = shufflevector <8 x i16> %b, <8 x i16> undef, <8 x i32> zeroinitializer
171  %shift = shl <8 x i16> %a, %splat
172  ret <8 x i16> %shift
173}
174
175define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) {
176; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v16i16':
177; SSE2: Found an estimated cost of 64 for instruction:   %shift
178; SSE41: Found an estimated cost of 64 for instruction:   %shift
179; AVX: Found an estimated cost of 64 for instruction:   %shift
180; AVX2: Found an estimated cost of 10 for instruction:   %shift
181; XOP: Found an estimated cost of 2 for instruction:   %shift
182  %splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer
183  %shift = shl <16 x i16> %a, %splat
184  ret <16 x i16> %shift
185}
186
187define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
188; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v16i8':
189; SSE2: Found an estimated cost of 26 for instruction:   %shift
190; SSE41: Found an estimated cost of 26 for instruction:   %shift
191; AVX: Found an estimated cost of 26 for instruction:   %shift
192; AVX2: Found an estimated cost of 26 for instruction:   %shift
193; XOP: Found an estimated cost of 1 for instruction:   %shift
194  %splat = shufflevector <16 x i8> %b, <16 x i8> undef, <16 x i32> zeroinitializer
195  %shift = shl <16 x i8> %a, %splat
196  ret <16 x i8> %shift
197}
198
199define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) {
200; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v32i8':
201; SSE2: Found an estimated cost of 52 for instruction:   %shift
202; SSE41: Found an estimated cost of 52 for instruction:   %shift
203; AVX: Found an estimated cost of 52 for instruction:   %shift
204; AVX2: Found an estimated cost of 11 for instruction:   %shift
205; XOP: Found an estimated cost of 2 for instruction:   %shift
206  %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer
207  %shift = shl <32 x i8> %a, %splat
208  ret <32 x i8> %shift
209}
210
211;
212; Constant Shifts
213;
214
215define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) {
216; CHECK: 'Cost Model Analysis' for function 'constant_shift_v2i64':
217; SSE2: Found an estimated cost of 4 for instruction:   %shift
218; SSE41: Found an estimated cost of 4 for instruction:   %shift
219; AVX: Found an estimated cost of 4 for instruction:   %shift
220; AVX2: Found an estimated cost of 1 for instruction:   %shift
221; XOPAVX: Found an estimated cost of 1 for instruction:   %shift
222; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
223  %shift = shl <2 x i64> %a, <i64 1, i64 7>
224  ret <2 x i64> %shift
225}
226
227define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) {
228; CHECK: 'Cost Model Analysis' for function 'constant_shift_v4i64':
229; SSE2: Found an estimated cost of 8 for instruction:   %shift
230; SSE41: Found an estimated cost of 8 for instruction:   %shift
231; AVX: Found an estimated cost of 8 for instruction:   %shift
232; AVX2: Found an estimated cost of 1 for instruction:   %shift
233; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
234; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
235  %shift = shl <4 x i64> %a, <i64 1, i64 7, i64 15, i64 31>
236  ret <4 x i64> %shift
237}
238
239define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) {
240; CHECK: 'Cost Model Analysis' for function 'constant_shift_v4i32':
241; SSE2: Found an estimated cost of 6 for instruction:   %shift
242; SSE41: Found an estimated cost of 1 for instruction:   %shift
243; AVX: Found an estimated cost of 1 for instruction:   %shift
244; AVX2: Found an estimated cost of 1 for instruction:   %shift
245; XOPAVX: Found an estimated cost of 1 for instruction:   %shift
246; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
247  %shift = shl <4 x i32> %a, <i32 4, i32 5, i32 6, i32 7>
248  ret <4 x i32> %shift
249}
250
251define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) {
252; CHECK: 'Cost Model Analysis' for function 'constant_shift_v8i32':
253; SSE2: Found an estimated cost of 12 for instruction:   %shift
254; SSE41: Found an estimated cost of 2 for instruction:   %shift
255; AVX: Found an estimated cost of 4 for instruction:   %shift
256; AVX2: Found an estimated cost of 1 for instruction:   %shift
257; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
258; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
259  %shift = shl <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
260  ret <8 x i32> %shift
261}
262
263define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) {
264; CHECK: 'Cost Model Analysis' for function 'constant_shift_v8i16':
265; SSE2: Found an estimated cost of 1 for instruction:   %shift
266; SSE41: Found an estimated cost of 1 for instruction:   %shift
267; AVX: Found an estimated cost of 1 for instruction:   %shift
268; AVX2: Found an estimated cost of 1 for instruction:   %shift
269; XOP: Found an estimated cost of 1 for instruction:   %shift
270  %shift = shl <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
271  ret <8 x i16> %shift
272}
273
274define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) {
275; CHECK: 'Cost Model Analysis' for function 'constant_shift_v16i16':
276; SSE2: Found an estimated cost of 2 for instruction:   %shift
277; SSE41: Found an estimated cost of 2 for instruction:   %shift
278; AVX: Found an estimated cost of 4 for instruction:   %shift
279; AVX2: Found an estimated cost of 1 for instruction:   %shift
280; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
281; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
282  %shift = shl <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
283  ret <16 x i16> %shift
284}
285
286define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) {
287; CHECK: 'Cost Model Analysis' for function 'constant_shift_v16i8':
288; SSE2: Found an estimated cost of 26 for instruction:   %shift
289; SSE41: Found an estimated cost of 26 for instruction:   %shift
290; AVX: Found an estimated cost of 26 for instruction:   %shift
291; AVX2: Found an estimated cost of 26 for instruction:   %shift
292; XOP: Found an estimated cost of 1 for instruction:   %shift
293  %shift = shl <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
294  ret <16 x i8> %shift
295}
296
297define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) {
298; CHECK: 'Cost Model Analysis' for function 'constant_shift_v32i8':
299; SSE2: Found an estimated cost of 52 for instruction:   %shift
300; SSE41: Found an estimated cost of 52 for instruction:   %shift
301; AVX: Found an estimated cost of 52 for instruction:   %shift
302; AVX2: Found an estimated cost of 11 for instruction:   %shift
303; XOP: Found an estimated cost of 2 for instruction:   %shift
304  %shift = shl <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
305  ret <32 x i8> %shift
306}
307
308;
309; Uniform Constant Shifts
310;
311
312define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) {
313; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v2i64':
314; SSE2: Found an estimated cost of 1 for instruction:   %shift
315; SSE41: Found an estimated cost of 1 for instruction:   %shift
316; AVX: Found an estimated cost of 1 for instruction:   %shift
317; AVX2: Found an estimated cost of 1 for instruction:   %shift
318; XOPAVX: Found an estimated cost of 1 for instruction:   %shift
319; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
320  %shift = shl <2 x i64> %a, <i64 7, i64 7>
321  ret <2 x i64> %shift
322}
323
324define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) {
325; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v4i64':
326; SSE2: Found an estimated cost of 2 for instruction:   %shift
327; SSE41: Found an estimated cost of 2 for instruction:   %shift
328; AVX: Found an estimated cost of 2 for instruction:   %shift
329; AVX2: Found an estimated cost of 1 for instruction:   %shift
330; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
331; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
332  %shift = shl <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
333  ret <4 x i64> %shift
334}
335
336define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) {
337; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v4i32':
338; SSE2: Found an estimated cost of 1 for instruction:   %shift
339; SSE41: Found an estimated cost of 1 for instruction:   %shift
340; AVX: Found an estimated cost of 1 for instruction:   %shift
341; AVX2: Found an estimated cost of 1 for instruction:   %shift
342; XOPAVX: Found an estimated cost of 1 for instruction:   %shift
343; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
344  %shift = shl <4 x i32> %a, <i32 5, i32 5, i32 5, i32 5>
345  ret <4 x i32> %shift
346}
347
348define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) {
349; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v8i32':
350; SSE2: Found an estimated cost of 2 for instruction:   %shift
351; SSE41: Found an estimated cost of 2 for instruction:   %shift
352; AVX: Found an estimated cost of 2 for instruction:   %shift
353; AVX2: Found an estimated cost of 1 for instruction:   %shift
354; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
355; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
356  %shift = shl <8 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
357  ret <8 x i32> %shift
358}
359
360define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) {
361; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v8i16':
362; SSE2: Found an estimated cost of 1 for instruction:   %shift
363; SSE41: Found an estimated cost of 1 for instruction:   %shift
364; AVX: Found an estimated cost of 1 for instruction:   %shift
365; AVX2: Found an estimated cost of 1 for instruction:   %shift
366; XOP: Found an estimated cost of 1 for instruction:   %shift
367  %shift = shl <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
368  ret <8 x i16> %shift
369}
370
371define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) {
372; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v16i16':
373; SSE2: Found an estimated cost of 2 for instruction:   %shift
374; SSE41: Found an estimated cost of 2 for instruction:   %shift
375; AVX: Found an estimated cost of 2 for instruction:   %shift
376; AVX2: Found an estimated cost of 1 for instruction:   %shift
377; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
378; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
379  %shift = shl <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
380  ret <16 x i16> %shift
381}
382
383define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) {
384; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v16i8':
385; SSE2: Found an estimated cost of 1 for instruction:   %shift
386; SSE41: Found an estimated cost of 1 for instruction:   %shift
387; AVX: Found an estimated cost of 1 for instruction:   %shift
388; AVX2: Found an estimated cost of 1 for instruction:   %shift
389; XOP: Found an estimated cost of 1 for instruction:   %shift
390  %shift = shl <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
391  ret <16 x i8> %shift
392}
393
394define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) {
395; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v32i8':
396; SSE2: Found an estimated cost of 2 for instruction:   %shift
397; SSE41: Found an estimated cost of 2 for instruction:   %shift
398; AVX: Found an estimated cost of 2 for instruction:   %shift
399; AVX2: Found an estimated cost of 11 for instruction:   %shift
400; XOP: Found an estimated cost of 2 for instruction:   %shift
401  %shift = shl <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
402  ret <32 x i8> %shift
403}
404
405;
406; Special Cases
407;
408
409; We always emit a single pmullw in the case of v8i16 vector shifts by
410; non-uniform constant.
411
412define <8 x i16> @test1(<8 x i16> %a) {
413  %shl = shl <8 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
414  ret <8 x i16> %shl
415}
416; CHECK: 'Cost Model Analysis' for function 'test1':
417; CHECK: Found an estimated cost of 1 for instruction:   %shl
418
419
420define <8 x i16> @test2(<8 x i16> %a) {
421  %shl = shl <8 x i16> %a, <i16 0, i16 undef, i16 0, i16 0, i16 1, i16 undef, i16 -1, i16 1>
422  ret <8 x i16> %shl
423}
424; CHECK: 'Cost Model Analysis' for function 'test2':
425; CHECK: Found an estimated cost of 1 for instruction:   %shl
426
427
428; With SSE4.1, v4i32 shifts can be lowered into a single pmulld instruction.
429; Make sure that the estimated cost is always 1 except for the case where
430; we only have SSE2 support. With SSE2, we are forced to special lower the
431; v4i32 mul as a 2x shuffle, 2x pmuludq, 2x shuffle.
432
433define <4 x i32> @test3(<4 x i32> %a) {
434  %shl = shl <4 x i32> %a, <i32 1, i32 -1, i32 2, i32 -3>
435  ret <4 x i32> %shl
436}
437; CHECK: 'Cost Model Analysis' for function 'test3':
438; SSE2: Found an estimated cost of 6 for instruction:   %shl
439; SSE41: Found an estimated cost of 1 for instruction:   %shl
440; AVX: Found an estimated cost of 1 for instruction:   %shl
441; AVX2: Found an estimated cost of 1 for instruction:   %shl
442; XOP: Found an estimated cost of 1 for instruction:   %shl
443
444
445define <4 x i32> @test4(<4 x i32> %a) {
446  %shl = shl <4 x i32> %a, <i32 0, i32 0, i32 1, i32 1>
447  ret <4 x i32> %shl
448}
449; CHECK: 'Cost Model Analysis' for function 'test4':
450; SSE2: Found an estimated cost of 6 for instruction:   %shl
451; SSE41: Found an estimated cost of 1 for instruction:   %shl
452; AVX: Found an estimated cost of 1 for instruction:   %shl
453; AVX2: Found an estimated cost of 1 for instruction:   %shl
454; XOP: Found an estimated cost of 1 for instruction:   %shl
455
456
457; On AVX2 we are able to lower the following shift into a single
458; vpsllvq. Therefore, the expected cost is only 1.
459; In all other cases, this shift is scalarized as the target does not support
460; vpsllv instructions.
461
462define <2 x i64> @test5(<2 x i64> %a) {
463  %shl = shl <2 x i64> %a, <i64 2, i64 3>
464  ret <2 x i64> %shl
465}
466; CHECK: 'Cost Model Analysis' for function 'test5':
467; SSE2: Found an estimated cost of 4 for instruction:   %shl
468; SSE41: Found an estimated cost of 4 for instruction:   %shl
469; AVX: Found an estimated cost of 4 for instruction:   %shl
470; AVX2: Found an estimated cost of 1 for instruction:   %shl
471; XOP: Found an estimated cost of 1 for instruction:   %shl
472
473
474; v16i16 and v8i32 shift left by non-uniform constant are lowered into
475; vector multiply instructions.  With AVX (but not AVX2), the vector multiply
476; is lowered into a sequence of: 1 extract + 2 vpmullw + 1 insert.
477;
478; With AVX2, instruction vpmullw works with 256bit quantities and
479; therefore there is no need to split the resulting vector multiply into
480; a sequence of two multiply.
481;
482; With SSE2 and SSE4.1, the vector shift cost for 'test6' is twice
483; the cost computed in the case of 'test1'. That is because the backend
484; simply emits 2 pmullw with no extract/insert.
485
486
487define <16 x i16> @test6(<16 x i16> %a) {
488  %shl = shl <16 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
489  ret <16 x i16> %shl
490}
491; CHECK: 'Cost Model Analysis' for function 'test6':
492; SSE2: Found an estimated cost of 2 for instruction:   %shl
493; SSE41: Found an estimated cost of 2 for instruction:   %shl
494; AVX: Found an estimated cost of 4 for instruction:   %shl
495; AVX2: Found an estimated cost of 1 for instruction:   %shl
496; XOPAVX: Found an estimated cost of 2 for instruction:   %shl
497; XOPAVX2: Found an estimated cost of 1 for instruction:   %shl
498
499
500; With SSE2 and SSE4.1, the vector shift cost for 'test7' is twice
501; the cost computed in the case of 'test3'. That is because the multiply
502; is type-legalized into two 4i32 vector multiply.
503
504define <8 x i32> @test7(<8 x i32> %a) {
505  %shl = shl <8 x i32> %a, <i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3>
506  ret <8 x i32> %shl
507}
508; CHECK: 'Cost Model Analysis' for function 'test7':
509; SSE2: Found an estimated cost of 12 for instruction:   %shl
510; SSE41: Found an estimated cost of 2 for instruction:   %shl
511; AVX: Found an estimated cost of 4 for instruction:   %shl
512; AVX2: Found an estimated cost of 1 for instruction:   %shl
513; XOPAVX: Found an estimated cost of 2 for instruction:   %shl
514; XOPAVX2: Found an estimated cost of 1 for instruction:   %shl
515
516
517; On AVX2 we are able to lower the following shift into a single
518; vpsllvq. Therefore, the expected cost is only 1.
519; In all other cases, this shift is scalarized as the target does not support
520; vpsllv instructions.
521
522define <4 x i64> @test8(<4 x i64> %a) {
523  %shl = shl <4 x i64> %a, <i64 1, i64 2, i64 3, i64 4>
524  ret <4 x i64> %shl
525}
526; CHECK: 'Cost Model Analysis' for function 'test8':
527; SSE2: Found an estimated cost of 8 for instruction:   %shl
528; SSE41: Found an estimated cost of 8 for instruction:   %shl
529; AVX: Found an estimated cost of 8 for instruction:   %shl
530; AVX2: Found an estimated cost of 1 for instruction:   %shl
531; XOPAVX: Found an estimated cost of 2 for instruction:   %shl
532; XOPAVX2: Found an estimated cost of 1 for instruction:   %shl
533
534
535; Same as 'test6', with the difference that the cost is double.
536
537define <32 x i16> @test9(<32 x i16> %a) {
538  %shl = shl <32 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
539  ret <32 x i16> %shl
540}
541; CHECK: 'Cost Model Analysis' for function 'test9':
542; SSE2: Found an estimated cost of 4 for instruction:   %shl
543; SSE41: Found an estimated cost of 4 for instruction:   %shl
544; AVX: Found an estimated cost of 8 for instruction:   %shl
545; AVX2: Found an estimated cost of 2 for instruction:   %shl
546; XOPAVX: Found an estimated cost of 4 for instruction:   %shl
547; XOPAVX2: Found an estimated cost of 2 for instruction:   %shl
548
549
550; Same as 'test7', except that now the cost is double.
551
552define <16 x i32> @test10(<16 x i32> %a) {
553  %shl = shl <16 x i32> %a, <i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3>
554  ret <16 x i32> %shl
555}
556; CHECK: 'Cost Model Analysis' for function 'test10':
557; SSE2: Found an estimated cost of 24 for instruction:   %shl
558; SSE41: Found an estimated cost of 4 for instruction:   %shl
559; AVX: Found an estimated cost of 8 for instruction:   %shl
560; AVX2: Found an estimated cost of 2 for instruction:   %shl
561; XOPAVX: Found an estimated cost of 4 for instruction:   %shl
562; XOPAVX2: Found an estimated cost of 2 for instruction:   %shl
563
564
565; On AVX2 we are able to lower the following shift into a sequence of
566; two vpsllvq instructions. Therefore, the expected cost is only 2.
567; In all other cases, this shift is scalarized as we don't have vpsllv
568; instructions.
569
570define <8 x i64> @test11(<8 x i64> %a) {
571  %shl = shl <8 x i64> %a, <i64 1, i64 1, i64 2, i64 3, i64 1, i64 1, i64 2, i64 3>
572  ret <8 x i64> %shl
573}
574; CHECK: 'Cost Model Analysis' for function 'test11':
575; SSE2: Found an estimated cost of 16 for instruction:   %shl
576; SSE41: Found an estimated cost of 16 for instruction:   %shl
577; AVX: Found an estimated cost of 16 for instruction:   %shl
578; AVX2: Found an estimated cost of 2 for instruction:   %shl
579; XOPAVX: Found an estimated cost of 4 for instruction:   %shl
580; XOPAVX2: Found an estimated cost of 2 for instruction:   %shl
581