1; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+sse2 -cost-model -analyze < %s | FileCheck --check-prefix=SSE2 %s
2; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+sse4.1 -cost-model -analyze < %s | FileCheck --check-prefix=SSE41 %s
3
4define void @zext_v4i8_to_v4i64(<4 x i8>* %a) {
5; SSE2: zext_v4i8_to_v4i64
6; SSE2: cost of 4 {{.*}} zext
7;
8; SSE41: zext_v4i8_to_v4i64
9; SSE41: cost of 2 {{.*}} zext
10;
11  %1 = load <4 x i8>, <4 x i8>* %a
12  %2 = zext <4 x i8> %1 to <4 x i64>
13  store <4 x i64> %2, <4 x i64>* undef, align 4
14  ret void
15}
16
17define void @sext_v4i8_to_v4i64(<4 x i8>* %a) {
18; SSE2: sext_v4i8_to_v4i64
19; SSE2: cost of 8 {{.*}} sext
20;
21; SSE41: sext_v4i8_to_v4i64
22; SSE41: cost of 2 {{.*}} sext
23;
24  %1 = load <4 x i8>, <4 x i8>* %a
25  %2 = sext <4 x i8> %1 to <4 x i64>
26  store <4 x i64> %2, <4 x i64>* undef, align 4
27  ret void
28}
29
30define void @zext_v4i16_to_v4i64(<4 x i16>* %a) {
31; SSE2: zext_v4i16_to_v4i64
32; SSE2: cost of 3 {{.*}} zext
33;
34; SSE41: zext_v4i16_to_v4i64
35; SSE41: cost of 2 {{.*}} zext
36;
37  %1 = load <4 x i16>, <4 x i16>* %a
38  %2 = zext <4 x i16> %1 to <4 x i64>
39  store <4 x i64> %2, <4 x i64>* undef, align 4
40  ret void
41}
42
43define void @sext_v4i16_to_v4i64(<4 x i16>* %a) {
44; SSE2: sext_v4i16_to_v4i64
45; SSE2: cost of 10 {{.*}} sext
46;
47; SSE41: sext_v4i16_to_v4i64
48; SSE41: cost of 2 {{.*}} sext
49;
50  %1 = load <4 x i16>, <4 x i16>* %a
51  %2 = sext <4 x i16> %1 to <4 x i64>
52  store <4 x i64> %2, <4 x i64>* undef, align 4
53  ret void
54}
55
56
57define void @zext_v4i32_to_v4i64(<4 x i32>* %a) {
58; SSE2: zext_v4i32_to_v4i64
59; SSE2: cost of 3 {{.*}} zext
60;
61; SSE41: zext_v4i32_to_v4i64
62; SSE41: cost of 2 {{.*}} zext
63;
64  %1 = load <4 x i32>, <4 x i32>* %a
65  %2 = zext <4 x i32> %1 to <4 x i64>
66  store <4 x i64> %2, <4 x i64>* undef, align 4
67  ret void
68}
69
70define void @sext_v4i32_to_v4i64(<4 x i32>* %a) {
71; SSE2: sext_v4i32_to_v4i64
72; SSE2: cost of 5 {{.*}} sext
73;
74; SSE41: sext_v4i32_to_v4i64
75; SSE41: cost of 2 {{.*}} sext
76;
77  %1 = load <4 x i32>, <4 x i32>* %a
78  %2 = sext <4 x i32> %1 to <4 x i64>
79  store <4 x i64> %2, <4 x i64>* undef, align 4
80  ret void
81}
82
83define void @zext_v16i16_to_v16i32(<16 x i16>* %a) {
84; SSE2: zext_v16i16_to_v16i32
85; SSE2: cost of 6 {{.*}} zext
86;
87; SSE41: zext_v16i16_to_v16i32
88; SSE41: cost of 4 {{.*}} zext
89;
90  %1 = load <16 x i16>, <16 x i16>* %a
91  %2 = zext <16 x i16> %1 to <16 x i32>
92  store <16 x i32> %2, <16 x i32>* undef, align 4
93  ret void
94}
95
96define void @sext_v16i16_to_v16i32(<16 x i16>* %a) {
97; SSE2: sext_v16i16_to_v16i32
98; SSE2: cost of 8 {{.*}} sext
99;
100; SSE41: sext_v16i16_to_v16i32
101; SSE41: cost of 4 {{.*}} sext
102;
103  %1 = load <16 x i16>, <16 x i16>* %a
104  %2 = sext <16 x i16> %1 to <16 x i32>
105  store <16 x i32> %2, <16 x i32>* undef, align 4
106  ret void
107}
108
109define void @zext_v8i16_to_v8i32(<8 x i16>* %a) {
110; SSE2: zext_v8i16_to_v8i32
111; SSE2: cost of 3 {{.*}} zext
112;
113; SSE41: zext_v8i16_to_v8i32
114; SSE41: cost of 2 {{.*}} zext
115;
116  %1 = load <8 x i16>, <8 x i16>* %a
117  %2 = zext <8 x i16> %1 to <8 x i32>
118  store <8 x i32> %2, <8 x i32>* undef, align 4
119  ret void
120}
121
122define void @sext_v8i16_to_v8i32(<8 x i16>* %a) {
123; SSE2: sext_v8i16_to_v8i32
124; SSE2: cost of 4 {{.*}} sext
125;
126; SSE41: sext_v8i16_to_v8i32
127; SSE41: cost of 2 {{.*}} sext
128;
129  %1 = load <8 x i16>, <8 x i16>* %a
130  %2 = sext <8 x i16> %1 to <8 x i32>
131  store <8 x i32> %2, <8 x i32>* undef, align 4
132  ret void
133}
134
135define void @zext_v4i16_to_v4i32(<4 x i16>* %a) {
136; SSE2: zext_v4i16_to_v4i32
137; SSE2: cost of 1 {{.*}} zext
138;
139; SSE41: zext_v4i16_to_v4i32
140; SSE41: cost of 1 {{.*}} zext
141;
142  %1 = load <4 x i16>, <4 x i16>* %a
143  %2 = zext <4 x i16> %1 to <4 x i32>
144  store <4 x i32> %2, <4 x i32>* undef, align 4
145  ret void
146}
147
148define void @sext_v4i16_to_v4i32(<4 x i16>* %a) {
149; SSE2: sext_v4i16_to_v4i32
150; SSE2: cost of 2 {{.*}} sext
151;
152; SSE41: sext_v4i16_to_v4i32
153; SSE41: cost of 1 {{.*}} sext
154;
155  %1 = load <4 x i16>, <4 x i16>* %a
156  %2 = sext <4 x i16> %1 to <4 x i32>
157  store <4 x i32> %2, <4 x i32>* undef, align 4
158  ret void
159}
160
161define void @zext_v16i8_to_v16i32(<16 x i8>* %a) {
162; SSE2: zext_v16i8_to_v16i32
163; SSE2: cost of 9 {{.*}} zext
164;
165; SSE41: zext_v16i8_to_v16i32
166; SSE41: cost of 4 {{.*}} zext
167;
168  %1 = load <16 x i8>, <16 x i8>* %a
169  %2 = zext <16 x i8> %1 to <16 x i32>
170  store <16 x i32> %2, <16 x i32>* undef, align 4
171  ret void
172}
173
174define void @sext_v16i8_to_v16i32(<16 x i8>* %a) {
175; SSE2: sext_v16i8_to_v16i32
176; SSE2: cost of 12 {{.*}} sext
177;
178; SSE41: sext_v16i8_to_v16i32
179; SSE41: cost of 4 {{.*}} sext
180;
181  %1 = load <16 x i8>, <16 x i8>* %a
182  %2 = sext <16 x i8> %1 to <16 x i32>
183  store <16 x i32> %2, <16 x i32>* undef, align 4
184  ret void
185}
186
187define void @zext_v8i8_to_v8i32(<8 x i8>* %a) {
188; SSE2: zext_v8i8_to_v8i32
189; SSE2: cost of 6 {{.*}} zext
190;
191; SSE41: zext_v8i8_to_v8i32
192; SSE41: cost of 2 {{.*}} zext
193;
194  %1 = load <8 x i8>, <8 x i8>* %a
195  %2 = zext <8 x i8> %1 to <8 x i32>
196  store <8 x i32> %2, <8 x i32>* undef, align 4
197  ret void
198}
199
200define void @sext_v8i8_to_v8i32(<8 x i8>* %a) {
201; SSE2: sext_v8i8_to_v8i32
202; SSE2: cost of 6 {{.*}} sext
203;
204; SSE41: sext_v8i8_to_v8i32
205; SSE41: cost of 2 {{.*}} sext
206;
207  %1 = load <8 x i8>, <8 x i8>* %a
208  %2 = sext <8 x i8> %1 to <8 x i32>
209  store <8 x i32> %2, <8 x i32>* undef, align 4
210  ret void
211}
212
213define void @zext_v4i8_to_v4i32(<4 x i8>* %a) {
214; SSE2: zext_v4i8_to_v4i32
215; SSE2: cost of 2 {{.*}} zext
216;
217; SSE41: zext_v4i8_to_v4i32
218; SSE41: cost of 1 {{.*}} zext
219;
220  %1 = load <4 x i8>, <4 x i8>* %a
221  %2 = zext <4 x i8> %1 to <4 x i32>
222  store <4 x i32> %2, <4 x i32>* undef, align 4
223  ret void
224}
225
226define void @sext_v4i8_to_v4i32(<4 x i8>* %a) {
227; SSE2: sext_v4i8_to_v4i32
228; SSE2: cost of 3 {{.*}} sext
229;
230; SSE41: sext_v4i8_to_v4i32
231; SSE41: cost of 1 {{.*}} sext
232;
233  %1 = load <4 x i8>, <4 x i8>* %a
234  %2 = sext <4 x i8> %1 to <4 x i32>
235  store <4 x i32> %2, <4 x i32>* undef, align 4
236  ret void
237}
238
239define void @zext_v16i8_to_v16i16(<16 x i8>* %a) {
240; SSE2: zext_v16i8_to_v16i16
241; SSE2: cost of 3 {{.*}} zext
242;
243; SSE41: zext_v16i8_to_v16i16
244; SSE41: cost of 2 {{.*}} zext
245;
246  %1 = load <16 x i8>, <16 x i8>* %a
247  %2 = zext <16 x i8> %1 to <16 x i16>
248  store <16 x i16> %2, <16 x i16>* undef, align 4
249  ret void
250}
251
252define void @sext_v16i8_to_v16i16(<16 x i8>* %a) {
253; SSE2: sext_v16i8_to_v16i16
254; SSE2: cost of 4 {{.*}} sext
255;
256; SSE41: sext_v16i8_to_v16i16
257; SSE41: cost of 2 {{.*}} sext
258;
259  %1 = load <16 x i8>, <16 x i8>* %a
260  %2 = sext <16 x i8> %1 to <16 x i16>
261  store <16 x i16> %2, <16 x i16>* undef, align 4
262  ret void
263}
264
265define void @zext_v8i8_to_v8i16(<8 x i8>* %a) {
266; SSE2: zext_v8i8_to_v8i16
267; SSE2: cost of 1 {{.*}} zext
268;
269; SSE41: zext_v8i8_to_v8i16
270; SSE41: cost of 1 {{.*}} zext
271;
272  %1 = load <8 x i8>, <8 x i8>* %a
273  %2 = zext <8 x i8> %1 to <8 x i16>
274  store <8 x i16> %2, <8 x i16>* undef, align 4
275  ret void
276}
277
278define void @sext_v8i8_to_v8i16(<8 x i8>* %a) {
279; SSE2: sext_v8i8_to_v8i16
280; SSE2: cost of 2 {{.*}} sext
281;
282; SSE41: sext_v8i8_to_v8i16
283; SSE41: cost of 1 {{.*}} sext
284;
285  %1 = load <8 x i8>, <8 x i8>* %a
286  %2 = sext <8 x i8> %1 to <8 x i16>
287  store <8 x i16> %2, <8 x i16>* undef, align 4
288  ret void
289}
290
291define void @zext_v4i8_to_v4i16(<4 x i8>* %a) {
292; SSE2: zext_v4i8_to_v4i16
293; SSE2: cost of 1 {{.*}} zext
294;
295; SSE41: zext_v4i8_to_v4i16
296; SSE41: cost of 1 {{.*}} zext
297;
298  %1 = load <4 x i8>, <4 x i8>* %a
299  %2 = zext <4 x i8> %1 to <4 x i16>
300  store <4 x i16> %2, <4 x i16>* undef, align 4
301  ret void
302}
303
304define void @sext_v4i8_to_v4i16(<4 x i8>* %a) {
305; SSE2: sext_v4i8_to_v4i16
306; SSE2: cost of 6 {{.*}} sext
307;
308; SSE41: sext_v4i8_to_v4i16
309; SSE41: cost of 2 {{.*}} sext
310;
311  %1 = load <4 x i8>, <4 x i8>* %a
312  %2 = sext <4 x i8> %1 to <4 x i16>
313  store <4 x i16> %2, <4 x i16>* undef, align 4
314  ret void
315}
316
317define void @truncate_v16i32_to_v16i16(<16 x i32>* %a) {
318; SSE2: truncate_v16i32_to_v16i16
319; SSE2: cost of 10 {{.*}} trunc
320;
321; SSE41: truncate_v16i32_to_v16i16
322; SSE41: cost of 6 {{.*}} trunc
323;
324  %1 = load <16 x i32>, <16 x i32>* %a
325  %2 = trunc <16 x i32> %1 to <16 x i16>
326  store <16 x i16> %2, <16 x i16>* undef, align 4
327  ret void
328}
329
330define void @truncate_v8i32_to_v8i16(<8 x i32>* %a) {
331; SSE2: truncate_v8i32_to_v8i16
332; SSE2: cost of 5 {{.*}} trunc
333;
334; SSE41: truncate_v8i32_to_v8i16
335; SSE41: cost of 3 {{.*}} trunc
336;
337  %1 = load <8 x i32>, <8 x i32>* %a
338  %2 = trunc <8 x i32> %1 to <8 x i16>
339  store <8 x i16> %2, <8 x i16>* undef, align 4
340  ret void
341}
342
343define void @truncate_v4i32_to_v4i16(<4 x i32>* %a) {
344; SSE2: truncate_v4i32_to_v4i16
345; SSE2: cost of 3 {{.*}} trunc
346;
347; SSE41: truncate_v4i32_to_v4i16
348; SSE41: cost of 1 {{.*}} trunc
349;
350  %1 = load <4 x i32>, <4 x i32>* %a
351  %2 = trunc <4 x i32> %1 to <4 x i16>
352  store <4 x i16> %2, <4 x i16>* undef, align 4
353  ret void
354}
355
356define void @truncate_v16i32_to_v16i8(<16 x i32>* %a) {
357; SSE2: truncate_v16i32_to_v16i8
358; SSE2: cost of 7 {{.*}} trunc
359;
360; SSE41: truncate_v16i32_to_v16i8
361; SSE41: cost of 7 {{.*}} trunc
362;
363  %1 = load <16 x i32>, <16 x i32>* %a
364  %2 = trunc <16 x i32> %1 to <16 x i8>
365  store <16 x i8> %2, <16 x i8>* undef, align 4
366  ret void
367}
368
369define void @truncate_v8i32_to_v8i8(<8 x i32>* %a) {
370; SSE2: truncate_v8i32_to_v8i8
371; SSE2: cost of 4 {{.*}} trunc
372;
373; SSE41: truncate_v8i32_to_v8i8
374; SSE41: cost of 3 {{.*}} trunc
375;
376  %1 = load <8 x i32>, <8 x i32>* %a
377  %2 = trunc <8 x i32> %1 to <8 x i8>
378  store <8 x i8> %2, <8 x i8>* undef, align 4
379  ret void
380}
381
382define void @truncate_v4i32_to_v4i8(<4 x i32>* %a) {
383; SSE2: truncate_v4i32_to_v4i8
384; SSE2: cost of 3 {{.*}} trunc
385;
386; SSE41: truncate_v4i32_to_v4i8
387; SSE41: cost of 1 {{.*}} trunc
388;
389  %1 = load <4 x i32>, <4 x i32>* %a
390  %2 = trunc <4 x i32> %1 to <4 x i8>
391  store <4 x i8> %2, <4 x i8>* undef, align 4
392  ret void
393}
394
395define void @truncate_v16i16_to_v16i8(<16 x i16>* %a) {
396; SSE2: truncate_v16i16_to_v16i8
397; SSE2: cost of 3 {{.*}} trunc
398;
399; SSE41: truncate_v16i16_to_v16i8
400; SSE41: cost of 3 {{.*}} trunc
401;
402  %1 = load <16 x i16>, <16 x i16>* %a
403  %2 = trunc <16 x i16> %1 to <16 x i8>
404  store <16 x i8> %2, <16 x i8>* undef, align 4
405  ret void
406}
407
408define void @truncate_v8i16_to_v8i8(<8 x i16>* %a) {
409; SSE2: truncate_v8i16_to_v8i8
410; SSE2: cost of 2 {{.*}} trunc
411;
412; SSE41: truncate_v8i16_to_v8i8
413; SSE41: cost of 1 {{.*}} trunc
414;
415  %1 = load <8 x i16>, <8 x i16>* %a
416  %2 = trunc <8 x i16> %1 to <8 x i8>
417  store <8 x i8> %2, <8 x i8>* undef, align 4
418  ret void
419}
420
421define void @truncate_v4i16_to_v4i8(<4 x i16>* %a) {
422; SSE2: truncate_v4i16_to_v4i8
423; SSE2: cost of 4 {{.*}} trunc
424;
425; SSE41: truncate_v4i16_to_v4i8
426; SSE41: cost of 2 {{.*}} trunc
427;
428  %1 = load <4 x i16>, <4 x i16>* %a
429  %2 = trunc <4 x i16> %1 to <4 x i8>
430  store <4 x i8> %2, <4 x i8>* undef, align 4
431  ret void
432}
433