1; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
2; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=core2 | FileCheck %s --check-prefix=SSE3
3; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=core-avx2 | FileCheck %s --check-prefix=AVX2
4
5target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
6target triple = "x86_64-apple-macosx10.8.0"
7
8define i32 @add(i32 %arg) {
9  ;CHECK: cost of 1 {{.*}} add
10  %A = add <4 x i32> undef, undef
11  ;CHECK: cost of 4 {{.*}} add
12  %B = add <8 x i32> undef, undef
13  ;CHECK: cost of 1 {{.*}} add
14  %C = add <2 x i64> undef, undef
15  ;CHECK: cost of 4 {{.*}} add
16  %D = add <4 x i64> undef, undef
17  ;CHECK: cost of 8 {{.*}} add
18  %E = add <8 x i64> undef, undef
19  ;CHECK: cost of 0 {{.*}} ret
20  ret i32 undef
21}
22
23
24define i32 @xor(i32 %arg) {
25  ;CHECK: cost of 1 {{.*}} xor
26  %A = xor <4 x i32> undef, undef
27  ;CHECK: cost of 1 {{.*}} xor
28  %B = xor <8 x i32> undef, undef
29  ;CHECK: cost of 1 {{.*}} xor
30  %C = xor <2 x i64> undef, undef
31  ;CHECK: cost of 1 {{.*}} xor
32  %D = xor <4 x i64> undef, undef
33  ;CHECK: cost of 0 {{.*}} ret
34  ret i32 undef
35}
36
37; CHECK: mul
38define void @mul() {
39  ; A <2 x i32> gets expanded to a <2 x i64> vector.
40  ; A <2 x i64> vector multiply is implemented using
41  ; 3 PMULUDQ and 2 PADDS and 4 shifts.
42  ;CHECK: cost of 9 {{.*}} mul
43  %A0 = mul <2 x i32> undef, undef
44  ;CHECK: cost of 9 {{.*}} mul
45  %A1 = mul <2 x i64> undef, undef
46  ;CHECK: cost of 18 {{.*}} mul
47  %A2 = mul <4 x i64> undef, undef
48  ret void
49}
50
51; SSE3: sse3mull
52define void @sse3mull() {
53  ; SSE3: cost of 6 {{.*}} mul
54  %A0 = mul <4 x i32> undef, undef
55  ret void
56  ; SSE3: avx2mull
57}
58
59; AVX2: avx2mull
60define void @avx2mull() {
61  ; AVX2: cost of 9 {{.*}} mul
62  %A0 = mul <4 x i64> undef, undef
63  ret void
64  ; AVX2: fmul
65}
66
67; CHECK: fmul
68define i32 @fmul(i32 %arg) {
69  ;CHECK: cost of 2 {{.*}} fmul
70  %A = fmul <4 x float> undef, undef
71  ;CHECK: cost of 2 {{.*}} fmul
72  %B = fmul <8 x float> undef, undef
73  ret i32 undef
74}
75
76; AVX: shift
77; AVX2: shift
78define void @shift() {
79  ; AVX: cost of 2 {{.*}} shl
80  ; AVX2: cost of 1 {{.*}} shl
81  %A0 = shl <4 x i32> undef, undef
82  ; AVX: cost of 2 {{.*}} shl
83  ; AVX2: cost of 1 {{.*}} shl
84  %A1 = shl <2 x i64> undef, undef
85
86  ; AVX: cost of 2 {{.*}} lshr
87  ; AVX2: cost of 1 {{.*}} lshr
88  %B0 = lshr <4 x i32> undef, undef
89  ; AVX: cost of 2 {{.*}} lshr
90  ; AVX2: cost of 1 {{.*}} lshr
91  %B1 = lshr <2 x i64> undef, undef
92
93  ; AVX: cost of 2 {{.*}} ashr
94  ; AVX2: cost of 1 {{.*}} ashr
95  %C0 = ashr <4 x i32> undef, undef
96  ; AVX: cost of 6 {{.*}} ashr
97  ; AVX2: cost of 20 {{.*}} ashr
98  %C1 = ashr <2 x i64> undef, undef
99
100  ret void
101}
102
103; AVX: avx2shift
104; AVX2: avx2shift
105define void @avx2shift() {
106  ; AVX: cost of 2 {{.*}} shl
107  ; AVX2: cost of 1 {{.*}} shl
108  %A0 = shl <8 x i32> undef, undef
109  ; AVX: cost of 2 {{.*}} shl
110  ; AVX2: cost of 1 {{.*}} shl
111  %A1 = shl <4 x i64> undef, undef
112
113  ; AVX: cost of 2 {{.*}} lshr
114  ; AVX2: cost of 1 {{.*}} lshr
115  %B0 = lshr <8 x i32> undef, undef
116  ; AVX: cost of 2 {{.*}} lshr
117  ; AVX2: cost of 1 {{.*}} lshr
118  %B1 = lshr <4 x i64> undef, undef
119
120  ; AVX: cost of 2 {{.*}} ashr
121  ; AVX2: cost of 1 {{.*}} ashr
122  %C0 = ashr <8 x i32> undef, undef
123  ; AVX: cost of 12 {{.*}} ashr
124  ; AVX2: cost of 40 {{.*}} ashr
125  %C1 = ashr <4 x i64> undef, undef
126
127  ret void
128}
129