1; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SIVI,FUNC %s
2; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SIVI,FUNC %s
3; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,FUNC %s
4; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -enable-var-scope -check-prefix=EG -check-prefix=FUNC %s
5
6; FUNC-LABEL: {{^}}s_add_i32:
7; EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
8
9; GCN: s_add_i32 s[[REG:[0-9]+]], {{s[0-9]+, s[0-9]+}}
10; GCN: v_mov_b32_e32 v[[V_REG:[0-9]+]], s[[REG]]
11; GCN: buffer_store_dword v[[V_REG]],
12define amdgpu_kernel void @s_add_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
13  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
14  %a = load i32, i32 addrspace(1)* %in
15  %b = load i32, i32 addrspace(1)* %b_ptr
16  %result = add i32 %a, %b
17  store i32 %result, i32 addrspace(1)* %out
18  ret void
19}
20
21; FUNC-LABEL: {{^}}s_add_v2i32:
22; EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
23; EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
24
25; GCN: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
26; GCN: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
27define amdgpu_kernel void @s_add_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
28  %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
29  %a = load <2 x i32>, <2 x i32> addrspace(1)* %in
30  %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr
31  %result = add <2 x i32> %a, %b
32  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
33  ret void
34}
35
36; FUNC-LABEL: {{^}}s_add_v4i32:
37; EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
38; EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
39; EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
40; EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
41
42; GCN: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
43; GCN: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
44; GCN: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
45; GCN: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
46define amdgpu_kernel void @s_add_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
47  %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
48  %a = load <4 x i32>, <4 x i32> addrspace(1)* %in
49  %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr
50  %result = add <4 x i32> %a, %b
51  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
52  ret void
53}
54
55; FUNC-LABEL: {{^}}s_add_v8i32:
56; EG: ADD_INT
57; EG: ADD_INT
58; EG: ADD_INT
59; EG: ADD_INT
60; EG: ADD_INT
61; EG: ADD_INT
62; EG: ADD_INT
63; EG: ADD_INT
64
65; GCN: s_add_i32
66; GCN: s_add_i32
67; GCN: s_add_i32
68; GCN: s_add_i32
69; GCN: s_add_i32
70; GCN: s_add_i32
71; GCN: s_add_i32
72; GCN: s_add_i32
73define amdgpu_kernel void @s_add_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) {
74entry:
75  %0 = add <8 x i32> %a, %b
76  store <8 x i32> %0, <8 x i32> addrspace(1)* %out
77  ret void
78}
79
80; FUNC-LABEL: {{^}}s_add_v16i32:
81; EG: ADD_INT
82; EG: ADD_INT
83; EG: ADD_INT
84; EG: ADD_INT
85; EG: ADD_INT
86; EG: ADD_INT
87; EG: ADD_INT
88; EG: ADD_INT
89; EG: ADD_INT
90; EG: ADD_INT
91; EG: ADD_INT
92; EG: ADD_INT
93; EG: ADD_INT
94; EG: ADD_INT
95; EG: ADD_INT
96; EG: ADD_INT
97
98; GCN: s_add_i32
99; GCN: s_add_i32
100; GCN: s_add_i32
101; GCN: s_add_i32
102; GCN: s_add_i32
103; GCN: s_add_i32
104; GCN: s_add_i32
105; GCN: s_add_i32
106; GCN: s_add_i32
107; GCN: s_add_i32
108; GCN: s_add_i32
109; GCN: s_add_i32
110; GCN: s_add_i32
111; GCN: s_add_i32
112; GCN: s_add_i32
113; GCN: s_add_i32
114define amdgpu_kernel void @s_add_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> %a, <16 x i32> %b) {
115entry:
116  %0 = add <16 x i32> %a, %b
117  store <16 x i32> %0, <16 x i32> addrspace(1)* %out
118  ret void
119}
120
121; FUNC-LABEL: {{^}}v_add_i32:
122; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
123; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
124; SIVI: v_add_{{i|u}}32_e32 v{{[0-9]+}}, vcc, [[B]], [[A]]
125; GFX9: v_add_u32_e32 v{{[0-9]+}}, [[A]], [[B]]
126define amdgpu_kernel void @v_add_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
127  %tid = call i32 @llvm.r600.read.tidig.x()
128  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %tid
129  %b_ptr = getelementptr i32, i32 addrspace(1)* %gep, i32 1
130  %a = load volatile i32, i32 addrspace(1)* %gep
131  %b = load volatile i32, i32 addrspace(1)* %b_ptr
132  %result = add i32 %a, %b
133  store i32 %result, i32 addrspace(1)* %out
134  ret void
135}
136
137; FUNC-LABEL: {{^}}v_add_imm_i32:
138; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
139; SIVI: v_add_{{i|u}}32_e32 v{{[0-9]+}}, vcc, 0x7b, [[A]]
140; GFX9: v_add_u32_e32 v{{[0-9]+}}, 0x7b, [[A]]
141define amdgpu_kernel void @v_add_imm_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
142  %tid = call i32 @llvm.r600.read.tidig.x()
143  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %tid
144  %b_ptr = getelementptr i32, i32 addrspace(1)* %gep, i32 1
145  %a = load volatile i32, i32 addrspace(1)* %gep
146  %result = add i32 %a, 123
147  store i32 %result, i32 addrspace(1)* %out
148  ret void
149}
150
151; FUNC-LABEL: {{^}}add64:
152; GCN: s_add_u32
153; GCN: s_addc_u32
154
155; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.XY]]
156; EG-DAG: ADD_INT {{[* ]*}}
157; EG-DAG: ADDC_UINT
158; EG-DAG: ADD_INT
159; EG-DAG: ADD_INT {{[* ]*}}
160; EG-NOT: SUB
161define amdgpu_kernel void @add64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
162entry:
163  %add = add i64 %a, %b
164  store i64 %add, i64 addrspace(1)* %out
165  ret void
166}
167
168; The v_addc_u32 and v_add_i32 instruction can't read SGPRs, because they
169; use VCC.  The test is designed so that %a will be stored in an SGPR and
170; %0 will be stored in a VGPR, so the comiler will be forced to copy %a
171; to a VGPR before doing the add.
172
173; FUNC-LABEL: {{^}}add64_sgpr_vgpr:
174; GCN-NOT: v_addc_u32_e32 s
175
176; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.XY]]
177; EG-DAG: ADD_INT {{[* ]*}}
178; EG-DAG: ADDC_UINT
179; EG-DAG: ADD_INT
180; EG-DAG: ADD_INT {{[* ]*}}
181; EG-NOT: SUB
182define amdgpu_kernel void @add64_sgpr_vgpr(i64 addrspace(1)* %out, i64 %a, i64 addrspace(1)* %in) {
183entry:
184  %0 = load i64, i64 addrspace(1)* %in
185  %1 = add i64 %a, %0
186  store i64 %1, i64 addrspace(1)* %out
187  ret void
188}
189
190; Test i64 add inside a branch.
191; FUNC-LABEL: {{^}}add64_in_branch:
192; GCN: s_add_u32
193; GCN: s_addc_u32
194
195; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.XY]]
196; EG-DAG: ADD_INT {{[* ]*}}
197; EG-DAG: ADDC_UINT
198; EG-DAG: ADD_INT
199; EG-DAG: ADD_INT {{[* ]*}}
200; EG-NOT: SUB
201define amdgpu_kernel void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) {
202entry:
203  %0 = icmp eq i64 %a, 0
204  br i1 %0, label %if, label %else
205
206if:
207  %1 = load i64, i64 addrspace(1)* %in
208  br label %endif
209
210else:
211  %2 = add i64 %a, %b
212  br label %endif
213
214endif:
215  %3 = phi i64 [%1, %if], [%2, %else]
216  store i64 %3, i64 addrspace(1)* %out
217  ret void
218}
219
220declare i32 @llvm.r600.read.tidig.x() #1
221
222attributes #0 = { nounwind }
223attributes #1 = { nounwind readnone speculatable }
224