1; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global -enable-ipra=0 -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,CIVI,MESA %s
2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-ipra=0 -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,CIVI,MESA %s
3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global -enable-ipra=0 -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,MESA %s
4target datalayout = "A5"
5
6; FIXME: Why is this commuted only sometimes?
7; GCN-LABEL: {{^}}i32_fastcc_i32_i32:
8; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v0, v1
10; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
11; GCN-NEXT: s_setpc_b64
12define fastcc i32 @i32_fastcc_i32_i32(i32 %arg0, i32 %arg1) #1 {
13  %add0 = add i32 %arg0, %arg1
14  ret i32 %add0
15}
16
17; GCN-LABEL: {{^}}i32_fastcc_i32_i32_stack_object:
18; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19; GCN-NEXT: v_mov_b32_e32 [[K:v[0-9]+]], 9
20; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v0, v1
21; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
22; GCN: buffer_store_dword [[K]], off, s[0:3], s32 offset:20
23; GCN: s_waitcnt vmcnt(0)
24; GCN: s_setpc_b64
25; GCN: ; ScratchSize: 68
26define fastcc i32 @i32_fastcc_i32_i32_stack_object(i32 %arg0, i32 %arg1) #1 {
27  %alloca = alloca [16 x i32], align 4, addrspace(5)
28  %gep = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 5
29  store volatile i32 9, i32 addrspace(5)* %gep
30  %add0 = add i32 %arg0, %arg1
31  ret i32 %add0
32}
33
34; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32:
35define hidden fastcc i32 @sibling_call_i32_fastcc_i32_i32(i32 %a, i32 %b, i32 %c) #1 {
36entry:
37  %ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b)
38  ret i32 %ret
39}
40
41; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_stack_object:
42; GCN: v_mov_b32_e32 [[NINE:v[0-9]+]], 9
43; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:20
44; GCN: s_setpc_b64
45; GCN: ; ScratchSize: 68
46define fastcc i32 @sibling_call_i32_fastcc_i32_i32_stack_object(i32 %a, i32 %b, i32 %c) #1 {
47entry:
48  %alloca = alloca [16 x i32], align 4, addrspace(5)
49  %gep = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 5
50  store volatile i32 9, i32 addrspace(5)* %gep
51  %ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b)
52  ret i32 %ret
53}
54
55; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_callee_stack_object:
56; GCN: v_mov_b32_e32 [[NINE:v[0-9]+]], 9
57; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:20
58; GCN: s_setpc_b64
59; GCN: ; ScratchSize: 136
60define fastcc i32 @sibling_call_i32_fastcc_i32_i32_callee_stack_object(i32 %a, i32 %b, i32 %c) #1 {
61entry:
62  %alloca = alloca [16 x i32], align 4, addrspace(5)
63  %gep = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 5
64  store volatile i32 9, i32 addrspace(5)* %gep
65  %ret = tail call fastcc i32 @i32_fastcc_i32_i32_stack_object(i32 %a, i32 %b)
66  ret i32 %ret
67}
68
69; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_unused_result:
70define fastcc void @sibling_call_i32_fastcc_i32_i32_unused_result(i32 %a, i32 %b, i32 %c) #1 {
71entry:
72  %ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b)
73  ret void
74}
75
76; It doesn't make sense to do a tail from a kernel
77; GCN-LABEL: {{^}}kernel_call_i32_fastcc_i32_i32_unused_result:
78;define amdgpu_kernel void @kernel_call_i32_fastcc_i32_i32_unused_result(i32 %a, i32 %b, i32 %c) #1 {
79define amdgpu_kernel void @kernel_call_i32_fastcc_i32_i32_unused_result(i32 %a, i32 %b, i32 %c) #1 {
80entry:
81  %ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b)
82  ret void
83}
84
85; GCN-LABEL: {{^}}i32_fastcc_i32_byval_i32:
86; GCN: s_waitcnt
87; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32{{$}}
88; GCN-NEXT: s_waitcnt vmcnt(0)
89
90; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v0, v1
91; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
92
93; GCN-NEXT: s_setpc_b64 s[30:31]
94define hidden fastcc i32 @i32_fastcc_i32_byval_i32(i32 %arg0, i32 addrspace(5)* byval(i32) align 4 %arg1) #1 {
95  %arg1.load = load i32, i32 addrspace(5)* %arg1, align 4
96  %add0 = add i32 %arg0, %arg1.load
97  ret i32 %add0
98}
99
100; Tail call disallowed with byval in parent.
101; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_byval_i32_byval_parent:
102; GCN-NOT: v_writelane_b32 v{{[0-9]+}}, s32
103; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32{{$}}
104; GCN: s_swappc_b64
105; GCN-NOT: v_readlane_b32 s32
106; GCN: s_setpc_b64
107define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32_byval_parent(i32 %a, i32 addrspace(5)* byval(i32) %b.byval, i32 %c) #1 {
108entry:
109  %ret = tail call fastcc i32 @i32_fastcc_i32_byval_i32(i32 %a, i32 addrspace(5)* byval(i32) %b.byval)
110  ret i32 %ret
111}
112
113; Tail call disallowed with byval in parent, not callee. The stack
114; usage of incoming arguments must be <= the outgoing stack
115; arguments.
116
117; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_byval_i32:
118; GCN-NOT: v0
119; GCN-NOT: s32
120; GCN: buffer_load_dword v1, off, s[0:3], 0 offset:16
121; GCN: buffer_store_dword v1, off, s[0:3], s32{{$}}
122; GCN-NEXT: s_setpc_b64
123define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32(i32 %a, [32 x i32] %large) #1 {
124entry:
125  %ret = tail call fastcc i32 @i32_fastcc_i32_byval_i32(i32 %a, i32 addrspace(5)* byval(i32) inttoptr (i32 16 to i32 addrspace(5)*))
126  ret i32 %ret
127}
128
129; GCN-LABEL: {{^}}i32_fastcc_i32_i32_a32i32:
130; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
131; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s32{{$}}
132; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s32 offset:4
133
134; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v0, v1
135; CIVI: v_add_{{i|u}}32_e32 v0, vcc, v0, [[LOAD_0]]
136; CIVI: v_add_{{i|u}}32_e32 v0, vcc, v0, [[LOAD_1]]
137
138
139; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
140; GFX9: v_add3_u32 v0, v0, v3, v2
141
142; GCN-NEXT: s_setpc_b64
143define fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %arg0, i32 %arg1, [32 x i32] %large) #1 {
144  %val_firststack = extractvalue [32 x i32] %large, 30
145  %val_laststack = extractvalue [32 x i32] %large, 31
146  %add0 = add i32 %arg0, %arg1
147  %add1 = add i32 %add0, %val_firststack
148  %add2 = add i32 %add1, %val_laststack
149  ret i32 %add2
150}
151
152; FIXME: Why load and store same location for stack args?
153; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_a32i32:
154
155; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s32{{$}}
156; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s32 offset:4
157
158; GCN-NOT: s32
159
160; GCN-DAG: buffer_store_dword [[LOAD_0]], off, s[0:3], s32{{$}}
161; GCN-DAG: buffer_store_dword [[LOAD_1]], off, s[0:3], s32 offset:4
162
163; GCN-NOT: s32
164; GCN: s_setpc_b64
165define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) #1 {
166entry:
167  %ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c)
168  ret i32 %ret
169}
170
171; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_a32i32_stack_object:
172; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9
173; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:28
174; GCN: s_setpc_b64
175define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32_stack_object(i32 %a, i32 %b, [32 x i32] %c) #1 {
176entry:
177  %alloca = alloca [16 x i32], align 4, addrspace(5)
178  %gep = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 5
179  store volatile i32 9, i32 addrspace(5)* %gep
180  %ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c)
181  ret i32 %ret
182}
183
184; If the callee requires more stack argument space than the caller,
185; don't do a tail call.
186; TODO: Do we really need this restriction?
187
188; GCN-LABEL: {{^}}no_sibling_call_callee_more_stack_space:
189; GCN: s_swappc_b64
190; GCN: s_setpc_b64
191define fastcc i32 @no_sibling_call_callee_more_stack_space(i32 %a, i32 %b) #1 {
192entry:
193  %ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] zeroinitializer)
194  ret i32 %ret
195}
196
197; Have another non-tail in the function
198; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_other_call:
199; GCN: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1
200; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
201; GCN-NEXT: s_mov_b64 exec
202; GCN: s_mov_b32 s33, s32
203; GCN-DAG: s_add_u32 s32, s32, 0x400
204
205; GCN-DAG: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
206; GCN-DAG: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
207; GCN-DAG: v_writelane_b32 v42, s34, 0
208; GCN-DAG: v_writelane_b32 v42, s35, 1
209
210; GCN-DAG: s_getpc_b64 s[4:5]
211; GCN-DAG: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4
212; GCN-DAG: s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+12
213
214
215; GCN: s_swappc_b64
216
217; GCN-DAG: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
218; GCN-DAG: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
219
220; GCN: s_getpc_b64 s[4:5]
221; GCN-NEXT: s_add_u32 s4, s4, sibling_call_i32_fastcc_i32_i32@rel32@lo+4
222; GCN-NEXT: s_addc_u32 s5, s5, sibling_call_i32_fastcc_i32_i32@rel32@hi+12
223
224; GCN-DAG: v_readlane_b32 s34, v42, 0
225; GCN-DAG: v_readlane_b32 s35, v42, 1
226
227; GCN: s_sub_u32 s32, s32, 0x400
228; GCN-NEXT: v_readlane_b32 s33,
229; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
230; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
231; GCN-NEXT: s_mov_b64 exec, s[6:7]
232; GCN-NEXT: s_setpc_b64 s[4:5]
233define fastcc i32 @sibling_call_i32_fastcc_i32_i32_other_call(i32 %a, i32 %b, i32 %c) #1 {
234entry:
235  %other.call = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b)
236  %ret = tail call fastcc i32 @sibling_call_i32_fastcc_i32_i32(i32 %a, i32 %b, i32 %other.call)
237  ret i32 %ret
238}
239
240; Have stack object in caller and stack passed arguments. SP should be
241; in same place at function exit.
242
243; GCN-LABEL: {{^}}sibling_call_stack_objecti32_fastcc_i32_i32_a32i32:
244; GCN-NOT: s33
245; GCN: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:
246
247; GCN-NOT: s33
248
249; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset:
250; GCN: s_setpc_b64 s[4:5]
251define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) #1 {
252entry:
253  %alloca = alloca [16 x i32], align 4, addrspace(5)
254  %gep = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 5
255  store volatile i32 9, i32 addrspace(5)* %gep
256  %ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c)
257  ret i32 %ret
258}
259
260; GCN-LABEL: {{^}}sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area:
261; GCN-NOT: s33
262; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset:44
263
264; GCN-NOT: s33
265; GCN: s_setpc_b64 s[4:5]
266define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area(i32 %a, i32 %b, [36 x i32] %c) #1 {
267entry:
268  %alloca = alloca [16 x i32], align 4, addrspace(5)
269  %gep = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 5
270  store volatile i32 9, i32 addrspace(5)* %gep
271  %ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] zeroinitializer)
272  ret i32 %ret
273}
274
275attributes #0 = { nounwind }
276attributes #1 = { nounwind noinline }
277