1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
3
4; Load argument depends on waitcnt which should be skipped.
5define amdgpu_kernel void @call_memory_arg_load(i32 addrspace(3)* %ptr, i32) #0 {
6; GCN-LABEL: call_memory_arg_load:
7; GCN:       ; %bb.0:
8; GCN-NEXT:    s_load_dword s4, s[4:5], 0x0
9; GCN-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
10; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
11; GCN-NEXT:    s_add_u32 s0, s0, s9
12; GCN-NEXT:    s_addc_u32 s1, s1, 0
13; GCN-NEXT:    s_waitcnt lgkmcnt(0)
14; GCN-NEXT:    v_mov_b32_e32 v0, s4
15; GCN-NEXT:    ds_read_b32 v0, v0
16; GCN-NEXT:    s_getpc_b64 s[4:5]
17; GCN-NEXT:    s_add_u32 s4, s4, func@rel32@lo+4
18; GCN-NEXT:    s_addc_u32 s5, s5, func@rel32@hi+12
19; GCN-NEXT:    s_mov_b32 s32, 0
20; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
21; GCN-NEXT:    s_endpgm
22  %vgpr = load volatile i32, i32 addrspace(3)* %ptr
23  call void @func(i32 %vgpr)
24  ret void
25}
26
27; Memory waitcnt with no register dependence on the call
28define amdgpu_kernel void @call_memory_no_dep(i32 addrspace(1)* %ptr, i32) #0 {
29; GCN-LABEL: call_memory_no_dep:
30; GCN:       ; %bb.0:
31; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
32; GCN-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
33; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
34; GCN-NEXT:    s_add_u32 s0, s0, s9
35; GCN-NEXT:    v_mov_b32_e32 v0, 0
36; GCN-NEXT:    s_addc_u32 s1, s1, 0
37; GCN-NEXT:    s_waitcnt lgkmcnt(0)
38; GCN-NEXT:    global_store_dword v0, v0, s[4:5]
39; GCN-NEXT:    v_mov_b32_e32 v0, 0
40; GCN-NEXT:    s_getpc_b64 s[6:7]
41; GCN-NEXT:    s_add_u32 s6, s6, func@rel32@lo+4
42; GCN-NEXT:    s_addc_u32 s7, s7, func@rel32@hi+12
43; GCN-NEXT:    s_mov_b32 s32, 0
44; GCN-NEXT:    s_swappc_b64 s[30:31], s[6:7]
45; GCN-NEXT:    s_endpgm
46  store i32 0, i32 addrspace(1)* %ptr
47  call void @func(i32 0)
48  ret void
49}
50
51; Should not wait after the call before memory
52define amdgpu_kernel void @call_no_wait_after_call(i32 addrspace(1)* %ptr, i32) #0 {
53; GCN-LABEL: call_no_wait_after_call:
54; GCN:       ; %bb.0:
55; GCN-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
56; GCN-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x0
57; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
58; GCN-NEXT:    s_add_u32 s0, s0, s9
59; GCN-NEXT:    s_addc_u32 s1, s1, 0
60; GCN-NEXT:    v_mov_b32_e32 v0, 0
61; GCN-NEXT:    s_getpc_b64 s[4:5]
62; GCN-NEXT:    s_add_u32 s4, s4, func@rel32@lo+4
63; GCN-NEXT:    s_addc_u32 s5, s5, func@rel32@hi+12
64; GCN-NEXT:    s_mov_b32 s32, 0
65; GCN-NEXT:    v_mov_b32_e32 v40, 0
66; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
67; GCN-NEXT:    global_store_dword v40, v40, s[34:35]
68; GCN-NEXT:    s_endpgm
69  call void @func(i32 0)
70  store i32 0, i32 addrspace(1)* %ptr
71  ret void
72}
73
74define amdgpu_kernel void @call_no_wait_after_call_return_val(i32 addrspace(1)* %ptr, i32) #0 {
75; GCN-LABEL: call_no_wait_after_call_return_val:
76; GCN:       ; %bb.0:
77; GCN-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
78; GCN-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x0
79; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
80; GCN-NEXT:    s_add_u32 s0, s0, s9
81; GCN-NEXT:    s_addc_u32 s1, s1, 0
82; GCN-NEXT:    v_mov_b32_e32 v0, 0
83; GCN-NEXT:    s_getpc_b64 s[4:5]
84; GCN-NEXT:    s_add_u32 s4, s4, func.return@rel32@lo+4
85; GCN-NEXT:    s_addc_u32 s5, s5, func.return@rel32@hi+12
86; GCN-NEXT:    s_mov_b32 s32, 0
87; GCN-NEXT:    v_mov_b32_e32 v40, 0
88; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
89; GCN-NEXT:    global_store_dword v40, v0, s[34:35]
90; GCN-NEXT:    s_endpgm
91  %rv = call i32 @func.return(i32 0)
92  store i32 %rv, i32 addrspace(1)* %ptr
93  ret void
94}
95
96; Need to wait for the address dependency
97define amdgpu_kernel void @call_got_load(i32 addrspace(1)* %ptr, i32) #0 {
98; GCN-LABEL: call_got_load:
99; GCN:       ; %bb.0:
100; GCN-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
101; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
102; GCN-NEXT:    s_add_u32 s0, s0, s9
103; GCN-NEXT:    s_addc_u32 s1, s1, 0
104; GCN-NEXT:    s_getpc_b64 s[4:5]
105; GCN-NEXT:    s_add_u32 s4, s4, got.func@gotpcrel32@lo+4
106; GCN-NEXT:    s_addc_u32 s5, s5, got.func@gotpcrel32@hi+12
107; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
108; GCN-NEXT:    v_mov_b32_e32 v0, 0
109; GCN-NEXT:    s_mov_b32 s32, 0
110; GCN-NEXT:    s_waitcnt lgkmcnt(0)
111; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
112; GCN-NEXT:    s_endpgm
113  call void @got.func(i32 0)
114  ret void
115}
116
117; Need to wait for the address dependency
118define void @tailcall_got_load(i32 addrspace(1)* %ptr, i32) #0 {
119; GCN-LABEL: tailcall_got_load:
120; GCN:       ; %bb.0:
121; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
122; GCN-NEXT:    s_getpc_b64 s[4:5]
123; GCN-NEXT:    s_add_u32 s4, s4, got.func@gotpcrel32@lo+4
124; GCN-NEXT:    s_addc_u32 s5, s5, got.func@gotpcrel32@hi+12
125; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
126; GCN-NEXT:    v_mov_b32_e32 v0, 0
127; GCN-NEXT:    s_waitcnt lgkmcnt(0)
128; GCN-NEXT:    s_setpc_b64 s[4:5]
129  tail call void @got.func(i32 0)
130  ret void
131}
132
133; No need to wait for the load.
134define void @tail_call_memory_arg_load(i32 addrspace(3)* %ptr, i32) #0 {
135; GCN-LABEL: tail_call_memory_arg_load:
136; GCN:       ; %bb.0:
137; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
138; GCN-NEXT:    ds_read_b32 v0, v0
139; GCN-NEXT:    s_getpc_b64 s[4:5]
140; GCN-NEXT:    s_add_u32 s4, s4, func@rel32@lo+4
141; GCN-NEXT:    s_addc_u32 s5, s5, func@rel32@hi+12
142; GCN-NEXT:    s_setpc_b64 s[4:5]
143  %vgpr = load volatile i32, i32 addrspace(3)* %ptr
144  tail call void @func(i32 %vgpr)
145  ret void
146}
147
148declare hidden void @func(i32) #0
149declare hidden i32 @func.return(i32) #0
150declare void @got.func(i32) #0
151
152attributes #0 = { nounwind }
153