1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,DEFAULTSIZE %s
3; RUN: llc -global-isel -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -amdgpu-assume-dynamic-stack-object-size=1024 < %s | FileCheck -check-prefixes=GCN,ASSUME1024 %s
4
5; FIXME: Generated test checks do not check metadata at the end of the
6; function, so this also includes manually added checks.
7
8; Test that we can select a statically sized alloca outside of the
9; entry block.
10
11; FIXME: FunctionLoweringInfo unhelpfully doesn't preserve an
12; alignment less than the stack alignment.
13define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reached_align4(i32 addrspace(1)* %out, i32 %arg.cond0, i32 %arg.cond1, i32 %in) {
14; GCN-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align4:
15; GCN:       ; %bb.0: ; %entry
16; GCN-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
17; GCN-NEXT:    s_load_dword s6, s[4:5], 0x8
18; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
19; GCN-NEXT:    s_add_u32 s0, s0, s9
20; GCN-NEXT:    s_addc_u32 s1, s1, 0
21; GCN-NEXT:    s_movk_i32 s32, 0x400
22; GCN-NEXT:    s_waitcnt lgkmcnt(0)
23; GCN-NEXT:    s_cmp_lg_u32 s6, 0
24; GCN-NEXT:    s_cselect_b32 s6, 1, 0
25; GCN-NEXT:    s_and_b32 s6, s6, 1
26; GCN-NEXT:    s_cmp_lg_u32 s6, 0
27; GCN-NEXT:    s_mov_b32 s33, 0
28; GCN-NEXT:    s_cbranch_scc1 BB0_3
29; GCN-NEXT:  ; %bb.1: ; %bb.0
30; GCN-NEXT:    s_load_dword s6, s[4:5], 0xc
31; GCN-NEXT:    s_waitcnt lgkmcnt(0)
32; GCN-NEXT:    s_cmp_lg_u32 s6, 0
33; GCN-NEXT:    s_cselect_b32 s6, 1, 0
34; GCN-NEXT:    s_and_b32 s6, s6, 1
35; GCN-NEXT:    s_cmp_lg_u32 s6, 0
36; GCN-NEXT:    s_cbranch_scc1 BB0_3
37; GCN-NEXT:  ; %bb.2: ; %bb.1
38; GCN-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
39; GCN-NEXT:    s_load_dword s4, s[4:5], 0x10
40; GCN-NEXT:    s_add_u32 s5, s32, 0x1000
41; GCN-NEXT:    s_add_u32 s8, s5, 4
42; GCN-NEXT:    v_mov_b32_e32 v1, 0
43; GCN-NEXT:    v_mov_b32_e32 v2, s5
44; GCN-NEXT:    s_waitcnt lgkmcnt(0)
45; GCN-NEXT:    s_lshl_b32 s4, s4, 2
46; GCN-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
47; GCN-NEXT:    v_mov_b32_e32 v2, 1
48; GCN-NEXT:    v_mov_b32_e32 v3, s8
49; GCN-NEXT:    s_add_u32 s4, s5, s4
50; GCN-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
51; GCN-NEXT:    v_mov_b32_e32 v2, s4
52; GCN-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
53; GCN-NEXT:    s_waitcnt vmcnt(0)
54; GCN-NEXT:    v_add_u32_e32 v0, v2, v0
55; GCN-NEXT:    global_store_dword v1, v0, s[6:7]
56; GCN-NEXT:  BB0_3: ; %bb.2
57; GCN-NEXT:    v_mov_b32_e32 v0, 0
58; GCN-NEXT:    global_store_dword v[0:1], v0, off
59; GCN-NEXT:    s_endpgm
60
61entry:
62  %cond0 = icmp eq i32 %arg.cond0, 0
63  br i1 %cond0, label %bb.0, label %bb.2
64
65bb.0:
66  %alloca = alloca [16 x i32], align 4, addrspace(5)
67  %gep0 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 0
68  %gep1 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 1
69  %cond1 = icmp eq i32 %arg.cond1, 0
70  br i1 %cond1, label %bb.1, label %bb.2
71
72bb.1:
73  ; Use the alloca outside of the defining block.
74  store i32 0, i32 addrspace(5)* %gep0
75  store i32 1, i32 addrspace(5)* %gep1
76  %gep2 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %in
77  %load = load i32, i32 addrspace(5)* %gep2
78  %tid = call i32 @llvm.amdgcn.workitem.id.x()
79  %add = add i32 %load, %tid
80  store i32 %add, i32 addrspace(1)* %out
81  br label %bb.2
82
83bb.2:
84  store volatile i32 0, i32 addrspace(1)* undef
85  ret void
86}
87; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 4112
88; DEFAULTSIZE: ; ScratchSize: 4112
89
90; ASSUME1024: .amdhsa_private_segment_fixed_size 1040
91; ASSUME1024: ; ScratchSize: 1040
92
93define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reached_align64(i32 addrspace(1)* %out, i32 %arg.cond, i32 %in) {
94; GCN-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align64:
95; GCN:       ; %bb.0: ; %entry
96; GCN-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
97; GCN-NEXT:    s_load_dword s6, s[4:5], 0x8
98; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
99; GCN-NEXT:    s_add_u32 s0, s0, s9
100; GCN-NEXT:    s_addc_u32 s1, s1, 0
101; GCN-NEXT:    s_movk_i32 s32, 0x1000
102; GCN-NEXT:    s_waitcnt lgkmcnt(0)
103; GCN-NEXT:    s_cmp_lg_u32 s6, 0
104; GCN-NEXT:    s_cselect_b32 s6, 1, 0
105; GCN-NEXT:    s_and_b32 s6, s6, 1
106; GCN-NEXT:    s_cmp_lg_u32 s6, 0
107; GCN-NEXT:    s_mov_b32 s33, 0
108; GCN-NEXT:    s_cbranch_scc1 BB1_2
109; GCN-NEXT:  ; %bb.1: ; %bb.0
110; GCN-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
111; GCN-NEXT:    s_load_dword s4, s[4:5], 0xc
112; GCN-NEXT:    s_add_u32 s5, s32, 0x1000
113; GCN-NEXT:    s_and_b32 s5, s5, 0xfffff000
114; GCN-NEXT:    s_add_u32 s8, s5, 4
115; GCN-NEXT:    v_mov_b32_e32 v1, 0
116; GCN-NEXT:    s_waitcnt lgkmcnt(0)
117; GCN-NEXT:    s_lshl_b32 s4, s4, 2
118; GCN-NEXT:    v_mov_b32_e32 v2, s5
119; GCN-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
120; GCN-NEXT:    v_mov_b32_e32 v2, 1
121; GCN-NEXT:    v_mov_b32_e32 v3, s8
122; GCN-NEXT:    s_add_u32 s4, s5, s4
123; GCN-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
124; GCN-NEXT:    v_mov_b32_e32 v2, s4
125; GCN-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
126; GCN-NEXT:    s_waitcnt vmcnt(0)
127; GCN-NEXT:    v_add_u32_e32 v0, v2, v0
128; GCN-NEXT:    global_store_dword v1, v0, s[6:7]
129; GCN-NEXT:  BB1_2: ; %bb.1
130; GCN-NEXT:    v_mov_b32_e32 v0, 0
131; GCN-NEXT:    global_store_dword v[0:1], v0, off
132; GCN-NEXT:    s_endpgm
133entry:
134  %cond = icmp eq i32 %arg.cond, 0
135  br i1 %cond, label %bb.0, label %bb.1
136
137bb.0:
138  %alloca = alloca [16 x i32], align 64, addrspace(5)
139  %gep0 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 0
140  %gep1 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 1
141  store i32 0, i32 addrspace(5)* %gep0
142  store i32 1, i32 addrspace(5)* %gep1
143  %gep2 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %in
144  %load = load i32, i32 addrspace(5)* %gep2
145  %tid = call i32 @llvm.amdgcn.workitem.id.x()
146  %add = add i32 %load, %tid
147  store i32 %add, i32 addrspace(1)* %out
148  br label %bb.1
149
150bb.1:
151  store volatile i32 0, i32 addrspace(1)* undef
152  ret void
153}
154
155; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 4160
156; DEFAULTSIZE: ; ScratchSize: 4160
157
158; ASSUME1024: .amdhsa_private_segment_fixed_size 1088
159; ASSUME1024: ; ScratchSize: 1088
160
161
162define void @func_non_entry_block_static_alloca_align4(i32 addrspace(1)* %out, i32 %arg.cond0, i32 %arg.cond1, i32 %in) {
163; GCN-LABEL: func_non_entry_block_static_alloca_align4:
164; GCN:       ; %bb.0: ; %entry
165; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
166; GCN-NEXT:    s_mov_b32 s8, s33
167; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
168; GCN-NEXT:    s_mov_b32 s33, s32
169; GCN-NEXT:    s_add_u32 s32, s32, 0x400
170; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
171; GCN-NEXT:    s_cbranch_execz BB2_3
172; GCN-NEXT:  ; %bb.1: ; %bb.0
173; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
174; GCN-NEXT:    s_and_b64 exec, exec, vcc
175; GCN-NEXT:    s_cbranch_execz BB2_3
176; GCN-NEXT:  ; %bb.2: ; %bb.1
177; GCN-NEXT:    s_add_u32 s6, s32, 0x1000
178; GCN-NEXT:    v_mov_b32_e32 v2, 0
179; GCN-NEXT:    v_mov_b32_e32 v3, s6
180; GCN-NEXT:    s_add_u32 s7, s6, 4
181; GCN-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
182; GCN-NEXT:    v_mov_b32_e32 v2, 1
183; GCN-NEXT:    v_mov_b32_e32 v3, s7
184; GCN-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
185; GCN-NEXT:    v_lshlrev_b32_e32 v2, 2, v4
186; GCN-NEXT:    v_add_u32_e32 v2, s6, v2
187; GCN-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
188; GCN-NEXT:    v_and_b32_e32 v3, 0x3ff, v5
189; GCN-NEXT:    s_waitcnt vmcnt(0)
190; GCN-NEXT:    v_add_u32_e32 v2, v2, v3
191; GCN-NEXT:    global_store_dword v[0:1], v2, off
192; GCN-NEXT:  BB2_3: ; %bb.2
193; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
194; GCN-NEXT:    v_mov_b32_e32 v0, 0
195; GCN-NEXT:    global_store_dword v[0:1], v0, off
196; GCN-NEXT:    s_sub_u32 s32, s32, 0x400
197; GCN-NEXT:    s_mov_b32 s33, s8
198; GCN-NEXT:    s_waitcnt vmcnt(0)
199; GCN-NEXT:    s_setpc_b64 s[30:31]
200
201entry:
202  %cond0 = icmp eq i32 %arg.cond0, 0
203  br i1 %cond0, label %bb.0, label %bb.2
204
205bb.0:
206  %alloca = alloca [16 x i32], align 4, addrspace(5)
207  %gep0 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 0
208  %gep1 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 1
209  %cond1 = icmp eq i32 %arg.cond1, 0
210  br i1 %cond1, label %bb.1, label %bb.2
211
212bb.1:
213  ; Use the alloca outside of the defining block.
214  store i32 0, i32 addrspace(5)* %gep0
215  store i32 1, i32 addrspace(5)* %gep1
216  %gep2 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %in
217  %load = load i32, i32 addrspace(5)* %gep2
218  %tid = call i32 @llvm.amdgcn.workitem.id.x()
219  %add = add i32 %load, %tid
220  store i32 %add, i32 addrspace(1)* %out
221  br label %bb.2
222
223bb.2:
224  store volatile i32 0, i32 addrspace(1)* undef
225  ret void
226}
227
228define void @func_non_entry_block_static_alloca_align64(i32 addrspace(1)* %out, i32 %arg.cond, i32 %in) {
229; GCN-LABEL: func_non_entry_block_static_alloca_align64:
230; GCN:       ; %bb.0: ; %entry
231; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
232; GCN-NEXT:    s_add_u32 s4, s32, 0xfc0
233; GCN-NEXT:    s_mov_b32 s8, s33
234; GCN-NEXT:    s_and_b32 s33, s4, 0xfffff000
235; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
236; GCN-NEXT:    s_add_u32 s32, s32, 0x2000
237; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
238; GCN-NEXT:    s_cbranch_execz BB3_2
239; GCN-NEXT:  ; %bb.1: ; %bb.0
240; GCN-NEXT:    s_add_u32 s6, s32, 0x1000
241; GCN-NEXT:    s_and_b32 s6, s6, 0xfffff000
242; GCN-NEXT:    s_add_u32 s7, s6, 4
243; GCN-NEXT:    v_mov_b32_e32 v2, 0
244; GCN-NEXT:    v_mov_b32_e32 v5, s6
245; GCN-NEXT:    buffer_store_dword v2, v5, s[0:3], 0 offen
246; GCN-NEXT:    v_mov_b32_e32 v2, 1
247; GCN-NEXT:    v_mov_b32_e32 v5, s7
248; GCN-NEXT:    buffer_store_dword v2, v5, s[0:3], 0 offen
249; GCN-NEXT:    v_lshlrev_b32_e32 v2, 2, v3
250; GCN-NEXT:    v_add_u32_e32 v2, s6, v2
251; GCN-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
252; GCN-NEXT:    v_and_b32_e32 v3, 0x3ff, v4
253; GCN-NEXT:    s_waitcnt vmcnt(0)
254; GCN-NEXT:    v_add_u32_e32 v2, v2, v3
255; GCN-NEXT:    global_store_dword v[0:1], v2, off
256; GCN-NEXT:  BB3_2: ; %bb.1
257; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
258; GCN-NEXT:    v_mov_b32_e32 v0, 0
259; GCN-NEXT:    global_store_dword v[0:1], v0, off
260; GCN-NEXT:    s_sub_u32 s32, s32, 0x2000
261; GCN-NEXT:    s_mov_b32 s33, s8
262; GCN-NEXT:    s_waitcnt vmcnt(0)
263; GCN-NEXT:    s_setpc_b64 s[30:31]
264entry:
265  %cond = icmp eq i32 %arg.cond, 0
266  br i1 %cond, label %bb.0, label %bb.1
267
268bb.0:
269  %alloca = alloca [16 x i32], align 64, addrspace(5)
270  %gep0 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 0
271  %gep1 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 1
272  store i32 0, i32 addrspace(5)* %gep0
273  store i32 1, i32 addrspace(5)* %gep1
274  %gep2 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %in
275  %load = load i32, i32 addrspace(5)* %gep2
276  %tid = call i32 @llvm.amdgcn.workitem.id.x()
277  %add = add i32 %load, %tid
278  store i32 %add, i32 addrspace(1)* %out
279  br label %bb.1
280
281bb.1:
282  store volatile i32 0, i32 addrspace(1)* undef
283  ret void
284}
285
286declare i32 @llvm.amdgcn.workitem.id.x() #0
287
288attributes #0 = { nounwind readnone speculatable }
289