1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,DEFAULTSIZE,MUBUF %s
3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -amdgpu-assume-dynamic-stack-object-size=1024 < %s | FileCheck -check-prefixes=GCN,ASSUME1024,MUBUF %s
4; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-flat-scratch < %s | FileCheck -check-prefixes=GCN,DEFAULTSIZE,FLATSCR %s
5; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-flat-scratch -amdgpu-assume-dynamic-stack-object-size=1024 < %s | FileCheck -check-prefixes=GCN,ASSUME1024,FLATSCR %s
6
7; FIXME: Generated test checks do not check metadata at the end of the
8; function, so this also includes manually added checks.
9
10; Test that we can select a statically sized alloca outside of the
11; entry block.
12
13; FIXME: FunctionLoweringInfo unhelpfully doesn't preserve an
14; alignment less than the stack alignment.
15define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reached_align4(i32 addrspace(1)* %out, i32 %arg.cond0, i32 %arg.cond1, i32 %in) {
16; MUBUF-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align4:
17; MUBUF:       ; %bb.0: ; %entry
18; MUBUF-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
19; MUBUF-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
20; MUBUF-NEXT:    s_add_u32 s0, s0, s9
21; MUBUF-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
22; MUBUF-NEXT:    s_addc_u32 s1, s1, 0
23; MUBUF-NEXT:    s_movk_i32 s32, 0x400
24; MUBUF-NEXT:    s_mov_b32 s33, 0
25; MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
26; MUBUF-NEXT:    s_cmp_lg_u32 s8, 0
27; MUBUF-NEXT:    s_cbranch_scc1 BB0_3
28; MUBUF-NEXT:  ; %bb.1: ; %bb.0
29; MUBUF-NEXT:    s_cmp_lg_u32 s9, 0
30; MUBUF-NEXT:    s_cbranch_scc1 BB0_3
31; MUBUF-NEXT:  ; %bb.2: ; %bb.1
32; MUBUF-NEXT:    s_add_i32 s6, s32, 0x1000
33; MUBUF-NEXT:    s_lshl_b32 s7, s10, 2
34; MUBUF-NEXT:    s_mov_b32 s32, s6
35; MUBUF-NEXT:    v_mov_b32_e32 v2, s6
36; MUBUF-NEXT:    v_mov_b32_e32 v1, 0
37; MUBUF-NEXT:    v_mov_b32_e32 v3, 1
38; MUBUF-NEXT:    s_add_i32 s6, s6, s7
39; MUBUF-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
40; MUBUF-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
41; MUBUF-NEXT:    v_mov_b32_e32 v2, s6
42; MUBUF-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
43; MUBUF-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
44; MUBUF-NEXT:    s_waitcnt vmcnt(0)
45; MUBUF-NEXT:    v_add_u32_e32 v0, v2, v0
46; MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
47; MUBUF-NEXT:    global_store_dword v1, v0, s[4:5]
48; MUBUF-NEXT:  BB0_3: ; %bb.2
49; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
50; MUBUF-NEXT:    global_store_dword v[0:1], v0, off
51; MUBUF-NEXT:    s_endpgm
52;
53; FLATSCR-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align4:
54; FLATSCR:       ; %bb.0: ; %entry
55; FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
56; FLATSCR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
57; FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
58; FLATSCR-NEXT:    s_mov_b32 s32, 16
59; FLATSCR-NEXT:    s_mov_b32 s33, 0
60; FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
61; FLATSCR-NEXT:    s_cmp_lg_u32 s4, 0
62; FLATSCR-NEXT:    s_cbranch_scc1 BB0_3
63; FLATSCR-NEXT:  ; %bb.1: ; %bb.0
64; FLATSCR-NEXT:    s_cmp_lg_u32 s5, 0
65; FLATSCR-NEXT:    s_cbranch_scc1 BB0_3
66; FLATSCR-NEXT:  ; %bb.2: ; %bb.1
67; FLATSCR-NEXT:    s_mov_b32 s2, s32
68; FLATSCR-NEXT:    s_movk_i32 s3, 0x1000
69; FLATSCR-NEXT:    s_add_i32 s4, s2, s3
70; FLATSCR-NEXT:    v_mov_b32_e32 v1, 0
71; FLATSCR-NEXT:    v_mov_b32_e32 v2, 1
72; FLATSCR-NEXT:    s_add_u32 s2, s2, s3
73; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[1:2], s2
74; FLATSCR-NEXT:    s_lshl_b32 s2, s6, 2
75; FLATSCR-NEXT:    s_mov_b32 s32, s4
76; FLATSCR-NEXT:    s_add_i32 s4, s4, s2
77; FLATSCR-NEXT:    scratch_load_dword v2, off, s4
78; FLATSCR-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
79; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
80; FLATSCR-NEXT:    v_add_u32_e32 v0, v2, v0
81; FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
82; FLATSCR-NEXT:    global_store_dword v1, v0, s[0:1]
83; FLATSCR-NEXT:  BB0_3: ; %bb.2
84; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
85; FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
86; FLATSCR-NEXT:    s_endpgm
87
88entry:
89  %cond0 = icmp eq i32 %arg.cond0, 0
90  br i1 %cond0, label %bb.0, label %bb.2
91
92bb.0:
93  %alloca = alloca [16 x i32], align 4, addrspace(5)
94  %gep0 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 0
95  %gep1 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 1
96  %cond1 = icmp eq i32 %arg.cond1, 0
97  br i1 %cond1, label %bb.1, label %bb.2
98
99bb.1:
100  ; Use the alloca outside of the defining block.
101  store i32 0, i32 addrspace(5)* %gep0
102  store i32 1, i32 addrspace(5)* %gep1
103  %gep2 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %in
104  %load = load i32, i32 addrspace(5)* %gep2
105  %tid = call i32 @llvm.amdgcn.workitem.id.x()
106  %add = add i32 %load, %tid
107  store i32 %add, i32 addrspace(1)* %out
108  br label %bb.2
109
110bb.2:
111  store volatile i32 0, i32 addrspace(1)* undef
112  ret void
113}
114; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 4112
115; DEFAULTSIZE: ; ScratchSize: 4112
116
117; ASSUME1024: .amdhsa_private_segment_fixed_size 1040
118; ASSUME1024: ; ScratchSize: 1040
119
120define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reached_align64(i32 addrspace(1)* %out, i32 %arg.cond, i32 %in) {
121; MUBUF-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align64:
122; MUBUF:       ; %bb.0: ; %entry
123; MUBUF-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
124; MUBUF-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
125; MUBUF-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x8
126; MUBUF-NEXT:    s_add_u32 s0, s0, s9
127; MUBUF-NEXT:    s_addc_u32 s1, s1, 0
128; MUBUF-NEXT:    s_movk_i32 s32, 0x1000
129; MUBUF-NEXT:    s_mov_b32 s33, 0
130; MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
131; MUBUF-NEXT:    s_cmp_lg_u32 s6, 0
132; MUBUF-NEXT:    s_cbranch_scc1 BB1_2
133; MUBUF-NEXT:  ; %bb.1: ; %bb.0
134; MUBUF-NEXT:    s_add_i32 s6, s32, 0x1000
135; MUBUF-NEXT:    s_and_b32 s6, s6, 0xfffff000
136; MUBUF-NEXT:    s_lshl_b32 s7, s7, 2
137; MUBUF-NEXT:    s_mov_b32 s32, s6
138; MUBUF-NEXT:    v_mov_b32_e32 v2, s6
139; MUBUF-NEXT:    v_mov_b32_e32 v1, 0
140; MUBUF-NEXT:    v_mov_b32_e32 v3, 1
141; MUBUF-NEXT:    s_add_i32 s6, s6, s7
142; MUBUF-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
143; MUBUF-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
144; MUBUF-NEXT:    v_mov_b32_e32 v2, s6
145; MUBUF-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
146; MUBUF-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
147; MUBUF-NEXT:    s_waitcnt vmcnt(0)
148; MUBUF-NEXT:    v_add_u32_e32 v0, v2, v0
149; MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
150; MUBUF-NEXT:    global_store_dword v1, v0, s[4:5]
151; MUBUF-NEXT:  BB1_2: ; %bb.1
152; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
153; MUBUF-NEXT:    global_store_dword v[0:1], v0, off
154; MUBUF-NEXT:    s_endpgm
155;
156; FLATSCR-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align64:
157; FLATSCR:       ; %bb.0: ; %entry
158; FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
159; FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
160; FLATSCR-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
161; FLATSCR-NEXT:    s_mov_b32 s32, 64
162; FLATSCR-NEXT:    s_mov_b32 s33, 0
163; FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
164; FLATSCR-NEXT:    s_cmp_lg_u32 s2, 0
165; FLATSCR-NEXT:    s_cbranch_scc1 BB1_2
166; FLATSCR-NEXT:  ; %bb.1: ; %bb.0
167; FLATSCR-NEXT:    s_add_i32 s2, s32, 0x1000
168; FLATSCR-NEXT:    s_and_b32 s2, s2, 0xfffff000
169; FLATSCR-NEXT:    v_mov_b32_e32 v1, 0
170; FLATSCR-NEXT:    v_mov_b32_e32 v2, 1
171; FLATSCR-NEXT:    s_lshl_b32 s3, s3, 2
172; FLATSCR-NEXT:    s_mov_b32 s32, s2
173; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[1:2], s2
174; FLATSCR-NEXT:    s_add_i32 s2, s2, s3
175; FLATSCR-NEXT:    scratch_load_dword v2, off, s2
176; FLATSCR-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
177; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
178; FLATSCR-NEXT:    v_add_u32_e32 v0, v2, v0
179; FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
180; FLATSCR-NEXT:    global_store_dword v1, v0, s[0:1]
181; FLATSCR-NEXT:  BB1_2: ; %bb.1
182; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
183; FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
184; FLATSCR-NEXT:    s_endpgm
185entry:
186  %cond = icmp eq i32 %arg.cond, 0
187  br i1 %cond, label %bb.0, label %bb.1
188
189bb.0:
190  %alloca = alloca [16 x i32], align 64, addrspace(5)
191  %gep0 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 0
192  %gep1 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 1
193  store i32 0, i32 addrspace(5)* %gep0
194  store i32 1, i32 addrspace(5)* %gep1
195  %gep2 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %in
196  %load = load i32, i32 addrspace(5)* %gep2
197  %tid = call i32 @llvm.amdgcn.workitem.id.x()
198  %add = add i32 %load, %tid
199  store i32 %add, i32 addrspace(1)* %out
200  br label %bb.1
201
202bb.1:
203  store volatile i32 0, i32 addrspace(1)* undef
204  ret void
205}
206
207; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 4160
208; DEFAULTSIZE: ; ScratchSize: 4160
209
210; ASSUME1024: .amdhsa_private_segment_fixed_size 1088
211; ASSUME1024: ; ScratchSize: 1088
212
213
214define void @func_non_entry_block_static_alloca_align4(i32 addrspace(1)* %out, i32 %arg.cond0, i32 %arg.cond1, i32 %in) {
215; MUBUF-LABEL: func_non_entry_block_static_alloca_align4:
216; MUBUF:       ; %bb.0: ; %entry
217; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
218; MUBUF-NEXT:    s_mov_b32 s7, s33
219; MUBUF-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
220; MUBUF-NEXT:    s_mov_b32 s33, s32
221; MUBUF-NEXT:    s_add_u32 s32, s32, 0x400
222; MUBUF-NEXT:    s_and_saveexec_b64 s[4:5], vcc
223; MUBUF-NEXT:    s_cbranch_execz BB2_3
224; MUBUF-NEXT:  ; %bb.1: ; %bb.0
225; MUBUF-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
226; MUBUF-NEXT:    s_and_b64 exec, exec, vcc
227; MUBUF-NEXT:    s_cbranch_execz BB2_3
228; MUBUF-NEXT:  ; %bb.2: ; %bb.1
229; MUBUF-NEXT:    s_add_i32 s6, s32, 0x1000
230; MUBUF-NEXT:    v_mov_b32_e32 v2, 0
231; MUBUF-NEXT:    v_mov_b32_e32 v3, s6
232; MUBUF-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
233; MUBUF-NEXT:    v_mov_b32_e32 v2, 1
234; MUBUF-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen offset:4
235; MUBUF-NEXT:    v_lshl_add_u32 v2, v4, 2, s6
236; MUBUF-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
237; MUBUF-NEXT:    v_and_b32_e32 v3, 0x3ff, v5
238; MUBUF-NEXT:    s_mov_b32 s32, s6
239; MUBUF-NEXT:    s_waitcnt vmcnt(0)
240; MUBUF-NEXT:    v_add_u32_e32 v2, v2, v3
241; MUBUF-NEXT:    global_store_dword v[0:1], v2, off
242; MUBUF-NEXT:  BB2_3: ; %bb.2
243; MUBUF-NEXT:    s_or_b64 exec, exec, s[4:5]
244; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
245; MUBUF-NEXT:    global_store_dword v[0:1], v0, off
246; MUBUF-NEXT:    s_sub_u32 s32, s32, 0x400
247; MUBUF-NEXT:    s_mov_b32 s33, s7
248; MUBUF-NEXT:    s_waitcnt vmcnt(0)
249; MUBUF-NEXT:    s_setpc_b64 s[30:31]
250;
251; FLATSCR-LABEL: func_non_entry_block_static_alloca_align4:
252; FLATSCR:       ; %bb.0: ; %entry
253; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
254; FLATSCR-NEXT:    s_mov_b32 s5, s33
255; FLATSCR-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
256; FLATSCR-NEXT:    s_mov_b32 s33, s32
257; FLATSCR-NEXT:    s_add_u32 s32, s32, 16
258; FLATSCR-NEXT:    s_and_saveexec_b64 s[0:1], vcc
259; FLATSCR-NEXT:    s_cbranch_execz BB2_3
260; FLATSCR-NEXT:  ; %bb.1: ; %bb.0
261; FLATSCR-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
262; FLATSCR-NEXT:    s_and_b64 exec, exec, vcc
263; FLATSCR-NEXT:    s_cbranch_execz BB2_3
264; FLATSCR-NEXT:  ; %bb.2: ; %bb.1
265; FLATSCR-NEXT:    s_mov_b32 s2, s32
266; FLATSCR-NEXT:    s_movk_i32 s3, 0x1000
267; FLATSCR-NEXT:    s_add_i32 s4, s2, s3
268; FLATSCR-NEXT:    v_mov_b32_e32 v2, 0
269; FLATSCR-NEXT:    v_mov_b32_e32 v3, 1
270; FLATSCR-NEXT:    s_add_u32 s2, s2, s3
271; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[2:3], s2
272; FLATSCR-NEXT:    v_lshl_add_u32 v2, v4, 2, s4
273; FLATSCR-NEXT:    scratch_load_dword v2, v2, off
274; FLATSCR-NEXT:    v_and_b32_e32 v3, 0x3ff, v5
275; FLATSCR-NEXT:    s_mov_b32 s32, s4
276; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
277; FLATSCR-NEXT:    v_add_u32_e32 v2, v2, v3
278; FLATSCR-NEXT:    global_store_dword v[0:1], v2, off
279; FLATSCR-NEXT:  BB2_3: ; %bb.2
280; FLATSCR-NEXT:    s_or_b64 exec, exec, s[0:1]
281; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
282; FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
283; FLATSCR-NEXT:    s_sub_u32 s32, s32, 16
284; FLATSCR-NEXT:    s_mov_b32 s33, s5
285; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
286; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
287
288entry:
289  %cond0 = icmp eq i32 %arg.cond0, 0
290  br i1 %cond0, label %bb.0, label %bb.2
291
292bb.0:
293  %alloca = alloca [16 x i32], align 4, addrspace(5)
294  %gep0 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 0
295  %gep1 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 1
296  %cond1 = icmp eq i32 %arg.cond1, 0
297  br i1 %cond1, label %bb.1, label %bb.2
298
299bb.1:
300  ; Use the alloca outside of the defining block.
301  store i32 0, i32 addrspace(5)* %gep0
302  store i32 1, i32 addrspace(5)* %gep1
303  %gep2 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %in
304  %load = load i32, i32 addrspace(5)* %gep2
305  %tid = call i32 @llvm.amdgcn.workitem.id.x()
306  %add = add i32 %load, %tid
307  store i32 %add, i32 addrspace(1)* %out
308  br label %bb.2
309
310bb.2:
311  store volatile i32 0, i32 addrspace(1)* undef
312  ret void
313}
314
315define void @func_non_entry_block_static_alloca_align64(i32 addrspace(1)* %out, i32 %arg.cond, i32 %in) {
316; MUBUF-LABEL: func_non_entry_block_static_alloca_align64:
317; MUBUF:       ; %bb.0: ; %entry
318; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
319; MUBUF-NEXT:    s_add_u32 s4, s32, 0xfc0
320; MUBUF-NEXT:    s_mov_b32 s7, s33
321; MUBUF-NEXT:    s_and_b32 s33, s4, 0xfffff000
322; MUBUF-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
323; MUBUF-NEXT:    s_add_u32 s32, s32, 0x2000
324; MUBUF-NEXT:    s_and_saveexec_b64 s[4:5], vcc
325; MUBUF-NEXT:    s_cbranch_execz BB3_2
326; MUBUF-NEXT:  ; %bb.1: ; %bb.0
327; MUBUF-NEXT:    s_add_i32 s6, s32, 0x1000
328; MUBUF-NEXT:    s_and_b32 s6, s6, 0xfffff000
329; MUBUF-NEXT:    v_mov_b32_e32 v2, 0
330; MUBUF-NEXT:    v_mov_b32_e32 v5, s6
331; MUBUF-NEXT:    buffer_store_dword v2, v5, s[0:3], 0 offen
332; MUBUF-NEXT:    v_mov_b32_e32 v2, 1
333; MUBUF-NEXT:    buffer_store_dword v2, v5, s[0:3], 0 offen offset:4
334; MUBUF-NEXT:    v_lshl_add_u32 v2, v3, 2, s6
335; MUBUF-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
336; MUBUF-NEXT:    v_and_b32_e32 v3, 0x3ff, v4
337; MUBUF-NEXT:    s_mov_b32 s32, s6
338; MUBUF-NEXT:    s_waitcnt vmcnt(0)
339; MUBUF-NEXT:    v_add_u32_e32 v2, v2, v3
340; MUBUF-NEXT:    global_store_dword v[0:1], v2, off
341; MUBUF-NEXT:  BB3_2: ; %bb.1
342; MUBUF-NEXT:    s_or_b64 exec, exec, s[4:5]
343; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
344; MUBUF-NEXT:    global_store_dword v[0:1], v0, off
345; MUBUF-NEXT:    s_sub_u32 s32, s32, 0x2000
346; MUBUF-NEXT:    s_mov_b32 s33, s7
347; MUBUF-NEXT:    s_waitcnt vmcnt(0)
348; MUBUF-NEXT:    s_setpc_b64 s[30:31]
349;
350; FLATSCR-LABEL: func_non_entry_block_static_alloca_align64:
351; FLATSCR:       ; %bb.0: ; %entry
352; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
353; FLATSCR-NEXT:    s_add_u32 s0, s32, 63
354; FLATSCR-NEXT:    s_mov_b32 s3, s33
355; FLATSCR-NEXT:    s_and_b32 s33, s0, 0xffffffc0
356; FLATSCR-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
357; FLATSCR-NEXT:    s_add_u32 s32, s32, 0x80
358; FLATSCR-NEXT:    s_and_saveexec_b64 s[0:1], vcc
359; FLATSCR-NEXT:    s_cbranch_execz BB3_2
360; FLATSCR-NEXT:  ; %bb.1: ; %bb.0
361; FLATSCR-NEXT:    s_add_i32 s2, s32, 0x1000
362; FLATSCR-NEXT:    s_and_b32 s2, s2, 0xfffff000
363; FLATSCR-NEXT:    v_mov_b32_e32 v5, 0
364; FLATSCR-NEXT:    v_mov_b32_e32 v6, 1
365; FLATSCR-NEXT:    v_lshl_add_u32 v2, v3, 2, s2
366; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[5:6], s2
367; FLATSCR-NEXT:    scratch_load_dword v2, v2, off
368; FLATSCR-NEXT:    v_and_b32_e32 v3, 0x3ff, v4
369; FLATSCR-NEXT:    s_mov_b32 s32, s2
370; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
371; FLATSCR-NEXT:    v_add_u32_e32 v2, v2, v3
372; FLATSCR-NEXT:    global_store_dword v[0:1], v2, off
373; FLATSCR-NEXT:  BB3_2: ; %bb.1
374; FLATSCR-NEXT:    s_or_b64 exec, exec, s[0:1]
375; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
376; FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
377; FLATSCR-NEXT:    s_sub_u32 s32, s32, 0x80
378; FLATSCR-NEXT:    s_mov_b32 s33, s3
379; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
380; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
381entry:
382  %cond = icmp eq i32 %arg.cond, 0
383  br i1 %cond, label %bb.0, label %bb.1
384
385bb.0:
386  %alloca = alloca [16 x i32], align 64, addrspace(5)
387  %gep0 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 0
388  %gep1 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 1
389  store i32 0, i32 addrspace(5)* %gep0
390  store i32 1, i32 addrspace(5)* %gep1
391  %gep2 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %in
392  %load = load i32, i32 addrspace(5)* %gep2
393  %tid = call i32 @llvm.amdgcn.workitem.id.x()
394  %add = add i32 %load, %tid
395  store i32 %add, i32 addrspace(1)* %out
396  br label %bb.1
397
398bb.1:
399  store volatile i32 0, i32 addrspace(1)* undef
400  ret void
401}
402
403declare i32 @llvm.amdgcn.workitem.id.x() #0
404
405attributes #0 = { nounwind readnone speculatable }
406