1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
3; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
4
5@gv = external addrspace(4) constant i32
6
7define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align4(i32 %n) {
8; GFX9-LABEL: kernel_dynamic_stackalloc_sgpr_align4:
9; GFX9:       ; %bb.0:
10; GFX9-NEXT:    s_load_dword s4, s[4:5], 0x0
11; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
12; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
13; GFX9-NEXT:    s_add_u32 s0, s0, s9
14; GFX9-NEXT:    s_addc_u32 s1, s1, 0
15; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
16; GFX9-NEXT:    s_lshl2_add_u32 s4, s4, 15
17; GFX9-NEXT:    s_and_b32 s4, s4, -16
18; GFX9-NEXT:    s_movk_i32 s32, 0x400
19; GFX9-NEXT:    s_lshl_b32 s4, s4, 6
20; GFX9-NEXT:    s_add_u32 s4, s32, s4
21; GFX9-NEXT:    v_mov_b32_e32 v0, 0
22; GFX9-NEXT:    v_mov_b32_e32 v1, s4
23; GFX9-NEXT:    s_mov_b32 s33, 0
24; GFX9-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
25; GFX9-NEXT:    s_endpgm
26;
27; GFX10-LABEL: kernel_dynamic_stackalloc_sgpr_align4:
28; GFX10:       ; %bb.0:
29; GFX10-NEXT:    s_add_u32 s6, s6, s9
30; GFX10-NEXT:    s_movk_i32 s32, 0x200
31; GFX10-NEXT:    s_mov_b32 s33, 0
32; GFX10-NEXT:    s_addc_u32 s7, s7, 0
33; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
34; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
35; GFX10-NEXT:    s_load_dword s4, s[4:5], 0x0
36; GFX10-NEXT:    s_add_u32 s0, s0, s9
37; GFX10-NEXT:    s_addc_u32 s1, s1, 0
38; GFX10-NEXT:    v_mov_b32_e32 v0, 0
39; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
40; GFX10-NEXT:    s_lshl2_add_u32 s4, s4, 15
41; GFX10-NEXT:    s_and_b32 s4, s4, -16
42; GFX10-NEXT:    s_lshl_b32 s4, s4, 5
43; GFX10-NEXT:    s_add_u32 s4, s32, s4
44; GFX10-NEXT:    v_mov_b32_e32 v1, s4
45; GFX10-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
46; GFX10-NEXT:    s_endpgm
47  %alloca = alloca i32, i32 %n, align 4, addrspace(5)
48  store i32 0, i32 addrspace(5)* %alloca
49  ret void
50}
51
52define void @func_dynamic_stackalloc_sgpr_align4() {
53; GFX9-LABEL: func_dynamic_stackalloc_sgpr_align4:
54; GFX9:       ; %bb.0:
55; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
56; GFX9-NEXT:    s_mov_b32 s6, s33
57; GFX9-NEXT:    s_mov_b32 s33, s32
58; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
59; GFX9-NEXT:    s_getpc_b64 s[4:5]
60; GFX9-NEXT:    s_add_u32 s4, s4, gv@gotpcrel32@lo+4
61; GFX9-NEXT:    s_addc_u32 s5, s5, gv@gotpcrel32@hi+12
62; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
63; GFX9-NEXT:    v_mov_b32_e32 v0, 0
64; GFX9-NEXT:    s_mov_b32 s33, s6
65; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
66; GFX9-NEXT:    s_load_dword s4, s[4:5], 0x0
67; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
68; GFX9-NEXT:    s_lshl2_add_u32 s4, s4, 15
69; GFX9-NEXT:    s_and_b32 s4, s4, -16
70; GFX9-NEXT:    s_lshl_b32 s4, s4, 6
71; GFX9-NEXT:    s_add_u32 s4, s32, s4
72; GFX9-NEXT:    v_mov_b32_e32 v1, s4
73; GFX9-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
74; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
75; GFX9-NEXT:    s_waitcnt vmcnt(0)
76; GFX9-NEXT:    s_setpc_b64 s[30:31]
77;
78; GFX10-LABEL: func_dynamic_stackalloc_sgpr_align4:
79; GFX10:       ; %bb.0:
80; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
81; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
82; GFX10-NEXT:    s_mov_b32 s6, s33
83; GFX10-NEXT:    s_mov_b32 s33, s32
84; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
85; GFX10-NEXT:    s_getpc_b64 s[4:5]
86; GFX10-NEXT:    s_add_u32 s4, s4, gv@gotpcrel32@lo+4
87; GFX10-NEXT:    s_addc_u32 s5, s5, gv@gotpcrel32@hi+12
88; GFX10-NEXT:    v_mov_b32_e32 v0, 0
89; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
90; GFX10-NEXT:    s_mov_b32 s33, s6
91; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
92; GFX10-NEXT:    s_load_dword s4, s[4:5], 0x0
93; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
94; GFX10-NEXT:    s_lshl2_add_u32 s4, s4, 15
95; GFX10-NEXT:    s_and_b32 s4, s4, -16
96; GFX10-NEXT:    s_lshl_b32 s4, s4, 5
97; GFX10-NEXT:    s_add_u32 s4, s32, s4
98; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
99; GFX10-NEXT:    v_mov_b32_e32 v1, s4
100; GFX10-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
101; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
102; GFX10-NEXT:    s_setpc_b64 s[30:31]
103  %n = load i32, i32 addrspace(4)* @gv, align 4
104  %alloca = alloca i32, i32 %n, addrspace(5)
105  store i32 0, i32 addrspace(5)* %alloca
106  ret void
107}
108
109define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align16(i32 %n) {
110; GFX9-LABEL: kernel_dynamic_stackalloc_sgpr_align16:
111; GFX9:       ; %bb.0:
112; GFX9-NEXT:    s_load_dword s4, s[4:5], 0x0
113; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
114; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
115; GFX9-NEXT:    s_add_u32 s0, s0, s9
116; GFX9-NEXT:    s_addc_u32 s1, s1, 0
117; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
118; GFX9-NEXT:    s_lshl2_add_u32 s4, s4, 15
119; GFX9-NEXT:    s_and_b32 s4, s4, -16
120; GFX9-NEXT:    s_movk_i32 s32, 0x400
121; GFX9-NEXT:    s_lshl_b32 s4, s4, 6
122; GFX9-NEXT:    s_add_u32 s4, s32, s4
123; GFX9-NEXT:    v_mov_b32_e32 v0, 0
124; GFX9-NEXT:    v_mov_b32_e32 v1, s4
125; GFX9-NEXT:    s_mov_b32 s33, 0
126; GFX9-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
127; GFX9-NEXT:    s_endpgm
128;
129; GFX10-LABEL: kernel_dynamic_stackalloc_sgpr_align16:
130; GFX10:       ; %bb.0:
131; GFX10-NEXT:    s_add_u32 s6, s6, s9
132; GFX10-NEXT:    s_movk_i32 s32, 0x200
133; GFX10-NEXT:    s_mov_b32 s33, 0
134; GFX10-NEXT:    s_addc_u32 s7, s7, 0
135; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
136; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
137; GFX10-NEXT:    s_load_dword s4, s[4:5], 0x0
138; GFX10-NEXT:    s_add_u32 s0, s0, s9
139; GFX10-NEXT:    s_addc_u32 s1, s1, 0
140; GFX10-NEXT:    v_mov_b32_e32 v0, 0
141; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
142; GFX10-NEXT:    s_lshl2_add_u32 s4, s4, 15
143; GFX10-NEXT:    s_and_b32 s4, s4, -16
144; GFX10-NEXT:    s_lshl_b32 s4, s4, 5
145; GFX10-NEXT:    s_add_u32 s4, s32, s4
146; GFX10-NEXT:    v_mov_b32_e32 v1, s4
147; GFX10-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
148; GFX10-NEXT:    s_endpgm
149  %alloca = alloca i32, i32 %n, align 16, addrspace(5)
150  store i32 0, i32 addrspace(5)* %alloca
151  ret void
152}
153
154define void @func_dynamic_stackalloc_sgpr_align16() {
155; GFX9-LABEL: func_dynamic_stackalloc_sgpr_align16:
156; GFX9:       ; %bb.0:
157; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
158; GFX9-NEXT:    s_mov_b32 s6, s33
159; GFX9-NEXT:    s_mov_b32 s33, s32
160; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
161; GFX9-NEXT:    s_getpc_b64 s[4:5]
162; GFX9-NEXT:    s_add_u32 s4, s4, gv@gotpcrel32@lo+4
163; GFX9-NEXT:    s_addc_u32 s5, s5, gv@gotpcrel32@hi+12
164; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
165; GFX9-NEXT:    v_mov_b32_e32 v0, 0
166; GFX9-NEXT:    s_mov_b32 s33, s6
167; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
168; GFX9-NEXT:    s_load_dword s4, s[4:5], 0x0
169; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
170; GFX9-NEXT:    s_lshl2_add_u32 s4, s4, 15
171; GFX9-NEXT:    s_and_b32 s4, s4, -16
172; GFX9-NEXT:    s_lshl_b32 s4, s4, 6
173; GFX9-NEXT:    s_add_u32 s4, s32, s4
174; GFX9-NEXT:    v_mov_b32_e32 v1, s4
175; GFX9-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
176; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
177; GFX9-NEXT:    s_waitcnt vmcnt(0)
178; GFX9-NEXT:    s_setpc_b64 s[30:31]
179;
180; GFX10-LABEL: func_dynamic_stackalloc_sgpr_align16:
181; GFX10:       ; %bb.0:
182; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
183; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
184; GFX10-NEXT:    s_mov_b32 s6, s33
185; GFX10-NEXT:    s_mov_b32 s33, s32
186; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
187; GFX10-NEXT:    s_getpc_b64 s[4:5]
188; GFX10-NEXT:    s_add_u32 s4, s4, gv@gotpcrel32@lo+4
189; GFX10-NEXT:    s_addc_u32 s5, s5, gv@gotpcrel32@hi+12
190; GFX10-NEXT:    v_mov_b32_e32 v0, 0
191; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
192; GFX10-NEXT:    s_mov_b32 s33, s6
193; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
194; GFX10-NEXT:    s_load_dword s4, s[4:5], 0x0
195; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
196; GFX10-NEXT:    s_lshl2_add_u32 s4, s4, 15
197; GFX10-NEXT:    s_and_b32 s4, s4, -16
198; GFX10-NEXT:    s_lshl_b32 s4, s4, 5
199; GFX10-NEXT:    s_add_u32 s4, s32, s4
200; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
201; GFX10-NEXT:    v_mov_b32_e32 v1, s4
202; GFX10-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
203; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
204; GFX10-NEXT:    s_setpc_b64 s[30:31]
205  %n = load i32, i32 addrspace(4)* @gv, align 16
206  %alloca = alloca i32, i32 %n, addrspace(5)
207  store i32 0, i32 addrspace(5)* %alloca
208  ret void
209}
210
211define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align32(i32 %n) {
212; GFX9-LABEL: kernel_dynamic_stackalloc_sgpr_align32:
213; GFX9:       ; %bb.0:
214; GFX9-NEXT:    s_load_dword s4, s[4:5], 0x0
215; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
216; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
217; GFX9-NEXT:    s_add_u32 s0, s0, s9
218; GFX9-NEXT:    s_addc_u32 s1, s1, 0
219; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
220; GFX9-NEXT:    s_lshl2_add_u32 s4, s4, 15
221; GFX9-NEXT:    s_and_b32 s4, s4, -16
222; GFX9-NEXT:    s_movk_i32 s32, 0x800
223; GFX9-NEXT:    s_lshl_b32 s4, s4, 6
224; GFX9-NEXT:    s_add_u32 s4, s32, s4
225; GFX9-NEXT:    s_and_b32 s4, s4, 0xfffff800
226; GFX9-NEXT:    v_mov_b32_e32 v0, 0
227; GFX9-NEXT:    v_mov_b32_e32 v1, s4
228; GFX9-NEXT:    s_mov_b32 s33, 0
229; GFX9-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
230; GFX9-NEXT:    s_endpgm
231;
232; GFX10-LABEL: kernel_dynamic_stackalloc_sgpr_align32:
233; GFX10:       ; %bb.0:
234; GFX10-NEXT:    s_add_u32 s6, s6, s9
235; GFX10-NEXT:    s_movk_i32 s32, 0x400
236; GFX10-NEXT:    s_mov_b32 s33, 0
237; GFX10-NEXT:    s_addc_u32 s7, s7, 0
238; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
239; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
240; GFX10-NEXT:    s_load_dword s4, s[4:5], 0x0
241; GFX10-NEXT:    s_add_u32 s0, s0, s9
242; GFX10-NEXT:    s_addc_u32 s1, s1, 0
243; GFX10-NEXT:    v_mov_b32_e32 v0, 0
244; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
245; GFX10-NEXT:    s_lshl2_add_u32 s4, s4, 15
246; GFX10-NEXT:    s_and_b32 s4, s4, -16
247; GFX10-NEXT:    s_lshl_b32 s4, s4, 5
248; GFX10-NEXT:    s_add_u32 s4, s32, s4
249; GFX10-NEXT:    s_and_b32 s4, s4, 0xfffffc00
250; GFX10-NEXT:    v_mov_b32_e32 v1, s4
251; GFX10-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
252; GFX10-NEXT:    s_endpgm
253  %alloca = alloca i32, i32 %n, align 32, addrspace(5)
254  store i32 0, i32 addrspace(5)* %alloca
255  ret void
256}
257
258define void @func_dynamic_stackalloc_sgpr_align32(i32 addrspace(1)* %out) {
259; GFX9-LABEL: func_dynamic_stackalloc_sgpr_align32:
260; GFX9:       ; %bb.0:
261; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
262; GFX9-NEXT:    s_add_u32 s4, s32, 0x7c0
263; GFX9-NEXT:    s_mov_b32 s6, s33
264; GFX9-NEXT:    s_and_b32 s33, s4, 0xfffff800
265; GFX9-NEXT:    s_add_u32 s32, s32, 0x1000
266; GFX9-NEXT:    s_getpc_b64 s[4:5]
267; GFX9-NEXT:    s_add_u32 s4, s4, gv@gotpcrel32@lo+4
268; GFX9-NEXT:    s_addc_u32 s5, s5, gv@gotpcrel32@hi+12
269; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
270; GFX9-NEXT:    v_mov_b32_e32 v0, 0
271; GFX9-NEXT:    s_mov_b32 s33, s6
272; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
273; GFX9-NEXT:    s_load_dword s4, s[4:5], 0x0
274; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
275; GFX9-NEXT:    s_lshl2_add_u32 s4, s4, 15
276; GFX9-NEXT:    s_and_b32 s4, s4, -16
277; GFX9-NEXT:    s_lshl_b32 s4, s4, 6
278; GFX9-NEXT:    s_add_u32 s4, s32, s4
279; GFX9-NEXT:    s_and_b32 s4, s4, 0xfffff800
280; GFX9-NEXT:    v_mov_b32_e32 v1, s4
281; GFX9-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
282; GFX9-NEXT:    s_sub_u32 s32, s32, 0x1000
283; GFX9-NEXT:    s_waitcnt vmcnt(0)
284; GFX9-NEXT:    s_setpc_b64 s[30:31]
285;
286; GFX10-LABEL: func_dynamic_stackalloc_sgpr_align32:
287; GFX10:       ; %bb.0:
288; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
289; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
290; GFX10-NEXT:    s_add_u32 s4, s32, 0x3e0
291; GFX10-NEXT:    s_mov_b32 s6, s33
292; GFX10-NEXT:    s_and_b32 s33, s4, 0xfffffc00
293; GFX10-NEXT:    s_add_u32 s32, s32, 0x800
294; GFX10-NEXT:    s_getpc_b64 s[4:5]
295; GFX10-NEXT:    s_add_u32 s4, s4, gv@gotpcrel32@lo+4
296; GFX10-NEXT:    s_addc_u32 s5, s5, gv@gotpcrel32@hi+12
297; GFX10-NEXT:    v_mov_b32_e32 v0, 0
298; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
299; GFX10-NEXT:    s_mov_b32 s33, s6
300; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
301; GFX10-NEXT:    s_load_dword s4, s[4:5], 0x0
302; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
303; GFX10-NEXT:    s_lshl2_add_u32 s4, s4, 15
304; GFX10-NEXT:    s_and_b32 s4, s4, -16
305; GFX10-NEXT:    s_lshl_b32 s4, s4, 5
306; GFX10-NEXT:    s_add_u32 s4, s32, s4
307; GFX10-NEXT:    s_and_b32 s4, s4, 0xfffffc00
308; GFX10-NEXT:    s_sub_u32 s32, s32, 0x800
309; GFX10-NEXT:    v_mov_b32_e32 v1, s4
310; GFX10-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
311; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
312; GFX10-NEXT:    s_setpc_b64 s[30:31]
313  %n = load i32, i32 addrspace(4)* @gv
314  %alloca = alloca i32, i32 %n, align 32, addrspace(5)
315  store i32 0, i32 addrspace(5)* %alloca
316  ret void
317}
318