1; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck -check-prefixes=GCN,MUBUF %s
2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 -amdgpu-enable-flat-scratch < %s | FileCheck -check-prefixes=GCN,FLATSCR %s
3
4; Test that the VGPR spiller correctly switches to SGPR offsets when the
5; instruction offset field would overflow, and that it accounts for memory
6; swizzling.
7
8; GCN-LABEL: test_inst_offset_kernel
9define amdgpu_kernel void @test_inst_offset_kernel() {
10entry:
11  ; Occupy 4092 bytes of scratch, so the offset of the spill of %a just fits in
12  ; the instruction offset field.
13  %alloca = alloca i8, i32 4088, align 4, addrspace(5)
14  %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
15
16  %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
17  ; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4092 ; 4-byte Folded Spill
18  ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s{{[0-9]+}} ; 4-byte Folded Spill
19  %a = load volatile i32, i32 addrspace(5)* %aptr
20
21  ; Force %a to spill.
22  call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
23
24  %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
25  store volatile i32 %a, i32 addrspace(5)* %outptr
26
27  ret void
28}
29
30; GCN-LABEL: test_sgpr_offset_kernel
31define amdgpu_kernel void @test_sgpr_offset_kernel() {
32entry:
33  ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
34  ; fit in the instruction, and has to live in the SGPR offset.
35  %alloca = alloca i8, i32 4092, align 4, addrspace(5)
36  %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
37
38  %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
39  ; 0x40000 / 64 = 4096 (for wave64)
40  ; MUBUF:   s_mov_b32 s6, 0x40000
41  ; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 ; 4-byte Folded Spill
42  ; FLATSCR: s_movk_i32 s2, 0x1000
43  ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s2 ; 4-byte Folded Spill
44  %a = load volatile i32, i32 addrspace(5)* %aptr
45
46  ; Force %a to spill
47  call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
48
49  %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
50  store volatile i32 %a, i32 addrspace(5)* %outptr
51
52  ret void
53}
54
55; FIXME: If we fail to scavenge an SGPR in a kernel we don't have a stack
56; pointer to temporarily update, so we just crash.
57
58; GCN-LABEL: test_sgpr_offset_function_scavenge_fail
59define void @test_sgpr_offset_function_scavenge_fail() #2 {
60entry:
61  ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
62  ; fit in the instruction, and has to live in the SGPR offset.
63  %alloca = alloca i8, i32 4096, align 4, addrspace(5)
64  %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
65
66  %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
67
68  %asm.0 = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"()
69  %asm0.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 0
70  %asm1.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 1
71  %asm2.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 2
72  %asm3.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 3
73  %asm4.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 4
74  %asm5.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 5
75  %asm6.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 6
76  %asm7.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 7
77
78  ; 0x40000 / 64 = 4096 (for wave64)
79  %a = load volatile i32, i32 addrspace(5)* %aptr
80
81  ; MUBUF:   s_add_u32 s32, s32, 0x40000
82  ; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s32 ; 4-byte Folded Spill
83  ; MUBUF:   s_sub_u32 s32, s32, 0x40000
84  ; FLATSCR: s_add_u32 [[SOFF:s[0-9+]]], s32, 0x1000
85  ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, [[SOFF]] ; 4-byte Folded Spill
86  call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0.0, i32 %asm1.0, i32 %asm2.0, i32 %asm3.0, i32 %asm4.0, i32 %asm5.0, i32 %asm6.0, i32 %asm7.0, i32 %a)
87
88  %asm = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"()
89  %asm0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 0
90  %asm1 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 1
91  %asm2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 2
92  %asm3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 3
93  %asm4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 4
94  %asm5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 5
95  %asm6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 6
96  %asm7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 7
97
98  call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0
99
100  ; MUBUF:   s_add_u32 s32, s32, 0x40000
101  ; MUBUF:   buffer_load_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s32 ; 4-byte Folded Reload
102  ; MUBUF:   s_sub_u32 s32, s32, 0x40000
103  ; FLATSCR: s_add_u32 [[SOFF:s[0-9+]]], s32, 0x1000
104  ; FLATSCR: scratch_load_dword v{{[0-9]+}}, off, [[SOFF]] ; 4-byte Folded Reload
105
106   ; Force %a to spill with no free SGPRs
107  call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0, i32 %asm1, i32 %asm2, i32 %asm3, i32 %asm4, i32 %asm5, i32 %asm6, i32 %asm7, i32 %a)
108  ret void
109}
110
111; GCN-LABEL: test_sgpr_offset_subregs_kernel
112define amdgpu_kernel void @test_sgpr_offset_subregs_kernel() {
113entry:
114  ; Occupy 4088 bytes of scratch, so that the spill of the last subreg of %a
115  ; still fits below offset 4096 (4088 + 8 - 4 = 4092), and can be placed in
116  ; the instruction offset field.
117  %alloca = alloca i8, i32 4084, align 4, addrspace(5)
118  %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
119  %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)*
120
121  ; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4088 ; 4-byte Folded Spill
122  ; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4092 ; 4-byte Folded Spill
123  ; FLATSCR: s_movk_i32 [[SOFF:s[0-9]+]], 0xff8
124  ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, [[SOFF]]          ; 4-byte Folded Spill
125  ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, [[SOFF]] offset:4 ; 4-byte Folded Spill
126  %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1
127  %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr
128
129  ; Force %a to spill.
130  call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
131
132  ; Ensure the alloca sticks around.
133  %bptr = getelementptr i32, i32 addrspace(5)* %bufv1, i32 1
134  %b = load volatile i32, i32 addrspace(5)* %bptr
135
136  ; Ensure the spill is of the full super-reg.
137  call void asm sideeffect "; $0", "r"(<2 x i32> %a)
138
139  ret void
140}
141
142; GCN-LABEL: test_inst_offset_subregs_kernel
143define amdgpu_kernel void @test_inst_offset_subregs_kernel() {
144entry:
145  ; Occupy 4092 bytes of scratch, so that the spill of the last subreg of %a
146  ; does not fit below offset 4096 (4092 + 8 - 4 = 4096), and has to live
147  ; in the SGPR offset.
148  %alloca = alloca i8, i32 4088, align 4, addrspace(5)
149  %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
150  %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)*
151
152  ; 0x3ff00 / 64 = 4092 (for wave64)
153  ; MUBUF:   s_mov_b32 s6, 0x3ff00
154  ; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 ; 4-byte Folded Spill
155  ; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 offset:4 ; 4-byte Folded Spill
156  ; FLATSCR: s_movk_i32 [[SOFF:s[0-9]+]], 0xffc
157  ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, [[SOFF]]          ; 4-byte Folded Spill
158  ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, [[SOFF]] offset:4 ; 4-byte Folded Spill
159  %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1
160  %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr
161
162  ; Force %a to spill.
163  call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
164
165  ; Ensure the alloca sticks around.
166  %bptr = getelementptr i32, i32 addrspace(5)* %bufv1, i32 1
167  %b = load volatile i32, i32 addrspace(5)* %bptr
168
169  ; Ensure the spill is of the full super-reg.
170  call void asm sideeffect "; $0", "r"(<2 x i32> %a)
171
172  ret void
173}
174
175; GCN-LABEL: test_inst_offset_function
176define void @test_inst_offset_function() {
177entry:
178  ; Occupy 4092 bytes of scratch, so the offset of the spill of %a just fits in
179  ; the instruction offset field.
180  %alloca = alloca i8, i32 4092, align 4, addrspace(5)
181  %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
182
183  %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
184  ; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4092 ; 4-byte Folded Spill
185  ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s{{[0-9]+}} offset:4092 ; 4-byte Folded Spill
186  %a = load volatile i32, i32 addrspace(5)* %aptr
187
188  ; Force %a to spill.
189  call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
190
191  %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
192  store volatile i32 %a, i32 addrspace(5)* %outptr
193
194  ret void
195}
196
197; GCN-LABEL: test_sgpr_offset_function
198define void @test_sgpr_offset_function() {
199entry:
200  ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
201  ; fit in the instruction, and has to live in the SGPR offset.
202  %alloca = alloca i8, i32 4096, align 4, addrspace(5)
203  %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
204
205  %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
206  ; 0x40000 / 64 = 4096 (for wave64)
207  ; MUBUF:   s_add_u32 s4, s32, 0x40000
208  ; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 ; 4-byte Folded Spill
209  ; FLATSCR: s_add_u32 s0, s32, 0x1000
210  ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s0 ; 4-byte Folded Spill
211  %a = load volatile i32, i32 addrspace(5)* %aptr
212
213  ; Force %a to spill
214  call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
215
216  %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
217  store volatile i32 %a, i32 addrspace(5)* %outptr
218
219  ret void
220}
221
222; GCN-LABEL: test_sgpr_offset_subregs_function
223define void @test_sgpr_offset_subregs_function() {
224entry:
225  ; Occupy 4088 bytes of scratch, so that the spill of the last subreg of %a
226  ; still fits below offset 4096 (4088 + 8 - 4 = 4092), and can be placed in
227  ; the instruction offset field.
228  %alloca = alloca i8, i32 4088, align 4, addrspace(5)
229  %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
230  %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)*
231
232  ; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4088 ; 4-byte Folded Spill
233  ; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4092 ; 4-byte Folded Spill
234  ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s32 offset:4088 ; 4-byte Folded Spill
235  ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s32 offset:4092 ; 4-byte Folded Spill
236  %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1
237  %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr
238
239  ; Force %a to spill.
240  call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
241
242  ; Ensure the alloca sticks around.
243  %bptr = getelementptr i32, i32 addrspace(5)* %bufv1, i32 1
244  %b = load volatile i32, i32 addrspace(5)* %bptr
245
246  ; Ensure the spill is of the full super-reg.
247  call void asm sideeffect "; $0", "r"(<2 x i32> %a)
248
249  ret void
250}
251
252; GCN-LABEL: test_inst_offset_subregs_function
253define void @test_inst_offset_subregs_function() {
254entry:
255  ; Occupy 4092 bytes of scratch, so that the spill of the last subreg of %a
256  ; does not fit below offset 4096 (4092 + 8 - 4 = 4096), and has to live
257  ; in the SGPR offset.
258  %alloca = alloca i8, i32 4092, align 4, addrspace(5)
259  %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
260  %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)*
261
262  ; 0x3ff00 / 64 = 4092 (for wave64)
263  ; MUBUF: s_add_u32 s4, s32, 0x3ff00
264  ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 ; 4-byte Folded Spill
265  ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 offset:4 ; 4-byte Folded Spill
266  ; FLATSCR: s_add_u32 [[SOFF:s[0-9]+]], s32, 0xffc
267  ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, [[SOFF]]          ; 4-byte Folded Spill
268  ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, [[SOFF]] offset:4 ; 4-byte Folded Spill
269  %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1
270  %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr
271
272  ; Force %a to spill.
273  call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
274
275  ; Ensure the alloca sticks around.
276  %bptr = getelementptr i32, i32 addrspace(5)* %bufv1, i32 1
277  %b = load volatile i32, i32 addrspace(5)* %bptr
278
279  ; Ensure the spill is of the full super-reg.
280  call void asm sideeffect "; $0", "r"(<2 x i32> %a)
281
282  ret void
283}
284
285attributes #0 = { nounwind }
286attributes #1 = { nounwind "amdgpu-num-sgpr"="17" "amdgpu-num-vgpr"="8" }
287attributes #2 = { nounwind "amdgpu-num-sgpr"="14" "amdgpu-num-vgpr"="8" }
288