1; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s
2
3; Test that the VGPR spiller correctly switches to SGPR offsets when the
4; instruction offset field would overflow, and that it accounts for memory
5; swizzling.
6
7; CHECK-LABEL: test_inst_offset_kernel
8define amdgpu_kernel void @test_inst_offset_kernel() {
9entry:
10  ; Occupy 4092 bytes of scratch, so the offset of the spill of %a just fits in
11  ; the instruction offset field.
12  %alloca = alloca i8, i32 4088, align 4, addrspace(5)
13  %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
14
15  %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
16  ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4092 ; 4-byte Folded Spill
17  %a = load volatile i32, i32 addrspace(5)* %aptr
18
19  ; Force %a to spill.
20  call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
21
22  %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
23  store volatile i32 %a, i32 addrspace(5)* %outptr
24
25  ret void
26}
27
28; CHECK-LABEL: test_sgpr_offset_kernel
29define amdgpu_kernel void @test_sgpr_offset_kernel() {
30entry:
31  ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
32  ; fit in the instruction, and has to live in the SGPR offset.
33  %alloca = alloca i8, i32 4092, align 4, addrspace(5)
34  %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
35
36  %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
37  ; 0x40000 / 64 = 4096 (for wave64)
38  ; CHECK: s_add_u32 s7, s7, 0x40000
39  ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s7 ; 4-byte Folded Spill
40  ; CHECK: s_sub_u32 s7, s7, 0x40000
41  %a = load volatile i32, i32 addrspace(5)* %aptr
42
43  ; Force %a to spill
44  call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
45
46  %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
47  store volatile i32 %a, i32 addrspace(5)* %outptr
48
49  ret void
50}
51
52; CHECK-LABEL: test_sgpr_offset_subregs_kernel
53define amdgpu_kernel void @test_sgpr_offset_subregs_kernel() {
54entry:
55  ; Occupy 4088 bytes of scratch, so that the spill of the last subreg of %a
56  ; still fits below offset 4096 (4088 + 8 - 4 = 4092), and can be placed in
57  ; the instruction offset field.
58  %alloca = alloca i8, i32 4084, align 4, addrspace(5)
59  %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
60  %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)*
61
62  ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4088 ; 4-byte Folded Spill
63  ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4092 ; 4-byte Folded Spill
64  %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1
65  %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr
66
67  ; Force %a to spill.
68  call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
69
70  ; Ensure the alloca sticks around.
71  %bptr = getelementptr i32, i32 addrspace(5)* %bufv1, i32 1
72  %b = load volatile i32, i32 addrspace(5)* %bptr
73
74  ; Ensure the spill is of the full super-reg.
75  call void asm sideeffect "; $0", "r"(<2 x i32> %a)
76
77  ret void
78}
79
80; CHECK-LABEL: test_inst_offset_subregs_kernel
81define amdgpu_kernel void @test_inst_offset_subregs_kernel() {
82entry:
83  ; Occupy 4092 bytes of scratch, so that the spill of the last subreg of %a
84  ; does not fit below offset 4096 (4092 + 8 - 4 = 4096), and has to live
85  ; in the SGPR offset.
86  %alloca = alloca i8, i32 4088, align 4, addrspace(5)
87  %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
88  %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)*
89
90  ; 0x3ff00 / 64 = 4092 (for wave64)
91  ; CHECK: s_add_u32 s7, s7, 0x3ff00
92  ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s7 ; 4-byte Folded Spill
93  ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s7 offset:4 ; 4-byte Folded Spill
94  ; CHECK: s_sub_u32 s7, s7, 0x3ff00
95  %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1
96  %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr
97
98  ; Force %a to spill.
99  call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
100
101  ; Ensure the alloca sticks around.
102  %bptr = getelementptr i32, i32 addrspace(5)* %bufv1, i32 1
103  %b = load volatile i32, i32 addrspace(5)* %bptr
104
105  ; Ensure the spill is of the full super-reg.
106  call void asm sideeffect "; $0", "r"(<2 x i32> %a)
107
108  ret void
109}
110
111; CHECK-LABEL: test_inst_offset_function
112define void @test_inst_offset_function() {
113entry:
114  ; Occupy 4092 bytes of scratch, so the offset of the spill of %a just fits in
115  ; the instruction offset field.
116  %alloca = alloca i8, i32 4088, align 4, addrspace(5)
117  %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
118
119  %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
120  ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4092 ; 4-byte Folded Spill
121  %a = load volatile i32, i32 addrspace(5)* %aptr
122
123  ; Force %a to spill.
124  call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
125
126  %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
127  store volatile i32 %a, i32 addrspace(5)* %outptr
128
129  ret void
130}
131
132; CHECK-LABEL: test_sgpr_offset_function
133define void @test_sgpr_offset_function() {
134entry:
135  ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
136  ; fit in the instruction, and has to live in the SGPR offset.
137  %alloca = alloca i8, i32 4092, align 4, addrspace(5)
138  %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
139
140  %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
141  ; 0x40000 / 64 = 4096 (for wave64)
142  ; CHECK: s_add_u32 s5, s5, 0x40000
143  ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s5 ; 4-byte Folded Spill
144  ; CHECK: s_sub_u32 s5, s5, 0x40000
145  %a = load volatile i32, i32 addrspace(5)* %aptr
146
147  ; Force %a to spill
148  call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
149
150  %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
151  store volatile i32 %a, i32 addrspace(5)* %outptr
152
153  ret void
154}
155
156; CHECK-LABEL: test_sgpr_offset_subregs_function
157define void @test_sgpr_offset_subregs_function() {
158entry:
159  ; Occupy 4088 bytes of scratch, so that the spill of the last subreg of %a
160  ; still fits below offset 4096 (4088 + 8 - 4 = 4092), and can be placed in
161  ; the instruction offset field.
162  %alloca = alloca i8, i32 4084, align 4, addrspace(5)
163  %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
164  %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)*
165
166  ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4088 ; 4-byte Folded Spill
167  ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4092 ; 4-byte Folded Spill
168  %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1
169  %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr
170
171  ; Force %a to spill.
172  call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
173
174  ; Ensure the alloca sticks around.
175  %bptr = getelementptr i32, i32 addrspace(5)* %bufv1, i32 1
176  %b = load volatile i32, i32 addrspace(5)* %bptr
177
178  ; Ensure the spill is of the full super-reg.
179  call void asm sideeffect "; $0", "r"(<2 x i32> %a)
180
181  ret void
182}
183
184; CHECK-LABEL: test_inst_offset_subregs_function
185define void @test_inst_offset_subregs_function() {
186entry:
187  ; Occupy 4092 bytes of scratch, so that the spill of the last subreg of %a
188  ; does not fit below offset 4096 (4092 + 8 - 4 = 4096), and has to live
189  ; in the SGPR offset.
190  %alloca = alloca i8, i32 4088, align 4, addrspace(5)
191  %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
192  %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)*
193
194  ; 0x3ff00 / 64 = 4092 (for wave64)
195  ; CHECK: s_add_u32 s5, s5, 0x3ff00
196  ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s5 ; 4-byte Folded Spill
197  ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s5 offset:4 ; 4-byte Folded Spill
198  ; CHECK: s_sub_u32 s5, s5, 0x3ff00
199  %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1
200  %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr
201
202  ; Force %a to spill.
203  call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
204
205  ; Ensure the alloca sticks around.
206  %bptr = getelementptr i32, i32 addrspace(5)* %bufv1, i32 1
207  %b = load volatile i32, i32 addrspace(5)* %bptr
208
209  ; Ensure the spill is of the full super-reg.
210  call void asm sideeffect "; $0", "r"(<2 x i32> %a)
211
212  ret void
213}
214