1; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck -check-prefixes=GCN,MUBUF %s 2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 -amdgpu-enable-flat-scratch < %s | FileCheck -check-prefixes=GCN,FLATSCR %s 3 4; Test that the VGPR spiller correctly switches to SGPR offsets when the 5; instruction offset field would overflow, and that it accounts for memory 6; swizzling. 7 8; GCN-LABEL: test_inst_offset_kernel 9define amdgpu_kernel void @test_inst_offset_kernel() { 10entry: 11 ; Occupy 4092 bytes of scratch, so the offset of the spill of %a just fits in 12 ; the instruction offset field. 13 %alloca = alloca i8, i32 4088, align 4, addrspace(5) 14 %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)* 15 16 %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 17 ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4092 ; 4-byte Folded Spill 18 ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s{{[0-9]+}} ; 4-byte Folded Spill 19 %a = load volatile i32, i32 addrspace(5)* %aptr 20 21 ; Force %a to spill. 22 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () 23 24 %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 25 store volatile i32 %a, i32 addrspace(5)* %outptr 26 27 ret void 28} 29 30; GCN-LABEL: test_sgpr_offset_kernel 31define amdgpu_kernel void @test_sgpr_offset_kernel() { 32entry: 33 ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not 34 ; fit in the instruction, and has to live in the SGPR offset. 35 %alloca = alloca i8, i32 4092, align 4, addrspace(5) 36 %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)* 37 38 %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 39 ; 0x40000 / 64 = 4096 (for wave64) 40 ; MUBUF: s_mov_b32 s6, 0x40000 41 ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 ; 4-byte Folded Spill 42 ; FLATSCR: s_movk_i32 s2, 0x1000 43 ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s2 ; 4-byte Folded Spill 44 %a = load volatile i32, i32 addrspace(5)* %aptr 45 46 ; Force %a to spill 47 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () 48 49 %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 50 store volatile i32 %a, i32 addrspace(5)* %outptr 51 52 ret void 53} 54 55; FIXME: If we fail to scavenge an SGPR in a kernel we don't have a stack 56; pointer to temporarily update, so we just crash. 57 58; GCN-LABEL: test_sgpr_offset_function_scavenge_fail 59define void @test_sgpr_offset_function_scavenge_fail() #2 { 60entry: 61 ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not 62 ; fit in the instruction, and has to live in the SGPR offset. 63 %alloca = alloca i8, i32 4096, align 4, addrspace(5) 64 %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)* 65 66 %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 67 68 %asm.0 = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"() 69 %asm0.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 0 70 %asm1.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 1 71 %asm2.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 2 72 %asm3.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 3 73 %asm4.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 4 74 %asm5.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 5 75 %asm6.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 6 76 %asm7.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 7 77 78 ; 0x40000 / 64 = 4096 (for wave64) 79 %a = load volatile i32, i32 addrspace(5)* %aptr 80 81 ; MUBUF: s_add_u32 s32, s32, 0x40000 82 ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s32 ; 4-byte Folded Spill 83 ; MUBUF: s_sub_u32 s32, s32, 0x40000 84 ; FLATSCR: s_add_u32 [[SOFF:s[0-9+]]], s32, 0x1000 85 ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, [[SOFF]] ; 4-byte Folded Spill 86 call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0.0, i32 %asm1.0, i32 %asm2.0, i32 %asm3.0, i32 %asm4.0, i32 %asm5.0, i32 %asm6.0, i32 %asm7.0, i32 %a) 87 88 %asm = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"() 89 %asm0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 0 90 %asm1 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 1 91 %asm2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 2 92 %asm3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 3 93 %asm4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 4 94 %asm5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 5 95 %asm6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 6 96 %asm7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 7 97 98 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0 99 100 ; MUBUF: s_add_u32 s32, s32, 0x40000 101 ; MUBUF: buffer_load_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s32 ; 4-byte Folded Reload 102 ; MUBUF: s_sub_u32 s32, s32, 0x40000 103 ; FLATSCR: s_add_u32 [[SOFF:s[0-9+]]], s32, 0x1000 104 ; FLATSCR: scratch_load_dword v{{[0-9]+}}, off, [[SOFF]] ; 4-byte Folded Reload 105 106 ; Force %a to spill with no free SGPRs 107 call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0, i32 %asm1, i32 %asm2, i32 %asm3, i32 %asm4, i32 %asm5, i32 %asm6, i32 %asm7, i32 %a) 108 ret void 109} 110 111; GCN-LABEL: test_sgpr_offset_subregs_kernel 112define amdgpu_kernel void @test_sgpr_offset_subregs_kernel() { 113entry: 114 ; Occupy 4088 bytes of scratch, so that the spill of the last subreg of %a 115 ; still fits below offset 4096 (4088 + 8 - 4 = 4092), and can be placed in 116 ; the instruction offset field. 117 %alloca = alloca i8, i32 4084, align 4, addrspace(5) 118 %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)* 119 %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)* 120 121 ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4088 ; 4-byte Folded Spill 122 ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4092 ; 4-byte Folded Spill 123 ; FLATSCR: s_movk_i32 [[SOFF:s[0-9]+]], 0xff8 124 ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, [[SOFF]] ; 4-byte Folded Spill 125 ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, [[SOFF]] offset:4 ; 4-byte Folded Spill 126 %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1 127 %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr 128 129 ; Force %a to spill. 130 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () 131 132 ; Ensure the alloca sticks around. 133 %bptr = getelementptr i32, i32 addrspace(5)* %bufv1, i32 1 134 %b = load volatile i32, i32 addrspace(5)* %bptr 135 136 ; Ensure the spill is of the full super-reg. 137 call void asm sideeffect "; $0", "r"(<2 x i32> %a) 138 139 ret void 140} 141 142; GCN-LABEL: test_inst_offset_subregs_kernel 143define amdgpu_kernel void @test_inst_offset_subregs_kernel() { 144entry: 145 ; Occupy 4092 bytes of scratch, so that the spill of the last subreg of %a 146 ; does not fit below offset 4096 (4092 + 8 - 4 = 4096), and has to live 147 ; in the SGPR offset. 148 %alloca = alloca i8, i32 4088, align 4, addrspace(5) 149 %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)* 150 %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)* 151 152 ; 0x3ff00 / 64 = 4092 (for wave64) 153 ; MUBUF: s_mov_b32 s6, 0x3ff00 154 ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 ; 4-byte Folded Spill 155 ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 offset:4 ; 4-byte Folded Spill 156 ; FLATSCR: s_movk_i32 [[SOFF:s[0-9]+]], 0xffc 157 ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, [[SOFF]] ; 4-byte Folded Spill 158 ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, [[SOFF]] offset:4 ; 4-byte Folded Spill 159 %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1 160 %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr 161 162 ; Force %a to spill. 163 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () 164 165 ; Ensure the alloca sticks around. 166 %bptr = getelementptr i32, i32 addrspace(5)* %bufv1, i32 1 167 %b = load volatile i32, i32 addrspace(5)* %bptr 168 169 ; Ensure the spill is of the full super-reg. 170 call void asm sideeffect "; $0", "r"(<2 x i32> %a) 171 172 ret void 173} 174 175; GCN-LABEL: test_inst_offset_function 176define void @test_inst_offset_function() { 177entry: 178 ; Occupy 4092 bytes of scratch, so the offset of the spill of %a just fits in 179 ; the instruction offset field. 180 %alloca = alloca i8, i32 4092, align 4, addrspace(5) 181 %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)* 182 183 %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 184 ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4092 ; 4-byte Folded Spill 185 ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s{{[0-9]+}} offset:4092 ; 4-byte Folded Spill 186 %a = load volatile i32, i32 addrspace(5)* %aptr 187 188 ; Force %a to spill. 189 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () 190 191 %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 192 store volatile i32 %a, i32 addrspace(5)* %outptr 193 194 ret void 195} 196 197; GCN-LABEL: test_sgpr_offset_function 198define void @test_sgpr_offset_function() { 199entry: 200 ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not 201 ; fit in the instruction, and has to live in the SGPR offset. 202 %alloca = alloca i8, i32 4096, align 4, addrspace(5) 203 %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)* 204 205 %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 206 ; 0x40000 / 64 = 4096 (for wave64) 207 ; MUBUF: s_add_u32 s4, s32, 0x40000 208 ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 ; 4-byte Folded Spill 209 ; FLATSCR: s_add_u32 s0, s32, 0x1000 210 ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s0 ; 4-byte Folded Spill 211 %a = load volatile i32, i32 addrspace(5)* %aptr 212 213 ; Force %a to spill 214 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () 215 216 %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 217 store volatile i32 %a, i32 addrspace(5)* %outptr 218 219 ret void 220} 221 222; GCN-LABEL: test_sgpr_offset_subregs_function 223define void @test_sgpr_offset_subregs_function() { 224entry: 225 ; Occupy 4088 bytes of scratch, so that the spill of the last subreg of %a 226 ; still fits below offset 4096 (4088 + 8 - 4 = 4092), and can be placed in 227 ; the instruction offset field. 228 %alloca = alloca i8, i32 4088, align 4, addrspace(5) 229 %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)* 230 %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)* 231 232 ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4088 ; 4-byte Folded Spill 233 ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4092 ; 4-byte Folded Spill 234 ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s32 offset:4088 ; 4-byte Folded Spill 235 ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s32 offset:4092 ; 4-byte Folded Spill 236 %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1 237 %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr 238 239 ; Force %a to spill. 240 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () 241 242 ; Ensure the alloca sticks around. 243 %bptr = getelementptr i32, i32 addrspace(5)* %bufv1, i32 1 244 %b = load volatile i32, i32 addrspace(5)* %bptr 245 246 ; Ensure the spill is of the full super-reg. 247 call void asm sideeffect "; $0", "r"(<2 x i32> %a) 248 249 ret void 250} 251 252; GCN-LABEL: test_inst_offset_subregs_function 253define void @test_inst_offset_subregs_function() { 254entry: 255 ; Occupy 4092 bytes of scratch, so that the spill of the last subreg of %a 256 ; does not fit below offset 4096 (4092 + 8 - 4 = 4096), and has to live 257 ; in the SGPR offset. 258 %alloca = alloca i8, i32 4092, align 4, addrspace(5) 259 %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)* 260 %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)* 261 262 ; 0x3ff00 / 64 = 4092 (for wave64) 263 ; MUBUF: s_add_u32 s4, s32, 0x3ff00 264 ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 ; 4-byte Folded Spill 265 ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 offset:4 ; 4-byte Folded Spill 266 ; FLATSCR: s_add_u32 [[SOFF:s[0-9]+]], s32, 0xffc 267 ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, [[SOFF]] ; 4-byte Folded Spill 268 ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, [[SOFF]] offset:4 ; 4-byte Folded Spill 269 %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1 270 %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr 271 272 ; Force %a to spill. 273 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () 274 275 ; Ensure the alloca sticks around. 276 %bptr = getelementptr i32, i32 addrspace(5)* %bufv1, i32 1 277 %b = load volatile i32, i32 addrspace(5)* %bptr 278 279 ; Ensure the spill is of the full super-reg. 280 call void asm sideeffect "; $0", "r"(<2 x i32> %a) 281 282 ret void 283} 284 285attributes #0 = { nounwind } 286attributes #1 = { nounwind "amdgpu-num-sgpr"="17" "amdgpu-num-vgpr"="8" } 287attributes #2 = { nounwind "amdgpu-num-sgpr"="14" "amdgpu-num-vgpr"="8" } 288