1; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=TOSGPR -check-prefix=ALL %s 2 3; FIXME: Vectorization can increase required SGPR count beyond limit. 4 5; ALL-LABEL: {{^}}max_10_sgprs: 6 7; ALL: SGPRBlocks: 1 8; ALL: NumSGPRsForWavesPerEU: 10 9define amdgpu_kernel void @max_10_sgprs() #0 { 10 %one = load volatile i32, i32 addrspace(4)* undef 11 %two = load volatile i32, i32 addrspace(4)* undef 12 %three = load volatile i32, i32 addrspace(4)* undef 13 %four = load volatile i32, i32 addrspace(4)* undef 14 %five = load volatile i32, i32 addrspace(4)* undef 15 %six = load volatile i32, i32 addrspace(4)* undef 16 %seven = load volatile i32, i32 addrspace(4)* undef 17 %eight = load volatile i32, i32 addrspace(4)* undef 18 %nine = load volatile i32, i32 addrspace(4)* undef 19 %ten = load volatile i32, i32 addrspace(4)* undef 20 %eleven = load volatile i32, i32 addrspace(4)* undef 21 call void asm sideeffect "", "s,s,s,s,s,s,s,s,s,s"(i32 %one, i32 %two, i32 %three, i32 %four, i32 %five, i32 %six, i32 %seven, i32 %eight, i32 %nine, i32 %ten) 22 store volatile i32 %one, i32 addrspace(1)* undef 23 store volatile i32 %two, i32 addrspace(1)* undef 24 store volatile i32 %three, i32 addrspace(1)* undef 25 store volatile i32 %four, i32 addrspace(1)* undef 26 store volatile i32 %five, i32 addrspace(1)* undef 27 store volatile i32 %six, i32 addrspace(1)* undef 28 store volatile i32 %seven, i32 addrspace(1)* undef 29 store volatile i32 %eight, i32 addrspace(1)* undef 30 store volatile i32 %nine, i32 addrspace(1)* undef 31 store volatile i32 %ten, i32 addrspace(1)* undef 32 store volatile i32 %eleven, i32 addrspace(1)* undef 33 ret void 34} 35 36; private resource: 4 37; scratch wave offset: 1 38; workgroup ids: 3 39; dispatch id: 2 40; queue ptr: 2 41; flat scratch init: 2 42; --------------------- 43; total: 14 44 45; + reserved vcc = 16 46 47; Because we can't handle re-using the last few input registers as the 48; special vcc etc. registers (as well as decide to not use the unused 49; features when the number of registers is frozen), this ends up using 50; more than expected. 51 52; XALL-LABEL: {{^}}max_12_sgprs_14_input_sgprs: 53; XTOSGPR: SGPRBlocks: 1 54; XTOSGPR: NumSGPRsForWavesPerEU: 16 55 56; This test case is disabled: When calculating the spillslot addresses AMDGPU 57; creates an extra vreg to save/restore m0 which in a point of maximum register 58; pressure would trigger an endless loop; the compiler aborts earlier with 59; "Incomplete scavenging after 2nd pass" in practice. 60;define amdgpu_kernel void @max_12_sgprs_14_input_sgprs(i32 addrspace(1)* %out1, 61; i32 addrspace(1)* %out2, 62; i32 addrspace(1)* %out3, 63; i32 addrspace(1)* %out4, 64; i32 %one, i32 %two, i32 %three, i32 %four) #2 { 65; %x.0 = call i32 @llvm.amdgcn.workgroup.id.x() 66; %x.1 = call i32 @llvm.amdgcn.workgroup.id.y() 67; %x.2 = call i32 @llvm.amdgcn.workgroup.id.z() 68; %x.3 = call i64 @llvm.amdgcn.dispatch.id() 69; %x.4 = call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() 70; %x.5 = call i8 addrspace(4)* @llvm.amdgcn.queue.ptr() 71; store volatile i32 0, i32* undef 72; br label %stores 73; 74;stores: 75; store volatile i32 %x.0, i32 addrspace(1)* undef 76; store volatile i32 %x.0, i32 addrspace(1)* undef 77; store volatile i32 %x.0, i32 addrspace(1)* undef 78; store volatile i64 %x.3, i64 addrspace(1)* undef 79; store volatile i8 addrspace(4)* %x.4, i8 addrspace(4)* addrspace(1)* undef 80; store volatile i8 addrspace(4)* %x.5, i8 addrspace(4)* addrspace(1)* undef 81; 82; store i32 %one, i32 addrspace(1)* %out1 83; store i32 %two, i32 addrspace(1)* %out2 84; store i32 %three, i32 addrspace(1)* %out3 85; store i32 %four, i32 addrspace(1)* %out4 86; ret void 87;} 88 89; The following test is commented out for now; http://llvm.org/PR31230 90; XALL-LABEL: max_12_sgprs_12_input_sgprs{{$}} 91; ; Make sure copies for input buffer are not clobbered. This requires 92; ; swapping the order the registers are copied from what normally 93; ; happens. 94 95; XALL: SGPRBlocks: 2 96; XALL: NumSGPRsForWavesPerEU: 18 97;define amdgpu_kernel void @max_12_sgprs_12_input_sgprs(i32 addrspace(1)* %out1, 98; i32 addrspace(1)* %out2, 99; i32 addrspace(1)* %out3, 100; i32 addrspace(1)* %out4, 101; i32 %one, i32 %two, i32 %three, i32 %four) #2 { 102; store volatile i32 0, i32* undef 103; %x.0 = call i32 @llvm.amdgcn.workgroup.id.x() 104; store volatile i32 %x.0, i32 addrspace(1)* undef 105; %x.1 = call i32 @llvm.amdgcn.workgroup.id.y() 106; store volatile i32 %x.0, i32 addrspace(1)* undef 107; %x.2 = call i32 @llvm.amdgcn.workgroup.id.z() 108; store volatile i32 %x.0, i32 addrspace(1)* undef 109; %x.3 = call i64 @llvm.amdgcn.dispatch.id() 110; store volatile i64 %x.3, i64 addrspace(1)* undef 111; %x.4 = call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() 112; store volatile i8 addrspace(4)* %x.4, i8 addrspace(4)* addrspace(1)* undef 113; 114; store i32 %one, i32 addrspace(1)* %out1 115; store i32 %two, i32 addrspace(1)* %out2 116; store i32 %three, i32 addrspace(1)* %out3 117; store i32 %four, i32 addrspace(1)* %out4 118; ret void 119;} 120 121declare i32 @llvm.amdgcn.workgroup.id.x() #1 122declare i32 @llvm.amdgcn.workgroup.id.y() #1 123declare i32 @llvm.amdgcn.workgroup.id.z() #1 124declare i64 @llvm.amdgcn.dispatch.id() #1 125declare i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #1 126declare i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #1 127 128attributes #0 = { nounwind "amdgpu-num-sgpr"="14" } 129attributes #1 = { nounwind readnone } 130attributes #2 = { nounwind "amdgpu-num-sgpr"="12" } 131attributes #3 = { nounwind "amdgpu-num-sgpr"="11" } 132