1; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908,A2V %s 2; RUN: llc -march=amdgcn -mcpu=gfx908 -amdgpu-spill-vgpr-to-agpr=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908,A2M %s 3 4; GCN-LABEL: {{^}}max_24regs_32a_used: 5; A2M-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 6; A2M-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 7; A2V-NOT: SCRATCH_RSRC 8; GFX908-DAG: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a0 ; Reload Reuse 9; A2M: buffer_store_dword v[[VSPILL]], off, s[{{[0-9:]+}}], 0 offset:[[FI:[0-9]+]] ; 4-byte Folded Spill 10; A2M: buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI]] ; 4-byte Folded Reload 11; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]] ; Reload Reuse 12; A2V: ScratchSize: 0 13define amdgpu_kernel void @max_24regs_32a_used(<16 x float> addrspace(1)* %arg, float addrspace(1)* %out) #0 { 14bb: 15 %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg 16 %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 1.0, <16 x float> %in.1, i32 0, i32 0, i32 0) 17 %mai.2 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 1.0, <16 x float> %mai.1, i32 0, i32 0, i32 0) 18 %elt1 = extractelement <16 x float> %mai.2, i32 0 19 %elt2 = extractelement <16 x float> %mai.1, i32 15 20 %elt3 = extractelement <16 x float> %mai.1, i32 14 21 %elt4 = extractelement <16 x float> %mai.2, i32 1 22 store float %elt1, float addrspace(1)* %out 23 %gep1 = getelementptr float, float addrspace(1)* %out, i64 1 24 store float %elt2, float addrspace(1)* %gep1 25 %gep2 = getelementptr float, float addrspace(1)* %out, i64 2 26 store float %elt3, float addrspace(1)* %gep2 27 %gep3 = getelementptr float, float addrspace(1)* %out, i64 3 28 store float %elt4, float addrspace(1)* %gep3 29 30 ret void 31} 32 33; GCN-LABEL: {{^}}max_12regs_13a_used: 34; A2M-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 35; A2M-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 36; A2V-NOT: SCRATCH_RSRC 37; GFX908-DAG: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a{{[0-9]+}} ; Reload Reuse 38; A2M: buffer_store_dword v[[VSPILL]], off, s[{{[0-9:]+}}], 0 offset:[[FI:[0-9]+]] ; 4-byte Folded Spill 39; A2M: buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI]] ; 4-byte Folded Reload 40; A2V: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]] ; Reload Reuse 41; A2V: ScratchSize: 0 42define amdgpu_kernel void @max_12regs_13a_used(i32 %cond, <4 x float> addrspace(1)* %arg, <4 x float> addrspace(1)* %out) #2 { 43bb: 44 %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg 45 %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %in.1, i32 0, i32 0, i32 0) 46 %mai.2 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %mai.1, i32 0, i32 0, i32 0) 47 %cmp = icmp eq i32 %cond, 0 48 br i1 %cmp, label %use, label %st 49 50use: 51 call void asm sideeffect "", "a,a,a,a,a"(i32 1, i32 2, i32 3, i32 4, i32 5) 52 store volatile <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, <4 x float> addrspace(1)* %out 53 br label %st 54 55st: 56 %gep1 = getelementptr <4 x float>, <4 x float> addrspace(1)* %out, i64 16 57 %gep2 = getelementptr <4 x float>, <4 x float> addrspace(1)* %out, i64 32 58 call void asm sideeffect "", "a,a"(<4 x float> %mai.1, <4 x float> %mai.2) 59 ret void 60} 61 62; GCN-LABEL: {{^}}max_10_vgprs_used_9a: 63; A2M-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 64; A2M-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 65; A2V-NOT: SCRATCH_RSRC 66 67; A2V: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a{{[0-9]+}} ; Reload Reuse 68; A2V: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]] ; Reload Reuse 69; A2V: ScratchSize: 0 70 71; A2M: buffer_store_dword v[[VSPILLSTORE:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI:[0-9]+]] ; 4-byte Folded Spill 72; A2M: buffer_load_dword v[[VSPILL_RELOAD:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI]] ; 4-byte Folded Reload 73; A2M: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL_RELOAD]] ; Reload Reuse 74define amdgpu_kernel void @max_10_vgprs_used_9a() #1 { 75 %a1 = call <4 x i32> asm sideeffect "", "=a"() 76 %a2 = call <4 x i32> asm sideeffect "", "=a"() 77 %a3 = call i32 asm sideeffect "", "=a"() 78 %a4 = call <2 x i32> asm sideeffect "", "=a"() 79 call void asm sideeffect "", "a,a,a"(<4 x i32> %a1, <4 x i32> %a2, i32 %a3) 80 call void asm sideeffect "", "a"(<2 x i32> %a4) 81 ret void 82} 83 84; GCN-LABEL: {{^}}max_32regs_mfma32: 85; A2M-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 86; A2M-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 87; A2V-NOT: SCRATCH_RSRC 88; GFX908-DAG: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a0 ; Reload Reuse 89; A2M: buffer_store_dword v[[VSPILL]], off, s[{{[0-9:]+}}], 0 offset:[[FI:[0-9]+]] ; 4-byte Folded Spill 90; A2M: buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI]] ; 4-byte Folded Reload 91; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]] ; Reload Reuse 92; A2V: ScratchSize: 0 93define amdgpu_kernel void @max_32regs_mfma32(float addrspace(1)* %arg) #3 { 94bb: 95 %v = call i32 asm sideeffect "", "=a"() 96 br label %use 97 98use: 99 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 1.0, <32 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0, float 17.0, float 18.0, float 19.0, float 20.0, float 21.0, float 22.0, float 23.0, float 24.0, float 25.0, float 26.0, float 27.0, float 28.0, float 29.0, float 30.0, float 31.0, float 2.0>, i32 0, i32 0, i32 0) 100 call void asm sideeffect "", "a"(i32 %v) 101 %elt1 = extractelement <32 x float> %mai.1, i32 0 102 store float %elt1, float addrspace(1)* %arg 103 ret void 104} 105 106declare i32 @llvm.amdgcn.workitem.id.x() 107declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float, float, <16 x float>, i32, i32, i32) 108declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float, float, <4 x float>, i32, i32, i32) 109declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) 110 111attributes #0 = { nounwind "amdgpu-num-vgpr"="24" } 112attributes #1 = { nounwind "amdgpu-num-vgpr"="10" } 113attributes #2 = { nounwind "amdgpu-num-vgpr"="12" } 114attributes #3 = { nounwind "amdgpu-num-vgpr"="32" } 115