1; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908,A2V %s
2; RUN: llc -march=amdgcn -mcpu=gfx908 -amdgpu-spill-vgpr-to-agpr=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908,A2M %s
3
4; GCN-LABEL: {{^}}max_24regs_32a_used:
5; A2M-DAG:    s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
6; A2M-DAG:    s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
7; A2V-NOT:    SCRATCH_RSRC
8; GFX908-DAG: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a0 ; Reload Reuse
9; A2M:        buffer_store_dword v[[VSPILL]], off, s[{{[0-9:]+}}], 0 offset:[[FI:[0-9]+]] ; 4-byte Folded Spill
10; A2M:        buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI]] ; 4-byte Folded Reload
11; GFX908:     v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]] ; Reload Reuse
12; A2V:        ScratchSize: 0
13define amdgpu_kernel void @max_24regs_32a_used(<16 x float> addrspace(1)* %arg, float addrspace(1)* %out) #0 {
14bb:
15  %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
16  %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 1.0, <16 x float> %in.1, i32 0, i32 0, i32 0)
17  %mai.2 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 1.0, <16 x float> %mai.1, i32 0, i32 0, i32 0)
18  %elt1 = extractelement <16 x float> %mai.2, i32 0
19  %elt2 = extractelement <16 x float> %mai.1, i32 15
20  %elt3 = extractelement <16 x float> %mai.1, i32 14
21  %elt4 = extractelement <16 x float> %mai.2, i32 1
22  store float %elt1, float addrspace(1)* %out
23  %gep1 = getelementptr float, float addrspace(1)* %out, i64 1
24  store float %elt2, float addrspace(1)* %gep1
25  %gep2 = getelementptr float, float addrspace(1)* %out, i64 2
26  store float %elt3, float addrspace(1)* %gep2
27  %gep3 = getelementptr float, float addrspace(1)* %out, i64 3
28  store float %elt4, float addrspace(1)* %gep3
29
30  ret void
31}
32
33; GCN-LABEL: {{^}}max_12regs_13a_used:
34; A2M-DAG:    s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
35; A2M-DAG:    s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
36; A2V-NOT:    SCRATCH_RSRC
37; GFX908-DAG: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a{{[0-9]+}} ; Reload Reuse
38; A2M:        buffer_store_dword v[[VSPILL]], off, s[{{[0-9:]+}}], 0 offset:[[FI:[0-9]+]] ; 4-byte Folded Spill
39; A2M:        buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI]] ; 4-byte Folded Reload
40; A2V:        v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]] ; Reload Reuse
41; A2V:        ScratchSize: 0
42define amdgpu_kernel void @max_12regs_13a_used(i32 %cond, <4 x float> addrspace(1)* %arg, <4 x float> addrspace(1)* %out) #2 {
43bb:
44  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
45  %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %in.1, i32 0, i32 0, i32 0)
46  %mai.2 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %mai.1, i32 0, i32 0, i32 0)
47  %cmp = icmp eq i32 %cond, 0
48  br i1 %cmp, label %use, label %st
49
50use:
51  call void asm sideeffect "", "a,a,a,a,a"(i32 1, i32 2, i32 3, i32 4, i32 5)
52  store volatile <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, <4 x float> addrspace(1)* %out
53  br label %st
54
55st:
56  %gep1 = getelementptr <4 x float>, <4 x float> addrspace(1)* %out, i64 16
57  %gep2 = getelementptr <4 x float>, <4 x float> addrspace(1)* %out, i64 32
58  call void asm sideeffect "", "a,a"(<4 x float> %mai.1, <4 x float> %mai.2)
59  ret void
60}
61
62; GCN-LABEL: {{^}}max_10_vgprs_used_9a:
63; A2M-DAG:    s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
64; A2M-DAG:    s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
65; A2V-NOT:    SCRATCH_RSRC
66
67; A2V: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a{{[0-9]+}} ; Reload Reuse
68; A2V: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]] ; Reload Reuse
69; A2V: ScratchSize: 0
70
71; A2M: buffer_store_dword v[[VSPILLSTORE:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI:[0-9]+]] ; 4-byte Folded Spill
72; A2M: buffer_load_dword v[[VSPILL_RELOAD:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI]] ; 4-byte Folded Reload
73; A2M: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL_RELOAD]] ; Reload Reuse
74define amdgpu_kernel void @max_10_vgprs_used_9a() #1 {
75  %a1 = call <4 x i32> asm sideeffect "", "=a"()
76  %a2 = call <4 x i32> asm sideeffect "", "=a"()
77  %a3 = call i32 asm sideeffect "", "=a"()
78  %a4 = call <2 x i32> asm sideeffect "", "=a"()
79  call void asm sideeffect "", "a,a,a"(<4 x i32> %a1, <4 x i32> %a2, i32 %a3)
80  call void asm sideeffect "", "a"(<2 x i32> %a4)
81  ret void
82}
83
84; GCN-LABEL: {{^}}max_32regs_mfma32:
85; A2M-DAG:    s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
86; A2M-DAG:    s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
87; A2V-NOT:    SCRATCH_RSRC
88; GFX908-DAG: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a0 ; Reload Reuse
89; A2M:        buffer_store_dword v[[VSPILL]], off, s[{{[0-9:]+}}], 0 offset:[[FI:[0-9]+]] ; 4-byte Folded Spill
90; A2M:        buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI]] ; 4-byte Folded Reload
91; GFX908:     v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]] ; Reload Reuse
92; A2V:        ScratchSize: 0
93define amdgpu_kernel void @max_32regs_mfma32(float addrspace(1)* %arg) #3 {
94bb:
95  %v = call i32 asm sideeffect "", "=a"()
96  br label %use
97
98use:
99  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 1.0, <32 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0, float 17.0, float 18.0, float 19.0, float 20.0, float 21.0, float 22.0, float 23.0, float 24.0, float 25.0, float 26.0, float 27.0, float 28.0, float 29.0, float 30.0, float 31.0, float 2.0>, i32 0, i32 0, i32 0)
100  call void asm sideeffect "", "a"(i32 %v)
101  %elt1 = extractelement <32 x float> %mai.1, i32 0
102  store float %elt1, float addrspace(1)* %arg
103  ret void
104}
105
106declare i32 @llvm.amdgcn.workitem.id.x()
107declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float, float, <16 x float>, i32, i32, i32)
108declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float, float, <4 x float>, i32, i32, i32)
109declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32)
110
111attributes #0 = { nounwind "amdgpu-num-vgpr"="24" }
112attributes #1 = { nounwind "amdgpu-num-vgpr"="10" }
113attributes #2 = { nounwind "amdgpu-num-vgpr"="12" }
114attributes #3 = { nounwind "amdgpu-num-vgpr"="32" }
115