1; RUN: llc -O0 -march=amdgcn -mcpu=fiji -amdgpu-spill-sgpr-to-smem=0 -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=VGPR %s
2; RUN: llc -O0 -march=amdgcn -mcpu=fiji -amdgpu-spill-sgpr-to-smem=1 -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=SMEM %s
3; RUN: llc -O0 -march=amdgcn -mcpu=fiji -amdgpu-spill-sgpr-to-smem=0 -amdgpu-spill-sgpr-to-vgpr=0 -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=VMEM %s
4
5; ALL-LABEL: {{^}}spill_sgpr_x2:
6; SMEM: s_add_u32 m0, s3, 0x100{{$}}
7; SMEM: s_buffer_store_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[8:11], m0 ; 8-byte Folded Spill
8; SMEM: s_cbranch_scc1
9
10; SMEM: s_add_u32 m0, s3, 0x100{{$}}
11; SMEM: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[8:11], m0 ; 8-byte Folded Reload
12
13; SMEM: s_dcache_wb
14; SMEM: s_endpgm
15
16; FIXME: Should only need 4 bytes
17; SMEM: ScratchSize: 12
18
19
20; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 0
21; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 1
22; VGPR: s_cbranch_scc1
23
24; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 0
25; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 1
26
27; VMEM: buffer_store_dword
28; VMEM: buffer_store_dword
29; VMEM: s_cbranch_scc1
30
31; VMEM: buffer_load_dword
32; VMEM: buffer_load_dword
33define amdgpu_kernel void @spill_sgpr_x2(i32 addrspace(1)* %out, i32 %in) #0 {
34  %wide.sgpr = call <2 x i32>  asm sideeffect "; def $0", "=s" () #0
35  %cmp = icmp eq i32 %in, 0
36  br i1 %cmp, label %bb0, label %ret
37
38bb0:
39  call void asm sideeffect "; use $0", "s"(<2 x i32> %wide.sgpr) #0
40  br label %ret
41
42ret:
43  ret void
44}
45
46; ALL-LABEL: {{^}}spill_sgpr_x4:
47; SMEM: s_add_u32 m0, s3, 0x100{{$}}
48; SMEM: s_buffer_store_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[12:15], m0 ; 16-byte Folded Spill
49; SMEM: s_cbranch_scc1
50
51; SMEM: s_add_u32 m0, s3, 0x100{{$}}
52; SMEM: s_buffer_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[12:15], m0 ; 16-byte Folded Reload
53; SMEM: s_dcache_wb
54; SMEM: s_endpgm
55
56; FIXME: Should only need 4 bytes
57; SMEM: ScratchSize: 20
58
59; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 0
60; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 1
61; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 2
62; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 3
63; VGPR: s_cbranch_scc1
64
65; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 0
66; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 1
67; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 2
68; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 3
69
70
71; VMEM: buffer_store_dword
72; VMEM: buffer_store_dword
73; VMEM: buffer_store_dword
74; VMEM: buffer_store_dword
75; VMEM: s_cbranch_scc1
76
77; VMEM: buffer_load_dword
78; VMEM: buffer_load_dword
79; VMEM: buffer_load_dword
80; VMEM: buffer_load_dword
81define amdgpu_kernel void @spill_sgpr_x4(i32 addrspace(1)* %out, i32 %in) #0 {
82  %wide.sgpr = call <4 x i32>  asm sideeffect "; def $0", "=s" () #0
83  %cmp = icmp eq i32 %in, 0
84  br i1 %cmp, label %bb0, label %ret
85
86bb0:
87  call void asm sideeffect "; use $0", "s"(<4 x i32> %wide.sgpr) #0
88  br label %ret
89
90ret:
91  ret void
92}
93
94; ALL-LABEL: {{^}}spill_sgpr_x8:
95
96; SMEM: s_add_u32 m0, s3, 0x100{{$}}
97; SMEM: s_buffer_store_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[16:19], m0 ; 16-byte Folded Spill
98; SMEM: s_add_u32 m0, s3, 0x110{{$}}
99; SMEM: s_buffer_store_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[16:19], m0 ; 16-byte Folded Spill
100; SMEM: s_cbranch_scc1
101
102; SMEM: s_add_u32 m0, s3, 0x100{{$}}
103; SMEM: s_buffer_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[16:19], m0 ; 16-byte Folded Reload
104; SMEM: s_add_u32 m0, s3, 0x110{{$}}
105; SMEM: s_buffer_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[16:19], m0 ; 16-byte Folded Reload
106
107; SMEM: s_dcache_wb
108; SMEM: s_endpgm
109
110; SMEM: ScratchSize: 36
111
112; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 0
113; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 1
114; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 2
115; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 3
116; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 4
117; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 5
118; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 6
119; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 7
120; VGPR: s_cbranch_scc1
121
122; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 0
123; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 1
124; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 2
125; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 3
126; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 4
127; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 5
128; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 6
129; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 7
130
131; VMEM: buffer_store_dword
132; VMEM: buffer_store_dword
133; VMEM: buffer_store_dword
134; VMEM: buffer_store_dword
135; VMEM: buffer_store_dword
136; VMEM: buffer_store_dword
137; VMEM: buffer_store_dword
138; VMEM: buffer_store_dword
139; VMEM: s_cbranch_scc1
140
141; VMEM: buffer_load_dword
142; VMEM: buffer_load_dword
143; VMEM: buffer_load_dword
144; VMEM: buffer_load_dword
145; VMEM: buffer_load_dword
146; VMEM: buffer_load_dword
147; VMEM: buffer_load_dword
148; VMEM: buffer_load_dword
149define amdgpu_kernel void @spill_sgpr_x8(i32 addrspace(1)* %out, i32 %in) #0 {
150  %wide.sgpr = call <8 x i32>  asm sideeffect "; def $0", "=s" () #0
151  %cmp = icmp eq i32 %in, 0
152  br i1 %cmp, label %bb0, label %ret
153
154bb0:
155  call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr) #0
156  br label %ret
157
158ret:
159  ret void
160}
161
162; FIXME: x16 inlineasm seems broken
163; define amdgpu_kernel void @spill_sgpr_x16(i32 addrspace(1)* %out, i32 %in) #0 {
164;   %wide.sgpr = call <16 x i32>  asm sideeffect "; def $0", "=s" () #0
165;   %cmp = icmp eq i32 %in, 0
166;   br i1 %cmp, label %bb0, label %ret
167
168; bb0:
169;   call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr) #0
170;   br label %ret
171
172; ret:
173;   ret void
174; }
175
176attributes #0 = { nounwind }
177