1; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri --amdhsa-code-object-version=2 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=HSA -check-prefix=CI %s
2; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=2 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=HSA -check-prefix=GFX9 %s
3
4; HSA-LABEL: {{^}}use_group_to_flat_addrspacecast:
5; HSA: enable_sgpr_private_segment_buffer = 1
6; HSA: enable_sgpr_dispatch_ptr = 0
7; CI: enable_sgpr_queue_ptr = 1
8; GFX9: enable_sgpr_queue_ptr = 0
9
10; CI-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}}
11; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10{{$}}
12; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]]
13; CI-DAG: v_cmp_ne_u32_e64 vcc, [[PTR]], -1
14; CI-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc
15; CI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
16; CI-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]
17
18; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
19; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}}
20; GFX9-DAG: s_getreg_b32 [[SSRC_SHARED:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 16, 16)
21; GFX9-DAG: s_lshl_b32 [[SSRC_SHARED_BASE:s[0-9]+]], [[SSRC_SHARED]], 16
22; GFX9-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[SSRC_SHARED_BASE]]
23
24; GFX9-XXX: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_shared_base
25; GFX9: v_cmp_ne_u32_e64 vcc, [[PTR]], -1
26; GFX9: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc
27; GFX9-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
28; GFX9-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]
29
30; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, [[K]]
31
32; At most 2 digits. Make sure src_shared_base is not counted as a high
33; number SGPR.
34
35; CI: NumSgprs: {{[0-9][0-9]+}}
36; GFX9: NumSgprs: {{[0-9]+}}
37define amdgpu_kernel void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #0 {
38  %stof = addrspacecast i32 addrspace(3)* %ptr to i32*
39  store volatile i32 7, i32* %stof
40  ret void
41}
42
43; Test handling inside a non-kernel
44; HSA-LABEL: {{^}}use_group_to_flat_addrspacecast_func:
45; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10{{$}}
46; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]]
47; CI-DAG: v_cmp_ne_u32_e32 vcc, -1, v0
48; CI-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc
49; CI-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, v0
50
51; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
52; GFX9-DAG: s_getreg_b32 [[SSRC_SHARED:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 16, 16)
53; GFX9-DAG: s_lshl_b32 [[SSRC_SHARED_BASE:s[0-9]+]], [[SSRC_SHARED]], 16
54; GFX9-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[SSRC_SHARED_BASE]]
55
56; GFX9-XXX: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_shared_base
57; GFX9-DAG: v_cmp_ne_u32_e32 vcc, -1, v0
58; GFX9-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, v0, vcc
59; GFX9-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc
60
61; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, [[K]]
62define void @use_group_to_flat_addrspacecast_func(i32 addrspace(3)* %ptr) #0 {
63  %stof = addrspacecast i32 addrspace(3)* %ptr to i32*
64  store volatile i32 7, i32* %stof
65  ret void
66}
67
68; HSA-LABEL: {{^}}use_private_to_flat_addrspacecast:
69; HSA: enable_sgpr_private_segment_buffer = 1
70; HSA: enable_sgpr_dispatch_ptr = 0
71; CI: enable_sgpr_queue_ptr = 1
72; GFX9: enable_sgpr_queue_ptr = 0
73
74; CI-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}}
75; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11{{$}}
76; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]]
77
78; CI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
79; CI-DAG: v_cmp_ne_u32_e64 vcc, [[PTR]], -1
80; CI-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc
81; CI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
82; CI-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]
83
84; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}}
85; GFX9-DAG: s_getreg_b32 [[SSRC_PRIVATE:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 0, 16)
86; GFX9-DAG: s_lshl_b32 [[SSRC_PRIVATE_BASE:s[0-9]+]], [[SSRC_PRIVATE]], 16
87; GFX9-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[SSRC_PRIVATE_BASE]]
88
89; GFX9-XXX: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_private_base
90
91; GFX9-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
92; GFX9: v_cmp_ne_u32_e64 vcc, [[PTR]], -1
93; GFX9: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc
94; GFX9: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
95; GFX9-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]
96
97; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, [[K]]
98
99; CI: NumSgprs: {{[0-9][0-9]+}}
100; GFX9: NumSgprs: {{[0-9]+}}
101define amdgpu_kernel void @use_private_to_flat_addrspacecast(i32 addrspace(5)* %ptr) #0 {
102  %stof = addrspacecast i32 addrspace(5)* %ptr to i32*
103  store volatile i32 7, i32* %stof
104  ret void
105}
106
107; no-op
108; HSA-LABEL: {{^}}use_global_to_flat_addrspacecast:
109; HSA: enable_sgpr_queue_ptr = 0
110
111; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}}
112; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
113; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
114; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
115; HSA: flat_store_dword v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}, [[K]]
116define amdgpu_kernel void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #0 {
117  %stof = addrspacecast i32 addrspace(1)* %ptr to i32*
118  store volatile i32 7, i32* %stof
119  ret void
120}
121
122; no-op
123; HSA-LABEl: {{^}}use_constant_to_flat_addrspacecast:
124; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}}
125; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
126; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
127; HSA: flat_load_dword v{{[0-9]+}}, v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}
128define amdgpu_kernel void @use_constant_to_flat_addrspacecast(i32 addrspace(4)* %ptr) #0 {
129  %stof = addrspacecast i32 addrspace(4)* %ptr to i32*
130  %ld = load volatile i32, i32* %stof
131  ret void
132}
133
134; HSA-LABEl: {{^}}use_constant_to_global_addrspacecast:
135; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}}
136; CI-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
137; CI-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
138; CI: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}
139
140; GFX9: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
141; GFX9: global_load_dword v{{[0-9]+}}, [[ZERO:v[0-9]+]], s{{\[}}[[PTRLO]]:[[PTRHI]]{{\]}}
142define amdgpu_kernel void @use_constant_to_global_addrspacecast(i32 addrspace(4)* %ptr) #0 {
143  %stof = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(1)*
144  %ld = load volatile i32, i32 addrspace(1)* %stof
145  ret void
146}
147
148; HSA-LABEL: {{^}}use_flat_to_group_addrspacecast:
149; HSA: enable_sgpr_private_segment_buffer = 1
150; HSA: enable_sgpr_dispatch_ptr = 0
151; HSA: enable_sgpr_queue_ptr = 0
152
153; HSA: s_load_dwordx2 s{{\[}}[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]{{\]}}
154; CI-DAG: v_cmp_ne_u64_e64 vcc, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0{{$}}
155; CI-DAG: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], s[[PTR_LO]]
156; CI-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], -1, v[[VPTR_LO]]
157; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 0{{$}}
158; GFX9-DAG: s_cmp_lg_u64 s{{\[}}[[CMP_LO:[0-9]+]]:[[CMP_HI:[0-9]+]]{{\]}}, 0
159; GFX9-DAG: s_cselect_b32 s[[PTR_LO]], s[[PTR_LO]], -1
160; GFX9-DAG: v_mov_b32_e32 [[CASTPTR:v[0-9]+]], s[[PTR_LO]]
161; HSA: ds_write_b32 [[CASTPTR]], v[[K]]
162define amdgpu_kernel void @use_flat_to_group_addrspacecast(i32* %ptr) #0 {
163  %ftos = addrspacecast i32* %ptr to i32 addrspace(3)*
164  store volatile i32 0, i32 addrspace(3)* %ftos
165  ret void
166}
167
168; HSA-LABEL: {{^}}use_flat_to_private_addrspacecast:
169; HSA: enable_sgpr_private_segment_buffer = 1
170; HSA: enable_sgpr_dispatch_ptr = 0
171; HSA: enable_sgpr_queue_ptr = 0
172
173; HSA: s_load_dwordx2 s{{\[}}[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]{{\]}}
174; CI-DAG: v_cmp_ne_u64_e64 vcc, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0{{$}}
175; CI-DAG: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], s[[PTR_LO]]
176; CI-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], -1, v[[VPTR_LO]]
177; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 0{{$}}
178; GFX9-DAG: s_cmp_lg_u64 s{{\[}}[[CMP_LO:[0-9]+]]:[[CMP_HI:[0-9]+]]{{\]}}, 0
179; GFX9-DAG: s_cselect_b32 s[[PTR_LO]], s[[PTR_LO]], -1
180; GFX9-DAG: v_mov_b32_e32 [[CASTPTR:v[0-9]+]], s[[PTR_LO]]
181; HSA: buffer_store_dword v[[K]], [[CASTPTR]], s{{\[[0-9]+:[0-9]+\]}}, 0 offen{{$}}
182define amdgpu_kernel void @use_flat_to_private_addrspacecast(i32* %ptr) #0 {
183  %ftos = addrspacecast i32* %ptr to i32 addrspace(5)*
184  store volatile i32 0, i32 addrspace(5)* %ftos
185  ret void
186}
187
188; HSA-LABEL: {{^}}use_flat_to_global_addrspacecast:
189; HSA: enable_sgpr_queue_ptr = 0
190
191; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}}, s[4:5], 0x0
192; CI-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
193; CI-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
194; CI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0
195; CI: flat_store_dword v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}, [[K]]
196
197; GFX9: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
198; GFX9: global_store_dword [[ZERO]], [[ZERO]], s{{\[}}[[PTRLO]]:[[PTRHI]]{{\]$}}
199define amdgpu_kernel void @use_flat_to_global_addrspacecast(i32* %ptr) #0 {
200  %ftos = addrspacecast i32* %ptr to i32 addrspace(1)*
201  store volatile i32 0, i32 addrspace(1)* %ftos
202  ret void
203}
204
205; HSA-LABEL: {{^}}use_flat_to_constant_addrspacecast:
206; HSA: enable_sgpr_queue_ptr = 0
207
208; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}}, s[4:5], 0x0
209; HSA: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTRLO]]:[[PTRHI]]{{\]}}, 0x0
210define amdgpu_kernel void @use_flat_to_constant_addrspacecast(i32* %ptr) #0 {
211  %ftos = addrspacecast i32* %ptr to i32 addrspace(4)*
212  load volatile i32, i32 addrspace(4)* %ftos
213  ret void
214}
215
216; HSA-LABEL: {{^}}cast_0_group_to_flat_addrspacecast:
217; CI: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10
218; CI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE]]
219; GFX9-DAG: s_getreg_b32 [[SSRC_SHARED:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 16, 16)
220; GFX9-DAG: s_lshl_b32 [[SSRC_SHARED_BASE:s[0-9]+]], [[SSRC_SHARED]], 16
221; GFX9-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SSRC_SHARED_BASE]]
222
223; GFX9-XXX: v_mov_b32_e32 v[[HI:[0-9]+]], src_shared_base
224
225; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
226; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
227; HSA: {{flat|global}}_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]]
228define amdgpu_kernel void @cast_0_group_to_flat_addrspacecast() #0 {
229  %cast = addrspacecast i32 addrspace(3)* null to i32*
230  store volatile i32 7, i32* %cast
231  ret void
232}
233
234; HSA-LABEL: {{^}}cast_0_flat_to_group_addrspacecast:
235; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}}
236; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}}
237; HSA: ds_write_b32 [[PTR]], [[K]]
238define amdgpu_kernel void @cast_0_flat_to_group_addrspacecast() #0 {
239  %cast = addrspacecast i32* null to i32 addrspace(3)*
240  store volatile i32 7, i32 addrspace(3)* %cast
241  ret void
242}
243
244; HSA-LABEL: {{^}}cast_neg1_group_to_flat_addrspacecast:
245; HSA: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
246; HSA: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
247; HSA: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
248; HSA: {{flat|global}}_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]]
249define amdgpu_kernel void @cast_neg1_group_to_flat_addrspacecast() #0 {
250  %cast = addrspacecast i32 addrspace(3)* inttoptr (i32 -1 to i32 addrspace(3)*) to i32*
251  store volatile i32 7, i32* %cast
252  ret void
253}
254
255; HSA-LABEL: {{^}}cast_neg1_flat_to_group_addrspacecast:
256; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}}
257; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}}
258; HSA: ds_write_b32 [[PTR]], [[K]]
259define amdgpu_kernel void @cast_neg1_flat_to_group_addrspacecast() #0 {
260  %cast = addrspacecast i32* inttoptr (i64 -1 to i32*) to i32 addrspace(3)*
261  store volatile i32 7, i32 addrspace(3)* %cast
262  ret void
263}
264
265; FIXME: Shouldn't need to enable queue ptr
266; HSA-LABEL: {{^}}cast_0_private_to_flat_addrspacecast:
267; CI: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11
268; CI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE]]
269; GFX9-DAG: s_getreg_b32 [[SSRC_SHARED:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 0, 16)
270; GFX9-DAG: s_lshl_b32 [[SSRC_SHARED_BASE:s[0-9]+]], [[SSRC_SHARED]], 16
271; GFX9-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SSRC_SHARED_BASE]]
272
273; GFX9-XXX: v_mov_b32_e32 v[[HI:[0-9]+]], src_shared_base
274
275; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
276; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
277; HSA: {{flat|global}}_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]]
278define amdgpu_kernel void @cast_0_private_to_flat_addrspacecast() #0 {
279  %cast = addrspacecast i32 addrspace(5)* null to i32*
280  store volatile i32 7, i32* %cast
281  ret void
282}
283
284; HSA-LABEL: {{^}}cast_0_flat_to_private_addrspacecast:
285; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}}
286; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}}
287; HSA: buffer_store_dword [[K]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, 0
288define amdgpu_kernel void @cast_0_flat_to_private_addrspacecast() #0 {
289  %cast = addrspacecast i32* null to i32 addrspace(5)*
290  store volatile i32 7, i32 addrspace(5)* %cast
291  ret void
292}
293
294
295; HSA-LABEL: {{^}}cast_neg1_private_to_flat_addrspacecast:
296; CI: enable_sgpr_queue_ptr = 1
297; GFX9: enable_sgpr_queue_ptr = 0
298
299; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
300; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
301; HSA: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
302; HSA: {{flat|global}}_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]]
303define amdgpu_kernel void @cast_neg1_private_to_flat_addrspacecast() #0 {
304  %cast = addrspacecast i32 addrspace(5)* inttoptr (i32 -1 to i32 addrspace(5)*) to i32*
305  store volatile i32 7, i32* %cast
306  ret void
307}
308
309; HSA-LABEL: {{^}}cast_neg1_flat_to_private_addrspacecast:
310; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}}
311; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}}
312; HSA: buffer_store_dword [[K]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, 0
313define amdgpu_kernel void @cast_neg1_flat_to_private_addrspacecast() #0 {
314  %cast = addrspacecast i32* inttoptr (i64 -1 to i32*) to i32 addrspace(5)*
315  store volatile i32 7, i32 addrspace(5)* %cast
316  ret void
317}
318
319
320; Disable optimizations in case there are optimizations added that
321; specialize away generic pointer accesses.
322
323; HSA-LABEL: {{^}}branch_use_flat_i32:
324; HSA: {{flat|global}}_store_dword {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}
325; HSA: s_endpgm
326define amdgpu_kernel void @branch_use_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %gptr, i32 addrspace(3)* %lptr, i32 %x, i32 %c) #0 {
327entry:
328  %cmp = icmp ne i32 %c, 0
329  br i1 %cmp, label %local, label %global
330
331local:
332  %flat_local = addrspacecast i32 addrspace(3)* %lptr to i32*
333  br label %end
334
335global:
336  %flat_global = addrspacecast i32 addrspace(1)* %gptr to i32*
337  br label %end
338
339end:
340  %fptr = phi i32* [ %flat_local, %local ], [ %flat_global, %global ]
341  store volatile i32 %x, i32* %fptr, align 4
342;  %val = load i32, i32* %fptr, align 4
343;  store i32 %val, i32 addrspace(1)* %out, align 4
344  ret void
345}
346
347; Check for prologue initializing special SGPRs pointing to scratch.
348; HSA-LABEL: {{^}}store_flat_scratch:
349; CI-DAG: s_mov_b32 flat_scratch_lo, s9
350; CI-DAG: s_add_u32 [[ADD:s[0-9]+]], s8, s11
351; CI-DAG: s_lshr_b32 flat_scratch_hi, [[ADD]], 8
352
353; GFX9: s_add_u32 flat_scratch_lo, s6, s9
354; GFX9: s_addc_u32 flat_scratch_hi, s7, 0
355
356; HSA: {{flat|global}}_store_dword
357; HSA: s_barrier
358; HSA: {{flat|global}}_load_dword
359define amdgpu_kernel void @store_flat_scratch(i32 addrspace(1)* noalias %out, i32) #0 {
360  %alloca = alloca i32, i32 9, align 4, addrspace(5)
361  %x = call i32 @llvm.amdgcn.workitem.id.x() #2
362  %pptr = getelementptr i32, i32 addrspace(5)* %alloca, i32 %x
363  %fptr = addrspacecast i32 addrspace(5)* %pptr to i32*
364  store volatile i32 %x, i32* %fptr
365  ; Dummy call
366  call void @llvm.amdgcn.s.barrier() #1
367  %reload = load volatile i32, i32* %fptr, align 4
368  store volatile i32 %reload, i32 addrspace(1)* %out, align 4
369  ret void
370}
371
372; HSA-LABEL: {{^}}use_constant_to_constant32_addrspacecast
373; GFX9: s_load_dwordx2 [[PTRPTR:s\[[0-9]+:[0-9]+\]]], s[4:5], 0x0{{$}}
374; GFX9: s_load_dword [[OFFSET:s[0-9]+]], s[4:5], 0x8{{$}}
375; GFX9: s_load_dwordx2 s{{\[}}[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]{{\]}}, [[PTRPTR]], 0x0{{$}}
376; GFX9: s_mov_b32 s[[PTR_HI]], 0{{$}}
377; GFX9: s_add_i32 s[[PTR_LO]], s[[PTR_LO]], [[OFFSET]]
378; GFX9: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x0{{$}}
379define amdgpu_kernel void @use_constant_to_constant32_addrspacecast(i8 addrspace(4)* addrspace(4)* %ptr.ptr, i32 %offset) #0 {
380  %ptr = load volatile i8 addrspace(4)*, i8 addrspace(4)* addrspace(4)* %ptr.ptr
381  %addrspacecast = addrspacecast i8 addrspace(4)* %ptr to i8 addrspace(6)*
382  %gep = getelementptr i8, i8 addrspace(6)* %addrspacecast, i32 %offset
383  %ptr.cast = bitcast i8 addrspace(6)* %gep to i32 addrspace(6)*
384  %load = load volatile i32, i32 addrspace(6)* %ptr.cast, align 4
385  ret void
386}
387
388; HSA-LABEL: {{^}}use_global_to_constant32_addrspacecast
389; GFX9: s_load_dwordx2 [[PTRPTR:s\[[0-9]+:[0-9]+\]]], s[4:5], 0x0{{$}}
390; GFX9: s_load_dword [[OFFSET:s[0-9]+]], s[4:5], 0x8{{$}}
391; GFX9: s_load_dwordx2 s{{\[}}[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]{{\]}}, [[PTRPTR]], 0x0{{$}}
392; GFX9: s_mov_b32 s[[PTR_HI]], 0{{$}}
393; GFX9: s_add_i32 s[[PTR_LO]], s[[PTR_LO]], [[OFFSET]]
394; GFX9: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x0{{$}}
395define amdgpu_kernel void @use_global_to_constant32_addrspacecast(i8 addrspace(1)* addrspace(4)* %ptr.ptr, i32 %offset) #0 {
396  %ptr = load volatile i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* %ptr.ptr
397  %addrspacecast = addrspacecast i8 addrspace(1)* %ptr to i8 addrspace(6)*
398  %gep = getelementptr i8, i8 addrspace(6)* %addrspacecast, i32 %offset
399  %ptr.cast = bitcast i8 addrspace(6)* %gep to i32 addrspace(6)*
400  %load = load volatile i32, i32 addrspace(6)* %ptr.cast, align 4
401  ret void
402}
403
404declare void @llvm.amdgcn.s.barrier() #1
405declare i32 @llvm.amdgcn.workitem.id.x() #2
406
407attributes #0 = { nounwind }
408attributes #1 = { nounwind convergent }
409attributes #2 = { nounwind readnone }
410