1; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tahiti < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-SI -check-prefix=OPT-SICIVI %s
2; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=bonaire < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-CI -check-prefix=OPT-SICIVI %s
3; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-VI -check-prefix=OPT-SICIVI %s
4; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=gfx900 < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-GFX9 %s
5; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=-promote-alloca -amdgpu-scalarize-global-loads=false -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SICIVI %s
6; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca -amdgpu-scalarize-global-loads=false -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=SICIVI %s
7; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-scalarize-global-loads=false -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=SICIVI %s
8; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -amdgpu-scalarize-global-loads=false -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
9
10target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
11
12; OPT-LABEL: @test_sink_global_small_offset_i32(
13; OPT-CI-NOT: getelementptr i32, i32 addrspace(1)* %in
14; OPT-VI: getelementptr i32, i32 addrspace(1)* %in
15; OPT: br i1
16; OPT-CI: getelementptr i8,
17
18; GCN-LABEL: {{^}}test_sink_global_small_offset_i32:
19define amdgpu_kernel void @test_sink_global_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
20entry:
21  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
22  %in.gep = getelementptr i32, i32 addrspace(1)* %in, i64 7
23  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
24  %tmp0 = icmp eq i32 %tid, 0
25  br i1 %tmp0, label %endif, label %if
26
27if:
28  %tmp1 = load i32, i32 addrspace(1)* %in.gep
29  br label %endif
30
31endif:
32  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
33  store i32 %x, i32 addrspace(1)* %out.gep
34  br label %done
35
36done:
37  ret void
38}
39
40; OPT-LABEL: @test_sink_global_small_max_i32_ds_offset(
41; OPT: %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 65535
42; OPT: br i1
43
44; GCN-LABEL: {{^}}test_sink_global_small_max_i32_ds_offset:
45; GCN: s_and_saveexec_b64
46; SICIVI: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}}
47
48; GFX9: v_mov_b32_e32 [[VOFFSET:v[0-9]+]], 0xf000{{$}}
49; GFX9: global_load_sbyte {{v[0-9]+}}, [[VOFFSET]], {{s\[[0-9]+:[0-9]+\]}} offset:4095{{$}}
50; GCN: {{^}}BB1_2:
51; GCN: s_or_b64 exec
52define amdgpu_kernel void @test_sink_global_small_max_i32_ds_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
53entry:
54  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 99999
55  %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 65535
56  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
57  %tmp0 = icmp eq i32 %tid, 0
58  br i1 %tmp0, label %endif, label %if
59
60if:
61  %tmp1 = load i8, i8 addrspace(1)* %in.gep
62  %tmp2 = sext i8 %tmp1 to i32
63  br label %endif
64
65endif:
66  %x = phi i32 [ %tmp2, %if ], [ 0, %entry ]
67  store i32 %x, i32 addrspace(1)* %out.gep
68  br label %done
69
70done:
71  ret void
72}
73
74; GCN-LABEL: {{^}}test_sink_global_small_max_mubuf_offset:
75; GCN: s_and_saveexec_b64
76; SICIVI: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4095{{$}}
77; GFX9: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
78; GFX9: global_load_sbyte {{v[0-9]+}}, [[ZERO]], {{s\[[0-9]+:[0-9]+\]}} offset:4095{{$}}
79; GCN: {{^}}BB2_2:
80; GCN: s_or_b64 exec
81define amdgpu_kernel void @test_sink_global_small_max_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
82entry:
83  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 1024
84  %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 4095
85  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
86  %tmp0 = icmp eq i32 %tid, 0
87  br i1 %tmp0, label %endif, label %if
88
89if:
90  %tmp1 = load i8, i8 addrspace(1)* %in.gep
91  %tmp2 = sext i8 %tmp1 to i32
92  br label %endif
93
94endif:
95  %x = phi i32 [ %tmp2, %if ], [ 0, %entry ]
96  store i32 %x, i32 addrspace(1)* %out.gep
97  br label %done
98
99done:
100  ret void
101}
102
103; GCN-LABEL: {{^}}test_sink_global_small_max_plus_1_mubuf_offset:
104; GCN: s_and_saveexec_b64
105; SICIVI: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}}
106; GFX9: v_mov_b32_e32 [[VOFFSET:v[0-9]+]], 0x1000{{$}}
107; GFX9: global_load_sbyte {{v[0-9]+}}, [[VOFFSET]], {{s\[[0-9]+:[0-9]+\]$}}
108; GCN: {{^}}BB3_2:
109; GCN: s_or_b64 exec
110define amdgpu_kernel void @test_sink_global_small_max_plus_1_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
111entry:
112  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 99999
113  %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 4096
114  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
115  %tmp0 = icmp eq i32 %tid, 0
116  br i1 %tmp0, label %endif, label %if
117
118if:
119  %tmp1 = load i8, i8 addrspace(1)* %in.gep
120  %tmp2 = sext i8 %tmp1 to i32
121  br label %endif
122
123endif:
124  %x = phi i32 [ %tmp2, %if ], [ 0, %entry ]
125  store i32 %x, i32 addrspace(1)* %out.gep
126  br label %done
127
128done:
129  ret void
130}
131
132; OPT-LABEL: @test_sink_scratch_small_offset_i32(
133; OPT-NOT:  getelementptr [512 x i32]
134; OPT: br i1
135; OPT: getelementptr i8,
136
137; GCN-LABEL: {{^}}test_sink_scratch_small_offset_i32:
138; GCN: s_and_saveexec_b64
139; GCN: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4092{{$}}
140; GCN: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4092{{$}}
141; GCN: {{^}}BB4_2:
142define amdgpu_kernel void @test_sink_scratch_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) {
143entry:
144  %alloca = alloca [512 x i32], align 4, addrspace(5)
145  %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998
146  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i64 999999
147  %add.arg = add i32 %arg, 8
148  %alloca.gep = getelementptr [512 x i32], [512 x i32] addrspace(5)* %alloca, i32 0, i32 1022
149  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
150  %tmp0 = icmp eq i32 %tid, 0
151  br i1 %tmp0, label %endif, label %if
152
153if:
154  store volatile i32 123, i32 addrspace(5)* %alloca.gep
155  %tmp1 = load volatile i32, i32 addrspace(5)* %alloca.gep
156  br label %endif
157
158endif:
159  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
160  store i32 %x, i32 addrspace(1)* %out.gep.0
161  %load = load volatile i32, i32 addrspace(5)* %alloca.gep
162  store i32 %load, i32 addrspace(1)* %out.gep.1
163  br label %done
164
165done:
166  ret void
167}
168
169; This ends up not fitting due to the reserved 4 bytes at offset 0
170; OPT-LABEL: @test_sink_scratch_small_offset_i32_reserved(
171; OPT-NOT:  getelementptr [512 x i32]
172; OPT: br i1
173; OPT: getelementptr i8,
174
175; GCN-LABEL: {{^}}test_sink_scratch_small_offset_i32_reserved:
176; GCN: s_and_saveexec_b64
177; GCN: v_mov_b32_e32 [[BASE_FI0:v[0-9]+]], 4
178; GCN: buffer_store_dword {{v[0-9]+}}, [[BASE_FI0]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen offset:4092{{$}}
179; GCN: v_mov_b32_e32 [[BASE_FI1:v[0-9]+]], 4
180; GCN: buffer_load_dword {{v[0-9]+}}, [[BASE_FI1]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen offset:4092{{$}}
181; GCN: {{^BB[0-9]+}}_2:
182
183define amdgpu_kernel void @test_sink_scratch_small_offset_i32_reserved(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) {
184entry:
185  %alloca = alloca [512 x i32], align 4, addrspace(5)
186  %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998
187  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i64 999999
188  %add.arg = add i32 %arg, 8
189  %alloca.gep = getelementptr [512 x i32], [512 x i32] addrspace(5)* %alloca, i32 0, i32 1023
190  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
191  %tmp0 = icmp eq i32 %tid, 0
192  br i1 %tmp0, label %endif, label %if
193
194if:
195  store volatile i32 123, i32 addrspace(5)* %alloca.gep
196  %tmp1 = load volatile i32, i32 addrspace(5)* %alloca.gep
197  br label %endif
198
199endif:
200  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
201  store i32 %x, i32 addrspace(1)* %out.gep.0
202  %load = load volatile i32, i32 addrspace(5)* %alloca.gep
203  store i32 %load, i32 addrspace(1)* %out.gep.1
204  br label %done
205
206done:
207  ret void
208}
209
210; OPT-LABEL: @test_no_sink_scratch_large_offset_i32(
211; OPT: %alloca.gep = getelementptr [512 x i32], [512 x i32] addrspace(5)* %alloca, i32 0, i32 1024
212; OPT: br i1
213; OPT-NOT: ptrtoint
214
215; GCN-LABEL: {{^}}test_no_sink_scratch_large_offset_i32:
216; GCN: s_and_saveexec_b64
217; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen{{$}}
218; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen{{$}}
219; GCN: {{^BB[0-9]+}}_2:
220define amdgpu_kernel void @test_no_sink_scratch_large_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) {
221entry:
222  %alloca = alloca [512 x i32], align 4, addrspace(5)
223  %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998
224  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i64 999999
225  %add.arg = add i32 %arg, 8
226  %alloca.gep = getelementptr [512 x i32], [512 x i32] addrspace(5)* %alloca, i32 0, i32 1024
227  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
228  %tmp0 = icmp eq i32 %tid, 0
229  br i1 %tmp0, label %endif, label %if
230
231if:
232  store volatile i32 123, i32 addrspace(5)* %alloca.gep
233  %tmp1 = load volatile i32, i32 addrspace(5)* %alloca.gep
234  br label %endif
235
236endif:
237  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
238  store i32 %x, i32 addrspace(1)* %out.gep.0
239  %load = load volatile i32, i32 addrspace(5)* %alloca.gep
240  store i32 %load, i32 addrspace(1)* %out.gep.1
241  br label %done
242
243done:
244  ret void
245}
246
247; GCN-LABEL: {{^}}test_sink_global_vreg_sreg_i32:
248; GCN: s_and_saveexec_b64
249; CI: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
250; VI: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
251; GCN: {{^BB[0-9]+}}_2:
252define amdgpu_kernel void @test_sink_global_vreg_sreg_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %offset) {
253entry:
254  %offset.ext = zext i32 %offset to i64
255  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
256  %in.gep = getelementptr i32, i32 addrspace(1)* %in, i64 %offset.ext
257  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
258  %tmp0 = icmp eq i32 %tid, 0
259  br i1 %tmp0, label %endif, label %if
260
261if:
262  %tmp1 = load i32, i32 addrspace(1)* %in.gep
263  br label %endif
264
265endif:
266  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
267  store i32 %x, i32 addrspace(1)* %out.gep
268  br label %done
269
270done:
271  ret void
272}
273
274; OPT-LABEL: @test_sink_constant_small_offset_i32
275; OPT-NOT:  getelementptr i32, i32 addrspace(4)*
276; OPT: br i1
277
278; GCN-LABEL: {{^}}test_sink_constant_small_offset_i32:
279; GCN: s_and_saveexec_b64
280; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x7{{$}}
281; GCN: s_or_b64 exec, exec
282define amdgpu_kernel void @test_sink_constant_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) {
283entry:
284  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
285  %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 7
286  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
287  %tmp0 = icmp eq i32 %tid, 0
288  br i1 %tmp0, label %endif, label %if
289
290if:
291  %tmp1 = load i32, i32 addrspace(4)* %in.gep
292  br label %endif
293
294endif:
295  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
296  store i32 %x, i32 addrspace(1)* %out.gep
297  br label %done
298
299done:
300  ret void
301}
302
303; OPT-LABEL: @test_sink_constant_max_8_bit_offset_i32
304; OPT-NOT:  getelementptr i32, i32 addrspace(4)*
305; OPT: br i1
306
307; GCN-LABEL: {{^}}test_sink_constant_max_8_bit_offset_i32:
308; GCN: s_and_saveexec_b64
309; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xff{{$}}
310; GCN: s_or_b64 exec, exec
311define amdgpu_kernel void @test_sink_constant_max_8_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) {
312entry:
313  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
314  %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 255
315  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
316  %tmp0 = icmp eq i32 %tid, 0
317  br i1 %tmp0, label %endif, label %if
318
319if:
320  %tmp1 = load i32, i32 addrspace(4)* %in.gep
321  br label %endif
322
323endif:
324  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
325  store i32 %x, i32 addrspace(1)* %out.gep
326  br label %done
327
328done:
329  ret void
330}
331
332; OPT-LABEL: @test_sink_constant_max_8_bit_offset_p1_i32
333; OPT-SI:  getelementptr i32, i32 addrspace(4)*
334; OPT-CI-NOT:  getelementptr i32, i32 addrspace(4)*
335; OPT-VI-NOT:  getelementptr i32, i32 addrspace(4)*
336; OPT: br i1
337
338; GCN-LABEL: {{^}}test_sink_constant_max_8_bit_offset_p1_i32:
339; GCN: s_and_saveexec_b64
340; SI: s_movk_i32 [[OFFSET:s[0-9]+]], 0x400
341
342; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}}
343; GCN: s_or_b64 exec, exec
344define amdgpu_kernel void @test_sink_constant_max_8_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) {
345entry:
346  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
347  %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 256
348  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
349  %tmp0 = icmp eq i32 %tid, 0
350  br i1 %tmp0, label %endif, label %if
351
352if:
353  %tmp1 = load i32, i32 addrspace(4)* %in.gep
354  br label %endif
355
356endif:
357  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
358  store i32 %x, i32 addrspace(1)* %out.gep
359  br label %done
360
361done:
362  ret void
363}
364
365; OPT-LABEL: @test_sink_constant_max_32_bit_offset_i32
366; OPT-SI: getelementptr i32, i32 addrspace(4)*
367; OPT-CI-NOT: getelementptr i32, i32 addrspace(4)*
368; OPT: br i1
369
370; GCN-LABEL: {{^}}test_sink_constant_max_32_bit_offset_i32:
371; GCN: s_and_saveexec_b64
372; SI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, -4{{$}}
373; SI: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, 3{{$}}
374; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
375
376; VI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, -4{{$}}
377; VI: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, 3{{$}}
378; VI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
379
380; CI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xffffffff{{$}}
381
382; GCN: s_or_b64 exec, exec
383define amdgpu_kernel void @test_sink_constant_max_32_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) {
384entry:
385  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
386  %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 4294967295
387  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
388  %tmp0 = icmp eq i32 %tid, 0
389  br i1 %tmp0, label %endif, label %if
390
391if:
392  %tmp1 = load i32, i32 addrspace(4)* %in.gep
393  br label %endif
394
395endif:
396  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
397  store i32 %x, i32 addrspace(1)* %out.gep
398  br label %done
399
400done:
401  ret void
402}
403
404; OPT-LABEL: @test_sink_constant_max_32_bit_offset_p1_i32
405; OPT: getelementptr i32, i32 addrspace(4)*
406; OPT: br i1
407
408; GCN-LABEL: {{^}}test_sink_constant_max_32_bit_offset_p1_i32:
409; GCN: s_and_saveexec_b64
410; GCN: s_add_u32
411; GCN: s_addc_u32
412; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
413; GCN: s_or_b64 exec, exec
414define amdgpu_kernel void @test_sink_constant_max_32_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) {
415entry:
416  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
417  %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 17179869181
418  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
419  %tmp0 = icmp eq i32 %tid, 0
420  br i1 %tmp0, label %endif, label %if
421
422if:
423  %tmp1 = load i32, i32 addrspace(4)* %in.gep
424  br label %endif
425
426endif:
427  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
428  store i32 %x, i32 addrspace(1)* %out.gep
429  br label %done
430
431done:
432  ret void
433}
434
435; GCN-LABEL: {{^}}test_sink_constant_max_20_bit_byte_offset_i32:
436; GCN: s_and_saveexec_b64
437; SI: s_mov_b32 [[OFFSET:s[0-9]+]], 0xffffc{{$}}
438; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}}
439
440; CI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x3ffff{{$}}
441; VI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xffffc{{$}}
442
443; GCN: s_or_b64 exec, exec
444define amdgpu_kernel void @test_sink_constant_max_20_bit_byte_offset_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) {
445entry:
446  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
447  %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 262143
448  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
449  %tmp0 = icmp eq i32 %tid, 0
450  br i1 %tmp0, label %endif, label %if
451
452if:
453  %tmp1 = load i32, i32 addrspace(4)* %in.gep
454  br label %endif
455
456endif:
457  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
458  store i32 %x, i32 addrspace(1)* %out.gep
459  br label %done
460
461done:
462  ret void
463}
464
465; OPT-LABEL: @test_sink_constant_max_20_bit_byte_offset_p1_i32
466; OPT-SI: getelementptr i32, i32 addrspace(4)*
467; OPT-CI-NOT: getelementptr i32, i32 addrspace(4)*
468; OPT-VI: getelementptr i32, i32 addrspace(4)*
469; OPT: br i1
470
471; GCN-LABEL: {{^}}test_sink_constant_max_20_bit_byte_offset_p1_i32:
472; GCN: s_and_saveexec_b64
473; SI: s_mov_b32 [[OFFSET:s[0-9]+]], 0x100000{{$}}
474; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}}
475
476; CI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x40000{{$}}
477
478; VI: s_mov_b32 [[OFFSET:s[0-9]+]], 0x100000{{$}}
479; VI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}}
480
481; GCN: s_or_b64 exec, exec
482define amdgpu_kernel void @test_sink_constant_max_20_bit_byte_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) {
483entry:
484  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
485  %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 262144
486  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
487  %tmp0 = icmp eq i32 %tid, 0
488  br i1 %tmp0, label %endif, label %if
489
490if:
491  %tmp1 = load i32, i32 addrspace(4)* %in.gep
492  br label %endif
493
494endif:
495  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
496  store i32 %x, i32 addrspace(1)* %out.gep
497  br label %done
498
499done:
500  ret void
501}
502
503%struct.foo = type { [3 x float], [3 x float] }
504
505; OPT-LABEL: @sink_ds_address(
506; OPT: getelementptr inbounds i8,
507
508; GCN-LABEL: {{^}}sink_ds_address:
509; GCN: s_load_dword [[SREG1:s[0-9]+]],
510; GCN: v_mov_b32_e32 [[VREG1:v[0-9]+]], [[SREG1]]
511; GCN-DAG: ds_read2_b32 v[{{[0-9+:[0-9]+}}], [[VREG1]] offset0:3 offset1:5
512define amdgpu_kernel void @sink_ds_address(%struct.foo addrspace(3)* nocapture %ptr) nounwind {
513entry:
514  %x = getelementptr inbounds %struct.foo, %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 0
515  %y = getelementptr inbounds %struct.foo, %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 2
516  br label %bb32
517
518bb32:
519  %a = load float, float addrspace(3)* %x, align 4
520  %b = load float, float addrspace(3)* %y, align 4
521  %cmp = fcmp one float %a, %b
522  br i1 %cmp, label %bb34, label %bb33
523
524bb33:
525  unreachable
526
527bb34:
528  unreachable
529}
530
531; Address offset is not a multiple of 4. This is a valid mubuf offset,
532; but not smrd.
533
534; OPT-LABEL: @test_sink_constant_small_max_mubuf_offset_load_i32_align_1(
535; OPT: br i1 %tmp0,
536; OPT: if:
537; OPT: getelementptr i8, {{.*}} 4095
538define amdgpu_kernel void @test_sink_constant_small_max_mubuf_offset_load_i32_align_1(i32 addrspace(1)* %out, i8 addrspace(4)* %in) {
539entry:
540  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 1024
541  %in.gep = getelementptr i8, i8 addrspace(4)* %in, i64 4095
542  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
543  %tmp0 = icmp eq i32 %tid, 0
544  br i1 %tmp0, label %endif, label %if
545
546if:
547  %bitcast = bitcast i8 addrspace(4)* %in.gep to i32 addrspace(4)*
548  %tmp1 = load i32, i32 addrspace(4)* %bitcast, align 1
549  br label %endif
550
551endif:
552  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
553  store i32 %x, i32 addrspace(1)* %out.gep
554  br label %done
555
556done:
557  ret void
558}
559
560; OPT-LABEL: @test_sink_local_small_offset_atomicrmw_i32(
561; OPT: %0 = bitcast i32 addrspace(3)* %in to i8 addrspace(3)*
562; OPT: %sunkaddr = getelementptr i8, i8 addrspace(3)* %0, i32 28
563; OPT: %1 = bitcast i8 addrspace(3)* %sunkaddr to i32 addrspace(3)*
564; OPT: %tmp1 = atomicrmw add i32 addrspace(3)* %1, i32 2 seq_cst
565define amdgpu_kernel void @test_sink_local_small_offset_atomicrmw_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) {
566entry:
567  %out.gep = getelementptr i32, i32 addrspace(3)* %out, i32 999999
568  %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
569  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
570  %tmp0 = icmp eq i32 %tid, 0
571  br i1 %tmp0, label %endif, label %if
572
573if:
574  %tmp1 = atomicrmw add i32 addrspace(3)* %in.gep, i32 2 seq_cst
575  br label %endif
576
577endif:
578  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
579  store i32 %x, i32 addrspace(3)* %out.gep
580  br label %done
581
582done:
583  ret void
584}
585
586; OPT-LABEL: @test_sink_local_small_offset_cmpxchg_i32(
587; OPT: %0 = bitcast i32 addrspace(3)* %in to i8 addrspace(3)*
588; OPT: %sunkaddr = getelementptr i8, i8 addrspace(3)* %0, i32 28
589; OPT: %1 = bitcast i8 addrspace(3)* %sunkaddr to i32 addrspace(3)*
590; OPT: %tmp1.struct = cmpxchg i32 addrspace(3)* %1, i32 undef, i32 2 seq_cst monotonic
591define amdgpu_kernel void @test_sink_local_small_offset_cmpxchg_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) {
592entry:
593  %out.gep = getelementptr i32, i32 addrspace(3)* %out, i32 999999
594  %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
595  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
596  %tmp0 = icmp eq i32 %tid, 0
597  br i1 %tmp0, label %endif, label %if
598
599if:
600  %tmp1.struct = cmpxchg i32 addrspace(3)* %in.gep, i32 undef, i32 2 seq_cst monotonic
601  %tmp1 = extractvalue { i32, i1 } %tmp1.struct, 0
602  br label %endif
603
604endif:
605  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
606  store i32 %x, i32 addrspace(3)* %out.gep
607  br label %done
608
609done:
610  ret void
611}
612
613; OPT-LABEL: @test_wrong_operand_local_small_offset_cmpxchg_i32(
614; OPT: %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
615; OPT: br i1
616; OPT: cmpxchg i32 addrspace(3)* addrspace(3)* undef, i32 addrspace(3)* %in.gep, i32 addrspace(3)* undef seq_cst monotonic
617define amdgpu_kernel void @test_wrong_operand_local_small_offset_cmpxchg_i32(i32 addrspace(3)* addrspace(3)* %out, i32 addrspace(3)* %in) {
618entry:
619  %out.gep = getelementptr i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* %out, i32 999999
620  %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
621  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
622  %tmp0 = icmp eq i32 %tid, 0
623  br i1 %tmp0, label %endif, label %if
624
625if:
626  %tmp1.struct = cmpxchg i32 addrspace(3)* addrspace(3)* undef, i32 addrspace(3)* %in.gep, i32 addrspace(3)* undef seq_cst monotonic
627  %tmp1 = extractvalue { i32 addrspace(3)*, i1 } %tmp1.struct, 0
628  br label %endif
629
630endif:
631  %x = phi i32 addrspace(3)* [ %tmp1, %if ], [ null, %entry ]
632  store i32 addrspace(3)* %x, i32 addrspace(3)* addrspace(3)* %out.gep
633  br label %done
634
635done:
636  ret void
637}
638
639; OPT-LABEL: @test_sink_local_small_offset_atomic_inc_i32(
640; OPT: %0 = bitcast i32 addrspace(3)* %in to i8 addrspace(3)*
641; OPT: %sunkaddr = getelementptr i8, i8 addrspace(3)* %0, i32 28
642; OPT: %1 = bitcast i8 addrspace(3)* %sunkaddr to i32 addrspace(3)*
643; OPT: %tmp1 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %1, i32 2, i32 0, i32 0, i1 false)
644define amdgpu_kernel void @test_sink_local_small_offset_atomic_inc_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) {
645entry:
646  %out.gep = getelementptr i32, i32 addrspace(3)* %out, i32 999999
647  %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
648  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
649  %tmp0 = icmp eq i32 %tid, 0
650  br i1 %tmp0, label %endif, label %if
651
652if:
653  %tmp1 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %in.gep, i32 2, i32 0, i32 0, i1 false)
654  br label %endif
655
656endif:
657  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
658  store i32 %x, i32 addrspace(3)* %out.gep
659  br label %done
660
661done:
662  ret void
663}
664
665; OPT-LABEL: @test_sink_local_small_offset_atomic_dec_i32(
666; OPT: %0 = bitcast i32 addrspace(3)* %in to i8 addrspace(3)*
667; OPT: %sunkaddr = getelementptr i8, i8 addrspace(3)* %0, i32 28
668; OPT: %1 = bitcast i8 addrspace(3)* %sunkaddr to i32 addrspace(3)*
669; OPT: %tmp1 = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %1, i32 2, i32 0, i32 0, i1 false)
670define amdgpu_kernel void @test_sink_local_small_offset_atomic_dec_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) {
671entry:
672  %out.gep = getelementptr i32, i32 addrspace(3)* %out, i32 999999
673  %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
674  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
675  %tmp0 = icmp eq i32 %tid, 0
676  br i1 %tmp0, label %endif, label %if
677
678if:
679  %tmp1 = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %in.gep, i32 2, i32 0, i32 0, i1 false)
680  br label %endif
681
682endif:
683  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
684  store i32 %x, i32 addrspace(3)* %out.gep
685  br label %done
686
687done:
688  ret void
689}
690
691; OPT-LABEL: @test_sink_global_small_min_scratch_global_offset(
692; OPT-SICIVI: %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 -4096
693; OPT-SICIV: br
694; OPT-SICIVI: %tmp1 = load i8, i8 addrspace(1)* %in.gep
695
696; OPT-GFX9: br
697; OPT-GFX9: %sunkaddr = getelementptr i8, i8 addrspace(1)* %in, i64 -4096
698; OPT-GFX9: load i8, i8 addrspace(1)* %sunkaddr
699
700; GCN-LABEL: {{^}}test_sink_global_small_min_scratch_global_offset:
701; GFX9: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
702; GFX9: global_load_sbyte v{{[0-9]+}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}} offset:-4096{{$}}
703define amdgpu_kernel void @test_sink_global_small_min_scratch_global_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
704entry:
705  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 1024
706  %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 -4096
707  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
708  %tmp0 = icmp eq i32 %tid, 0
709  br i1 %tmp0, label %endif, label %if
710
711if:
712  %tmp1 = load i8, i8 addrspace(1)* %in.gep
713  %tmp2 = sext i8 %tmp1 to i32
714  br label %endif
715
716endif:
717  %x = phi i32 [ %tmp2, %if ], [ 0, %entry ]
718  store i32 %x, i32 addrspace(1)* %out.gep
719  br label %done
720
721done:
722  ret void
723}
724
725; OPT-LABEL: @test_sink_global_small_min_scratch_global_neg1_offset(
726; OPT: %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 -4097
727; OPT: br
728; OPT: load i8, i8 addrspace(1)* %in.gep
729
730; GCN-LABEL: {{^}}test_sink_global_small_min_scratch_global_neg1_offset:
731define amdgpu_kernel void @test_sink_global_small_min_scratch_global_neg1_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
732entry:
733  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 99999
734  %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 -4097
735  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
736  %tmp0 = icmp eq i32 %tid, 0
737  br i1 %tmp0, label %endif, label %if
738
739if:
740  %tmp1 = load i8, i8 addrspace(1)* %in.gep
741  %tmp2 = sext i8 %tmp1 to i32
742  br label %endif
743
744endif:
745  %x = phi i32 [ %tmp2, %if ], [ 0, %entry ]
746  store i32 %x, i32 addrspace(1)* %out.gep
747  br label %done
748
749done:
750  ret void
751}
752
753; OPT-LABEL: @test_sink_small_offset_ds_append(
754; OPT: %0 = bitcast i32 addrspace(3)* %in to i8 addrspace(3)*
755; OPT: %sunkaddr = getelementptr i8, i8 addrspace(3)* %0, i32 28
756; OPT: %1 = bitcast i8 addrspace(3)* %sunkaddr to i32 addrspace(3)*
757; OPT: %tmp1 = call i32 @llvm.amdgcn.ds.append.p3i32(i32 addrspace(3)* %1, i1 false)
758define amdgpu_kernel void @test_sink_small_offset_ds_append(i32 addrspace(3)* %out, i32 addrspace(3)* %in) {
759entry:
760  %out.gep = getelementptr i32, i32 addrspace(3)* %out, i32 999999
761  %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
762  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
763  %tmp0 = icmp eq i32 %tid, 0
764  br i1 %tmp0, label %endif, label %if
765
766if:
767  %tmp1 = call i32 @llvm.amdgcn.ds.append.p3i32(i32 addrspace(3)* %in.gep, i1 false)
768  br label %endif
769
770endif:
771  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
772  store i32 %x, i32 addrspace(3)* %out.gep
773  br label %done
774
775done:
776  ret void
777}
778
779; OPT-LABEL: @test_sink_small_offset_ds_consume(
780; OPT: %0 = bitcast i32 addrspace(3)* %in to i8 addrspace(3)*
781; OPT: %sunkaddr = getelementptr i8, i8 addrspace(3)* %0, i32 28
782; OPT: %1 = bitcast i8 addrspace(3)* %sunkaddr to i32 addrspace(3)*
783; OPT: %tmp1 = call i32 @llvm.amdgcn.ds.consume.p3i32(i32 addrspace(3)* %1, i1 false)
784define amdgpu_kernel void @test_sink_small_offset_ds_consume(i32 addrspace(3)* %out, i32 addrspace(3)* %in) {
785entry:
786  %out.gep = getelementptr i32, i32 addrspace(3)* %out, i32 999999
787  %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
788  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
789  %tmp0 = icmp eq i32 %tid, 0
790  br i1 %tmp0, label %endif, label %if
791
792if:
793  %tmp1 = call i32 @llvm.amdgcn.ds.consume.p3i32(i32 addrspace(3)* %in.gep, i1 false)
794  br label %endif
795
796endif:
797  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
798  store i32 %x, i32 addrspace(3)* %out.gep
799  br label %done
800
801done:
802  ret void
803}
804
805declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0
806declare i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* nocapture, i32, i32, i32, i1) #2
807declare i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* nocapture, i32, i32, i32, i1) #2
808declare i32 @llvm.amdgcn.ds.append.p3i32(i32 addrspace(3)* nocapture, i1 immarg) #3
809declare i32 @llvm.amdgcn.ds.consume.p3i32(i32 addrspace(3)* nocapture, i1 immarg) #3
810
811attributes #0 = { nounwind readnone }
812attributes #1 = { nounwind }
813attributes #2 = { nounwind argmemonly }
814attributes #3 = { argmemonly convergent nounwind willreturn }
815