1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
3; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
4
5; Test splitting flat instruction offsets into the low and high bits
6; when the offset doesn't fit in the offset field.
7
8define i8 @global_inst_valu_offset_1(i8 addrspace(1)* %p) {
9; GFX9-LABEL: global_inst_valu_offset_1:
10; GFX9:       ; %bb.0:
11; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:1
13; GFX9-NEXT:    s_waitcnt vmcnt(0)
14; GFX9-NEXT:    s_setpc_b64 s[30:31]
15;
16; GFX10-LABEL: global_inst_valu_offset_1:
17; GFX10:       ; %bb.0:
18; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
20; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:1
21; GFX10-NEXT:    s_waitcnt vmcnt(0)
22; GFX10-NEXT:    s_setpc_b64 s[30:31]
23  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 1
24  %load = load i8, i8 addrspace(1)* %gep, align 4
25  ret i8 %load
26}
27
28define i8 @global_inst_valu_offset_11bit_max(i8 addrspace(1)* %p) {
29; GFX9-LABEL: global_inst_valu_offset_11bit_max:
30; GFX9:       ; %bb.0:
31; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
33; GFX9-NEXT:    s_waitcnt vmcnt(0)
34; GFX9-NEXT:    s_setpc_b64 s[30:31]
35;
36; GFX10-LABEL: global_inst_valu_offset_11bit_max:
37; GFX10:       ; %bb.0:
38; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
39; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
40; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
41; GFX10-NEXT:    s_waitcnt vmcnt(0)
42; GFX10-NEXT:    s_setpc_b64 s[30:31]
43  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 2047
44  %load = load i8, i8 addrspace(1)* %gep, align 4
45  ret i8 %load
46}
47
48define i8 @global_inst_valu_offset_12bit_max(i8 addrspace(1)* %p) {
49; GFX9-LABEL: global_inst_valu_offset_12bit_max:
50; GFX9:       ; %bb.0:
51; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
52; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:4095
53; GFX9-NEXT:    s_waitcnt vmcnt(0)
54; GFX9-NEXT:    s_setpc_b64 s[30:31]
55;
56; GFX10-LABEL: global_inst_valu_offset_12bit_max:
57; GFX10:       ; %bb.0:
58; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
59; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
60; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0x800, v0
61; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
62; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
63; GFX10-NEXT:    s_waitcnt vmcnt(0)
64; GFX10-NEXT:    s_setpc_b64 s[30:31]
65  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 4095
66  %load = load i8, i8 addrspace(1)* %gep, align 4
67  ret i8 %load
68}
69
70define i8 @global_inst_valu_offset_13bit_max(i8 addrspace(1)* %p) {
71; GFX9-LABEL: global_inst_valu_offset_13bit_max:
72; GFX9:       ; %bb.0:
73; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
74; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
75; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
76; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:4095
77; GFX9-NEXT:    s_waitcnt vmcnt(0)
78; GFX9-NEXT:    s_setpc_b64 s[30:31]
79;
80; GFX10-LABEL: global_inst_valu_offset_13bit_max:
81; GFX10:       ; %bb.0:
82; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
83; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
84; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0x1800, v0
85; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
86; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
87; GFX10-NEXT:    s_waitcnt vmcnt(0)
88; GFX10-NEXT:    s_setpc_b64 s[30:31]
89  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8191
90  %load = load i8, i8 addrspace(1)* %gep, align 4
91  ret i8 %load
92}
93
94define i8 @global_inst_valu_offset_neg_11bit_max(i8 addrspace(1)* %p) {
95; GFX9-LABEL: global_inst_valu_offset_neg_11bit_max:
96; GFX9:       ; %bb.0:
97; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
98; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:-2048
99; GFX9-NEXT:    s_waitcnt vmcnt(0)
100; GFX9-NEXT:    s_setpc_b64 s[30:31]
101;
102; GFX10-LABEL: global_inst_valu_offset_neg_11bit_max:
103; GFX10:       ; %bb.0:
104; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
105; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
106; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:-2048
107; GFX10-NEXT:    s_waitcnt vmcnt(0)
108; GFX10-NEXT:    s_setpc_b64 s[30:31]
109  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -2048
110  %load = load i8, i8 addrspace(1)* %gep, align 4
111  ret i8 %load
112}
113
114define i8 @global_inst_valu_offset_neg_12bit_max(i8 addrspace(1)* %p) {
115; GFX9-LABEL: global_inst_valu_offset_neg_12bit_max:
116; GFX9:       ; %bb.0:
117; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
118; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:-4096
119; GFX9-NEXT:    s_waitcnt vmcnt(0)
120; GFX9-NEXT:    s_setpc_b64 s[30:31]
121;
122; GFX10-LABEL: global_inst_valu_offset_neg_12bit_max:
123; GFX10:       ; %bb.0:
124; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
125; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
126; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0xfffff000, v0
127; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
128; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
129; GFX10-NEXT:    s_waitcnt vmcnt(0)
130; GFX10-NEXT:    s_setpc_b64 s[30:31]
131  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -4096
132  %load = load i8, i8 addrspace(1)* %gep, align 4
133  ret i8 %load
134}
135
136define i8 @global_inst_valu_offset_neg_13bit_max(i8 addrspace(1)* %p) {
137; GFX9-LABEL: global_inst_valu_offset_neg_13bit_max:
138; GFX9:       ; %bb.0:
139; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
140; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffe000, v0
141; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
142; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
143; GFX9-NEXT:    s_waitcnt vmcnt(0)
144; GFX9-NEXT:    s_setpc_b64 s[30:31]
145;
146; GFX10-LABEL: global_inst_valu_offset_neg_13bit_max:
147; GFX10:       ; %bb.0:
148; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
149; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
150; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0xffffe000, v0
151; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
152; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
153; GFX10-NEXT:    s_waitcnt vmcnt(0)
154; GFX10-NEXT:    s_setpc_b64 s[30:31]
155  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -8192
156  %load = load i8, i8 addrspace(1)* %gep, align 4
157  ret i8 %load
158}
159
160define i8 @global_inst_valu_offset_2x_11bit_max(i8 addrspace(1)* %p) {
161; GFX9-LABEL: global_inst_valu_offset_2x_11bit_max:
162; GFX9:       ; %bb.0:
163; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
164; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:4095
165; GFX9-NEXT:    s_waitcnt vmcnt(0)
166; GFX9-NEXT:    s_setpc_b64 s[30:31]
167;
168; GFX10-LABEL: global_inst_valu_offset_2x_11bit_max:
169; GFX10:       ; %bb.0:
170; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
171; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
172; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0x800, v0
173; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
174; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
175; GFX10-NEXT:    s_waitcnt vmcnt(0)
176; GFX10-NEXT:    s_setpc_b64 s[30:31]
177  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 4095
178  %load = load i8, i8 addrspace(1)* %gep, align 4
179  ret i8 %load
180}
181
182define i8 @global_inst_valu_offset_2x_12bit_max(i8 addrspace(1)* %p) {
183; GFX9-LABEL: global_inst_valu_offset_2x_12bit_max:
184; GFX9:       ; %bb.0:
185; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
186; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
187; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
188; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:4095
189; GFX9-NEXT:    s_waitcnt vmcnt(0)
190; GFX9-NEXT:    s_setpc_b64 s[30:31]
191;
192; GFX10-LABEL: global_inst_valu_offset_2x_12bit_max:
193; GFX10:       ; %bb.0:
194; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
195; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
196; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0x1800, v0
197; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
198; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
199; GFX10-NEXT:    s_waitcnt vmcnt(0)
200; GFX10-NEXT:    s_setpc_b64 s[30:31]
201  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8191
202  %load = load i8, i8 addrspace(1)* %gep, align 4
203  ret i8 %load
204}
205
206define i8 @global_inst_valu_offset_2x_13bit_max(i8 addrspace(1)* %p) {
207; GFX9-LABEL: global_inst_valu_offset_2x_13bit_max:
208; GFX9:       ; %bb.0:
209; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
210; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x3000, v0
211; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
212; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:4095
213; GFX9-NEXT:    s_waitcnt vmcnt(0)
214; GFX9-NEXT:    s_setpc_b64 s[30:31]
215;
216; GFX10-LABEL: global_inst_valu_offset_2x_13bit_max:
217; GFX10:       ; %bb.0:
218; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
219; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
220; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0x3800, v0
221; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
222; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
223; GFX10-NEXT:    s_waitcnt vmcnt(0)
224; GFX10-NEXT:    s_setpc_b64 s[30:31]
225  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 16383
226  %load = load i8, i8 addrspace(1)* %gep, align 4
227  ret i8 %load
228}
229
230define i8 @global_inst_valu_offset_2x_neg_11bit_max(i8 addrspace(1)* %p) {
231; GFX9-LABEL: global_inst_valu_offset_2x_neg_11bit_max:
232; GFX9:       ; %bb.0:
233; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
234; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:-4096
235; GFX9-NEXT:    s_waitcnt vmcnt(0)
236; GFX9-NEXT:    s_setpc_b64 s[30:31]
237;
238; GFX10-LABEL: global_inst_valu_offset_2x_neg_11bit_max:
239; GFX10:       ; %bb.0:
240; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
241; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
242; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0xfffff000, v0
243; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
244; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
245; GFX10-NEXT:    s_waitcnt vmcnt(0)
246; GFX10-NEXT:    s_setpc_b64 s[30:31]
247  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -4096
248  %load = load i8, i8 addrspace(1)* %gep, align 4
249  ret i8 %load
250}
251
252define i8 @global_inst_valu_offset_2x_neg_12bit_max(i8 addrspace(1)* %p) {
253; GFX9-LABEL: global_inst_valu_offset_2x_neg_12bit_max:
254; GFX9:       ; %bb.0:
255; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
256; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffe000, v0
257; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
258; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
259; GFX9-NEXT:    s_waitcnt vmcnt(0)
260; GFX9-NEXT:    s_setpc_b64 s[30:31]
261;
262; GFX10-LABEL: global_inst_valu_offset_2x_neg_12bit_max:
263; GFX10:       ; %bb.0:
264; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
265; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
266; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0xffffe000, v0
267; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
268; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
269; GFX10-NEXT:    s_waitcnt vmcnt(0)
270; GFX10-NEXT:    s_setpc_b64 s[30:31]
271  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -8192
272  %load = load i8, i8 addrspace(1)* %gep, align 4
273  ret i8 %load
274}
275
276define i8 @global_inst_valu_offset_2x_neg_13bit_max(i8 addrspace(1)* %p) {
277; GFX9-LABEL: global_inst_valu_offset_2x_neg_13bit_max:
278; GFX9:       ; %bb.0:
279; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
280; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffc000, v0
281; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
282; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
283; GFX9-NEXT:    s_waitcnt vmcnt(0)
284; GFX9-NEXT:    s_setpc_b64 s[30:31]
285;
286; GFX10-LABEL: global_inst_valu_offset_2x_neg_13bit_max:
287; GFX10:       ; %bb.0:
288; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
289; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
290; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0xffffc000, v0
291; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
292; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
293; GFX10-NEXT:    s_waitcnt vmcnt(0)
294; GFX10-NEXT:    s_setpc_b64 s[30:31]
295  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -16384
296  %load = load i8, i8 addrspace(1)* %gep, align 4
297  ret i8 %load
298}
299
300; Fill 11-bit low-bits (1ull << 33) | 2047
301define i8 @global_inst_valu_offset_64bit_11bit_split0(i8 addrspace(1)* %p) {
302; GFX9-LABEL: global_inst_valu_offset_64bit_11bit_split0:
303; GFX9:       ; %bb.0:
304; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
305; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0, v0
306; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
307; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
308; GFX9-NEXT:    s_waitcnt vmcnt(0)
309; GFX9-NEXT:    s_setpc_b64 s[30:31]
310;
311; GFX10-LABEL: global_inst_valu_offset_64bit_11bit_split0:
312; GFX10:       ; %bb.0:
313; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
314; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
315; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0, v0
316; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
317; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
318; GFX10-NEXT:    s_waitcnt vmcnt(0)
319; GFX10-NEXT:    s_setpc_b64 s[30:31]
320  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589936639
321  %load = load i8, i8 addrspace(1)* %gep, align 4
322  ret i8 %load
323}
324
325; Fill 11-bit low-bits (1ull << 33) | 2048
326define i8 @global_inst_valu_offset_64bit_11bit_split1(i8 addrspace(1)* %p) {
327; GFX9-LABEL: global_inst_valu_offset_64bit_11bit_split1:
328; GFX9:       ; %bb.0:
329; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
330; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0, v0
331; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
332; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:2048
333; GFX9-NEXT:    s_waitcnt vmcnt(0)
334; GFX9-NEXT:    s_setpc_b64 s[30:31]
335;
336; GFX10-LABEL: global_inst_valu_offset_64bit_11bit_split1:
337; GFX10:       ; %bb.0:
338; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
339; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
340; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0x800, v0
341; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
342; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
343; GFX10-NEXT:    s_waitcnt vmcnt(0)
344; GFX10-NEXT:    s_setpc_b64 s[30:31]
345  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589936640
346  %load = load i8, i8 addrspace(1)* %gep, align 4
347  ret i8 %load
348}
349
350; Fill 12-bit low-bits (1ull << 33) | 4095
351define i8 @global_inst_valu_offset_64bit_12bit_split0(i8 addrspace(1)* %p) {
352; GFX9-LABEL: global_inst_valu_offset_64bit_12bit_split0:
353; GFX9:       ; %bb.0:
354; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
355; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0, v0
356; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
357; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:4095
358; GFX9-NEXT:    s_waitcnt vmcnt(0)
359; GFX9-NEXT:    s_setpc_b64 s[30:31]
360;
361; GFX10-LABEL: global_inst_valu_offset_64bit_12bit_split0:
362; GFX10:       ; %bb.0:
363; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
364; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
365; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0x800, v0
366; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
367; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
368; GFX10-NEXT:    s_waitcnt vmcnt(0)
369; GFX10-NEXT:    s_setpc_b64 s[30:31]
370  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589938687
371  %load = load i8, i8 addrspace(1)* %gep, align 4
372  ret i8 %load
373}
374
375; Fill 12-bit low-bits (1ull << 33) | 4096
376define i8 @global_inst_valu_offset_64bit_12bit_split1(i8 addrspace(1)* %p) {
377; GFX9-LABEL: global_inst_valu_offset_64bit_12bit_split1:
378; GFX9:       ; %bb.0:
379; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
380; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
381; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
382; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
383; GFX9-NEXT:    s_waitcnt vmcnt(0)
384; GFX9-NEXT:    s_setpc_b64 s[30:31]
385;
386; GFX10-LABEL: global_inst_valu_offset_64bit_12bit_split1:
387; GFX10:       ; %bb.0:
388; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
389; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
390; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0x1000, v0
391; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
392; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
393; GFX10-NEXT:    s_waitcnt vmcnt(0)
394; GFX10-NEXT:    s_setpc_b64 s[30:31]
395  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589938688
396  %load = load i8, i8 addrspace(1)* %gep, align 4
397  ret i8 %load
398}
399
400; Fill 13-bit low-bits (1ull << 33) | 8191
401define i8 @global_inst_valu_offset_64bit_13bit_split0(i8 addrspace(1)* %p) {
402; GFX9-LABEL: global_inst_valu_offset_64bit_13bit_split0:
403; GFX9:       ; %bb.0:
404; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
405; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
406; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
407; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:4095
408; GFX9-NEXT:    s_waitcnt vmcnt(0)
409; GFX9-NEXT:    s_setpc_b64 s[30:31]
410;
411; GFX10-LABEL: global_inst_valu_offset_64bit_13bit_split0:
412; GFX10:       ; %bb.0:
413; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
414; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
415; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0x1800, v0
416; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
417; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
418; GFX10-NEXT:    s_waitcnt vmcnt(0)
419; GFX10-NEXT:    s_setpc_b64 s[30:31]
420  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589942783
421  %load = load i8, i8 addrspace(1)* %gep, align 4
422  ret i8 %load
423}
424
425; Fill 13-bit low-bits (1ull << 33) | 8192
426define i8 @global_inst_valu_offset_64bit_13bit_split1(i8 addrspace(1)* %p) {
427; GFX9-LABEL: global_inst_valu_offset_64bit_13bit_split1:
428; GFX9:       ; %bb.0:
429; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
430; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x2000, v0
431; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
432; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
433; GFX9-NEXT:    s_waitcnt vmcnt(0)
434; GFX9-NEXT:    s_setpc_b64 s[30:31]
435;
436; GFX10-LABEL: global_inst_valu_offset_64bit_13bit_split1:
437; GFX10:       ; %bb.0:
438; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
439; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
440; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0x2000, v0
441; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
442; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
443; GFX10-NEXT:    s_waitcnt vmcnt(0)
444; GFX10-NEXT:    s_setpc_b64 s[30:31]
445  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589942784
446  %load = load i8, i8 addrspace(1)* %gep, align 4
447  ret i8 %load
448}
449
450; Fill 11-bit low-bits, negative high bits (1ull << 63) | 2047
451define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split0(i8 addrspace(1)* %p) {
452; GFX9-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split0:
453; GFX9:       ; %bb.0:
454; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
455; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
456; GFX9-NEXT:    v_bfrev_b32_e32 v2, 1
457; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
458; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:-2049
459; GFX9-NEXT:    s_waitcnt vmcnt(0)
460; GFX9-NEXT:    s_setpc_b64 s[30:31]
461;
462; GFX10-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split0:
463; GFX10:       ; %bb.0:
464; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
465; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
466; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0x800, v0
467; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
468; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:-1
469; GFX10-NEXT:    s_waitcnt vmcnt(0)
470; GFX10-NEXT:    s_setpc_b64 s[30:31]
471  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854773761
472  %load = load i8, i8 addrspace(1)* %gep, align 4
473  ret i8 %load
474}
475
476; Fill 11-bit low-bits, negative high bits (1ull << 63) | 2048
477define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split1(i8 addrspace(1)* %p) {
478; GFX9-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split1:
479; GFX9:       ; %bb.0:
480; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
481; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
482; GFX9-NEXT:    v_bfrev_b32_e32 v2, 1
483; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
484; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:-2048
485; GFX9-NEXT:    s_waitcnt vmcnt(0)
486; GFX9-NEXT:    s_setpc_b64 s[30:31]
487;
488; GFX10-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split1:
489; GFX10:       ; %bb.0:
490; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
491; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
492; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0x800, v0
493; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
494; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
495; GFX10-NEXT:    s_waitcnt vmcnt(0)
496; GFX10-NEXT:    s_setpc_b64 s[30:31]
497  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854773760
498  %load = load i8, i8 addrspace(1)* %gep, align 4
499  ret i8 %load
500}
501
502; Fill 12-bit low-bits, negative high bits (1ull << 63) | 4095
503define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split0(i8 addrspace(1)* %p) {
504; GFX9-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split0:
505; GFX9:       ; %bb.0:
506; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
507; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
508; GFX9-NEXT:    v_bfrev_b32_e32 v2, 1
509; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
510; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:-1
511; GFX9-NEXT:    s_waitcnt vmcnt(0)
512; GFX9-NEXT:    s_setpc_b64 s[30:31]
513;
514; GFX10-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split0:
515; GFX10:       ; %bb.0:
516; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
517; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
518; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0x1000, v0
519; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
520; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:-1
521; GFX10-NEXT:    s_waitcnt vmcnt(0)
522; GFX10-NEXT:    s_setpc_b64 s[30:31]
523  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854771713
524  %load = load i8, i8 addrspace(1)* %gep, align 4
525  ret i8 %load
526}
527
528; Fill 12-bit low-bits, negative high bits (1ull << 63) | 4096
529define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split1(i8 addrspace(1)* %p) {
530; GFX9-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1:
531; GFX9:       ; %bb.0:
532; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
533; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
534; GFX9-NEXT:    v_bfrev_b32_e32 v2, 1
535; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
536; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
537; GFX9-NEXT:    s_waitcnt vmcnt(0)
538; GFX9-NEXT:    s_setpc_b64 s[30:31]
539;
540; GFX10-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1:
541; GFX10:       ; %bb.0:
542; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
543; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
544; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0x1000, v0
545; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
546; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
547; GFX10-NEXT:    s_waitcnt vmcnt(0)
548; GFX10-NEXT:    s_setpc_b64 s[30:31]
549  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854771712
550  %load = load i8, i8 addrspace(1)* %gep, align 4
551  ret i8 %load
552}
553
554; Fill 13-bit low-bits, negative high bits (1ull << 63) | 8191
555define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split0(i8 addrspace(1)* %p) {
556; GFX9-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split0:
557; GFX9:       ; %bb.0:
558; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
559; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x2000, v0
560; GFX9-NEXT:    v_bfrev_b32_e32 v2, 1
561; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
562; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:-1
563; GFX9-NEXT:    s_waitcnt vmcnt(0)
564; GFX9-NEXT:    s_setpc_b64 s[30:31]
565;
566; GFX10-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split0:
567; GFX10:       ; %bb.0:
568; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
569; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
570; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0x2000, v0
571; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
572; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:-1
573; GFX10-NEXT:    s_waitcnt vmcnt(0)
574; GFX10-NEXT:    s_setpc_b64 s[30:31]
575  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854767617
576  %load = load i8, i8 addrspace(1)* %gep, align 4
577  ret i8 %load
578}
579
580; Fill 13-bit low-bits, negative high bits (1ull << 63) | 8192
581define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split1(i8 addrspace(1)* %p) {
582; GFX9-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1:
583; GFX9:       ; %bb.0:
584; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
585; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x2000, v0
586; GFX9-NEXT:    v_bfrev_b32_e32 v2, 1
587; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
588; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
589; GFX9-NEXT:    s_waitcnt vmcnt(0)
590; GFX9-NEXT:    s_setpc_b64 s[30:31]
591;
592; GFX10-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1:
593; GFX10:       ; %bb.0:
594; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
595; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
596; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0x2000, v0
597; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
598; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
599; GFX10-NEXT:    s_waitcnt vmcnt(0)
600; GFX10-NEXT:    s_setpc_b64 s[30:31]
601  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854767616
602  %load = load i8, i8 addrspace(1)* %gep, align 4
603  ret i8 %load
604}
605
606define amdgpu_kernel void @global_inst_salu_offset_1(i8 addrspace(1)* %p) {
607; GFX9-LABEL: global_inst_salu_offset_1:
608; GFX9:       ; %bb.0:
609; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
610; GFX9-NEXT:    v_mov_b32_e32 v0, 0
611; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
612; GFX9-NEXT:    global_load_ubyte v0, v0, s[0:1] offset:1
613; GFX9-NEXT:    s_waitcnt vmcnt(0)
614; GFX9-NEXT:    global_store_byte v[0:1], v0, off
615; GFX9-NEXT:    s_endpgm
616;
617; GFX10-LABEL: global_inst_salu_offset_1:
618; GFX10:       ; %bb.0:
619; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
620; GFX10-NEXT:    v_mov_b32_e32 v0, 0
621; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
622; GFX10-NEXT:    global_load_ubyte v0, v0, s[0:1] offset:1
623; GFX10-NEXT:    s_waitcnt vmcnt(0)
624; GFX10-NEXT:    global_store_byte v[0:1], v0, off
625; GFX10-NEXT:    s_endpgm
626  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 1
627  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
628  store i8 %load, i8 addrspace(1)* undef
629  ret void
630}
631
632define amdgpu_kernel void @global_inst_salu_offset_11bit_max(i8 addrspace(1)* %p) {
633; GFX9-LABEL: global_inst_salu_offset_11bit_max:
634; GFX9:       ; %bb.0:
635; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
636; GFX9-NEXT:    v_mov_b32_e32 v0, 0
637; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
638; GFX9-NEXT:    global_load_ubyte v0, v0, s[0:1] offset:2047
639; GFX9-NEXT:    s_waitcnt vmcnt(0)
640; GFX9-NEXT:    global_store_byte v[0:1], v0, off
641; GFX9-NEXT:    s_endpgm
642;
643; GFX10-LABEL: global_inst_salu_offset_11bit_max:
644; GFX10:       ; %bb.0:
645; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
646; GFX10-NEXT:    v_mov_b32_e32 v0, 0
647; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
648; GFX10-NEXT:    global_load_ubyte v0, v0, s[0:1] offset:2047
649; GFX10-NEXT:    s_waitcnt vmcnt(0)
650; GFX10-NEXT:    global_store_byte v[0:1], v0, off
651; GFX10-NEXT:    s_endpgm
652  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 2047
653  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
654  store i8 %load, i8 addrspace(1)* undef
655  ret void
656}
657
658define amdgpu_kernel void @global_inst_salu_offset_12bit_max(i8 addrspace(1)* %p) {
659; GFX9-LABEL: global_inst_salu_offset_12bit_max:
660; GFX9:       ; %bb.0:
661; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
662; GFX9-NEXT:    v_mov_b32_e32 v0, 0
663; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
664; GFX9-NEXT:    global_load_ubyte v0, v0, s[0:1] offset:4095
665; GFX9-NEXT:    s_waitcnt vmcnt(0)
666; GFX9-NEXT:    global_store_byte v[0:1], v0, off
667; GFX9-NEXT:    s_endpgm
668;
669; GFX10-LABEL: global_inst_salu_offset_12bit_max:
670; GFX10:       ; %bb.0:
671; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
672; GFX10-NEXT:    v_mov_b32_e32 v0, 0x800
673; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
674; GFX10-NEXT:    global_load_ubyte v0, v0, s[0:1] offset:2047
675; GFX10-NEXT:    s_waitcnt vmcnt(0)
676; GFX10-NEXT:    global_store_byte v[0:1], v0, off
677; GFX10-NEXT:    s_endpgm
678  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 4095
679  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
680  store i8 %load, i8 addrspace(1)* undef
681  ret void
682}
683
684define amdgpu_kernel void @global_inst_salu_offset_13bit_max(i8 addrspace(1)* %p) {
685; GFX9-LABEL: global_inst_salu_offset_13bit_max:
686; GFX9:       ; %bb.0:
687; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
688; GFX9-NEXT:    v_mov_b32_e32 v0, 0x1000
689; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
690; GFX9-NEXT:    global_load_ubyte v0, v0, s[0:1] offset:4095
691; GFX9-NEXT:    s_waitcnt vmcnt(0)
692; GFX9-NEXT:    global_store_byte v[0:1], v0, off
693; GFX9-NEXT:    s_endpgm
694;
695; GFX10-LABEL: global_inst_salu_offset_13bit_max:
696; GFX10:       ; %bb.0:
697; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
698; GFX10-NEXT:    v_mov_b32_e32 v0, 0x1800
699; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
700; GFX10-NEXT:    global_load_ubyte v0, v0, s[0:1] offset:2047
701; GFX10-NEXT:    s_waitcnt vmcnt(0)
702; GFX10-NEXT:    global_store_byte v[0:1], v0, off
703; GFX10-NEXT:    s_endpgm
704  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8191
705  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
706  store i8 %load, i8 addrspace(1)* undef
707  ret void
708}
709
710define amdgpu_kernel void @global_inst_salu_offset_neg_11bit_max(i8 addrspace(1)* %p) {
711; GFX9-LABEL: global_inst_salu_offset_neg_11bit_max:
712; GFX9:       ; %bb.0:
713; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
714; GFX9-NEXT:    v_mov_b32_e32 v0, 0
715; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
716; GFX9-NEXT:    global_load_ubyte v0, v0, s[0:1] offset:-2048
717; GFX9-NEXT:    s_waitcnt vmcnt(0)
718; GFX9-NEXT:    global_store_byte v[0:1], v0, off
719; GFX9-NEXT:    s_endpgm
720;
721; GFX10-LABEL: global_inst_salu_offset_neg_11bit_max:
722; GFX10:       ; %bb.0:
723; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
724; GFX10-NEXT:    v_mov_b32_e32 v0, 0
725; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
726; GFX10-NEXT:    global_load_ubyte v0, v0, s[0:1] offset:-2048
727; GFX10-NEXT:    s_waitcnt vmcnt(0)
728; GFX10-NEXT:    global_store_byte v[0:1], v0, off
729; GFX10-NEXT:    s_endpgm
730  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -2048
731  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
732  store i8 %load, i8 addrspace(1)* undef
733  ret void
734}
735
736define amdgpu_kernel void @global_inst_salu_offset_neg_12bit_max(i8 addrspace(1)* %p) {
737; GFX9-LABEL: global_inst_salu_offset_neg_12bit_max:
738; GFX9:       ; %bb.0:
739; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
740; GFX9-NEXT:    v_mov_b32_e32 v0, 0
741; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
742; GFX9-NEXT:    global_load_ubyte v0, v0, s[0:1] offset:-4096
743; GFX9-NEXT:    s_waitcnt vmcnt(0)
744; GFX9-NEXT:    global_store_byte v[0:1], v0, off
745; GFX9-NEXT:    s_endpgm
746;
747; GFX10-LABEL: global_inst_salu_offset_neg_12bit_max:
748; GFX10:       ; %bb.0:
749; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
750; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
751; GFX10-NEXT:    v_add_co_u32_e64 v0, s0, 0xfffff000, s0
752; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s0, -1, s1, s0
753; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
754; GFX10-NEXT:    s_waitcnt vmcnt(0)
755; GFX10-NEXT:    global_store_byte v[0:1], v0, off
756; GFX10-NEXT:    s_endpgm
757  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -4096
758  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
759  store i8 %load, i8 addrspace(1)* undef
760  ret void
761}
762
763define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(i8 addrspace(1)* %p) {
764; GFX9-LABEL: global_inst_salu_offset_neg_13bit_max:
765; GFX9:       ; %bb.0:
766; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
767; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
768; GFX9-NEXT:    v_mov_b32_e32 v0, s0
769; GFX9-NEXT:    v_mov_b32_e32 v1, s1
770; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffe000, v0
771; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
772; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
773; GFX9-NEXT:    s_waitcnt vmcnt(0)
774; GFX9-NEXT:    global_store_byte v[0:1], v0, off
775; GFX9-NEXT:    s_endpgm
776;
777; GFX10-LABEL: global_inst_salu_offset_neg_13bit_max:
778; GFX10:       ; %bb.0:
779; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
780; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
781; GFX10-NEXT:    v_add_co_u32_e64 v0, s0, 0xffffe000, s0
782; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s0, -1, s1, s0
783; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
784; GFX10-NEXT:    s_waitcnt vmcnt(0)
785; GFX10-NEXT:    global_store_byte v[0:1], v0, off
786; GFX10-NEXT:    s_endpgm
787  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -8192
788  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
789  store i8 %load, i8 addrspace(1)* undef
790  ret void
791}
792
793define amdgpu_kernel void @global_inst_salu_offset_2x_11bit_max(i8 addrspace(1)* %p) {
794; GFX9-LABEL: global_inst_salu_offset_2x_11bit_max:
795; GFX9:       ; %bb.0:
796; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
797; GFX9-NEXT:    v_mov_b32_e32 v0, 0
798; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
799; GFX9-NEXT:    global_load_ubyte v0, v0, s[0:1] offset:4095
800; GFX9-NEXT:    s_waitcnt vmcnt(0)
801; GFX9-NEXT:    global_store_byte v[0:1], v0, off
802; GFX9-NEXT:    s_endpgm
803;
804; GFX10-LABEL: global_inst_salu_offset_2x_11bit_max:
805; GFX10:       ; %bb.0:
806; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
807; GFX10-NEXT:    v_mov_b32_e32 v0, 0x800
808; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
809; GFX10-NEXT:    global_load_ubyte v0, v0, s[0:1] offset:2047
810; GFX10-NEXT:    s_waitcnt vmcnt(0)
811; GFX10-NEXT:    global_store_byte v[0:1], v0, off
812; GFX10-NEXT:    s_endpgm
813  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 4095
814  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
815  store i8 %load, i8 addrspace(1)* undef
816  ret void
817}
818
819define amdgpu_kernel void @global_inst_salu_offset_2x_12bit_max(i8 addrspace(1)* %p) {
820; GFX9-LABEL: global_inst_salu_offset_2x_12bit_max:
821; GFX9:       ; %bb.0:
822; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
823; GFX9-NEXT:    v_mov_b32_e32 v0, 0x1000
824; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
825; GFX9-NEXT:    global_load_ubyte v0, v0, s[0:1] offset:4095
826; GFX9-NEXT:    s_waitcnt vmcnt(0)
827; GFX9-NEXT:    global_store_byte v[0:1], v0, off
828; GFX9-NEXT:    s_endpgm
829;
830; GFX10-LABEL: global_inst_salu_offset_2x_12bit_max:
831; GFX10:       ; %bb.0:
832; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
833; GFX10-NEXT:    v_mov_b32_e32 v0, 0x1800
834; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
835; GFX10-NEXT:    global_load_ubyte v0, v0, s[0:1] offset:2047
836; GFX10-NEXT:    s_waitcnt vmcnt(0)
837; GFX10-NEXT:    global_store_byte v[0:1], v0, off
838; GFX10-NEXT:    s_endpgm
839  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8191
840  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
841  store i8 %load, i8 addrspace(1)* undef
842  ret void
843}
844
845define amdgpu_kernel void @global_inst_salu_offset_2x_13bit_max(i8 addrspace(1)* %p) {
846; GFX9-LABEL: global_inst_salu_offset_2x_13bit_max:
847; GFX9:       ; %bb.0:
848; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
849; GFX9-NEXT:    v_mov_b32_e32 v0, 0x3000
850; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
851; GFX9-NEXT:    global_load_ubyte v0, v0, s[0:1] offset:4095
852; GFX9-NEXT:    s_waitcnt vmcnt(0)
853; GFX9-NEXT:    global_store_byte v[0:1], v0, off
854; GFX9-NEXT:    s_endpgm
855;
856; GFX10-LABEL: global_inst_salu_offset_2x_13bit_max:
857; GFX10:       ; %bb.0:
858; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
859; GFX10-NEXT:    v_mov_b32_e32 v0, 0x3800
860; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
861; GFX10-NEXT:    global_load_ubyte v0, v0, s[0:1] offset:2047
862; GFX10-NEXT:    s_waitcnt vmcnt(0)
863; GFX10-NEXT:    global_store_byte v[0:1], v0, off
864; GFX10-NEXT:    s_endpgm
865  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 16383
866  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
867  store i8 %load, i8 addrspace(1)* undef
868  ret void
869}
870
871define amdgpu_kernel void @global_inst_salu_offset_2x_neg_11bit_max(i8 addrspace(1)* %p) {
872; GFX9-LABEL: global_inst_salu_offset_2x_neg_11bit_max:
873; GFX9:       ; %bb.0:
874; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
875; GFX9-NEXT:    v_mov_b32_e32 v0, 0
876; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
877; GFX9-NEXT:    global_load_ubyte v0, v0, s[0:1] offset:-4096
878; GFX9-NEXT:    s_waitcnt vmcnt(0)
879; GFX9-NEXT:    global_store_byte v[0:1], v0, off
880; GFX9-NEXT:    s_endpgm
881;
882; GFX10-LABEL: global_inst_salu_offset_2x_neg_11bit_max:
883; GFX10:       ; %bb.0:
884; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
885; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
886; GFX10-NEXT:    v_add_co_u32_e64 v0, s0, 0xfffff000, s0
887; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s0, -1, s1, s0
888; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
889; GFX10-NEXT:    s_waitcnt vmcnt(0)
890; GFX10-NEXT:    global_store_byte v[0:1], v0, off
891; GFX10-NEXT:    s_endpgm
892  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -4096
893  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
894  store i8 %load, i8 addrspace(1)* undef
895  ret void
896}
897
898define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(i8 addrspace(1)* %p) {
899; GFX9-LABEL: global_inst_salu_offset_2x_neg_12bit_max:
900; GFX9:       ; %bb.0:
901; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
902; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
903; GFX9-NEXT:    v_mov_b32_e32 v0, s0
904; GFX9-NEXT:    v_mov_b32_e32 v1, s1
905; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffe000, v0
906; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
907; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
908; GFX9-NEXT:    s_waitcnt vmcnt(0)
909; GFX9-NEXT:    global_store_byte v[0:1], v0, off
910; GFX9-NEXT:    s_endpgm
911;
912; GFX10-LABEL: global_inst_salu_offset_2x_neg_12bit_max:
913; GFX10:       ; %bb.0:
914; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
915; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
916; GFX10-NEXT:    v_add_co_u32_e64 v0, s0, 0xffffe000, s0
917; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s0, -1, s1, s0
918; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
919; GFX10-NEXT:    s_waitcnt vmcnt(0)
920; GFX10-NEXT:    global_store_byte v[0:1], v0, off
921; GFX10-NEXT:    s_endpgm
922  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -8192
923  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
924  store i8 %load, i8 addrspace(1)* undef
925  ret void
926}
927
928define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(i8 addrspace(1)* %p) {
929; GFX9-LABEL: global_inst_salu_offset_2x_neg_13bit_max:
930; GFX9:       ; %bb.0:
931; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
932; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
933; GFX9-NEXT:    v_mov_b32_e32 v0, s0
934; GFX9-NEXT:    v_mov_b32_e32 v1, s1
935; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffc000, v0
936; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
937; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
938; GFX9-NEXT:    s_waitcnt vmcnt(0)
939; GFX9-NEXT:    global_store_byte v[0:1], v0, off
940; GFX9-NEXT:    s_endpgm
941;
942; GFX10-LABEL: global_inst_salu_offset_2x_neg_13bit_max:
943; GFX10:       ; %bb.0:
944; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
945; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
946; GFX10-NEXT:    v_add_co_u32_e64 v0, s0, 0xffffc000, s0
947; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s0, -1, s1, s0
948; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
949; GFX10-NEXT:    s_waitcnt vmcnt(0)
950; GFX10-NEXT:    global_store_byte v[0:1], v0, off
951; GFX10-NEXT:    s_endpgm
952  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -16384
953  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
954  store i8 %load, i8 addrspace(1)* undef
955  ret void
956}
957
958; Fill 11-bit low-bits (1ull << 33) | 2047
959define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(i8 addrspace(1)* %p) {
960; GFX9-LABEL: global_inst_salu_offset_64bit_11bit_split0:
961; GFX9:       ; %bb.0:
962; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
963; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
964; GFX9-NEXT:    v_mov_b32_e32 v1, s1
965; GFX9-NEXT:    v_add_co_u32_e64 v0, vcc, 0, s0
966; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
967; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
968; GFX9-NEXT:    s_waitcnt vmcnt(0)
969; GFX9-NEXT:    global_store_byte v[0:1], v0, off
970; GFX9-NEXT:    s_endpgm
971;
972; GFX10-LABEL: global_inst_salu_offset_64bit_11bit_split0:
973; GFX10:       ; %bb.0:
974; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
975; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
976; GFX10-NEXT:    v_add_co_u32_e64 v0, s0, 0, s0
977; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s0, 2, s1, s0
978; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
979; GFX10-NEXT:    s_waitcnt vmcnt(0)
980; GFX10-NEXT:    global_store_byte v[0:1], v0, off
981; GFX10-NEXT:    s_endpgm
982  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589936639
983  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
984  store i8 %load, i8 addrspace(1)* undef
985  ret void
986}
987
988; Fill 11-bit low-bits (1ull << 33) | 2048
989define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(i8 addrspace(1)* %p) {
990; GFX9-LABEL: global_inst_salu_offset_64bit_11bit_split1:
991; GFX9:       ; %bb.0:
992; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
993; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
994; GFX9-NEXT:    v_mov_b32_e32 v1, s1
995; GFX9-NEXT:    v_add_co_u32_e64 v0, vcc, 0, s0
996; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
997; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:2048
998; GFX9-NEXT:    s_waitcnt vmcnt(0)
999; GFX9-NEXT:    global_store_byte v[0:1], v0, off
1000; GFX9-NEXT:    s_endpgm
1001;
1002; GFX10-LABEL: global_inst_salu_offset_64bit_11bit_split1:
1003; GFX10:       ; %bb.0:
1004; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1005; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1006; GFX10-NEXT:    v_add_co_u32_e64 v0, s0, 0x800, s0
1007; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s0, 2, s1, s0
1008; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
1009; GFX10-NEXT:    s_waitcnt vmcnt(0)
1010; GFX10-NEXT:    global_store_byte v[0:1], v0, off
1011; GFX10-NEXT:    s_endpgm
1012  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589936640
1013  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
1014  store i8 %load, i8 addrspace(1)* undef
1015  ret void
1016}
1017
1018; Fill 12-bit low-bits (1ull << 33) | 4095
1019define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(i8 addrspace(1)* %p) {
1020; GFX9-LABEL: global_inst_salu_offset_64bit_12bit_split0:
1021; GFX9:       ; %bb.0:
1022; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1023; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1024; GFX9-NEXT:    v_mov_b32_e32 v1, s1
1025; GFX9-NEXT:    v_add_co_u32_e64 v0, vcc, 0, s0
1026; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
1027; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:4095
1028; GFX9-NEXT:    s_waitcnt vmcnt(0)
1029; GFX9-NEXT:    global_store_byte v[0:1], v0, off
1030; GFX9-NEXT:    s_endpgm
1031;
1032; GFX10-LABEL: global_inst_salu_offset_64bit_12bit_split0:
1033; GFX10:       ; %bb.0:
1034; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1035; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1036; GFX10-NEXT:    v_add_co_u32_e64 v0, s0, 0x800, s0
1037; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s0, 2, s1, s0
1038; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
1039; GFX10-NEXT:    s_waitcnt vmcnt(0)
1040; GFX10-NEXT:    global_store_byte v[0:1], v0, off
1041; GFX10-NEXT:    s_endpgm
1042  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589938687
1043  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
1044  store i8 %load, i8 addrspace(1)* undef
1045  ret void
1046}
1047
1048; Fill 12-bit low-bits (1ull << 33) | 4096
1049define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(i8 addrspace(1)* %p) {
1050; GFX9-LABEL: global_inst_salu_offset_64bit_12bit_split1:
1051; GFX9:       ; %bb.0:
1052; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1053; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1054; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1055; GFX9-NEXT:    v_mov_b32_e32 v1, s1
1056; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
1057; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
1058; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
1059; GFX9-NEXT:    s_waitcnt vmcnt(0)
1060; GFX9-NEXT:    global_store_byte v[0:1], v0, off
1061; GFX9-NEXT:    s_endpgm
1062;
1063; GFX10-LABEL: global_inst_salu_offset_64bit_12bit_split1:
1064; GFX10:       ; %bb.0:
1065; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1066; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1067; GFX10-NEXT:    v_add_co_u32_e64 v0, s0, 0x1000, s0
1068; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s0, 2, s1, s0
1069; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
1070; GFX10-NEXT:    s_waitcnt vmcnt(0)
1071; GFX10-NEXT:    global_store_byte v[0:1], v0, off
1072; GFX10-NEXT:    s_endpgm
1073  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589938688
1074  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
1075  store i8 %load, i8 addrspace(1)* undef
1076  ret void
1077}
1078
1079; Fill 13-bit low-bits (1ull << 33) | 8191
1080define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(i8 addrspace(1)* %p) {
1081; GFX9-LABEL: global_inst_salu_offset_64bit_13bit_split0:
1082; GFX9:       ; %bb.0:
1083; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1084; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1085; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1086; GFX9-NEXT:    v_mov_b32_e32 v1, s1
1087; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
1088; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
1089; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:4095
1090; GFX9-NEXT:    s_waitcnt vmcnt(0)
1091; GFX9-NEXT:    global_store_byte v[0:1], v0, off
1092; GFX9-NEXT:    s_endpgm
1093;
1094; GFX10-LABEL: global_inst_salu_offset_64bit_13bit_split0:
1095; GFX10:       ; %bb.0:
1096; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1097; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1098; GFX10-NEXT:    v_add_co_u32_e64 v0, s0, 0x1800, s0
1099; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s0, 2, s1, s0
1100; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
1101; GFX10-NEXT:    s_waitcnt vmcnt(0)
1102; GFX10-NEXT:    global_store_byte v[0:1], v0, off
1103; GFX10-NEXT:    s_endpgm
1104  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589942783
1105  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
1106  store i8 %load, i8 addrspace(1)* undef
1107  ret void
1108}
1109
1110; Fill 13-bit low-bits (1ull << 33) | 8192
1111define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(i8 addrspace(1)* %p) {
1112; GFX9-LABEL: global_inst_salu_offset_64bit_13bit_split1:
1113; GFX9:       ; %bb.0:
1114; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1115; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1116; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1117; GFX9-NEXT:    v_mov_b32_e32 v1, s1
1118; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x2000, v0
1119; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
1120; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
1121; GFX9-NEXT:    s_waitcnt vmcnt(0)
1122; GFX9-NEXT:    global_store_byte v[0:1], v0, off
1123; GFX9-NEXT:    s_endpgm
1124;
1125; GFX10-LABEL: global_inst_salu_offset_64bit_13bit_split1:
1126; GFX10:       ; %bb.0:
1127; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1128; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1129; GFX10-NEXT:    v_add_co_u32_e64 v0, s0, 0x2000, s0
1130; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s0, 2, s1, s0
1131; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
1132; GFX10-NEXT:    s_waitcnt vmcnt(0)
1133; GFX10-NEXT:    global_store_byte v[0:1], v0, off
1134; GFX10-NEXT:    s_endpgm
1135  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589942784
1136  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
1137  store i8 %load, i8 addrspace(1)* undef
1138  ret void
1139}
1140
1141; Fill 11-bit low-bits, negative high bits (1ull << 63) | 2047
1142define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(i8 addrspace(1)* %p) {
1143; GFX9-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0:
1144; GFX9:       ; %bb.0:
1145; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1146; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
1147; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1148; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1149; GFX9-NEXT:    v_mov_b32_e32 v2, s1
1150; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
1151; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
1152; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:-2049
1153; GFX9-NEXT:    s_waitcnt vmcnt(0)
1154; GFX9-NEXT:    global_store_byte v[0:1], v0, off
1155; GFX9-NEXT:    s_endpgm
1156;
1157; GFX10-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0:
1158; GFX10:       ; %bb.0:
1159; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1160; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1161; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1162; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0x800, s0
1163; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
1164; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:-1
1165; GFX10-NEXT:    s_waitcnt vmcnt(0)
1166; GFX10-NEXT:    global_store_byte v[0:1], v0, off
1167; GFX10-NEXT:    s_endpgm
1168  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854773761
1169  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
1170  store i8 %load, i8 addrspace(1)* undef
1171  ret void
1172}
1173
1174; Fill 11-bit low-bits, negative high bits (1ull << 63) | 2048
1175define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(i8 addrspace(1)* %p) {
1176; GFX9-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1:
1177; GFX9:       ; %bb.0:
1178; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1179; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
1180; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1181; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1182; GFX9-NEXT:    v_mov_b32_e32 v2, s1
1183; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
1184; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
1185; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:-2048
1186; GFX9-NEXT:    s_waitcnt vmcnt(0)
1187; GFX9-NEXT:    global_store_byte v[0:1], v0, off
1188; GFX9-NEXT:    s_endpgm
1189;
1190; GFX10-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1:
1191; GFX10:       ; %bb.0:
1192; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1193; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1194; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1195; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0x800, s0
1196; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
1197; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
1198; GFX10-NEXT:    s_waitcnt vmcnt(0)
1199; GFX10-NEXT:    global_store_byte v[0:1], v0, off
1200; GFX10-NEXT:    s_endpgm
1201  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854773760
1202  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
1203  store i8 %load, i8 addrspace(1)* undef
1204  ret void
1205}
1206
1207; Fill 12-bit low-bits, negative high bits (1ull << 63) | 4095
1208define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(i8 addrspace(1)* %p) {
1209; GFX9-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0:
1210; GFX9:       ; %bb.0:
1211; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1212; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
1213; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1214; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1215; GFX9-NEXT:    v_mov_b32_e32 v2, s1
1216; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
1217; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
1218; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:-1
1219; GFX9-NEXT:    s_waitcnt vmcnt(0)
1220; GFX9-NEXT:    global_store_byte v[0:1], v0, off
1221; GFX9-NEXT:    s_endpgm
1222;
1223; GFX10-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0:
1224; GFX10:       ; %bb.0:
1225; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1226; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1227; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1228; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0x1000, s0
1229; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
1230; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:-1
1231; GFX10-NEXT:    s_waitcnt vmcnt(0)
1232; GFX10-NEXT:    global_store_byte v[0:1], v0, off
1233; GFX10-NEXT:    s_endpgm
1234  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854771713
1235  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
1236  store i8 %load, i8 addrspace(1)* undef
1237  ret void
1238}
1239
1240; Fill 12-bit low-bits, negative high bits (1ull << 63) | 4096
1241define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(i8 addrspace(1)* %p) {
1242; GFX9-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1:
1243; GFX9:       ; %bb.0:
1244; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1245; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
1246; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1247; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1248; GFX9-NEXT:    v_mov_b32_e32 v2, s1
1249; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
1250; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
1251; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
1252; GFX9-NEXT:    s_waitcnt vmcnt(0)
1253; GFX9-NEXT:    global_store_byte v[0:1], v0, off
1254; GFX9-NEXT:    s_endpgm
1255;
1256; GFX10-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1:
1257; GFX10:       ; %bb.0:
1258; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1259; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1260; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1261; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0x1000, s0
1262; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
1263; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
1264; GFX10-NEXT:    s_waitcnt vmcnt(0)
1265; GFX10-NEXT:    global_store_byte v[0:1], v0, off
1266; GFX10-NEXT:    s_endpgm
1267  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854771712
1268  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
1269  store i8 %load, i8 addrspace(1)* undef
1270  ret void
1271}
1272
1273; Fill 13-bit low-bits, negative high bits (1ull << 63) | 8191
1274define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(i8 addrspace(1)* %p) {
1275; GFX9-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0:
1276; GFX9:       ; %bb.0:
1277; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1278; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
1279; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1280; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1281; GFX9-NEXT:    v_mov_b32_e32 v2, s1
1282; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x2000, v0
1283; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
1284; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:-1
1285; GFX9-NEXT:    s_waitcnt vmcnt(0)
1286; GFX9-NEXT:    global_store_byte v[0:1], v0, off
1287; GFX9-NEXT:    s_endpgm
1288;
1289; GFX10-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0:
1290; GFX10:       ; %bb.0:
1291; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1292; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1293; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1294; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0x2000, s0
1295; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
1296; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:-1
1297; GFX10-NEXT:    s_waitcnt vmcnt(0)
1298; GFX10-NEXT:    global_store_byte v[0:1], v0, off
1299; GFX10-NEXT:    s_endpgm
1300  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854767617
1301  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
1302  store i8 %load, i8 addrspace(1)* undef
1303  ret void
1304}
1305
1306; Fill 13-bit low-bits, negative high bits (1ull << 63) | 8192
1307define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split1(i8 addrspace(1)* %p) {
1308; GFX9-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1:
1309; GFX9:       ; %bb.0:
1310; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1311; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
1312; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1313; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1314; GFX9-NEXT:    v_mov_b32_e32 v2, s1
1315; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x2000, v0
1316; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
1317; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
1318; GFX9-NEXT:    s_waitcnt vmcnt(0)
1319; GFX9-NEXT:    global_store_byte v[0:1], v0, off
1320; GFX9-NEXT:    s_endpgm
1321;
1322; GFX10-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1:
1323; GFX10:       ; %bb.0:
1324; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1325; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1326; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1327; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0x2000, s0
1328; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
1329; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
1330; GFX10-NEXT:    s_waitcnt vmcnt(0)
1331; GFX10-NEXT:    global_store_byte v[0:1], v0, off
1332; GFX10-NEXT:    s_endpgm
1333  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854767616
1334  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
1335  store i8 %load, i8 addrspace(1)* undef
1336  ret void
1337}
1338