1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
3; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
4
5; Test splitting flat instruction offsets into the low and high bits
6; when the offset doesn't fit in the offset field.
7
8define i8 @flat_inst_valu_offset_1(i8* %p) {
9; GFX9-LABEL: flat_inst_valu_offset_1:
10; GFX9:       ; %bb.0:
11; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12; GFX9-NEXT:    flat_load_ubyte v0, v[0:1] offset:1
13; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14; GFX9-NEXT:    s_setpc_b64 s[30:31]
15;
16; GFX10-LABEL: flat_inst_valu_offset_1:
17; GFX10:       ; %bb.0:
18; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
20; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, v0, 1
21; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
22; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
23; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
24; GFX10-NEXT:    s_setpc_b64 s[30:31]
25  %gep = getelementptr i8, i8* %p, i64 1
26  %load = load i8, i8* %gep, align 4
27  ret i8 %load
28}
29
30define i8 @flat_inst_valu_offset_11bit_max(i8* %p) {
31; GFX9-LABEL: flat_inst_valu_offset_11bit_max:
32; GFX9:       ; %bb.0:
33; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34; GFX9-NEXT:    flat_load_ubyte v0, v[0:1] offset:2047
35; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
36; GFX9-NEXT:    s_setpc_b64 s[30:31]
37;
38; GFX10-LABEL: flat_inst_valu_offset_11bit_max:
39; GFX10:       ; %bb.0:
40; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
42; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0x7ff, v0
43; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
44; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
45; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
46; GFX10-NEXT:    s_setpc_b64 s[30:31]
47  %gep = getelementptr i8, i8* %p, i64 2047
48  %load = load i8, i8* %gep, align 4
49  ret i8 %load
50}
51
52define i8 @flat_inst_valu_offset_12bit_max(i8* %p) {
53; GFX9-LABEL: flat_inst_valu_offset_12bit_max:
54; GFX9:       ; %bb.0:
55; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
56; GFX9-NEXT:    flat_load_ubyte v0, v[0:1] offset:4095
57; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
58; GFX9-NEXT:    s_setpc_b64 s[30:31]
59;
60; GFX10-LABEL: flat_inst_valu_offset_12bit_max:
61; GFX10:       ; %bb.0:
62; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
63; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
64; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0xfff, v0
65; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
66; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
67; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
68; GFX10-NEXT:    s_setpc_b64 s[30:31]
69  %gep = getelementptr i8, i8* %p, i64 4095
70  %load = load i8, i8* %gep, align 4
71  ret i8 %load
72}
73
74define i8 @flat_inst_valu_offset_13bit_max(i8* %p) {
75; GFX9-LABEL: flat_inst_valu_offset_13bit_max:
76; GFX9:       ; %bb.0:
77; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
78; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
79; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
80; GFX9-NEXT:    flat_load_ubyte v0, v[0:1] offset:4095
81; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
82; GFX9-NEXT:    s_setpc_b64 s[30:31]
83;
84; GFX10-LABEL: flat_inst_valu_offset_13bit_max:
85; GFX10:       ; %bb.0:
86; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
87; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
88; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0x1fff, v0
89; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
90; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
91; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
92; GFX10-NEXT:    s_setpc_b64 s[30:31]
93  %gep = getelementptr i8, i8* %p, i64 8191
94  %load = load i8, i8* %gep, align 4
95  ret i8 %load
96}
97
98define i8 @flat_inst_valu_offset_neg_11bit_max(i8* %p) {
99; GFX9-LABEL: flat_inst_valu_offset_neg_11bit_max:
100; GFX9:       ; %bb.0:
101; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
102; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
103; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
104; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
105; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
106; GFX9-NEXT:    s_setpc_b64 s[30:31]
107;
108; GFX10-LABEL: flat_inst_valu_offset_neg_11bit_max:
109; GFX10:       ; %bb.0:
110; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
111; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
112; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0xfffff800, v0
113; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
114; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
115; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
116; GFX10-NEXT:    s_setpc_b64 s[30:31]
117  %gep = getelementptr i8, i8* %p, i64 -2048
118  %load = load i8, i8* %gep, align 4
119  ret i8 %load
120}
121
122define i8 @flat_inst_valu_offset_neg_12bit_max(i8* %p) {
123; GFX9-LABEL: flat_inst_valu_offset_neg_12bit_max:
124; GFX9:       ; %bb.0:
125; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
126; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff000, v0
127; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
128; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
129; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
130; GFX9-NEXT:    s_setpc_b64 s[30:31]
131;
132; GFX10-LABEL: flat_inst_valu_offset_neg_12bit_max:
133; GFX10:       ; %bb.0:
134; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
135; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
136; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0xfffff000, v0
137; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
138; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
139; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
140; GFX10-NEXT:    s_setpc_b64 s[30:31]
141  %gep = getelementptr i8, i8* %p, i64 -4096
142  %load = load i8, i8* %gep, align 4
143  ret i8 %load
144}
145
146define i8 @flat_inst_valu_offset_neg_13bit_max(i8* %p) {
147; GFX9-LABEL: flat_inst_valu_offset_neg_13bit_max:
148; GFX9:       ; %bb.0:
149; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
150; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffe000, v0
151; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
152; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
153; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
154; GFX9-NEXT:    s_setpc_b64 s[30:31]
155;
156; GFX10-LABEL: flat_inst_valu_offset_neg_13bit_max:
157; GFX10:       ; %bb.0:
158; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
159; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
160; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0xffffe000, v0
161; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
162; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
163; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
164; GFX10-NEXT:    s_setpc_b64 s[30:31]
165  %gep = getelementptr i8, i8* %p, i64 -8192
166  %load = load i8, i8* %gep, align 4
167  ret i8 %load
168}
169
170define i8 @flat_inst_valu_offset_2x_11bit_max(i8* %p) {
171; GFX9-LABEL: flat_inst_valu_offset_2x_11bit_max:
172; GFX9:       ; %bb.0:
173; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
174; GFX9-NEXT:    flat_load_ubyte v0, v[0:1] offset:4095
175; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
176; GFX9-NEXT:    s_setpc_b64 s[30:31]
177;
178; GFX10-LABEL: flat_inst_valu_offset_2x_11bit_max:
179; GFX10:       ; %bb.0:
180; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
181; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
182; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0xfff, v0
183; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
184; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
185; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
186; GFX10-NEXT:    s_setpc_b64 s[30:31]
187  %gep = getelementptr i8, i8* %p, i64 4095
188  %load = load i8, i8* %gep, align 4
189  ret i8 %load
190}
191
192define i8 @flat_inst_valu_offset_2x_12bit_max(i8* %p) {
193; GFX9-LABEL: flat_inst_valu_offset_2x_12bit_max:
194; GFX9:       ; %bb.0:
195; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
196; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
197; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
198; GFX9-NEXT:    flat_load_ubyte v0, v[0:1] offset:4095
199; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
200; GFX9-NEXT:    s_setpc_b64 s[30:31]
201;
202; GFX10-LABEL: flat_inst_valu_offset_2x_12bit_max:
203; GFX10:       ; %bb.0:
204; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
205; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
206; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0x1fff, v0
207; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
208; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
209; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
210; GFX10-NEXT:    s_setpc_b64 s[30:31]
211  %gep = getelementptr i8, i8* %p, i64 8191
212  %load = load i8, i8* %gep, align 4
213  ret i8 %load
214}
215
216define i8 @flat_inst_valu_offset_2x_13bit_max(i8* %p) {
217; GFX9-LABEL: flat_inst_valu_offset_2x_13bit_max:
218; GFX9:       ; %bb.0:
219; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
220; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x3000, v0
221; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
222; GFX9-NEXT:    flat_load_ubyte v0, v[0:1] offset:4095
223; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
224; GFX9-NEXT:    s_setpc_b64 s[30:31]
225;
226; GFX10-LABEL: flat_inst_valu_offset_2x_13bit_max:
227; GFX10:       ; %bb.0:
228; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
229; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
230; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0x3fff, v0
231; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
232; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
233; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
234; GFX10-NEXT:    s_setpc_b64 s[30:31]
235  %gep = getelementptr i8, i8* %p, i64 16383
236  %load = load i8, i8* %gep, align 4
237  ret i8 %load
238}
239
240define i8 @flat_inst_valu_offset_2x_neg_11bit_max(i8* %p) {
241; GFX9-LABEL: flat_inst_valu_offset_2x_neg_11bit_max:
242; GFX9:       ; %bb.0:
243; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
244; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff000, v0
245; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
246; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
247; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
248; GFX9-NEXT:    s_setpc_b64 s[30:31]
249;
250; GFX10-LABEL: flat_inst_valu_offset_2x_neg_11bit_max:
251; GFX10:       ; %bb.0:
252; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
253; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
254; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0xfffff000, v0
255; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
256; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
257; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
258; GFX10-NEXT:    s_setpc_b64 s[30:31]
259  %gep = getelementptr i8, i8* %p, i64 -4096
260  %load = load i8, i8* %gep, align 4
261  ret i8 %load
262}
263
264define i8 @flat_inst_valu_offset_2x_neg_12bit_max(i8* %p) {
265; GFX9-LABEL: flat_inst_valu_offset_2x_neg_12bit_max:
266; GFX9:       ; %bb.0:
267; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
268; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffe000, v0
269; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
270; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
271; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
272; GFX9-NEXT:    s_setpc_b64 s[30:31]
273;
274; GFX10-LABEL: flat_inst_valu_offset_2x_neg_12bit_max:
275; GFX10:       ; %bb.0:
276; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
277; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
278; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0xffffe000, v0
279; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
280; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
281; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
282; GFX10-NEXT:    s_setpc_b64 s[30:31]
283  %gep = getelementptr i8, i8* %p, i64 -8192
284  %load = load i8, i8* %gep, align 4
285  ret i8 %load
286}
287
288define i8 @flat_inst_valu_offset_2x_neg_13bit_max(i8* %p) {
289; GFX9-LABEL: flat_inst_valu_offset_2x_neg_13bit_max:
290; GFX9:       ; %bb.0:
291; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
292; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffc000, v0
293; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
294; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
295; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
296; GFX9-NEXT:    s_setpc_b64 s[30:31]
297;
298; GFX10-LABEL: flat_inst_valu_offset_2x_neg_13bit_max:
299; GFX10:       ; %bb.0:
300; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
301; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
302; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0xffffc000, v0
303; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
304; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
305; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
306; GFX10-NEXT:    s_setpc_b64 s[30:31]
307  %gep = getelementptr i8, i8* %p, i64 -16384
308  %load = load i8, i8* %gep, align 4
309  ret i8 %load
310}
311
312; Fill 11-bit low-bits (1ull << 33) | 2047
313define i8 @flat_inst_valu_offset_64bit_11bit_split0(i8* %p) {
314; GFX9-LABEL: flat_inst_valu_offset_64bit_11bit_split0:
315; GFX9:       ; %bb.0:
316; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
317; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0, v0
318; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
319; GFX9-NEXT:    flat_load_ubyte v0, v[0:1] offset:2047
320; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
321; GFX9-NEXT:    s_setpc_b64 s[30:31]
322;
323; GFX10-LABEL: flat_inst_valu_offset_64bit_11bit_split0:
324; GFX10:       ; %bb.0:
325; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
326; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
327; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0x7ff, v0
328; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
329; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
330; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
331; GFX10-NEXT:    s_setpc_b64 s[30:31]
332  %gep = getelementptr i8, i8* %p, i64 8589936639
333  %load = load i8, i8* %gep, align 4
334  ret i8 %load
335}
336
337; Fill 11-bit low-bits (1ull << 33) | 2048
338define i8 @flat_inst_valu_offset_64bit_11bit_split1(i8* %p) {
339; GFX9-LABEL: flat_inst_valu_offset_64bit_11bit_split1:
340; GFX9:       ; %bb.0:
341; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
342; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0, v0
343; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
344; GFX9-NEXT:    flat_load_ubyte v0, v[0:1] offset:2048
345; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
346; GFX9-NEXT:    s_setpc_b64 s[30:31]
347;
348; GFX10-LABEL: flat_inst_valu_offset_64bit_11bit_split1:
349; GFX10:       ; %bb.0:
350; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
351; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
352; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0x800, v0
353; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
354; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
355; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
356; GFX10-NEXT:    s_setpc_b64 s[30:31]
357  %gep = getelementptr i8, i8* %p, i64 8589936640
358  %load = load i8, i8* %gep, align 4
359  ret i8 %load
360}
361
362; Fill 12-bit low-bits (1ull << 33) | 4095
363define i8 @flat_inst_valu_offset_64bit_12bit_split0(i8* %p) {
364; GFX9-LABEL: flat_inst_valu_offset_64bit_12bit_split0:
365; GFX9:       ; %bb.0:
366; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
367; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0, v0
368; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
369; GFX9-NEXT:    flat_load_ubyte v0, v[0:1] offset:4095
370; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
371; GFX9-NEXT:    s_setpc_b64 s[30:31]
372;
373; GFX10-LABEL: flat_inst_valu_offset_64bit_12bit_split0:
374; GFX10:       ; %bb.0:
375; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
376; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
377; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0xfff, v0
378; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
379; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
380; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
381; GFX10-NEXT:    s_setpc_b64 s[30:31]
382  %gep = getelementptr i8, i8* %p, i64 8589938687
383  %load = load i8, i8* %gep, align 4
384  ret i8 %load
385}
386
387; Fill 12-bit low-bits (1ull << 33) | 4096
388define i8 @flat_inst_valu_offset_64bit_12bit_split1(i8* %p) {
389; GFX9-LABEL: flat_inst_valu_offset_64bit_12bit_split1:
390; GFX9:       ; %bb.0:
391; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
392; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
393; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
394; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
395; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
396; GFX9-NEXT:    s_setpc_b64 s[30:31]
397;
398; GFX10-LABEL: flat_inst_valu_offset_64bit_12bit_split1:
399; GFX10:       ; %bb.0:
400; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
401; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
402; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0x1000, v0
403; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
404; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
405; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
406; GFX10-NEXT:    s_setpc_b64 s[30:31]
407  %gep = getelementptr i8, i8* %p, i64 8589938688
408  %load = load i8, i8* %gep, align 4
409  ret i8 %load
410}
411
412; Fill 13-bit low-bits (1ull << 33) | 8191
413define i8 @flat_inst_valu_offset_64bit_13bit_split0(i8* %p) {
414; GFX9-LABEL: flat_inst_valu_offset_64bit_13bit_split0:
415; GFX9:       ; %bb.0:
416; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
417; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
418; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
419; GFX9-NEXT:    flat_load_ubyte v0, v[0:1] offset:4095
420; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
421; GFX9-NEXT:    s_setpc_b64 s[30:31]
422;
423; GFX10-LABEL: flat_inst_valu_offset_64bit_13bit_split0:
424; GFX10:       ; %bb.0:
425; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
426; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
427; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0x1fff, v0
428; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
429; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
430; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
431; GFX10-NEXT:    s_setpc_b64 s[30:31]
432  %gep = getelementptr i8, i8* %p, i64 8589942783
433  %load = load i8, i8* %gep, align 4
434  ret i8 %load
435}
436
437; Fill 13-bit low-bits (1ull << 33) | 8192
438define i8 @flat_inst_valu_offset_64bit_13bit_split1(i8* %p) {
439; GFX9-LABEL: flat_inst_valu_offset_64bit_13bit_split1:
440; GFX9:       ; %bb.0:
441; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
442; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x2000, v0
443; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
444; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
445; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
446; GFX9-NEXT:    s_setpc_b64 s[30:31]
447;
448; GFX10-LABEL: flat_inst_valu_offset_64bit_13bit_split1:
449; GFX10:       ; %bb.0:
450; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
451; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
452; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0x2000, v0
453; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
454; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
455; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
456; GFX10-NEXT:    s_setpc_b64 s[30:31]
457  %gep = getelementptr i8, i8* %p, i64 8589942784
458  %load = load i8, i8* %gep, align 4
459  ret i8 %load
460}
461
462; Fill 11-bit low-bits, negative high bits (1ull << 63) | 2047
463define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split0(i8* %p) {
464; GFX9-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0:
465; GFX9:       ; %bb.0:
466; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
467; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7ff, v0
468; GFX9-NEXT:    v_bfrev_b32_e32 v2, 1
469; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
470; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
471; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
472; GFX9-NEXT:    s_setpc_b64 s[30:31]
473;
474; GFX10-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0:
475; GFX10:       ; %bb.0:
476; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
477; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
478; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0x7ff, v0
479; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
480; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
481; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
482; GFX10-NEXT:    s_setpc_b64 s[30:31]
483  %gep = getelementptr i8, i8* %p, i64 -9223372036854773761
484  %load = load i8, i8* %gep, align 4
485  ret i8 %load
486}
487
488; Fill 11-bit low-bits, negative high bits (1ull << 63) | 2048
489define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split1(i8* %p) {
490; GFX9-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1:
491; GFX9:       ; %bb.0:
492; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
493; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x800, v0
494; GFX9-NEXT:    v_bfrev_b32_e32 v2, 1
495; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
496; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
497; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
498; GFX9-NEXT:    s_setpc_b64 s[30:31]
499;
500; GFX10-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1:
501; GFX10:       ; %bb.0:
502; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
503; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
504; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0x800, v0
505; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
506; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
507; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
508; GFX10-NEXT:    s_setpc_b64 s[30:31]
509  %gep = getelementptr i8, i8* %p, i64 -9223372036854773760
510  %load = load i8, i8* %gep, align 4
511  ret i8 %load
512}
513
514; Fill 12-bit low-bits, negative high bits (1ull << 63) | 4095
515define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split0(i8* %p) {
516; GFX9-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0:
517; GFX9:       ; %bb.0:
518; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
519; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfff, v0
520; GFX9-NEXT:    v_bfrev_b32_e32 v2, 1
521; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
522; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
523; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
524; GFX9-NEXT:    s_setpc_b64 s[30:31]
525;
526; GFX10-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0:
527; GFX10:       ; %bb.0:
528; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
529; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
530; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0xfff, v0
531; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
532; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
533; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
534; GFX10-NEXT:    s_setpc_b64 s[30:31]
535  %gep = getelementptr i8, i8* %p, i64 -9223372036854771713
536  %load = load i8, i8* %gep, align 4
537  ret i8 %load
538}
539
540; Fill 12-bit low-bits, negative high bits (1ull << 63) | 4096
541define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split1(i8* %p) {
542; GFX9-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1:
543; GFX9:       ; %bb.0:
544; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
545; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
546; GFX9-NEXT:    v_bfrev_b32_e32 v2, 1
547; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
548; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
549; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
550; GFX9-NEXT:    s_setpc_b64 s[30:31]
551;
552; GFX10-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1:
553; GFX10:       ; %bb.0:
554; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
555; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
556; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0x1000, v0
557; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
558; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
559; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
560; GFX10-NEXT:    s_setpc_b64 s[30:31]
561  %gep = getelementptr i8, i8* %p, i64 -9223372036854771712
562  %load = load i8, i8* %gep, align 4
563  ret i8 %load
564}
565
566; Fill 13-bit low-bits, negative high bits (1ull << 63) | 8191
567define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split0(i8* %p) {
568; GFX9-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0:
569; GFX9:       ; %bb.0:
570; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
571; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1fff, v0
572; GFX9-NEXT:    v_bfrev_b32_e32 v2, 1
573; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
574; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
575; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
576; GFX9-NEXT:    s_setpc_b64 s[30:31]
577;
578; GFX10-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0:
579; GFX10:       ; %bb.0:
580; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
581; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
582; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0x1fff, v0
583; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
584; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
585; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
586; GFX10-NEXT:    s_setpc_b64 s[30:31]
587  %gep = getelementptr i8, i8* %p, i64 -9223372036854767617
588  %load = load i8, i8* %gep, align 4
589  ret i8 %load
590}
591
592; Fill 13-bit low-bits, negative high bits (1ull << 63) | 8192
593define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split1(i8* %p) {
594; GFX9-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1:
595; GFX9:       ; %bb.0:
596; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
597; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x2000, v0
598; GFX9-NEXT:    v_bfrev_b32_e32 v2, 1
599; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
600; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
601; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
602; GFX9-NEXT:    s_setpc_b64 s[30:31]
603;
604; GFX10-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1:
605; GFX10:       ; %bb.0:
606; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
607; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
608; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0x2000, v0
609; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
610; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
611; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
612; GFX10-NEXT:    s_setpc_b64 s[30:31]
613  %gep = getelementptr i8, i8* %p, i64 -9223372036854767616
614  %load = load i8, i8* %gep, align 4
615  ret i8 %load
616}
617
618define amdgpu_kernel void @flat_inst_salu_offset_1(i8* %p) {
619; GFX9-LABEL: flat_inst_salu_offset_1:
620; GFX9:       ; %bb.0:
621; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
622; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
623; GFX9-NEXT:    v_mov_b32_e32 v0, s0
624; GFX9-NEXT:    v_mov_b32_e32 v1, s1
625; GFX9-NEXT:    flat_load_ubyte v0, v[0:1] offset:1
626; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
627; GFX9-NEXT:    flat_store_byte v[0:1], v0
628; GFX9-NEXT:    s_endpgm
629;
630; GFX10-LABEL: flat_inst_salu_offset_1:
631; GFX10:       ; %bb.0:
632; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
633; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
634; GFX10-NEXT:    s_add_u32 s0, s0, 1
635; GFX10-NEXT:    s_addc_u32 s1, s1, 0
636; GFX10-NEXT:    v_mov_b32_e32 v0, s0
637; GFX10-NEXT:    v_mov_b32_e32 v1, s1
638; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
639; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
640; GFX10-NEXT:    flat_store_byte v[0:1], v0
641; GFX10-NEXT:    s_endpgm
642  %gep = getelementptr i8, i8* %p, i64 1
643  %load = load volatile i8, i8* %gep, align 1
644  store i8 %load, i8* undef
645  ret void
646}
647
648define amdgpu_kernel void @flat_inst_salu_offset_11bit_max(i8* %p) {
649; GFX9-LABEL: flat_inst_salu_offset_11bit_max:
650; GFX9:       ; %bb.0:
651; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
652; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
653; GFX9-NEXT:    v_mov_b32_e32 v0, s0
654; GFX9-NEXT:    v_mov_b32_e32 v1, s1
655; GFX9-NEXT:    flat_load_ubyte v0, v[0:1] offset:2047
656; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
657; GFX9-NEXT:    flat_store_byte v[0:1], v0
658; GFX9-NEXT:    s_endpgm
659;
660; GFX10-LABEL: flat_inst_salu_offset_11bit_max:
661; GFX10:       ; %bb.0:
662; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
663; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
664; GFX10-NEXT:    s_add_u32 s0, s0, 0x7ff
665; GFX10-NEXT:    s_addc_u32 s1, s1, 0
666; GFX10-NEXT:    v_mov_b32_e32 v0, s0
667; GFX10-NEXT:    v_mov_b32_e32 v1, s1
668; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
669; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
670; GFX10-NEXT:    flat_store_byte v[0:1], v0
671; GFX10-NEXT:    s_endpgm
672  %gep = getelementptr i8, i8* %p, i64 2047
673  %load = load volatile i8, i8* %gep, align 1
674  store i8 %load, i8* undef
675  ret void
676}
677
678define amdgpu_kernel void @flat_inst_salu_offset_12bit_max(i8* %p) {
679; GFX9-LABEL: flat_inst_salu_offset_12bit_max:
680; GFX9:       ; %bb.0:
681; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
682; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
683; GFX9-NEXT:    v_mov_b32_e32 v0, s0
684; GFX9-NEXT:    v_mov_b32_e32 v1, s1
685; GFX9-NEXT:    flat_load_ubyte v0, v[0:1] offset:4095
686; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
687; GFX9-NEXT:    flat_store_byte v[0:1], v0
688; GFX9-NEXT:    s_endpgm
689;
690; GFX10-LABEL: flat_inst_salu_offset_12bit_max:
691; GFX10:       ; %bb.0:
692; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
693; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
694; GFX10-NEXT:    s_add_u32 s0, s0, 0xfff
695; GFX10-NEXT:    s_addc_u32 s1, s1, 0
696; GFX10-NEXT:    v_mov_b32_e32 v0, s0
697; GFX10-NEXT:    v_mov_b32_e32 v1, s1
698; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
699; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
700; GFX10-NEXT:    flat_store_byte v[0:1], v0
701; GFX10-NEXT:    s_endpgm
702  %gep = getelementptr i8, i8* %p, i64 4095
703  %load = load volatile i8, i8* %gep, align 1
704  store i8 %load, i8* undef
705  ret void
706}
707
708define amdgpu_kernel void @flat_inst_salu_offset_13bit_max(i8* %p) {
709; GFX9-LABEL: flat_inst_salu_offset_13bit_max:
710; GFX9:       ; %bb.0:
711; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
712; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
713; GFX9-NEXT:    v_mov_b32_e32 v0, s0
714; GFX9-NEXT:    v_mov_b32_e32 v1, s1
715; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
716; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
717; GFX9-NEXT:    flat_load_ubyte v0, v[0:1] offset:4095
718; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
719; GFX9-NEXT:    flat_store_byte v[0:1], v0
720; GFX9-NEXT:    s_endpgm
721;
722; GFX10-LABEL: flat_inst_salu_offset_13bit_max:
723; GFX10:       ; %bb.0:
724; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
725; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
726; GFX10-NEXT:    s_add_u32 s0, s0, 0x1fff
727; GFX10-NEXT:    s_addc_u32 s1, s1, 0
728; GFX10-NEXT:    v_mov_b32_e32 v0, s0
729; GFX10-NEXT:    v_mov_b32_e32 v1, s1
730; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
731; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
732; GFX10-NEXT:    flat_store_byte v[0:1], v0
733; GFX10-NEXT:    s_endpgm
734  %gep = getelementptr i8, i8* %p, i64 8191
735  %load = load volatile i8, i8* %gep, align 1
736  store i8 %load, i8* undef
737  ret void
738}
739
740define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(i8* %p) {
741; GFX9-LABEL: flat_inst_salu_offset_neg_11bit_max:
742; GFX9:       ; %bb.0:
743; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
744; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
745; GFX9-NEXT:    v_mov_b32_e32 v0, s0
746; GFX9-NEXT:    v_mov_b32_e32 v1, s1
747; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
748; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
749; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
750; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
751; GFX9-NEXT:    flat_store_byte v[0:1], v0
752; GFX9-NEXT:    s_endpgm
753;
754; GFX10-LABEL: flat_inst_salu_offset_neg_11bit_max:
755; GFX10:       ; %bb.0:
756; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
757; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
758; GFX10-NEXT:    s_add_u32 s0, s0, 0xfffff800
759; GFX10-NEXT:    s_addc_u32 s1, s1, -1
760; GFX10-NEXT:    v_mov_b32_e32 v0, s0
761; GFX10-NEXT:    v_mov_b32_e32 v1, s1
762; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
763; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
764; GFX10-NEXT:    flat_store_byte v[0:1], v0
765; GFX10-NEXT:    s_endpgm
766  %gep = getelementptr i8, i8* %p, i64 -2048
767  %load = load volatile i8, i8* %gep, align 1
768  store i8 %load, i8* undef
769  ret void
770}
771
772define amdgpu_kernel void @flat_inst_salu_offset_neg_12bit_max(i8* %p) {
773; GFX9-LABEL: flat_inst_salu_offset_neg_12bit_max:
774; GFX9:       ; %bb.0:
775; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
776; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
777; GFX9-NEXT:    v_mov_b32_e32 v0, s0
778; GFX9-NEXT:    v_mov_b32_e32 v1, s1
779; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff000, v0
780; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
781; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
782; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
783; GFX9-NEXT:    flat_store_byte v[0:1], v0
784; GFX9-NEXT:    s_endpgm
785;
786; GFX10-LABEL: flat_inst_salu_offset_neg_12bit_max:
787; GFX10:       ; %bb.0:
788; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
789; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
790; GFX10-NEXT:    s_add_u32 s0, s0, 0xfffff000
791; GFX10-NEXT:    s_addc_u32 s1, s1, -1
792; GFX10-NEXT:    v_mov_b32_e32 v0, s0
793; GFX10-NEXT:    v_mov_b32_e32 v1, s1
794; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
795; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
796; GFX10-NEXT:    flat_store_byte v[0:1], v0
797; GFX10-NEXT:    s_endpgm
798  %gep = getelementptr i8, i8* %p, i64 -4096
799  %load = load volatile i8, i8* %gep, align 1
800  store i8 %load, i8* undef
801  ret void
802}
803
804define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(i8* %p) {
805; GFX9-LABEL: flat_inst_salu_offset_neg_13bit_max:
806; GFX9:       ; %bb.0:
807; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
808; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
809; GFX9-NEXT:    v_mov_b32_e32 v0, s0
810; GFX9-NEXT:    v_mov_b32_e32 v1, s1
811; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffe000, v0
812; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
813; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
814; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
815; GFX9-NEXT:    flat_store_byte v[0:1], v0
816; GFX9-NEXT:    s_endpgm
817;
818; GFX10-LABEL: flat_inst_salu_offset_neg_13bit_max:
819; GFX10:       ; %bb.0:
820; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
821; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
822; GFX10-NEXT:    s_add_u32 s0, s0, 0xffffe000
823; GFX10-NEXT:    s_addc_u32 s1, s1, -1
824; GFX10-NEXT:    v_mov_b32_e32 v0, s0
825; GFX10-NEXT:    v_mov_b32_e32 v1, s1
826; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
827; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
828; GFX10-NEXT:    flat_store_byte v[0:1], v0
829; GFX10-NEXT:    s_endpgm
830  %gep = getelementptr i8, i8* %p, i64 -8192
831  %load = load volatile i8, i8* %gep, align 1
832  store i8 %load, i8* undef
833  ret void
834}
835
836define amdgpu_kernel void @flat_inst_salu_offset_2x_11bit_max(i8* %p) {
837; GFX9-LABEL: flat_inst_salu_offset_2x_11bit_max:
838; GFX9:       ; %bb.0:
839; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
840; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
841; GFX9-NEXT:    v_mov_b32_e32 v0, s0
842; GFX9-NEXT:    v_mov_b32_e32 v1, s1
843; GFX9-NEXT:    flat_load_ubyte v0, v[0:1] offset:4095
844; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
845; GFX9-NEXT:    flat_store_byte v[0:1], v0
846; GFX9-NEXT:    s_endpgm
847;
848; GFX10-LABEL: flat_inst_salu_offset_2x_11bit_max:
849; GFX10:       ; %bb.0:
850; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
851; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
852; GFX10-NEXT:    s_add_u32 s0, s0, 0xfff
853; GFX10-NEXT:    s_addc_u32 s1, s1, 0
854; GFX10-NEXT:    v_mov_b32_e32 v0, s0
855; GFX10-NEXT:    v_mov_b32_e32 v1, s1
856; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
857; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
858; GFX10-NEXT:    flat_store_byte v[0:1], v0
859; GFX10-NEXT:    s_endpgm
860  %gep = getelementptr i8, i8* %p, i64 4095
861  %load = load volatile i8, i8* %gep, align 1
862  store i8 %load, i8* undef
863  ret void
864}
865
866define amdgpu_kernel void @flat_inst_salu_offset_2x_12bit_max(i8* %p) {
867; GFX9-LABEL: flat_inst_salu_offset_2x_12bit_max:
868; GFX9:       ; %bb.0:
869; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
870; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
871; GFX9-NEXT:    v_mov_b32_e32 v0, s0
872; GFX9-NEXT:    v_mov_b32_e32 v1, s1
873; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
874; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
875; GFX9-NEXT:    flat_load_ubyte v0, v[0:1] offset:4095
876; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
877; GFX9-NEXT:    flat_store_byte v[0:1], v0
878; GFX9-NEXT:    s_endpgm
879;
880; GFX10-LABEL: flat_inst_salu_offset_2x_12bit_max:
881; GFX10:       ; %bb.0:
882; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
883; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
884; GFX10-NEXT:    s_add_u32 s0, s0, 0x1fff
885; GFX10-NEXT:    s_addc_u32 s1, s1, 0
886; GFX10-NEXT:    v_mov_b32_e32 v0, s0
887; GFX10-NEXT:    v_mov_b32_e32 v1, s1
888; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
889; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
890; GFX10-NEXT:    flat_store_byte v[0:1], v0
891; GFX10-NEXT:    s_endpgm
892  %gep = getelementptr i8, i8* %p, i64 8191
893  %load = load volatile i8, i8* %gep, align 1
894  store i8 %load, i8* undef
895  ret void
896}
897
898define amdgpu_kernel void @flat_inst_salu_offset_2x_13bit_max(i8* %p) {
899; GFX9-LABEL: flat_inst_salu_offset_2x_13bit_max:
900; GFX9:       ; %bb.0:
901; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
902; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
903; GFX9-NEXT:    v_mov_b32_e32 v0, s0
904; GFX9-NEXT:    v_mov_b32_e32 v1, s1
905; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x3000, v0
906; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
907; GFX9-NEXT:    flat_load_ubyte v0, v[0:1] offset:4095
908; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
909; GFX9-NEXT:    flat_store_byte v[0:1], v0
910; GFX9-NEXT:    s_endpgm
911;
912; GFX10-LABEL: flat_inst_salu_offset_2x_13bit_max:
913; GFX10:       ; %bb.0:
914; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
915; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
916; GFX10-NEXT:    s_add_u32 s0, s0, 0x3fff
917; GFX10-NEXT:    s_addc_u32 s1, s1, 0
918; GFX10-NEXT:    v_mov_b32_e32 v0, s0
919; GFX10-NEXT:    v_mov_b32_e32 v1, s1
920; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
921; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
922; GFX10-NEXT:    flat_store_byte v[0:1], v0
923; GFX10-NEXT:    s_endpgm
924  %gep = getelementptr i8, i8* %p, i64 16383
925  %load = load volatile i8, i8* %gep, align 1
926  store i8 %load, i8* undef
927  ret void
928}
929
930define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_11bit_max(i8* %p) {
931; GFX9-LABEL: flat_inst_salu_offset_2x_neg_11bit_max:
932; GFX9:       ; %bb.0:
933; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
934; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
935; GFX9-NEXT:    v_mov_b32_e32 v0, s0
936; GFX9-NEXT:    v_mov_b32_e32 v1, s1
937; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff000, v0
938; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
939; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
940; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
941; GFX9-NEXT:    flat_store_byte v[0:1], v0
942; GFX9-NEXT:    s_endpgm
943;
944; GFX10-LABEL: flat_inst_salu_offset_2x_neg_11bit_max:
945; GFX10:       ; %bb.0:
946; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
947; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
948; GFX10-NEXT:    s_add_u32 s0, s0, 0xfffff000
949; GFX10-NEXT:    s_addc_u32 s1, s1, -1
950; GFX10-NEXT:    v_mov_b32_e32 v0, s0
951; GFX10-NEXT:    v_mov_b32_e32 v1, s1
952; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
953; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
954; GFX10-NEXT:    flat_store_byte v[0:1], v0
955; GFX10-NEXT:    s_endpgm
956  %gep = getelementptr i8, i8* %p, i64 -4096
957  %load = load volatile i8, i8* %gep, align 1
958  store i8 %load, i8* undef
959  ret void
960}
961
962define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_12bit_max(i8* %p) {
963; GFX9-LABEL: flat_inst_salu_offset_2x_neg_12bit_max:
964; GFX9:       ; %bb.0:
965; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
966; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
967; GFX9-NEXT:    v_mov_b32_e32 v0, s0
968; GFX9-NEXT:    v_mov_b32_e32 v1, s1
969; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffe000, v0
970; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
971; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
972; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
973; GFX9-NEXT:    flat_store_byte v[0:1], v0
974; GFX9-NEXT:    s_endpgm
975;
976; GFX10-LABEL: flat_inst_salu_offset_2x_neg_12bit_max:
977; GFX10:       ; %bb.0:
978; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
979; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
980; GFX10-NEXT:    s_add_u32 s0, s0, 0xffffe000
981; GFX10-NEXT:    s_addc_u32 s1, s1, -1
982; GFX10-NEXT:    v_mov_b32_e32 v0, s0
983; GFX10-NEXT:    v_mov_b32_e32 v1, s1
984; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
985; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
986; GFX10-NEXT:    flat_store_byte v[0:1], v0
987; GFX10-NEXT:    s_endpgm
988  %gep = getelementptr i8, i8* %p, i64 -8192
989  %load = load volatile i8, i8* %gep, align 1
990  store i8 %load, i8* undef
991  ret void
992}
993
994define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_13bit_max(i8* %p) {
995; GFX9-LABEL: flat_inst_salu_offset_2x_neg_13bit_max:
996; GFX9:       ; %bb.0:
997; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
998; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
999; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1000; GFX9-NEXT:    v_mov_b32_e32 v1, s1
1001; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffc000, v0
1002; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
1003; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
1004; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1005; GFX9-NEXT:    flat_store_byte v[0:1], v0
1006; GFX9-NEXT:    s_endpgm
1007;
1008; GFX10-LABEL: flat_inst_salu_offset_2x_neg_13bit_max:
1009; GFX10:       ; %bb.0:
1010; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1011; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1012; GFX10-NEXT:    s_add_u32 s0, s0, 0xffffc000
1013; GFX10-NEXT:    s_addc_u32 s1, s1, -1
1014; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1015; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1016; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
1017; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1018; GFX10-NEXT:    flat_store_byte v[0:1], v0
1019; GFX10-NEXT:    s_endpgm
1020  %gep = getelementptr i8, i8* %p, i64 -16384
1021  %load = load volatile i8, i8* %gep, align 1
1022  store i8 %load, i8* undef
1023  ret void
1024}
1025
1026; Fill 11-bit low-bits (1ull << 33) | 2047
1027define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(i8* %p) {
1028; GFX9-LABEL: flat_inst_salu_offset_64bit_11bit_split0:
1029; GFX9:       ; %bb.0:
1030; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1031; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1032; GFX9-NEXT:    v_mov_b32_e32 v1, s1
1033; GFX9-NEXT:    v_add_co_u32_e64 v0, vcc, 0, s0
1034; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
1035; GFX9-NEXT:    flat_load_ubyte v0, v[0:1] offset:2047
1036; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1037; GFX9-NEXT:    flat_store_byte v[0:1], v0
1038; GFX9-NEXT:    s_endpgm
1039;
1040; GFX10-LABEL: flat_inst_salu_offset_64bit_11bit_split0:
1041; GFX10:       ; %bb.0:
1042; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1043; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1044; GFX10-NEXT:    s_add_u32 s0, s0, 0x7ff
1045; GFX10-NEXT:    s_addc_u32 s1, s1, 2
1046; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1047; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1048; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
1049; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1050; GFX10-NEXT:    flat_store_byte v[0:1], v0
1051; GFX10-NEXT:    s_endpgm
1052  %gep = getelementptr i8, i8* %p, i64 8589936639
1053  %load = load volatile i8, i8* %gep, align 1
1054  store i8 %load, i8* undef
1055  ret void
1056}
1057
1058; Fill 11-bit low-bits (1ull << 33) | 2048
1059define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(i8* %p) {
1060; GFX9-LABEL: flat_inst_salu_offset_64bit_11bit_split1:
1061; GFX9:       ; %bb.0:
1062; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1063; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1064; GFX9-NEXT:    v_mov_b32_e32 v1, s1
1065; GFX9-NEXT:    v_add_co_u32_e64 v0, vcc, 0, s0
1066; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
1067; GFX9-NEXT:    flat_load_ubyte v0, v[0:1] offset:2048
1068; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1069; GFX9-NEXT:    flat_store_byte v[0:1], v0
1070; GFX9-NEXT:    s_endpgm
1071;
1072; GFX10-LABEL: flat_inst_salu_offset_64bit_11bit_split1:
1073; GFX10:       ; %bb.0:
1074; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1075; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1076; GFX10-NEXT:    s_add_u32 s0, s0, 0x800
1077; GFX10-NEXT:    s_addc_u32 s1, s1, 2
1078; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1079; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1080; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
1081; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1082; GFX10-NEXT:    flat_store_byte v[0:1], v0
1083; GFX10-NEXT:    s_endpgm
1084  %gep = getelementptr i8, i8* %p, i64 8589936640
1085  %load = load volatile i8, i8* %gep, align 1
1086  store i8 %load, i8* undef
1087  ret void
1088}
1089
1090; Fill 12-bit low-bits (1ull << 33) | 4095
1091define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(i8* %p) {
1092; GFX9-LABEL: flat_inst_salu_offset_64bit_12bit_split0:
1093; GFX9:       ; %bb.0:
1094; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1095; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1096; GFX9-NEXT:    v_mov_b32_e32 v1, s1
1097; GFX9-NEXT:    v_add_co_u32_e64 v0, vcc, 0, s0
1098; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
1099; GFX9-NEXT:    flat_load_ubyte v0, v[0:1] offset:4095
1100; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1101; GFX9-NEXT:    flat_store_byte v[0:1], v0
1102; GFX9-NEXT:    s_endpgm
1103;
1104; GFX10-LABEL: flat_inst_salu_offset_64bit_12bit_split0:
1105; GFX10:       ; %bb.0:
1106; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1107; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1108; GFX10-NEXT:    s_add_u32 s0, s0, 0xfff
1109; GFX10-NEXT:    s_addc_u32 s1, s1, 2
1110; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1111; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1112; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
1113; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1114; GFX10-NEXT:    flat_store_byte v[0:1], v0
1115; GFX10-NEXT:    s_endpgm
1116  %gep = getelementptr i8, i8* %p, i64 8589938687
1117  %load = load volatile i8, i8* %gep, align 1
1118  store i8 %load, i8* undef
1119  ret void
1120}
1121
1122; Fill 12-bit low-bits (1ull << 33) | 4096
1123define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(i8* %p) {
1124; GFX9-LABEL: flat_inst_salu_offset_64bit_12bit_split1:
1125; GFX9:       ; %bb.0:
1126; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1127; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1128; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1129; GFX9-NEXT:    v_mov_b32_e32 v1, s1
1130; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
1131; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
1132; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
1133; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1134; GFX9-NEXT:    flat_store_byte v[0:1], v0
1135; GFX9-NEXT:    s_endpgm
1136;
1137; GFX10-LABEL: flat_inst_salu_offset_64bit_12bit_split1:
1138; GFX10:       ; %bb.0:
1139; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1140; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1141; GFX10-NEXT:    s_add_u32 s0, s0, 0x1000
1142; GFX10-NEXT:    s_addc_u32 s1, s1, 2
1143; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1144; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1145; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
1146; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1147; GFX10-NEXT:    flat_store_byte v[0:1], v0
1148; GFX10-NEXT:    s_endpgm
1149  %gep = getelementptr i8, i8* %p, i64 8589938688
1150  %load = load volatile i8, i8* %gep, align 1
1151  store i8 %load, i8* undef
1152  ret void
1153}
1154
1155; Fill 13-bit low-bits (1ull << 33) | 8191
1156define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(i8* %p) {
1157; GFX9-LABEL: flat_inst_salu_offset_64bit_13bit_split0:
1158; GFX9:       ; %bb.0:
1159; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1160; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1161; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1162; GFX9-NEXT:    v_mov_b32_e32 v1, s1
1163; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
1164; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
1165; GFX9-NEXT:    flat_load_ubyte v0, v[0:1] offset:4095
1166; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1167; GFX9-NEXT:    flat_store_byte v[0:1], v0
1168; GFX9-NEXT:    s_endpgm
1169;
1170; GFX10-LABEL: flat_inst_salu_offset_64bit_13bit_split0:
1171; GFX10:       ; %bb.0:
1172; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1173; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1174; GFX10-NEXT:    s_add_u32 s0, s0, 0x1fff
1175; GFX10-NEXT:    s_addc_u32 s1, s1, 2
1176; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1177; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1178; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
1179; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1180; GFX10-NEXT:    flat_store_byte v[0:1], v0
1181; GFX10-NEXT:    s_endpgm
1182  %gep = getelementptr i8, i8* %p, i64 8589942783
1183  %load = load volatile i8, i8* %gep, align 1
1184  store i8 %load, i8* undef
1185  ret void
1186}
1187
1188; Fill 13-bit low-bits (1ull << 33) | 8192
1189define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(i8* %p) {
1190; GFX9-LABEL: flat_inst_salu_offset_64bit_13bit_split1:
1191; GFX9:       ; %bb.0:
1192; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1193; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1194; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1195; GFX9-NEXT:    v_mov_b32_e32 v1, s1
1196; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x2000, v0
1197; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
1198; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
1199; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1200; GFX9-NEXT:    flat_store_byte v[0:1], v0
1201; GFX9-NEXT:    s_endpgm
1202;
1203; GFX10-LABEL: flat_inst_salu_offset_64bit_13bit_split1:
1204; GFX10:       ; %bb.0:
1205; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1206; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1207; GFX10-NEXT:    s_add_u32 s0, s0, 0x2000
1208; GFX10-NEXT:    s_addc_u32 s1, s1, 2
1209; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1210; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1211; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
1212; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1213; GFX10-NEXT:    flat_store_byte v[0:1], v0
1214; GFX10-NEXT:    s_endpgm
1215  %gep = getelementptr i8, i8* %p, i64 8589942784
1216  %load = load volatile i8, i8* %gep, align 1
1217  store i8 %load, i8* undef
1218  ret void
1219}
1220
1221; Fill 11-bit low-bits, negative high bits (1ull << 63) | 2047
1222define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(i8* %p) {
1223; GFX9-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0:
1224; GFX9:       ; %bb.0:
1225; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1226; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
1227; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1228; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1229; GFX9-NEXT:    v_mov_b32_e32 v2, s1
1230; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7ff, v0
1231; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
1232; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
1233; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1234; GFX9-NEXT:    flat_store_byte v[0:1], v0
1235; GFX9-NEXT:    s_endpgm
1236;
1237; GFX10-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0:
1238; GFX10:       ; %bb.0:
1239; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1240; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1241; GFX10-NEXT:    s_add_u32 s0, s0, 0x7ff
1242; GFX10-NEXT:    s_addc_u32 s1, s1, 0x80000000
1243; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1244; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1245; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
1246; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1247; GFX10-NEXT:    flat_store_byte v[0:1], v0
1248; GFX10-NEXT:    s_endpgm
1249  %gep = getelementptr i8, i8* %p, i64 -9223372036854773761
1250  %load = load volatile i8, i8* %gep, align 1
1251  store i8 %load, i8* undef
1252  ret void
1253}
1254
1255; Fill 11-bit low-bits, negative high bits (1ull << 63) | 2048
1256define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(i8* %p) {
1257; GFX9-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1:
1258; GFX9:       ; %bb.0:
1259; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1260; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
1261; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1262; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1263; GFX9-NEXT:    v_mov_b32_e32 v2, s1
1264; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x800, v0
1265; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
1266; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
1267; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1268; GFX9-NEXT:    flat_store_byte v[0:1], v0
1269; GFX9-NEXT:    s_endpgm
1270;
1271; GFX10-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1:
1272; GFX10:       ; %bb.0:
1273; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1274; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1275; GFX10-NEXT:    s_add_u32 s0, s0, 0x800
1276; GFX10-NEXT:    s_addc_u32 s1, s1, 0x80000000
1277; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1278; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1279; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
1280; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1281; GFX10-NEXT:    flat_store_byte v[0:1], v0
1282; GFX10-NEXT:    s_endpgm
1283  %gep = getelementptr i8, i8* %p, i64 -9223372036854773760
1284  %load = load volatile i8, i8* %gep, align 1
1285  store i8 %load, i8* undef
1286  ret void
1287}
1288
1289; Fill 12-bit low-bits, negative high bits (1ull << 63) | 4095
1290define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(i8* %p) {
1291; GFX9-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0:
1292; GFX9:       ; %bb.0:
1293; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1294; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
1295; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1296; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1297; GFX9-NEXT:    v_mov_b32_e32 v2, s1
1298; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfff, v0
1299; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
1300; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
1301; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1302; GFX9-NEXT:    flat_store_byte v[0:1], v0
1303; GFX9-NEXT:    s_endpgm
1304;
1305; GFX10-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0:
1306; GFX10:       ; %bb.0:
1307; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1308; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1309; GFX10-NEXT:    s_add_u32 s0, s0, 0xfff
1310; GFX10-NEXT:    s_addc_u32 s1, s1, 0x80000000
1311; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1312; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1313; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
1314; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1315; GFX10-NEXT:    flat_store_byte v[0:1], v0
1316; GFX10-NEXT:    s_endpgm
1317  %gep = getelementptr i8, i8* %p, i64 -9223372036854771713
1318  %load = load volatile i8, i8* %gep, align 1
1319  store i8 %load, i8* undef
1320  ret void
1321}
1322
1323; Fill 12-bit low-bits, negative high bits (1ull << 63) | 4096
1324define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(i8* %p) {
1325; GFX9-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1:
1326; GFX9:       ; %bb.0:
1327; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1328; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
1329; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1330; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1331; GFX9-NEXT:    v_mov_b32_e32 v2, s1
1332; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
1333; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
1334; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
1335; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1336; GFX9-NEXT:    flat_store_byte v[0:1], v0
1337; GFX9-NEXT:    s_endpgm
1338;
1339; GFX10-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1:
1340; GFX10:       ; %bb.0:
1341; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1342; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1343; GFX10-NEXT:    s_add_u32 s0, s0, 0x1000
1344; GFX10-NEXT:    s_addc_u32 s1, s1, 0x80000000
1345; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1346; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1347; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
1348; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1349; GFX10-NEXT:    flat_store_byte v[0:1], v0
1350; GFX10-NEXT:    s_endpgm
1351  %gep = getelementptr i8, i8* %p, i64 -9223372036854771712
1352  %load = load volatile i8, i8* %gep, align 1
1353  store i8 %load, i8* undef
1354  ret void
1355}
1356
1357; Fill 13-bit low-bits, negative high bits (1ull << 63) | 8191
1358define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(i8* %p) {
1359; GFX9-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0:
1360; GFX9:       ; %bb.0:
1361; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1362; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
1363; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1364; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1365; GFX9-NEXT:    v_mov_b32_e32 v2, s1
1366; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1fff, v0
1367; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
1368; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
1369; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1370; GFX9-NEXT:    flat_store_byte v[0:1], v0
1371; GFX9-NEXT:    s_endpgm
1372;
1373; GFX10-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0:
1374; GFX10:       ; %bb.0:
1375; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1376; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1377; GFX10-NEXT:    s_add_u32 s0, s0, 0x1fff
1378; GFX10-NEXT:    s_addc_u32 s1, s1, 0x80000000
1379; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1380; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1381; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
1382; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1383; GFX10-NEXT:    flat_store_byte v[0:1], v0
1384; GFX10-NEXT:    s_endpgm
1385  %gep = getelementptr i8, i8* %p, i64 -9223372036854767617
1386  %load = load volatile i8, i8* %gep, align 1
1387  store i8 %load, i8* undef
1388  ret void
1389}
1390
1391; Fill 13-bit low-bits, negative high bits (1ull << 63) | 8192
1392define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(i8* %p) {
1393; GFX9-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1:
1394; GFX9:       ; %bb.0:
1395; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1396; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
1397; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1398; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1399; GFX9-NEXT:    v_mov_b32_e32 v2, s1
1400; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x2000, v0
1401; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
1402; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
1403; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1404; GFX9-NEXT:    flat_store_byte v[0:1], v0
1405; GFX9-NEXT:    s_endpgm
1406;
1407; GFX10-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1:
1408; GFX10:       ; %bb.0:
1409; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1410; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1411; GFX10-NEXT:    s_add_u32 s0, s0, 0x2000
1412; GFX10-NEXT:    s_addc_u32 s1, s1, 0x80000000
1413; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1414; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1415; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
1416; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1417; GFX10-NEXT:    flat_store_byte v[0:1], v0
1418; GFX10-NEXT:    s_endpgm
1419  %gep = getelementptr i8, i8* %p, i64 -9223372036854767616
1420  %load = load volatile i8, i8* %gep, align 1
1421  store i8 %load, i8* undef
1422  ret void
1423}
1424