1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
3; RUN: llc -march=amdgcn -mcpu=gfx1030 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
4; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9-PAL %s
5; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10-PAL %s
6
7define amdgpu_kernel void @zero_init_kernel() {
8; GFX9-LABEL: zero_init_kernel:
9; GFX9:       ; %bb.0:
10; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
11; GFX9-NEXT:    s_mov_b32 s0, 0
12; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
13; GFX9-NEXT:    s_mov_b32 s1, s0
14; GFX9-NEXT:    s_mov_b32 s2, s0
15; GFX9-NEXT:    s_mov_b32 s3, s0
16; GFX9-NEXT:    v_mov_b32_e32 v0, s0
17; GFX9-NEXT:    v_mov_b32_e32 v1, s1
18; GFX9-NEXT:    v_mov_b32_e32 v2, s2
19; GFX9-NEXT:    v_mov_b32_e32 v3, s3
20; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
21; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:64
22; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
23; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
24; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
25; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32
26; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
27; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
28; GFX9-NEXT:    s_endpgm
29;
30; GFX10-LABEL: zero_init_kernel:
31; GFX10:       ; %bb.0:
32; GFX10-NEXT:    s_add_u32 s0, s0, s3
33; GFX10-NEXT:    s_addc_u32 s1, s1, 0
34; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
35; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
36; GFX10-NEXT:    s_mov_b32 s0, 0
37; GFX10-NEXT:    s_mov_b32 s1, s0
38; GFX10-NEXT:    s_mov_b32 s2, s0
39; GFX10-NEXT:    s_mov_b32 s3, s0
40; GFX10-NEXT:    v_mov_b32_e32 v0, s0
41; GFX10-NEXT:    v_mov_b32_e32 v1, s1
42; GFX10-NEXT:    v_mov_b32_e32 v2, s2
43; GFX10-NEXT:    v_mov_b32_e32 v3, s3
44; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:64
45; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:48
46; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:32
47; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:16
48; GFX10-NEXT:    s_endpgm
49;
50; GFX9-PAL-LABEL: zero_init_kernel:
51; GFX9-PAL:       ; %bb.0:
52; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
53; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
54; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
55; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
56; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
57; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
58; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
59; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
60; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
61; GFX9-PAL-NEXT:    s_mov_b32 s1, s0
62; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
63; GFX9-PAL-NEXT:    s_mov_b32 s3, s0
64; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, s0
65; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
66; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
67; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, s3
68; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:64
69; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
70; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
71; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
72; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32
73; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
74; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
75; GFX9-PAL-NEXT:    s_endpgm
76;
77; GFX10-PAL-LABEL: zero_init_kernel:
78; GFX10-PAL:       ; %bb.0:
79; GFX10-PAL-NEXT:    s_getpc_b64 s[2:3]
80; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
81; GFX10-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
82; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
83; GFX10-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
84; GFX10-PAL-NEXT:    s_add_u32 s2, s2, s1
85; GFX10-PAL-NEXT:    s_addc_u32 s3, s3, 0
86; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
87; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
88; GFX10-PAL-NEXT:    s_mov_b32 s0, 0
89; GFX10-PAL-NEXT:    s_mov_b32 s1, s0
90; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
91; GFX10-PAL-NEXT:    s_mov_b32 s3, s0
92; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, s0
93; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, s1
94; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, s2
95; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, s3
96; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:64
97; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:48
98; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:32
99; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:16
100; GFX10-PAL-NEXT:    s_endpgm
101  %alloca = alloca [32 x i16], align 2, addrspace(5)
102  %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
103  call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
104  ret void
105}
106
107define void @zero_init_foo() {
108; GFX9-LABEL: zero_init_foo:
109; GFX9:       ; %bb.0:
110; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
111; GFX9-NEXT:    s_mov_b32 s0, 0
112; GFX9-NEXT:    s_mov_b32 s1, s0
113; GFX9-NEXT:    s_mov_b32 s2, s0
114; GFX9-NEXT:    s_mov_b32 s3, s0
115; GFX9-NEXT:    v_mov_b32_e32 v0, s0
116; GFX9-NEXT:    v_mov_b32_e32 v1, s1
117; GFX9-NEXT:    v_mov_b32_e32 v2, s2
118; GFX9-NEXT:    v_mov_b32_e32 v3, s3
119; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:48
120; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:32
121; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:16
122; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
123; GFX9-NEXT:    s_waitcnt vmcnt(0)
124; GFX9-NEXT:    s_setpc_b64 s[30:31]
125;
126; GFX10-LABEL: zero_init_foo:
127; GFX10:       ; %bb.0:
128; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
129; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
130; GFX10-NEXT:    s_mov_b32 s0, 0
131; GFX10-NEXT:    s_mov_b32 s1, s0
132; GFX10-NEXT:    s_mov_b32 s2, s0
133; GFX10-NEXT:    s_mov_b32 s3, s0
134; GFX10-NEXT:    v_mov_b32_e32 v0, s0
135; GFX10-NEXT:    v_mov_b32_e32 v1, s1
136; GFX10-NEXT:    v_mov_b32_e32 v2, s2
137; GFX10-NEXT:    v_mov_b32_e32 v3, s3
138; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:48
139; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:32
140; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:16
141; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
142; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
143; GFX10-NEXT:    s_setpc_b64 s[30:31]
144;
145; GFX9-PAL-LABEL: zero_init_foo:
146; GFX9-PAL:       ; %bb.0:
147; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
148; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
149; GFX9-PAL-NEXT:    s_mov_b32 s1, s0
150; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
151; GFX9-PAL-NEXT:    s_mov_b32 s3, s0
152; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, s0
153; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
154; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
155; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, s3
156; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:48
157; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:32
158; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:16
159; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
160; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
161; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
162;
163; GFX10-PAL-LABEL: zero_init_foo:
164; GFX10-PAL:       ; %bb.0:
165; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
166; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
167; GFX10-PAL-NEXT:    s_mov_b32 s0, 0
168; GFX10-PAL-NEXT:    s_mov_b32 s1, s0
169; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
170; GFX10-PAL-NEXT:    s_mov_b32 s3, s0
171; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, s0
172; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, s1
173; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, s2
174; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, s3
175; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:48
176; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:32
177; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:16
178; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
179; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
180; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
181  %alloca = alloca [32 x i16], align 2, addrspace(5)
182  %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
183  call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
184  ret void
185}
186
187define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
188; GFX9-LABEL: store_load_sindex_kernel:
189; GFX9:       ; %bb.0: ; %bb
190; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
191; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
192; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
193; GFX9-NEXT:    v_mov_b32_e32 v0, 15
194; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
195; GFX9-NEXT:    s_lshl_b32 s1, s0, 2
196; GFX9-NEXT:    s_and_b32 s0, s0, 15
197; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
198; GFX9-NEXT:    s_add_u32 s1, 4, s1
199; GFX9-NEXT:    scratch_store_dword off, v0, s1
200; GFX9-NEXT:    s_add_u32 s0, 4, s0
201; GFX9-NEXT:    scratch_load_dword v0, off, s0
202; GFX9-NEXT:    s_endpgm
203;
204; GFX10-LABEL: store_load_sindex_kernel:
205; GFX10:       ; %bb.0: ; %bb
206; GFX10-NEXT:    s_add_u32 s2, s2, s5
207; GFX10-NEXT:    s_addc_u32 s3, s3, 0
208; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
209; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
210; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x24
211; GFX10-NEXT:    v_mov_b32_e32 v0, 15
212; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
213; GFX10-NEXT:    s_and_b32 s1, s0, 15
214; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
215; GFX10-NEXT:    s_lshl_b32 s1, s1, 2
216; GFX10-NEXT:    s_add_u32 s0, 4, s0
217; GFX10-NEXT:    s_add_u32 s1, 4, s1
218; GFX10-NEXT:    scratch_store_dword off, v0, s0
219; GFX10-NEXT:    scratch_load_dword v0, off, s1
220; GFX10-NEXT:    s_endpgm
221;
222; GFX9-PAL-LABEL: store_load_sindex_kernel:
223; GFX9-PAL:       ; %bb.0: ; %bb
224; GFX9-PAL-NEXT:    s_getpc_b64 s[4:5]
225; GFX9-PAL-NEXT:    s_mov_b32 s4, s0
226; GFX9-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
227; GFX9-PAL-NEXT:    s_load_dword s0, s[0:1], 0x24
228; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
229; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
230; GFX9-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
231; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s4, s3
232; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
233; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
234; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
235; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
236; GFX9-PAL-NEXT:    s_add_u32 s1, 4, s1
237; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
238; GFX9-PAL-NEXT:    s_add_u32 s0, 4, s0
239; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0
240; GFX9-PAL-NEXT:    s_endpgm
241;
242; GFX10-PAL-LABEL: store_load_sindex_kernel:
243; GFX10-PAL:       ; %bb.0: ; %bb
244; GFX10-PAL-NEXT:    s_getpc_b64 s[4:5]
245; GFX10-PAL-NEXT:    s_mov_b32 s4, s0
246; GFX10-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
247; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
248; GFX10-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
249; GFX10-PAL-NEXT:    s_add_u32 s4, s4, s3
250; GFX10-PAL-NEXT:    s_addc_u32 s5, s5, 0
251; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
252; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
253; GFX10-PAL-NEXT:    s_load_dword s0, s[0:1], 0x24
254; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, 15
255; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
256; GFX10-PAL-NEXT:    s_and_b32 s1, s0, 15
257; GFX10-PAL-NEXT:    s_lshl_b32 s0, s0, 2
258; GFX10-PAL-NEXT:    s_lshl_b32 s1, s1, 2
259; GFX10-PAL-NEXT:    s_add_u32 s0, 4, s0
260; GFX10-PAL-NEXT:    s_add_u32 s1, 4, s1
261; GFX10-PAL-NEXT:    scratch_store_dword off, v0, s0
262; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s1
263; GFX10-PAL-NEXT:    s_endpgm
264bb:
265  %i = alloca [32 x float], align 4, addrspace(5)
266  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
267  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
268  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
269  store volatile i32 15, i32 addrspace(5)* %i8, align 4
270  %i9 = and i32 %idx, 15
271  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
272  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
273  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
274  ret void
275}
276
277define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) {
278; GFX9-LABEL: store_load_sindex_foo:
279; GFX9:       ; %bb.0: ; %bb
280; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
281; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
282; GFX9-NEXT:    s_lshl_b32 s0, s2, 2
283; GFX9-NEXT:    s_add_u32 s0, 4, s0
284; GFX9-NEXT:    v_mov_b32_e32 v0, 15
285; GFX9-NEXT:    scratch_store_dword off, v0, s0
286; GFX9-NEXT:    s_and_b32 s0, s2, 15
287; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
288; GFX9-NEXT:    s_add_u32 s0, 4, s0
289; GFX9-NEXT:    scratch_load_dword v0, off, s0
290; GFX9-NEXT:    s_endpgm
291;
292; GFX10-LABEL: store_load_sindex_foo:
293; GFX10:       ; %bb.0: ; %bb
294; GFX10-NEXT:    s_add_u32 s0, s0, s3
295; GFX10-NEXT:    s_addc_u32 s1, s1, 0
296; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
297; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
298; GFX10-NEXT:    s_and_b32 s0, s2, 15
299; GFX10-NEXT:    v_mov_b32_e32 v0, 15
300; GFX10-NEXT:    s_lshl_b32 s1, s2, 2
301; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
302; GFX10-NEXT:    s_add_u32 s1, 4, s1
303; GFX10-NEXT:    s_add_u32 s0, 4, s0
304; GFX10-NEXT:    scratch_store_dword off, v0, s1
305; GFX10-NEXT:    scratch_load_dword v0, off, s0
306; GFX10-NEXT:    s_endpgm
307;
308; GFX9-PAL-LABEL: store_load_sindex_foo:
309; GFX9-PAL:       ; %bb.0: ; %bb
310; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
311; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
312; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
313; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
314; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
315; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
316; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
317; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
318; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
319; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
320; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
321; GFX9-PAL-NEXT:    s_add_u32 s1, 4, s1
322; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
323; GFX9-PAL-NEXT:    s_add_u32 s0, 4, s0
324; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0
325; GFX9-PAL-NEXT:    s_endpgm
326;
327; GFX10-PAL-LABEL: store_load_sindex_foo:
328; GFX10-PAL:       ; %bb.0: ; %bb
329; GFX10-PAL-NEXT:    s_getpc_b64 s[2:3]
330; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
331; GFX10-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
332; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
333; GFX10-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
334; GFX10-PAL-NEXT:    s_add_u32 s2, s2, s1
335; GFX10-PAL-NEXT:    s_addc_u32 s3, s3, 0
336; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
337; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
338; GFX10-PAL-NEXT:    s_and_b32 s1, s0, 15
339; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, 15
340; GFX10-PAL-NEXT:    s_lshl_b32 s0, s0, 2
341; GFX10-PAL-NEXT:    s_lshl_b32 s1, s1, 2
342; GFX10-PAL-NEXT:    s_add_u32 s0, 4, s0
343; GFX10-PAL-NEXT:    s_add_u32 s1, 4, s1
344; GFX10-PAL-NEXT:    scratch_store_dword off, v0, s0
345; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s1
346; GFX10-PAL-NEXT:    s_endpgm
347bb:
348  %i = alloca [32 x float], align 4, addrspace(5)
349  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
350  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
351  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
352  store volatile i32 15, i32 addrspace(5)* %i8, align 4
353  %i9 = and i32 %idx, 15
354  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
355  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
356  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
357  ret void
358}
359
360define amdgpu_kernel void @store_load_vindex_kernel() {
361; GFX9-LABEL: store_load_vindex_kernel:
362; GFX9:       ; %bb.0: ; %bb
363; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
364; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
365; GFX9-NEXT:    v_mov_b32_e32 v1, 4
366; GFX9-NEXT:    v_add_u32_e32 v2, v1, v0
367; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
368; GFX9-NEXT:    v_mov_b32_e32 v3, 15
369; GFX9-NEXT:    scratch_store_dword v2, v3, off
370; GFX9-NEXT:    v_sub_u32_e32 v0, v1, v0
371; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124
372; GFX9-NEXT:    s_endpgm
373;
374; GFX10-LABEL: store_load_vindex_kernel:
375; GFX10:       ; %bb.0: ; %bb
376; GFX10-NEXT:    s_add_u32 s0, s0, s3
377; GFX10-NEXT:    s_addc_u32 s1, s1, 0
378; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
379; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
380; GFX10-NEXT:    v_mov_b32_e32 v1, 4
381; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
382; GFX10-NEXT:    v_mov_b32_e32 v3, 15
383; GFX10-NEXT:    v_add_nc_u32_e32 v2, v1, v0
384; GFX10-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
385; GFX10-NEXT:    scratch_store_dword v2, v3, off
386; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:124
387; GFX10-NEXT:    s_endpgm
388;
389; GFX9-PAL-LABEL: store_load_vindex_kernel:
390; GFX9-PAL:       ; %bb.0: ; %bb
391; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
392; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
393; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
394; GFX9-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
395; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 4
396; GFX9-PAL-NEXT:    v_add_u32_e32 v2, v1, v0
397; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 15
398; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
399; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
400; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
401; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
402; GFX9-PAL-NEXT:    scratch_store_dword v2, v3, off
403; GFX9-PAL-NEXT:    v_sub_u32_e32 v0, v1, v0
404; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124
405; GFX9-PAL-NEXT:    s_endpgm
406;
407; GFX10-PAL-LABEL: store_load_vindex_kernel:
408; GFX10-PAL:       ; %bb.0: ; %bb
409; GFX10-PAL-NEXT:    s_getpc_b64 s[2:3]
410; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
411; GFX10-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
412; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
413; GFX10-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
414; GFX10-PAL-NEXT:    s_add_u32 s2, s2, s1
415; GFX10-PAL-NEXT:    s_addc_u32 s3, s3, 0
416; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
417; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
418; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 4
419; GFX10-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
420; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, 15
421; GFX10-PAL-NEXT:    v_add_nc_u32_e32 v2, v1, v0
422; GFX10-PAL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
423; GFX10-PAL-NEXT:    scratch_store_dword v2, v3, off
424; GFX10-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124
425; GFX10-PAL-NEXT:    s_endpgm
426bb:
427  %i = alloca [32 x float], align 4, addrspace(5)
428  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
429  %i2 = tail call i32 @llvm.amdgcn.workitem.id.x()
430  %i3 = zext i32 %i2 to i64
431  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2
432  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
433  store volatile i32 15, i32 addrspace(5)* %i8, align 4
434  %i9 = sub nsw i32 31, %i2
435  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
436  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
437  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
438  ret void
439}
440
441define void @store_load_vindex_foo(i32 %idx) {
442; GFX9-LABEL: store_load_vindex_foo:
443; GFX9:       ; %bb.0: ; %bb
444; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
445; GFX9-NEXT:    v_mov_b32_e32 v1, s32
446; GFX9-NEXT:    v_mov_b32_e32 v3, 15
447; GFX9-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
448; GFX9-NEXT:    v_and_b32_e32 v0, v0, v3
449; GFX9-NEXT:    scratch_store_dword v2, v3, off
450; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
451; GFX9-NEXT:    scratch_load_dword v0, v0, off
452; GFX9-NEXT:    s_waitcnt vmcnt(0)
453; GFX9-NEXT:    s_setpc_b64 s[30:31]
454;
455; GFX10-LABEL: store_load_vindex_foo:
456; GFX10:       ; %bb.0: ; %bb
457; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
458; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
459; GFX10-NEXT:    v_mov_b32_e32 v1, 15
460; GFX10-NEXT:    v_mov_b32_e32 v2, s32
461; GFX10-NEXT:    v_and_b32_e32 v3, v0, v1
462; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, v2
463; GFX10-NEXT:    v_lshl_add_u32 v2, v3, 2, v2
464; GFX10-NEXT:    scratch_store_dword v0, v1, off
465; GFX10-NEXT:    scratch_load_dword v0, v2, off
466; GFX10-NEXT:    s_waitcnt vmcnt(0)
467; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
468; GFX10-NEXT:    s_setpc_b64 s[30:31]
469;
470; GFX9-PAL-LABEL: store_load_vindex_foo:
471; GFX9-PAL:       ; %bb.0: ; %bb
472; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
473; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s32
474; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 15
475; GFX9-PAL-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
476; GFX9-PAL-NEXT:    v_and_b32_e32 v0, v0, v3
477; GFX9-PAL-NEXT:    scratch_store_dword v2, v3, off
478; GFX9-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
479; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off
480; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
481; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
482;
483; GFX10-PAL-LABEL: store_load_vindex_foo:
484; GFX10-PAL:       ; %bb.0: ; %bb
485; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
486; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
487; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
488; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, s32
489; GFX10-PAL-NEXT:    v_and_b32_e32 v3, v0, v1
490; GFX10-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v2
491; GFX10-PAL-NEXT:    v_lshl_add_u32 v2, v3, 2, v2
492; GFX10-PAL-NEXT:    scratch_store_dword v0, v1, off
493; GFX10-PAL-NEXT:    scratch_load_dword v0, v2, off
494; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
495; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
496; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
497bb:
498  %i = alloca [32 x float], align 4, addrspace(5)
499  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
500  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
501  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
502  store volatile i32 15, i32 addrspace(5)* %i8, align 4
503  %i9 = and i32 %idx, 15
504  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
505  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
506  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
507  ret void
508}
509
510define void @private_ptr_foo(float addrspace(5)* nocapture %arg) {
511; GFX9-LABEL: private_ptr_foo:
512; GFX9:       ; %bb.0:
513; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
514; GFX9-NEXT:    v_mov_b32_e32 v1, 0x41200000
515; GFX9-NEXT:    scratch_store_dword v0, v1, off offset:4
516; GFX9-NEXT:    s_waitcnt vmcnt(0)
517; GFX9-NEXT:    s_setpc_b64 s[30:31]
518;
519; GFX10-LABEL: private_ptr_foo:
520; GFX10:       ; %bb.0:
521; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
522; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
523; GFX10-NEXT:    v_mov_b32_e32 v1, 0x41200000
524; GFX10-NEXT:    scratch_store_dword v0, v1, off offset:4
525; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
526; GFX10-NEXT:    s_setpc_b64 s[30:31]
527;
528; GFX9-PAL-LABEL: private_ptr_foo:
529; GFX9-PAL:       ; %bb.0:
530; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
531; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 0x41200000
532; GFX9-PAL-NEXT:    scratch_store_dword v0, v1, off offset:4
533; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
534; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
535;
536; GFX10-PAL-LABEL: private_ptr_foo:
537; GFX10-PAL:       ; %bb.0:
538; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
539; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
540; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 0x41200000
541; GFX10-PAL-NEXT:    scratch_store_dword v0, v1, off offset:4
542; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
543; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
544  %gep = getelementptr inbounds float, float addrspace(5)* %arg, i32 1
545  store float 1.000000e+01, float addrspace(5)* %gep, align 4
546  ret void
547}
548
549define amdgpu_kernel void @zero_init_small_offset_kernel() {
550; GFX9-LABEL: zero_init_small_offset_kernel:
551; GFX9:       ; %bb.0:
552; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
553; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
554; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
555; GFX9-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4
556; GFX9-NEXT:    s_mov_b32 s0, 0
557; GFX9-NEXT:    s_mov_b32 s1, s0
558; GFX9-NEXT:    s_mov_b32 s2, s0
559; GFX9-NEXT:    s_mov_b32 s3, s0
560; GFX9-NEXT:    s_waitcnt vmcnt(0)
561; GFX9-NEXT:    v_mov_b32_e32 v0, s0
562; GFX9-NEXT:    v_mov_b32_e32 v1, s1
563; GFX9-NEXT:    v_mov_b32_e32 v2, s2
564; GFX9-NEXT:    v_mov_b32_e32 v3, s3
565; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
566; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:272
567; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
568; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:288
569; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
570; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:304
571; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
572; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:320
573; GFX9-NEXT:    s_endpgm
574;
575; GFX10-LABEL: zero_init_small_offset_kernel:
576; GFX10:       ; %bb.0:
577; GFX10-NEXT:    s_add_u32 s0, s0, s3
578; GFX10-NEXT:    s_addc_u32 s1, s1, 0
579; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
580; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
581; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4
582; GFX10-NEXT:    s_mov_b32 s0, 0
583; GFX10-NEXT:    s_mov_b32 s1, s0
584; GFX10-NEXT:    s_mov_b32 s2, s0
585; GFX10-NEXT:    s_mov_b32 s3, s0
586; GFX10-NEXT:    s_waitcnt vmcnt(0)
587; GFX10-NEXT:    v_mov_b32_e32 v0, s0
588; GFX10-NEXT:    v_mov_b32_e32 v1, s1
589; GFX10-NEXT:    v_mov_b32_e32 v2, s2
590; GFX10-NEXT:    v_mov_b32_e32 v3, s3
591; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:272
592; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:288
593; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:304
594; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:320
595; GFX10-NEXT:    s_endpgm
596;
597; GFX9-PAL-LABEL: zero_init_small_offset_kernel:
598; GFX9-PAL:       ; %bb.0:
599; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
600; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
601; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
602; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
603; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
604; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
605; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
606; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
607; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
608; GFX9-PAL-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4
609; GFX9-PAL-NEXT:    s_mov_b32 s1, s0
610; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
611; GFX9-PAL-NEXT:    s_mov_b32 s3, s0
612; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
613; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, s0
614; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
615; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
616; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, s3
617; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
618; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:272
619; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
620; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:288
621; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
622; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:304
623; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
624; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:320
625; GFX9-PAL-NEXT:    s_endpgm
626;
627; GFX10-PAL-LABEL: zero_init_small_offset_kernel:
628; GFX10-PAL:       ; %bb.0:
629; GFX10-PAL-NEXT:    s_getpc_b64 s[2:3]
630; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
631; GFX10-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
632; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
633; GFX10-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
634; GFX10-PAL-NEXT:    s_add_u32 s2, s2, s1
635; GFX10-PAL-NEXT:    s_addc_u32 s3, s3, 0
636; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
637; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
638; GFX10-PAL-NEXT:    scratch_load_dword v0, off, off offset:4
639; GFX10-PAL-NEXT:    s_mov_b32 s0, 0
640; GFX10-PAL-NEXT:    s_mov_b32 s1, s0
641; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
642; GFX10-PAL-NEXT:    s_mov_b32 s3, s0
643; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
644; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, s0
645; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, s1
646; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, s2
647; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, s3
648; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:272
649; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:288
650; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:304
651; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:320
652; GFX10-PAL-NEXT:    s_endpgm
653  %padding = alloca [64 x i32], align 4, addrspace(5)
654  %alloca = alloca [32 x i16], align 2, addrspace(5)
655  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
656  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
657  %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
658  call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
659  ret void
660}
661
662define void @zero_init_small_offset_foo() {
663; GFX9-LABEL: zero_init_small_offset_foo:
664; GFX9:       ; %bb.0:
665; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
666; GFX9-NEXT:    scratch_load_dword v0, off, s32
667; GFX9-NEXT:    s_mov_b32 s0, 0
668; GFX9-NEXT:    s_mov_b32 s1, s0
669; GFX9-NEXT:    s_mov_b32 s2, s0
670; GFX9-NEXT:    s_mov_b32 s3, s0
671; GFX9-NEXT:    s_waitcnt vmcnt(0)
672; GFX9-NEXT:    v_mov_b32_e32 v0, s0
673; GFX9-NEXT:    v_mov_b32_e32 v1, s1
674; GFX9-NEXT:    v_mov_b32_e32 v2, s2
675; GFX9-NEXT:    v_mov_b32_e32 v3, s3
676; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:256
677; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:272
678; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:288
679; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:304
680; GFX9-NEXT:    s_waitcnt vmcnt(0)
681; GFX9-NEXT:    s_setpc_b64 s[30:31]
682;
683; GFX10-LABEL: zero_init_small_offset_foo:
684; GFX10:       ; %bb.0:
685; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
686; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
687; GFX10-NEXT:    scratch_load_dword v0, off, s32
688; GFX10-NEXT:    s_mov_b32 s0, 0
689; GFX10-NEXT:    s_mov_b32 s1, s0
690; GFX10-NEXT:    s_mov_b32 s2, s0
691; GFX10-NEXT:    s_mov_b32 s3, s0
692; GFX10-NEXT:    s_waitcnt vmcnt(0)
693; GFX10-NEXT:    v_mov_b32_e32 v0, s0
694; GFX10-NEXT:    v_mov_b32_e32 v1, s1
695; GFX10-NEXT:    v_mov_b32_e32 v2, s2
696; GFX10-NEXT:    v_mov_b32_e32 v3, s3
697; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:256
698; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:272
699; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:288
700; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:304
701; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
702; GFX10-NEXT:    s_setpc_b64 s[30:31]
703;
704; GFX9-PAL-LABEL: zero_init_small_offset_foo:
705; GFX9-PAL:       ; %bb.0:
706; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
707; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s32
708; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
709; GFX9-PAL-NEXT:    s_mov_b32 s1, s0
710; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
711; GFX9-PAL-NEXT:    s_mov_b32 s3, s0
712; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
713; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, s0
714; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
715; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
716; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, s3
717; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:256
718; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:272
719; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:288
720; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:304
721; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
722; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
723;
724; GFX10-PAL-LABEL: zero_init_small_offset_foo:
725; GFX10-PAL:       ; %bb.0:
726; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
727; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
728; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s32
729; GFX10-PAL-NEXT:    s_mov_b32 s0, 0
730; GFX10-PAL-NEXT:    s_mov_b32 s1, s0
731; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
732; GFX10-PAL-NEXT:    s_mov_b32 s3, s0
733; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
734; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, s0
735; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, s1
736; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, s2
737; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, s3
738; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:256
739; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:272
740; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:288
741; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:304
742; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
743; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
744  %padding = alloca [64 x i32], align 4, addrspace(5)
745  %alloca = alloca [32 x i16], align 2, addrspace(5)
746  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
747  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
748  %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
749  call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
750  ret void
751}
752
753define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
754; GFX9-LABEL: store_load_sindex_small_offset_kernel:
755; GFX9:       ; %bb.0: ; %bb
756; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
757; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
758; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
759; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
760; GFX9-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4
761; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
762; GFX9-NEXT:    s_lshl_b32 s1, s0, 2
763; GFX9-NEXT:    s_and_b32 s0, s0, 15
764; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
765; GFX9-NEXT:    s_waitcnt vmcnt(0)
766; GFX9-NEXT:    v_mov_b32_e32 v0, 15
767; GFX9-NEXT:    s_add_u32 s1, 0x104, s1
768; GFX9-NEXT:    scratch_store_dword off, v0, s1
769; GFX9-NEXT:    s_add_u32 s0, 0x104, s0
770; GFX9-NEXT:    scratch_load_dword v0, off, s0
771; GFX9-NEXT:    s_endpgm
772;
773; GFX10-LABEL: store_load_sindex_small_offset_kernel:
774; GFX10:       ; %bb.0: ; %bb
775; GFX10-NEXT:    s_add_u32 s2, s2, s5
776; GFX10-NEXT:    s_addc_u32 s3, s3, 0
777; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
778; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
779; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x24
780; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4
781; GFX10-NEXT:    s_waitcnt vmcnt(0)
782; GFX10-NEXT:    v_mov_b32_e32 v0, 15
783; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
784; GFX10-NEXT:    s_and_b32 s1, s0, 15
785; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
786; GFX10-NEXT:    s_lshl_b32 s1, s1, 2
787; GFX10-NEXT:    s_add_u32 s0, 0x104, s0
788; GFX10-NEXT:    s_add_u32 s1, 0x104, s1
789; GFX10-NEXT:    scratch_store_dword off, v0, s0
790; GFX10-NEXT:    scratch_load_dword v0, off, s1
791; GFX10-NEXT:    s_endpgm
792;
793; GFX9-PAL-LABEL: store_load_sindex_small_offset_kernel:
794; GFX9-PAL:       ; %bb.0: ; %bb
795; GFX9-PAL-NEXT:    s_getpc_b64 s[4:5]
796; GFX9-PAL-NEXT:    s_mov_b32 s4, s0
797; GFX9-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
798; GFX9-PAL-NEXT:    s_load_dword s0, s[0:1], 0x24
799; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
800; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
801; GFX9-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
802; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s4, s3
803; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
804; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
805; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
806; GFX9-PAL-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4
807; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
808; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
809; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
810; GFX9-PAL-NEXT:    s_add_u32 s1, 0x104, s1
811; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
812; GFX9-PAL-NEXT:    s_add_u32 s0, 0x104, s0
813; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0
814; GFX9-PAL-NEXT:    s_endpgm
815;
816; GFX10-PAL-LABEL: store_load_sindex_small_offset_kernel:
817; GFX10-PAL:       ; %bb.0: ; %bb
818; GFX10-PAL-NEXT:    s_getpc_b64 s[4:5]
819; GFX10-PAL-NEXT:    s_mov_b32 s4, s0
820; GFX10-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
821; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
822; GFX10-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
823; GFX10-PAL-NEXT:    s_add_u32 s4, s4, s3
824; GFX10-PAL-NEXT:    s_addc_u32 s5, s5, 0
825; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
826; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
827; GFX10-PAL-NEXT:    s_load_dword s0, s[0:1], 0x24
828; GFX10-PAL-NEXT:    scratch_load_dword v0, off, off offset:4
829; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
830; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, 15
831; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
832; GFX10-PAL-NEXT:    s_and_b32 s1, s0, 15
833; GFX10-PAL-NEXT:    s_lshl_b32 s0, s0, 2
834; GFX10-PAL-NEXT:    s_lshl_b32 s1, s1, 2
835; GFX10-PAL-NEXT:    s_add_u32 s0, 0x104, s0
836; GFX10-PAL-NEXT:    s_add_u32 s1, 0x104, s1
837; GFX10-PAL-NEXT:    scratch_store_dword off, v0, s0
838; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s1
839; GFX10-PAL-NEXT:    s_endpgm
840bb:
841  %padding = alloca [64 x i32], align 4, addrspace(5)
842  %i = alloca [32 x float], align 4, addrspace(5)
843  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
844  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
845  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
846  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
847  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
848  store volatile i32 15, i32 addrspace(5)* %i8, align 4
849  %i9 = and i32 %idx, 15
850  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
851  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
852  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
853  ret void
854}
855
856define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) {
857; GFX9-LABEL: store_load_sindex_small_offset_foo:
858; GFX9:       ; %bb.0: ; %bb
859; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
860; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
861; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
862; GFX9-NEXT:    s_lshl_b32 s0, s2, 2
863; GFX9-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4
864; GFX9-NEXT:    s_add_u32 s0, 0x104, s0
865; GFX9-NEXT:    s_waitcnt vmcnt(0)
866; GFX9-NEXT:    v_mov_b32_e32 v0, 15
867; GFX9-NEXT:    scratch_store_dword off, v0, s0
868; GFX9-NEXT:    s_and_b32 s0, s2, 15
869; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
870; GFX9-NEXT:    s_add_u32 s0, 0x104, s0
871; GFX9-NEXT:    scratch_load_dword v0, off, s0
872; GFX9-NEXT:    s_endpgm
873;
874; GFX10-LABEL: store_load_sindex_small_offset_foo:
875; GFX10:       ; %bb.0: ; %bb
876; GFX10-NEXT:    s_add_u32 s0, s0, s3
877; GFX10-NEXT:    s_addc_u32 s1, s1, 0
878; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
879; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
880; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4
881; GFX10-NEXT:    s_and_b32 s0, s2, 15
882; GFX10-NEXT:    s_waitcnt vmcnt(0)
883; GFX10-NEXT:    v_mov_b32_e32 v0, 15
884; GFX10-NEXT:    s_lshl_b32 s1, s2, 2
885; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
886; GFX10-NEXT:    s_add_u32 s1, 0x104, s1
887; GFX10-NEXT:    s_add_u32 s0, 0x104, s0
888; GFX10-NEXT:    scratch_store_dword off, v0, s1
889; GFX10-NEXT:    scratch_load_dword v0, off, s0
890; GFX10-NEXT:    s_endpgm
891;
892; GFX9-PAL-LABEL: store_load_sindex_small_offset_foo:
893; GFX9-PAL:       ; %bb.0: ; %bb
894; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
895; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
896; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
897; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
898; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
899; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
900; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
901; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
902; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
903; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
904; GFX9-PAL-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4
905; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
906; GFX9-PAL-NEXT:    s_add_u32 s1, 0x104, s1
907; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
908; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
909; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
910; GFX9-PAL-NEXT:    s_add_u32 s0, 0x104, s0
911; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0
912; GFX9-PAL-NEXT:    s_endpgm
913;
914; GFX10-PAL-LABEL: store_load_sindex_small_offset_foo:
915; GFX10-PAL:       ; %bb.0: ; %bb
916; GFX10-PAL-NEXT:    s_getpc_b64 s[2:3]
917; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
918; GFX10-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
919; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
920; GFX10-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
921; GFX10-PAL-NEXT:    s_add_u32 s2, s2, s1
922; GFX10-PAL-NEXT:    s_addc_u32 s3, s3, 0
923; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
924; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
925; GFX10-PAL-NEXT:    scratch_load_dword v0, off, off offset:4
926; GFX10-PAL-NEXT:    s_and_b32 s1, s0, 15
927; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
928; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, 15
929; GFX10-PAL-NEXT:    s_lshl_b32 s0, s0, 2
930; GFX10-PAL-NEXT:    s_lshl_b32 s1, s1, 2
931; GFX10-PAL-NEXT:    s_add_u32 s0, 0x104, s0
932; GFX10-PAL-NEXT:    s_add_u32 s1, 0x104, s1
933; GFX10-PAL-NEXT:    scratch_store_dword off, v0, s0
934; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s1
935; GFX10-PAL-NEXT:    s_endpgm
936bb:
937  %padding = alloca [64 x i32], align 4, addrspace(5)
938  %i = alloca [32 x float], align 4, addrspace(5)
939  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
940  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
941  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
942  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
943  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
944  store volatile i32 15, i32 addrspace(5)* %i8, align 4
945  %i9 = and i32 %idx, 15
946  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
947  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
948  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
949  ret void
950}
951
952define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
953; GFX9-LABEL: store_load_vindex_small_offset_kernel:
954; GFX9:       ; %bb.0: ; %bb
955; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
956; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
957; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
958; GFX9-NEXT:    scratch_load_dword v1, off, vcc_hi offset:4
959; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
960; GFX9-NEXT:    s_waitcnt vmcnt(0)
961; GFX9-NEXT:    v_mov_b32_e32 v1, 0x104
962; GFX9-NEXT:    v_add_u32_e32 v2, v1, v0
963; GFX9-NEXT:    v_mov_b32_e32 v3, 15
964; GFX9-NEXT:    scratch_store_dword v2, v3, off
965; GFX9-NEXT:    v_sub_u32_e32 v0, v1, v0
966; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124
967; GFX9-NEXT:    s_endpgm
968;
969; GFX10-LABEL: store_load_vindex_small_offset_kernel:
970; GFX10:       ; %bb.0: ; %bb
971; GFX10-NEXT:    s_add_u32 s0, s0, s3
972; GFX10-NEXT:    s_addc_u32 s1, s1, 0
973; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
974; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
975; GFX10-NEXT:    v_mov_b32_e32 v1, 0x104
976; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
977; GFX10-NEXT:    v_mov_b32_e32 v3, 15
978; GFX10-NEXT:    v_add_nc_u32_e32 v2, v1, v0
979; GFX10-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
980; GFX10-NEXT:    scratch_load_dword v1, off, off offset:4
981; GFX10-NEXT:    scratch_store_dword v2, v3, off
982; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:124
983; GFX10-NEXT:    s_endpgm
984;
985; GFX9-PAL-LABEL: store_load_vindex_small_offset_kernel:
986; GFX9-PAL:       ; %bb.0: ; %bb
987; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
988; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
989; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
990; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
991; GFX9-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
992; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 15
993; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
994; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
995; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
996; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
997; GFX9-PAL-NEXT:    scratch_load_dword v1, off, vcc_hi offset:4
998; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
999; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 0x104
1000; GFX9-PAL-NEXT:    v_add_u32_e32 v2, v1, v0
1001; GFX9-PAL-NEXT:    scratch_store_dword v2, v3, off
1002; GFX9-PAL-NEXT:    v_sub_u32_e32 v0, v1, v0
1003; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124
1004; GFX9-PAL-NEXT:    s_endpgm
1005;
1006; GFX10-PAL-LABEL: store_load_vindex_small_offset_kernel:
1007; GFX10-PAL:       ; %bb.0: ; %bb
1008; GFX10-PAL-NEXT:    s_getpc_b64 s[2:3]
1009; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
1010; GFX10-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1011; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1012; GFX10-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1013; GFX10-PAL-NEXT:    s_add_u32 s2, s2, s1
1014; GFX10-PAL-NEXT:    s_addc_u32 s3, s3, 0
1015; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1016; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1017; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 0x104
1018; GFX10-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1019; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, 15
1020; GFX10-PAL-NEXT:    v_add_nc_u32_e32 v2, v1, v0
1021; GFX10-PAL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
1022; GFX10-PAL-NEXT:    scratch_load_dword v1, off, off offset:4
1023; GFX10-PAL-NEXT:    scratch_store_dword v2, v3, off
1024; GFX10-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124
1025; GFX10-PAL-NEXT:    s_endpgm
1026bb:
1027  %padding = alloca [64 x i32], align 4, addrspace(5)
1028  %i = alloca [32 x float], align 4, addrspace(5)
1029  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
1030  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
1031  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
1032  %i2 = tail call i32 @llvm.amdgcn.workitem.id.x()
1033  %i3 = zext i32 %i2 to i64
1034  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2
1035  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
1036  store volatile i32 15, i32 addrspace(5)* %i8, align 4
1037  %i9 = sub nsw i32 31, %i2
1038  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
1039  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
1040  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
1041  ret void
1042}
1043
1044define void @store_load_vindex_small_offset_foo(i32 %idx) {
1045; GFX9-LABEL: store_load_vindex_small_offset_foo:
1046; GFX9:       ; %bb.0: ; %bb
1047; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1048; GFX9-NEXT:    scratch_load_dword v1, off, s32
1049; GFX9-NEXT:    s_add_u32 vcc_hi, s32, 0x100
1050; GFX9-NEXT:    s_waitcnt vmcnt(0)
1051; GFX9-NEXT:    v_mov_b32_e32 v1, vcc_hi
1052; GFX9-NEXT:    v_mov_b32_e32 v3, 15
1053; GFX9-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
1054; GFX9-NEXT:    v_and_b32_e32 v0, v0, v3
1055; GFX9-NEXT:    scratch_store_dword v2, v3, off
1056; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
1057; GFX9-NEXT:    scratch_load_dword v0, v0, off
1058; GFX9-NEXT:    s_waitcnt vmcnt(0)
1059; GFX9-NEXT:    s_setpc_b64 s[30:31]
1060;
1061; GFX10-LABEL: store_load_vindex_small_offset_foo:
1062; GFX10:       ; %bb.0: ; %bb
1063; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1064; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1065; GFX10-NEXT:    v_mov_b32_e32 v1, 15
1066; GFX10-NEXT:    s_add_u32 vcc_lo, s32, 0x100
1067; GFX10-NEXT:    v_mov_b32_e32 v2, vcc_lo
1068; GFX10-NEXT:    v_and_b32_e32 v3, v0, v1
1069; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, v2
1070; GFX10-NEXT:    v_lshl_add_u32 v2, v3, 2, v2
1071; GFX10-NEXT:    scratch_load_dword v3, off, s32
1072; GFX10-NEXT:    scratch_store_dword v0, v1, off
1073; GFX10-NEXT:    scratch_load_dword v0, v2, off
1074; GFX10-NEXT:    s_waitcnt vmcnt(0)
1075; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1076; GFX10-NEXT:    s_setpc_b64 s[30:31]
1077;
1078; GFX9-PAL-LABEL: store_load_vindex_small_offset_foo:
1079; GFX9-PAL:       ; %bb.0: ; %bb
1080; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1081; GFX9-PAL-NEXT:    scratch_load_dword v1, off, s32
1082; GFX9-PAL-NEXT:    s_add_u32 vcc_hi, s32, 0x100
1083; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1084; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, vcc_hi
1085; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 15
1086; GFX9-PAL-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
1087; GFX9-PAL-NEXT:    v_and_b32_e32 v0, v0, v3
1088; GFX9-PAL-NEXT:    scratch_store_dword v2, v3, off
1089; GFX9-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
1090; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off
1091; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1092; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
1093;
1094; GFX10-PAL-LABEL: store_load_vindex_small_offset_foo:
1095; GFX10-PAL:       ; %bb.0: ; %bb
1096; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1097; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1098; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
1099; GFX10-PAL-NEXT:    s_add_u32 vcc_lo, s32, 0x100
1100; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, vcc_lo
1101; GFX10-PAL-NEXT:    v_and_b32_e32 v3, v0, v1
1102; GFX10-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v2
1103; GFX10-PAL-NEXT:    v_lshl_add_u32 v2, v3, 2, v2
1104; GFX10-PAL-NEXT:    scratch_load_dword v3, off, s32
1105; GFX10-PAL-NEXT:    scratch_store_dword v0, v1, off
1106; GFX10-PAL-NEXT:    scratch_load_dword v0, v2, off
1107; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
1108; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1109; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
1110bb:
1111  %padding = alloca [64 x i32], align 4, addrspace(5)
1112  %i = alloca [32 x float], align 4, addrspace(5)
1113  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
1114  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
1115  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
1116  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
1117  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
1118  store volatile i32 15, i32 addrspace(5)* %i8, align 4
1119  %i9 = and i32 %idx, 15
1120  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
1121  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
1122  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
1123  ret void
1124}
1125
1126define amdgpu_kernel void @zero_init_large_offset_kernel() {
1127; GFX9-LABEL: zero_init_large_offset_kernel:
1128; GFX9:       ; %bb.0:
1129; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
1130; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
1131; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
1132; GFX9-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4
1133; GFX9-NEXT:    s_mov_b32 s0, 0
1134; GFX9-NEXT:    s_mov_b32 s1, s0
1135; GFX9-NEXT:    s_mov_b32 s2, s0
1136; GFX9-NEXT:    s_mov_b32 s3, s0
1137; GFX9-NEXT:    s_waitcnt vmcnt(0)
1138; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1139; GFX9-NEXT:    v_mov_b32_e32 v1, s1
1140; GFX9-NEXT:    v_mov_b32_e32 v2, s2
1141; GFX9-NEXT:    v_mov_b32_e32 v3, s3
1142; GFX9-NEXT:    s_movk_i32 vcc_hi, 0x4010
1143; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi
1144; GFX9-NEXT:    s_movk_i32 vcc_hi, 0x4010
1145; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
1146; GFX9-NEXT:    s_movk_i32 vcc_hi, 0x4010
1147; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32
1148; GFX9-NEXT:    s_movk_i32 vcc_hi, 0x4010
1149; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
1150; GFX9-NEXT:    s_endpgm
1151;
1152; GFX10-LABEL: zero_init_large_offset_kernel:
1153; GFX10:       ; %bb.0:
1154; GFX10-NEXT:    s_add_u32 s0, s0, s3
1155; GFX10-NEXT:    s_addc_u32 s1, s1, 0
1156; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1157; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1158; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4
1159; GFX10-NEXT:    s_mov_b32 s0, 0
1160; GFX10-NEXT:    s_movk_i32 vcc_lo, 0x4010
1161; GFX10-NEXT:    s_mov_b32 s1, s0
1162; GFX10-NEXT:    s_mov_b32 s2, s0
1163; GFX10-NEXT:    s_mov_b32 s3, s0
1164; GFX10-NEXT:    s_waitcnt vmcnt(0)
1165; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1166; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1167; GFX10-NEXT:    v_mov_b32_e32 v2, s2
1168; GFX10-NEXT:    v_mov_b32_e32 v3, s3
1169; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo
1170; GFX10-NEXT:    s_movk_i32 vcc_lo, 0x4010
1171; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
1172; GFX10-NEXT:    s_movk_i32 vcc_lo, 0x4010
1173; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
1174; GFX10-NEXT:    s_movk_i32 vcc_lo, 0x4010
1175; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
1176; GFX10-NEXT:    s_endpgm
1177;
1178; GFX9-PAL-LABEL: zero_init_large_offset_kernel:
1179; GFX9-PAL:       ; %bb.0:
1180; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
1181; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
1182; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1183; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
1184; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
1185; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1186; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1187; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
1188; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
1189; GFX9-PAL-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4
1190; GFX9-PAL-NEXT:    s_mov_b32 s1, s0
1191; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
1192; GFX9-PAL-NEXT:    s_mov_b32 s3, s0
1193; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1194; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, s0
1195; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
1196; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
1197; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, s3
1198; GFX9-PAL-NEXT:    s_movk_i32 vcc_hi, 0x4010
1199; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi
1200; GFX9-PAL-NEXT:    s_movk_i32 vcc_hi, 0x4010
1201; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
1202; GFX9-PAL-NEXT:    s_movk_i32 vcc_hi, 0x4010
1203; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32
1204; GFX9-PAL-NEXT:    s_movk_i32 vcc_hi, 0x4010
1205; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
1206; GFX9-PAL-NEXT:    s_endpgm
1207;
1208; GFX10-PAL-LABEL: zero_init_large_offset_kernel:
1209; GFX10-PAL:       ; %bb.0:
1210; GFX10-PAL-NEXT:    s_getpc_b64 s[2:3]
1211; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
1212; GFX10-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1213; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1214; GFX10-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1215; GFX10-PAL-NEXT:    s_add_u32 s2, s2, s1
1216; GFX10-PAL-NEXT:    s_addc_u32 s3, s3, 0
1217; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1218; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1219; GFX10-PAL-NEXT:    scratch_load_dword v0, off, off offset:4
1220; GFX10-PAL-NEXT:    s_mov_b32 s0, 0
1221; GFX10-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
1222; GFX10-PAL-NEXT:    s_mov_b32 s1, s0
1223; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
1224; GFX10-PAL-NEXT:    s_mov_b32 s3, s0
1225; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
1226; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, s0
1227; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, s1
1228; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, s2
1229; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, s3
1230; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo
1231; GFX10-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
1232; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
1233; GFX10-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
1234; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
1235; GFX10-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
1236; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
1237; GFX10-PAL-NEXT:    s_endpgm
1238  %padding = alloca [4096 x i32], align 4, addrspace(5)
1239  %alloca = alloca [32 x i16], align 2, addrspace(5)
1240  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
1241  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
1242  %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
1243  call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
1244  ret void
1245}
1246
1247define void @zero_init_large_offset_foo() {
1248; GFX9-LABEL: zero_init_large_offset_foo:
1249; GFX9:       ; %bb.0:
1250; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1251; GFX9-NEXT:    scratch_load_dword v0, off, s32
1252; GFX9-NEXT:    s_mov_b32 s0, 0
1253; GFX9-NEXT:    s_mov_b32 s1, s0
1254; GFX9-NEXT:    s_mov_b32 s2, s0
1255; GFX9-NEXT:    s_mov_b32 s3, s0
1256; GFX9-NEXT:    s_waitcnt vmcnt(0)
1257; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1258; GFX9-NEXT:    v_mov_b32_e32 v1, s1
1259; GFX9-NEXT:    v_mov_b32_e32 v2, s2
1260; GFX9-NEXT:    v_mov_b32_e32 v3, s3
1261; GFX9-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
1262; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi
1263; GFX9-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
1264; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
1265; GFX9-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
1266; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32
1267; GFX9-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
1268; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
1269; GFX9-NEXT:    s_waitcnt vmcnt(0)
1270; GFX9-NEXT:    s_setpc_b64 s[30:31]
1271;
1272; GFX10-LABEL: zero_init_large_offset_foo:
1273; GFX10:       ; %bb.0:
1274; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1275; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1276; GFX10-NEXT:    scratch_load_dword v0, off, s32
1277; GFX10-NEXT:    s_mov_b32 s0, 0
1278; GFX10-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
1279; GFX10-NEXT:    s_mov_b32 s1, s0
1280; GFX10-NEXT:    s_mov_b32 s2, s0
1281; GFX10-NEXT:    s_mov_b32 s3, s0
1282; GFX10-NEXT:    s_waitcnt vmcnt(0)
1283; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1284; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1285; GFX10-NEXT:    v_mov_b32_e32 v2, s2
1286; GFX10-NEXT:    v_mov_b32_e32 v3, s3
1287; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo
1288; GFX10-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
1289; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
1290; GFX10-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
1291; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
1292; GFX10-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
1293; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
1294; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1295; GFX10-NEXT:    s_setpc_b64 s[30:31]
1296;
1297; GFX9-PAL-LABEL: zero_init_large_offset_foo:
1298; GFX9-PAL:       ; %bb.0:
1299; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1300; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s32
1301; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
1302; GFX9-PAL-NEXT:    s_mov_b32 s1, s0
1303; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
1304; GFX9-PAL-NEXT:    s_mov_b32 s3, s0
1305; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1306; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, s0
1307; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
1308; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
1309; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, s3
1310; GFX9-PAL-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
1311; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi
1312; GFX9-PAL-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
1313; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
1314; GFX9-PAL-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
1315; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32
1316; GFX9-PAL-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
1317; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
1318; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1319; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
1320;
1321; GFX10-PAL-LABEL: zero_init_large_offset_foo:
1322; GFX10-PAL:       ; %bb.0:
1323; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1324; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1325; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s32
1326; GFX10-PAL-NEXT:    s_mov_b32 s0, 0
1327; GFX10-PAL-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
1328; GFX10-PAL-NEXT:    s_mov_b32 s1, s0
1329; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
1330; GFX10-PAL-NEXT:    s_mov_b32 s3, s0
1331; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
1332; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, s0
1333; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, s1
1334; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, s2
1335; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, s3
1336; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo
1337; GFX10-PAL-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
1338; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
1339; GFX10-PAL-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
1340; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
1341; GFX10-PAL-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
1342; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
1343; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1344; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
1345  %padding = alloca [4096 x i32], align 4, addrspace(5)
1346  %alloca = alloca [32 x i16], align 2, addrspace(5)
1347  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
1348  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
1349  %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
1350  call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
1351  ret void
1352}
1353
1354define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
1355; GFX9-LABEL: store_load_sindex_large_offset_kernel:
1356; GFX9:       ; %bb.0: ; %bb
1357; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
1358; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
1359; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
1360; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
1361; GFX9-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4
1362; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1363; GFX9-NEXT:    s_lshl_b32 s1, s0, 2
1364; GFX9-NEXT:    s_and_b32 s0, s0, 15
1365; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
1366; GFX9-NEXT:    s_waitcnt vmcnt(0)
1367; GFX9-NEXT:    v_mov_b32_e32 v0, 15
1368; GFX9-NEXT:    s_add_u32 s1, 0x4004, s1
1369; GFX9-NEXT:    scratch_store_dword off, v0, s1
1370; GFX9-NEXT:    s_add_u32 s0, 0x4004, s0
1371; GFX9-NEXT:    scratch_load_dword v0, off, s0
1372; GFX9-NEXT:    s_endpgm
1373;
1374; GFX10-LABEL: store_load_sindex_large_offset_kernel:
1375; GFX10:       ; %bb.0: ; %bb
1376; GFX10-NEXT:    s_add_u32 s2, s2, s5
1377; GFX10-NEXT:    s_addc_u32 s3, s3, 0
1378; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1379; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1380; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x24
1381; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4
1382; GFX10-NEXT:    s_waitcnt vmcnt(0)
1383; GFX10-NEXT:    v_mov_b32_e32 v0, 15
1384; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1385; GFX10-NEXT:    s_and_b32 s1, s0, 15
1386; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
1387; GFX10-NEXT:    s_lshl_b32 s1, s1, 2
1388; GFX10-NEXT:    s_add_u32 s0, 0x4004, s0
1389; GFX10-NEXT:    s_add_u32 s1, 0x4004, s1
1390; GFX10-NEXT:    scratch_store_dword off, v0, s0
1391; GFX10-NEXT:    scratch_load_dword v0, off, s1
1392; GFX10-NEXT:    s_endpgm
1393;
1394; GFX9-PAL-LABEL: store_load_sindex_large_offset_kernel:
1395; GFX9-PAL:       ; %bb.0: ; %bb
1396; GFX9-PAL-NEXT:    s_getpc_b64 s[4:5]
1397; GFX9-PAL-NEXT:    s_mov_b32 s4, s0
1398; GFX9-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
1399; GFX9-PAL-NEXT:    s_load_dword s0, s[0:1], 0x24
1400; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
1401; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1402; GFX9-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
1403; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s4, s3
1404; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
1405; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
1406; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
1407; GFX9-PAL-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4
1408; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
1409; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1410; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
1411; GFX9-PAL-NEXT:    s_add_u32 s1, 0x4004, s1
1412; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
1413; GFX9-PAL-NEXT:    s_add_u32 s0, 0x4004, s0
1414; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0
1415; GFX9-PAL-NEXT:    s_endpgm
1416;
1417; GFX10-PAL-LABEL: store_load_sindex_large_offset_kernel:
1418; GFX10-PAL:       ; %bb.0: ; %bb
1419; GFX10-PAL-NEXT:    s_getpc_b64 s[4:5]
1420; GFX10-PAL-NEXT:    s_mov_b32 s4, s0
1421; GFX10-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
1422; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1423; GFX10-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
1424; GFX10-PAL-NEXT:    s_add_u32 s4, s4, s3
1425; GFX10-PAL-NEXT:    s_addc_u32 s5, s5, 0
1426; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
1427; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
1428; GFX10-PAL-NEXT:    s_load_dword s0, s[0:1], 0x24
1429; GFX10-PAL-NEXT:    scratch_load_dword v0, off, off offset:4
1430; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
1431; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, 15
1432; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1433; GFX10-PAL-NEXT:    s_and_b32 s1, s0, 15
1434; GFX10-PAL-NEXT:    s_lshl_b32 s0, s0, 2
1435; GFX10-PAL-NEXT:    s_lshl_b32 s1, s1, 2
1436; GFX10-PAL-NEXT:    s_add_u32 s0, 0x4004, s0
1437; GFX10-PAL-NEXT:    s_add_u32 s1, 0x4004, s1
1438; GFX10-PAL-NEXT:    scratch_store_dword off, v0, s0
1439; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s1
1440; GFX10-PAL-NEXT:    s_endpgm
1441bb:
1442  %padding = alloca [4096 x i32], align 4, addrspace(5)
1443  %i = alloca [32 x float], align 4, addrspace(5)
1444  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
1445  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
1446  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
1447  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
1448  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
1449  store volatile i32 15, i32 addrspace(5)* %i8, align 4
1450  %i9 = and i32 %idx, 15
1451  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
1452  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
1453  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
1454  ret void
1455}
1456
1457define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) {
1458; GFX9-LABEL: store_load_sindex_large_offset_foo:
1459; GFX9:       ; %bb.0: ; %bb
1460; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
1461; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
1462; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
1463; GFX9-NEXT:    s_lshl_b32 s0, s2, 2
1464; GFX9-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4
1465; GFX9-NEXT:    s_add_u32 s0, 0x4004, s0
1466; GFX9-NEXT:    s_waitcnt vmcnt(0)
1467; GFX9-NEXT:    v_mov_b32_e32 v0, 15
1468; GFX9-NEXT:    scratch_store_dword off, v0, s0
1469; GFX9-NEXT:    s_and_b32 s0, s2, 15
1470; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
1471; GFX9-NEXT:    s_add_u32 s0, 0x4004, s0
1472; GFX9-NEXT:    scratch_load_dword v0, off, s0
1473; GFX9-NEXT:    s_endpgm
1474;
1475; GFX10-LABEL: store_load_sindex_large_offset_foo:
1476; GFX10:       ; %bb.0: ; %bb
1477; GFX10-NEXT:    s_add_u32 s0, s0, s3
1478; GFX10-NEXT:    s_addc_u32 s1, s1, 0
1479; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1480; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1481; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4
1482; GFX10-NEXT:    s_and_b32 s0, s2, 15
1483; GFX10-NEXT:    s_waitcnt vmcnt(0)
1484; GFX10-NEXT:    v_mov_b32_e32 v0, 15
1485; GFX10-NEXT:    s_lshl_b32 s1, s2, 2
1486; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
1487; GFX10-NEXT:    s_add_u32 s1, 0x4004, s1
1488; GFX10-NEXT:    s_add_u32 s0, 0x4004, s0
1489; GFX10-NEXT:    scratch_store_dword off, v0, s1
1490; GFX10-NEXT:    scratch_load_dword v0, off, s0
1491; GFX10-NEXT:    s_endpgm
1492;
1493; GFX9-PAL-LABEL: store_load_sindex_large_offset_foo:
1494; GFX9-PAL:       ; %bb.0: ; %bb
1495; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
1496; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
1497; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1498; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
1499; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1500; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1501; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
1502; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
1503; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
1504; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
1505; GFX9-PAL-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4
1506; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
1507; GFX9-PAL-NEXT:    s_add_u32 s1, 0x4004, s1
1508; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1509; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
1510; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
1511; GFX9-PAL-NEXT:    s_add_u32 s0, 0x4004, s0
1512; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0
1513; GFX9-PAL-NEXT:    s_endpgm
1514;
1515; GFX10-PAL-LABEL: store_load_sindex_large_offset_foo:
1516; GFX10-PAL:       ; %bb.0: ; %bb
1517; GFX10-PAL-NEXT:    s_getpc_b64 s[2:3]
1518; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
1519; GFX10-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1520; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1521; GFX10-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1522; GFX10-PAL-NEXT:    s_add_u32 s2, s2, s1
1523; GFX10-PAL-NEXT:    s_addc_u32 s3, s3, 0
1524; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1525; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1526; GFX10-PAL-NEXT:    scratch_load_dword v0, off, off offset:4
1527; GFX10-PAL-NEXT:    s_and_b32 s1, s0, 15
1528; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
1529; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, 15
1530; GFX10-PAL-NEXT:    s_lshl_b32 s0, s0, 2
1531; GFX10-PAL-NEXT:    s_lshl_b32 s1, s1, 2
1532; GFX10-PAL-NEXT:    s_add_u32 s0, 0x4004, s0
1533; GFX10-PAL-NEXT:    s_add_u32 s1, 0x4004, s1
1534; GFX10-PAL-NEXT:    scratch_store_dword off, v0, s0
1535; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s1
1536; GFX10-PAL-NEXT:    s_endpgm
1537bb:
1538  %padding = alloca [4096 x i32], align 4, addrspace(5)
1539  %i = alloca [32 x float], align 4, addrspace(5)
1540  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
1541  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
1542  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
1543  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
1544  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
1545  store volatile i32 15, i32 addrspace(5)* %i8, align 4
1546  %i9 = and i32 %idx, 15
1547  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
1548  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
1549  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
1550  ret void
1551}
1552
1553define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
1554; GFX9-LABEL: store_load_vindex_large_offset_kernel:
1555; GFX9:       ; %bb.0: ; %bb
1556; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
1557; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
1558; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
1559; GFX9-NEXT:    scratch_load_dword v1, off, vcc_hi offset:4
1560; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1561; GFX9-NEXT:    s_waitcnt vmcnt(0)
1562; GFX9-NEXT:    v_mov_b32_e32 v1, 0x4004
1563; GFX9-NEXT:    v_add_u32_e32 v2, v1, v0
1564; GFX9-NEXT:    v_mov_b32_e32 v3, 15
1565; GFX9-NEXT:    scratch_store_dword v2, v3, off
1566; GFX9-NEXT:    v_sub_u32_e32 v0, v1, v0
1567; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124
1568; GFX9-NEXT:    s_endpgm
1569;
1570; GFX10-LABEL: store_load_vindex_large_offset_kernel:
1571; GFX10:       ; %bb.0: ; %bb
1572; GFX10-NEXT:    s_add_u32 s0, s0, s3
1573; GFX10-NEXT:    s_addc_u32 s1, s1, 0
1574; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1575; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1576; GFX10-NEXT:    v_mov_b32_e32 v1, 0x4004
1577; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1578; GFX10-NEXT:    v_mov_b32_e32 v3, 15
1579; GFX10-NEXT:    v_add_nc_u32_e32 v2, v1, v0
1580; GFX10-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
1581; GFX10-NEXT:    scratch_load_dword v1, off, off offset:4
1582; GFX10-NEXT:    scratch_store_dword v2, v3, off
1583; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:124
1584; GFX10-NEXT:    s_endpgm
1585;
1586; GFX9-PAL-LABEL: store_load_vindex_large_offset_kernel:
1587; GFX9-PAL:       ; %bb.0: ; %bb
1588; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
1589; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
1590; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1591; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
1592; GFX9-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1593; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 15
1594; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1595; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1596; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
1597; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
1598; GFX9-PAL-NEXT:    scratch_load_dword v1, off, vcc_hi offset:4
1599; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1600; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 0x4004
1601; GFX9-PAL-NEXT:    v_add_u32_e32 v2, v1, v0
1602; GFX9-PAL-NEXT:    scratch_store_dword v2, v3, off
1603; GFX9-PAL-NEXT:    v_sub_u32_e32 v0, v1, v0
1604; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124
1605; GFX9-PAL-NEXT:    s_endpgm
1606;
1607; GFX10-PAL-LABEL: store_load_vindex_large_offset_kernel:
1608; GFX10-PAL:       ; %bb.0: ; %bb
1609; GFX10-PAL-NEXT:    s_getpc_b64 s[2:3]
1610; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
1611; GFX10-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1612; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1613; GFX10-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1614; GFX10-PAL-NEXT:    s_add_u32 s2, s2, s1
1615; GFX10-PAL-NEXT:    s_addc_u32 s3, s3, 0
1616; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1617; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1618; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 0x4004
1619; GFX10-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1620; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, 15
1621; GFX10-PAL-NEXT:    v_add_nc_u32_e32 v2, v1, v0
1622; GFX10-PAL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
1623; GFX10-PAL-NEXT:    scratch_load_dword v1, off, off offset:4
1624; GFX10-PAL-NEXT:    scratch_store_dword v2, v3, off
1625; GFX10-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124
1626; GFX10-PAL-NEXT:    s_endpgm
1627bb:
1628  %padding = alloca [4096 x i32], align 4, addrspace(5)
1629  %i = alloca [32 x float], align 4, addrspace(5)
1630  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
1631  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
1632  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
1633  %i2 = tail call i32 @llvm.amdgcn.workitem.id.x()
1634  %i3 = zext i32 %i2 to i64
1635  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2
1636  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
1637  store volatile i32 15, i32 addrspace(5)* %i8, align 4
1638  %i9 = sub nsw i32 31, %i2
1639  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
1640  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
1641  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
1642  ret void
1643}
1644
1645define void @store_load_vindex_large_offset_foo(i32 %idx) {
1646; GFX9-LABEL: store_load_vindex_large_offset_foo:
1647; GFX9:       ; %bb.0: ; %bb
1648; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1649; GFX9-NEXT:    scratch_load_dword v1, off, s32
1650; GFX9-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
1651; GFX9-NEXT:    s_waitcnt vmcnt(0)
1652; GFX9-NEXT:    v_mov_b32_e32 v1, vcc_hi
1653; GFX9-NEXT:    v_mov_b32_e32 v3, 15
1654; GFX9-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
1655; GFX9-NEXT:    v_and_b32_e32 v0, v0, v3
1656; GFX9-NEXT:    scratch_store_dword v2, v3, off
1657; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
1658; GFX9-NEXT:    scratch_load_dword v0, v0, off
1659; GFX9-NEXT:    s_waitcnt vmcnt(0)
1660; GFX9-NEXT:    s_setpc_b64 s[30:31]
1661;
1662; GFX10-LABEL: store_load_vindex_large_offset_foo:
1663; GFX10:       ; %bb.0: ; %bb
1664; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1665; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1666; GFX10-NEXT:    v_mov_b32_e32 v1, 15
1667; GFX10-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
1668; GFX10-NEXT:    v_mov_b32_e32 v2, vcc_lo
1669; GFX10-NEXT:    v_and_b32_e32 v3, v0, v1
1670; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, v2
1671; GFX10-NEXT:    v_lshl_add_u32 v2, v3, 2, v2
1672; GFX10-NEXT:    scratch_load_dword v3, off, s32
1673; GFX10-NEXT:    scratch_store_dword v0, v1, off
1674; GFX10-NEXT:    scratch_load_dword v0, v2, off
1675; GFX10-NEXT:    s_waitcnt vmcnt(0)
1676; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1677; GFX10-NEXT:    s_setpc_b64 s[30:31]
1678;
1679; GFX9-PAL-LABEL: store_load_vindex_large_offset_foo:
1680; GFX9-PAL:       ; %bb.0: ; %bb
1681; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1682; GFX9-PAL-NEXT:    scratch_load_dword v1, off, s32
1683; GFX9-PAL-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
1684; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1685; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, vcc_hi
1686; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 15
1687; GFX9-PAL-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
1688; GFX9-PAL-NEXT:    v_and_b32_e32 v0, v0, v3
1689; GFX9-PAL-NEXT:    scratch_store_dword v2, v3, off
1690; GFX9-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
1691; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off
1692; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1693; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
1694;
1695; GFX10-PAL-LABEL: store_load_vindex_large_offset_foo:
1696; GFX10-PAL:       ; %bb.0: ; %bb
1697; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1698; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1699; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
1700; GFX10-PAL-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
1701; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, vcc_lo
1702; GFX10-PAL-NEXT:    v_and_b32_e32 v3, v0, v1
1703; GFX10-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v2
1704; GFX10-PAL-NEXT:    v_lshl_add_u32 v2, v3, 2, v2
1705; GFX10-PAL-NEXT:    scratch_load_dword v3, off, s32
1706; GFX10-PAL-NEXT:    scratch_store_dword v0, v1, off
1707; GFX10-PAL-NEXT:    scratch_load_dword v0, v2, off
1708; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
1709; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1710; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
1711bb:
1712  %padding = alloca [4096 x i32], align 4, addrspace(5)
1713  %i = alloca [32 x float], align 4, addrspace(5)
1714  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
1715  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
1716  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
1717  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
1718  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
1719  store volatile i32 15, i32 addrspace(5)* %i8, align 4
1720  %i9 = and i32 %idx, 15
1721  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
1722  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
1723  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
1724  ret void
1725}
1726
1727define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
1728; GFX9-LABEL: store_load_large_imm_offset_kernel:
1729; GFX9:       ; %bb.0: ; %bb
1730; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
1731; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
1732; GFX9-NEXT:    s_movk_i32 s0, 0x3000
1733; GFX9-NEXT:    v_mov_b32_e32 v0, 13
1734; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
1735; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:4
1736; GFX9-NEXT:    s_add_u32 s0, 4, s0
1737; GFX9-NEXT:    v_mov_b32_e32 v0, 15
1738; GFX9-NEXT:    scratch_store_dword off, v0, s0 offset:3712
1739; GFX9-NEXT:    scratch_load_dword v0, off, s0 offset:3712
1740; GFX9-NEXT:    s_endpgm
1741;
1742; GFX10-LABEL: store_load_large_imm_offset_kernel:
1743; GFX10:       ; %bb.0: ; %bb
1744; GFX10-NEXT:    s_add_u32 s0, s0, s3
1745; GFX10-NEXT:    s_addc_u32 s1, s1, 0
1746; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1747; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1748; GFX10-NEXT:    v_mov_b32_e32 v0, 13
1749; GFX10-NEXT:    v_mov_b32_e32 v1, 15
1750; GFX10-NEXT:    s_movk_i32 s0, 0x3800
1751; GFX10-NEXT:    s_add_u32 s0, 4, s0
1752; GFX10-NEXT:    scratch_store_dword off, v0, off offset:4
1753; GFX10-NEXT:    scratch_store_dword off, v1, s0 offset:1664
1754; GFX10-NEXT:    scratch_load_dword v0, off, s0 offset:1664
1755; GFX10-NEXT:    s_endpgm
1756;
1757; GFX9-PAL-LABEL: store_load_large_imm_offset_kernel:
1758; GFX9-PAL:       ; %bb.0: ; %bb
1759; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
1760; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
1761; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1762; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 13
1763; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
1764; GFX9-PAL-NEXT:    s_movk_i32 s0, 0x3000
1765; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1766; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1767; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
1768; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
1769; GFX9-PAL-NEXT:    scratch_store_dword off, v0, vcc_hi offset:4
1770; GFX9-PAL-NEXT:    s_add_u32 s0, 4, s0
1771; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
1772; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s0 offset:3712
1773; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 offset:3712
1774; GFX9-PAL-NEXT:    s_endpgm
1775;
1776; GFX10-PAL-LABEL: store_load_large_imm_offset_kernel:
1777; GFX10-PAL:       ; %bb.0: ; %bb
1778; GFX10-PAL-NEXT:    s_getpc_b64 s[2:3]
1779; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
1780; GFX10-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1781; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1782; GFX10-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1783; GFX10-PAL-NEXT:    s_add_u32 s2, s2, s1
1784; GFX10-PAL-NEXT:    s_addc_u32 s3, s3, 0
1785; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1786; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1787; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, 13
1788; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
1789; GFX10-PAL-NEXT:    s_movk_i32 s0, 0x3800
1790; GFX10-PAL-NEXT:    s_add_u32 s0, 4, s0
1791; GFX10-PAL-NEXT:    scratch_store_dword off, v0, off offset:4
1792; GFX10-PAL-NEXT:    scratch_store_dword off, v1, s0 offset:1664
1793; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s0 offset:1664
1794; GFX10-PAL-NEXT:    s_endpgm
1795bb:
1796  %i = alloca [4096 x i32], align 4, addrspace(5)
1797  %i1 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 undef
1798  store volatile i32 13, i32 addrspace(5)* %i1, align 4
1799  %i7 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000
1800  store volatile i32 15, i32 addrspace(5)* %i7, align 4
1801  %i10 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000
1802  %i12 = load volatile i32, i32 addrspace(5)* %i10, align 4
1803  ret void
1804}
1805
1806define void @store_load_large_imm_offset_foo() {
1807; GFX9-LABEL: store_load_large_imm_offset_foo:
1808; GFX9:       ; %bb.0: ; %bb
1809; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1810; GFX9-NEXT:    s_movk_i32 s0, 0x3000
1811; GFX9-NEXT:    v_mov_b32_e32 v0, 13
1812; GFX9-NEXT:    scratch_store_dword off, v0, s32
1813; GFX9-NEXT:    s_add_u32 s0, s32, s0
1814; GFX9-NEXT:    v_mov_b32_e32 v0, 15
1815; GFX9-NEXT:    scratch_store_dword off, v0, s0 offset:3712
1816; GFX9-NEXT:    scratch_load_dword v0, off, s0 offset:3712
1817; GFX9-NEXT:    s_waitcnt vmcnt(0)
1818; GFX9-NEXT:    s_setpc_b64 s[30:31]
1819;
1820; GFX10-LABEL: store_load_large_imm_offset_foo:
1821; GFX10:       ; %bb.0: ; %bb
1822; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1823; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1824; GFX10-NEXT:    v_mov_b32_e32 v0, 13
1825; GFX10-NEXT:    v_mov_b32_e32 v1, 15
1826; GFX10-NEXT:    s_movk_i32 s0, 0x3800
1827; GFX10-NEXT:    s_add_u32 s0, s32, s0
1828; GFX10-NEXT:    scratch_store_dword off, v0, s32
1829; GFX10-NEXT:    scratch_store_dword off, v1, s0 offset:1664
1830; GFX10-NEXT:    scratch_load_dword v0, off, s0 offset:1664
1831; GFX10-NEXT:    s_waitcnt vmcnt(0)
1832; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1833; GFX10-NEXT:    s_setpc_b64 s[30:31]
1834;
1835; GFX9-PAL-LABEL: store_load_large_imm_offset_foo:
1836; GFX9-PAL:       ; %bb.0: ; %bb
1837; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1838; GFX9-PAL-NEXT:    s_movk_i32 s0, 0x3000
1839; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 13
1840; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s32
1841; GFX9-PAL-NEXT:    s_add_u32 s0, s32, s0
1842; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
1843; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s0 offset:3712
1844; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 offset:3712
1845; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1846; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
1847;
1848; GFX10-PAL-LABEL: store_load_large_imm_offset_foo:
1849; GFX10-PAL:       ; %bb.0: ; %bb
1850; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1851; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1852; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, 13
1853; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
1854; GFX10-PAL-NEXT:    s_movk_i32 s0, 0x3800
1855; GFX10-PAL-NEXT:    s_add_u32 s0, s32, s0
1856; GFX10-PAL-NEXT:    scratch_store_dword off, v0, s32
1857; GFX10-PAL-NEXT:    scratch_store_dword off, v1, s0 offset:1664
1858; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s0 offset:1664
1859; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
1860; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1861; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
1862bb:
1863  %i = alloca [4096 x i32], align 4, addrspace(5)
1864  %i1 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 undef
1865  store volatile i32 13, i32 addrspace(5)* %i1, align 4
1866  %i7 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000
1867  store volatile i32 15, i32 addrspace(5)* %i7, align 4
1868  %i10 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000
1869  %i12 = load volatile i32, i32 addrspace(5)* %i10, align 4
1870  ret void
1871}
1872
1873define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
1874; GFX9-LABEL: store_load_vidx_sidx_offset:
1875; GFX9:       ; %bb.0: ; %bb
1876; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
1877; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
1878; GFX9-NEXT:    v_mov_b32_e32 v1, 4
1879; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
1880; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1881; GFX9-NEXT:    v_add_u32_e32 v0, s0, v0
1882; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
1883; GFX9-NEXT:    v_mov_b32_e32 v1, 15
1884; GFX9-NEXT:    scratch_store_dword v0, v1, off offset:1024
1885; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:1024
1886; GFX9-NEXT:    s_endpgm
1887;
1888; GFX10-LABEL: store_load_vidx_sidx_offset:
1889; GFX10:       ; %bb.0: ; %bb
1890; GFX10-NEXT:    s_add_u32 s2, s2, s5
1891; GFX10-NEXT:    s_addc_u32 s3, s3, 0
1892; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1893; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1894; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x24
1895; GFX10-NEXT:    v_mov_b32_e32 v1, 15
1896; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1897; GFX10-NEXT:    v_add_nc_u32_e32 v0, s0, v0
1898; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, 4
1899; GFX10-NEXT:    scratch_store_dword v0, v1, off offset:1024
1900; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:1024
1901; GFX10-NEXT:    s_endpgm
1902;
1903; GFX9-PAL-LABEL: store_load_vidx_sidx_offset:
1904; GFX9-PAL:       ; %bb.0: ; %bb
1905; GFX9-PAL-NEXT:    s_getpc_b64 s[4:5]
1906; GFX9-PAL-NEXT:    s_mov_b32 s4, s0
1907; GFX9-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
1908; GFX9-PAL-NEXT:    s_load_dword s0, s[0:1], 0x24
1909; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 4
1910; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1911; GFX9-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
1912; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s4, s3
1913; GFX9-PAL-NEXT:    v_add_u32_e32 v0, s0, v0
1914; GFX9-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
1915; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
1916; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 15
1917; GFX9-PAL-NEXT:    scratch_store_dword v0, v1, off offset:1024
1918; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off offset:1024
1919; GFX9-PAL-NEXT:    s_endpgm
1920;
1921; GFX10-PAL-LABEL: store_load_vidx_sidx_offset:
1922; GFX10-PAL:       ; %bb.0: ; %bb
1923; GFX10-PAL-NEXT:    s_getpc_b64 s[4:5]
1924; GFX10-PAL-NEXT:    s_mov_b32 s4, s0
1925; GFX10-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
1926; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1927; GFX10-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
1928; GFX10-PAL-NEXT:    s_add_u32 s4, s4, s3
1929; GFX10-PAL-NEXT:    s_addc_u32 s5, s5, 0
1930; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
1931; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
1932; GFX10-PAL-NEXT:    s_load_dword s0, s[0:1], 0x24
1933; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
1934; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1935; GFX10-PAL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
1936; GFX10-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, 4
1937; GFX10-PAL-NEXT:    scratch_store_dword v0, v1, off offset:1024
1938; GFX10-PAL-NEXT:    scratch_load_dword v0, v0, off offset:1024
1939; GFX10-PAL-NEXT:    s_endpgm
1940bb:
1941  %alloca = alloca [32 x i32], align 4, addrspace(5)
1942  %vidx = tail call i32 @llvm.amdgcn.workitem.id.x()
1943  %add1 = add nsw i32 %sidx, %vidx
1944  %add2 = add nsw i32 %add1, 256
1945  %gep = getelementptr inbounds [32 x i32], [32 x i32] addrspace(5)* %alloca, i32 0, i32 %add2
1946  store volatile i32 15, i32 addrspace(5)* %gep, align 4
1947  %load = load volatile i32, i32 addrspace(5)* %gep, align 4
1948  ret void
1949}
1950
1951define void @store_load_i64_aligned(i64 addrspace(5)* nocapture %arg) {
1952; GFX9-LABEL: store_load_i64_aligned:
1953; GFX9:       ; %bb.0: ; %bb
1954; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1955; GFX9-NEXT:    v_mov_b32_e32 v1, 15
1956; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1957; GFX9-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
1958; GFX9-NEXT:    scratch_load_dwordx2 v[0:1], v0, off
1959; GFX9-NEXT:    s_waitcnt vmcnt(0)
1960; GFX9-NEXT:    s_setpc_b64 s[30:31]
1961;
1962; GFX10-LABEL: store_load_i64_aligned:
1963; GFX10:       ; %bb.0: ; %bb
1964; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1965; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1966; GFX10-NEXT:    v_mov_b32_e32 v1, 15
1967; GFX10-NEXT:    v_mov_b32_e32 v2, 0
1968; GFX10-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
1969; GFX10-NEXT:    scratch_load_dwordx2 v[0:1], v0, off
1970; GFX10-NEXT:    s_waitcnt vmcnt(0)
1971; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1972; GFX10-NEXT:    s_setpc_b64 s[30:31]
1973;
1974; GFX9-PAL-LABEL: store_load_i64_aligned:
1975; GFX9-PAL:       ; %bb.0: ; %bb
1976; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1977; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 15
1978; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, 0
1979; GFX9-PAL-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
1980; GFX9-PAL-NEXT:    scratch_load_dwordx2 v[0:1], v0, off
1981; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1982; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
1983;
1984; GFX10-PAL-LABEL: store_load_i64_aligned:
1985; GFX10-PAL:       ; %bb.0: ; %bb
1986; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1987; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1988; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
1989; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 0
1990; GFX10-PAL-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
1991; GFX10-PAL-NEXT:    scratch_load_dwordx2 v[0:1], v0, off
1992; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
1993; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1994; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
1995bb:
1996  store volatile i64 15, i64 addrspace(5)* %arg, align 8
1997  %load = load volatile i64, i64 addrspace(5)* %arg, align 8
1998  ret void
1999}
2000
2001define void @store_load_i64_unaligned(i64 addrspace(5)* nocapture %arg) {
2002; GFX9-LABEL: store_load_i64_unaligned:
2003; GFX9:       ; %bb.0: ; %bb
2004; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2005; GFX9-NEXT:    v_mov_b32_e32 v1, 15
2006; GFX9-NEXT:    v_mov_b32_e32 v2, 0
2007; GFX9-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
2008; GFX9-NEXT:    scratch_load_dwordx2 v[0:1], v0, off
2009; GFX9-NEXT:    s_waitcnt vmcnt(0)
2010; GFX9-NEXT:    s_setpc_b64 s[30:31]
2011;
2012; GFX10-LABEL: store_load_i64_unaligned:
2013; GFX10:       ; %bb.0: ; %bb
2014; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2015; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2016; GFX10-NEXT:    v_mov_b32_e32 v1, 15
2017; GFX10-NEXT:    v_mov_b32_e32 v2, 0
2018; GFX10-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
2019; GFX10-NEXT:    scratch_load_dwordx2 v[0:1], v0, off
2020; GFX10-NEXT:    s_waitcnt vmcnt(0)
2021; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2022; GFX10-NEXT:    s_setpc_b64 s[30:31]
2023;
2024; GFX9-PAL-LABEL: store_load_i64_unaligned:
2025; GFX9-PAL:       ; %bb.0: ; %bb
2026; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2027; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 15
2028; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, 0
2029; GFX9-PAL-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
2030; GFX9-PAL-NEXT:    scratch_load_dwordx2 v[0:1], v0, off
2031; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2032; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
2033;
2034; GFX10-PAL-LABEL: store_load_i64_unaligned:
2035; GFX10-PAL:       ; %bb.0: ; %bb
2036; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2037; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2038; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
2039; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 0
2040; GFX10-PAL-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
2041; GFX10-PAL-NEXT:    scratch_load_dwordx2 v[0:1], v0, off
2042; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
2043; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2044; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
2045bb:
2046  store volatile i64 15, i64 addrspace(5)* %arg, align 1
2047  %load = load volatile i64, i64 addrspace(5)* %arg, align 1
2048  ret void
2049}
2050
2051define void @store_load_v3i32_unaligned(<3 x i32> addrspace(5)* nocapture %arg) {
2052; GFX9-LABEL: store_load_v3i32_unaligned:
2053; GFX9:       ; %bb.0: ; %bb
2054; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2055; GFX9-NEXT:    v_mov_b32_e32 v1, 1
2056; GFX9-NEXT:    v_mov_b32_e32 v2, 2
2057; GFX9-NEXT:    v_mov_b32_e32 v3, 3
2058; GFX9-NEXT:    scratch_store_dwordx3 v0, v[1:3], off
2059; GFX9-NEXT:    scratch_load_dwordx3 v[0:2], v0, off
2060; GFX9-NEXT:    s_waitcnt vmcnt(0)
2061; GFX9-NEXT:    s_setpc_b64 s[30:31]
2062;
2063; GFX10-LABEL: store_load_v3i32_unaligned:
2064; GFX10:       ; %bb.0: ; %bb
2065; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2066; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2067; GFX10-NEXT:    v_mov_b32_e32 v1, 1
2068; GFX10-NEXT:    v_mov_b32_e32 v2, 2
2069; GFX10-NEXT:    v_mov_b32_e32 v3, 3
2070; GFX10-NEXT:    scratch_store_dwordx3 v0, v[1:3], off
2071; GFX10-NEXT:    scratch_load_dwordx3 v[0:2], v0, off
2072; GFX10-NEXT:    s_waitcnt vmcnt(0)
2073; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2074; GFX10-NEXT:    s_setpc_b64 s[30:31]
2075;
2076; GFX9-PAL-LABEL: store_load_v3i32_unaligned:
2077; GFX9-PAL:       ; %bb.0: ; %bb
2078; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2079; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 1
2080; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, 2
2081; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 3
2082; GFX9-PAL-NEXT:    scratch_store_dwordx3 v0, v[1:3], off
2083; GFX9-PAL-NEXT:    scratch_load_dwordx3 v[0:2], v0, off
2084; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2085; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
2086;
2087; GFX10-PAL-LABEL: store_load_v3i32_unaligned:
2088; GFX10-PAL:       ; %bb.0: ; %bb
2089; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2090; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2091; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 1
2092; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 2
2093; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, 3
2094; GFX10-PAL-NEXT:    scratch_store_dwordx3 v0, v[1:3], off
2095; GFX10-PAL-NEXT:    scratch_load_dwordx3 v[0:2], v0, off
2096; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
2097; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2098; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
2099bb:
2100  store volatile <3 x i32> <i32 1, i32 2, i32 3>, <3 x i32> addrspace(5)* %arg, align 1
2101  %load = load volatile <3 x i32>, <3 x i32> addrspace(5)* %arg, align 1
2102  ret void
2103}
2104
2105define void @store_load_v4i32_unaligned(<4 x i32> addrspace(5)* nocapture %arg) {
2106; GFX9-LABEL: store_load_v4i32_unaligned:
2107; GFX9:       ; %bb.0: ; %bb
2108; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2109; GFX9-NEXT:    v_mov_b32_e32 v1, 1
2110; GFX9-NEXT:    v_mov_b32_e32 v2, 2
2111; GFX9-NEXT:    v_mov_b32_e32 v3, 3
2112; GFX9-NEXT:    v_mov_b32_e32 v4, 4
2113; GFX9-NEXT:    scratch_store_dwordx4 v0, v[1:4], off
2114; GFX9-NEXT:    scratch_load_dwordx4 v[0:3], v0, off
2115; GFX9-NEXT:    s_waitcnt vmcnt(0)
2116; GFX9-NEXT:    s_setpc_b64 s[30:31]
2117;
2118; GFX10-LABEL: store_load_v4i32_unaligned:
2119; GFX10:       ; %bb.0: ; %bb
2120; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2121; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2122; GFX10-NEXT:    v_mov_b32_e32 v1, 1
2123; GFX10-NEXT:    v_mov_b32_e32 v2, 2
2124; GFX10-NEXT:    v_mov_b32_e32 v3, 3
2125; GFX10-NEXT:    v_mov_b32_e32 v4, 4
2126; GFX10-NEXT:    scratch_store_dwordx4 v0, v[1:4], off
2127; GFX10-NEXT:    scratch_load_dwordx4 v[0:3], v0, off
2128; GFX10-NEXT:    s_waitcnt vmcnt(0)
2129; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2130; GFX10-NEXT:    s_setpc_b64 s[30:31]
2131;
2132; GFX9-PAL-LABEL: store_load_v4i32_unaligned:
2133; GFX9-PAL:       ; %bb.0: ; %bb
2134; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2135; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 1
2136; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, 2
2137; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 3
2138; GFX9-PAL-NEXT:    v_mov_b32_e32 v4, 4
2139; GFX9-PAL-NEXT:    scratch_store_dwordx4 v0, v[1:4], off
2140; GFX9-PAL-NEXT:    scratch_load_dwordx4 v[0:3], v0, off
2141; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2142; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
2143;
2144; GFX10-PAL-LABEL: store_load_v4i32_unaligned:
2145; GFX10-PAL:       ; %bb.0: ; %bb
2146; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2147; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2148; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 1
2149; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 2
2150; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, 3
2151; GFX10-PAL-NEXT:    v_mov_b32_e32 v4, 4
2152; GFX10-PAL-NEXT:    scratch_store_dwordx4 v0, v[1:4], off
2153; GFX10-PAL-NEXT:    scratch_load_dwordx4 v[0:3], v0, off
2154; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
2155; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2156; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
2157bb:
2158  store volatile <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(5)* %arg, align 1
2159  %load = load volatile <4 x i32>, <4 x i32> addrspace(5)* %arg, align 1
2160  ret void
2161}
2162
2163declare void @llvm.memset.p5i8.i64(i8 addrspace(5)* nocapture writeonly, i8, i64, i1 immarg)
2164declare i32 @llvm.amdgcn.workitem.id.x()
2165