1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
3; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s
4
5; FIXME:
6; XUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX6 %s
7
8define amdgpu_kernel void @store_lds_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> %x) {
9; GFX9-LABEL: store_lds_v4i32:
10; GFX9:       ; %bb.0:
11; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x24
12; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x34
13; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
14; GFX9-NEXT:    v_mov_b32_e32 v4, s4
15; GFX9-NEXT:    v_mov_b32_e32 v0, s0
16; GFX9-NEXT:    v_mov_b32_e32 v1, s1
17; GFX9-NEXT:    v_mov_b32_e32 v2, s2
18; GFX9-NEXT:    v_mov_b32_e32 v3, s3
19; GFX9-NEXT:    ds_write_b128 v4, v[0:3]
20; GFX9-NEXT:    s_endpgm
21;
22; GFX7-LABEL: store_lds_v4i32:
23; GFX7:       ; %bb.0:
24; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x9
25; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
26; GFX7-NEXT:    s_mov_b32 m0, -1
27; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
28; GFX7-NEXT:    v_mov_b32_e32 v4, s4
29; GFX7-NEXT:    v_mov_b32_e32 v0, s0
30; GFX7-NEXT:    v_mov_b32_e32 v1, s1
31; GFX7-NEXT:    v_mov_b32_e32 v2, s2
32; GFX7-NEXT:    v_mov_b32_e32 v3, s3
33; GFX7-NEXT:    ds_write_b128 v4, v[0:3]
34; GFX7-NEXT:    s_endpgm
35  store <4 x i32> %x, <4 x i32> addrspace(3)* %out
36  ret void
37}
38
39define amdgpu_kernel void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out, <4 x i32> %x) {
40; GFX9-LABEL: store_lds_v4i32_align1:
41; GFX9:       ; %bb.0:
42; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x24
43; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x34
44; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
45; GFX9-NEXT:    v_mov_b32_e32 v1, s4
46; GFX9-NEXT:    v_mov_b32_e32 v0, s0
47; GFX9-NEXT:    s_lshr_b32 s5, s0, 8
48; GFX9-NEXT:    ds_write_b8 v1, v0
49; GFX9-NEXT:    v_mov_b32_e32 v0, s5
50; GFX9-NEXT:    s_lshr_b32 s6, s0, 16
51; GFX9-NEXT:    s_lshr_b32 s7, s0, 24
52; GFX9-NEXT:    ds_write_b8 v1, v0 offset:1
53; GFX9-NEXT:    v_mov_b32_e32 v0, s6
54; GFX9-NEXT:    ds_write_b8 v1, v0 offset:2
55; GFX9-NEXT:    v_mov_b32_e32 v0, s7
56; GFX9-NEXT:    ds_write_b8 v1, v0 offset:3
57; GFX9-NEXT:    v_mov_b32_e32 v0, s1
58; GFX9-NEXT:    s_lshr_b32 s0, s1, 8
59; GFX9-NEXT:    ds_write_b8 v1, v0 offset:4
60; GFX9-NEXT:    v_mov_b32_e32 v0, s0
61; GFX9-NEXT:    s_lshr_b32 s4, s1, 16
62; GFX9-NEXT:    s_lshr_b32 s5, s1, 24
63; GFX9-NEXT:    ds_write_b8 v1, v0 offset:5
64; GFX9-NEXT:    v_mov_b32_e32 v0, s4
65; GFX9-NEXT:    ds_write_b8 v1, v0 offset:6
66; GFX9-NEXT:    v_mov_b32_e32 v0, s5
67; GFX9-NEXT:    ds_write_b8 v1, v0 offset:7
68; GFX9-NEXT:    v_mov_b32_e32 v0, s2
69; GFX9-NEXT:    s_lshr_b32 s0, s2, 8
70; GFX9-NEXT:    ds_write_b8 v1, v0 offset:8
71; GFX9-NEXT:    v_mov_b32_e32 v0, s0
72; GFX9-NEXT:    s_lshr_b32 s1, s2, 16
73; GFX9-NEXT:    s_lshr_b32 s4, s2, 24
74; GFX9-NEXT:    ds_write_b8 v1, v0 offset:9
75; GFX9-NEXT:    v_mov_b32_e32 v0, s1
76; GFX9-NEXT:    ds_write_b8 v1, v0 offset:10
77; GFX9-NEXT:    v_mov_b32_e32 v0, s4
78; GFX9-NEXT:    ds_write_b8 v1, v0 offset:11
79; GFX9-NEXT:    v_mov_b32_e32 v0, s3
80; GFX9-NEXT:    s_lshr_b32 s0, s3, 8
81; GFX9-NEXT:    ds_write_b8 v1, v0 offset:12
82; GFX9-NEXT:    v_mov_b32_e32 v0, s0
83; GFX9-NEXT:    s_lshr_b32 s1, s3, 16
84; GFX9-NEXT:    ds_write_b8 v1, v0 offset:13
85; GFX9-NEXT:    v_mov_b32_e32 v0, s1
86; GFX9-NEXT:    s_lshr_b32 s2, s3, 24
87; GFX9-NEXT:    ds_write_b8 v1, v0 offset:14
88; GFX9-NEXT:    v_mov_b32_e32 v0, s2
89; GFX9-NEXT:    ds_write_b8 v1, v0 offset:15
90; GFX9-NEXT:    s_endpgm
91;
92; GFX7-LABEL: store_lds_v4i32_align1:
93; GFX7:       ; %bb.0:
94; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x9
95; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
96; GFX7-NEXT:    s_mov_b32 m0, -1
97; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
98; GFX7-NEXT:    v_mov_b32_e32 v1, s4
99; GFX7-NEXT:    v_mov_b32_e32 v0, s0
100; GFX7-NEXT:    s_lshr_b32 s5, s0, 8
101; GFX7-NEXT:    ds_write_b8 v1, v0
102; GFX7-NEXT:    v_mov_b32_e32 v0, s5
103; GFX7-NEXT:    s_lshr_b32 s6, s0, 16
104; GFX7-NEXT:    s_lshr_b32 s7, s0, 24
105; GFX7-NEXT:    ds_write_b8 v1, v0 offset:1
106; GFX7-NEXT:    v_mov_b32_e32 v0, s6
107; GFX7-NEXT:    ds_write_b8 v1, v0 offset:2
108; GFX7-NEXT:    v_mov_b32_e32 v0, s7
109; GFX7-NEXT:    ds_write_b8 v1, v0 offset:3
110; GFX7-NEXT:    v_mov_b32_e32 v0, s1
111; GFX7-NEXT:    s_lshr_b32 s0, s1, 8
112; GFX7-NEXT:    ds_write_b8 v1, v0 offset:4
113; GFX7-NEXT:    v_mov_b32_e32 v0, s0
114; GFX7-NEXT:    s_lshr_b32 s4, s1, 16
115; GFX7-NEXT:    s_lshr_b32 s5, s1, 24
116; GFX7-NEXT:    ds_write_b8 v1, v0 offset:5
117; GFX7-NEXT:    v_mov_b32_e32 v0, s4
118; GFX7-NEXT:    ds_write_b8 v1, v0 offset:6
119; GFX7-NEXT:    v_mov_b32_e32 v0, s5
120; GFX7-NEXT:    ds_write_b8 v1, v0 offset:7
121; GFX7-NEXT:    v_mov_b32_e32 v0, s2
122; GFX7-NEXT:    s_lshr_b32 s0, s2, 8
123; GFX7-NEXT:    ds_write_b8 v1, v0 offset:8
124; GFX7-NEXT:    v_mov_b32_e32 v0, s0
125; GFX7-NEXT:    s_lshr_b32 s1, s2, 16
126; GFX7-NEXT:    s_lshr_b32 s4, s2, 24
127; GFX7-NEXT:    ds_write_b8 v1, v0 offset:9
128; GFX7-NEXT:    v_mov_b32_e32 v0, s1
129; GFX7-NEXT:    ds_write_b8 v1, v0 offset:10
130; GFX7-NEXT:    v_mov_b32_e32 v0, s4
131; GFX7-NEXT:    ds_write_b8 v1, v0 offset:11
132; GFX7-NEXT:    v_mov_b32_e32 v0, s3
133; GFX7-NEXT:    s_lshr_b32 s0, s3, 8
134; GFX7-NEXT:    ds_write_b8 v1, v0 offset:12
135; GFX7-NEXT:    v_mov_b32_e32 v0, s0
136; GFX7-NEXT:    s_lshr_b32 s1, s3, 16
137; GFX7-NEXT:    ds_write_b8 v1, v0 offset:13
138; GFX7-NEXT:    v_mov_b32_e32 v0, s1
139; GFX7-NEXT:    s_lshr_b32 s2, s3, 24
140; GFX7-NEXT:    ds_write_b8 v1, v0 offset:14
141; GFX7-NEXT:    v_mov_b32_e32 v0, s2
142; GFX7-NEXT:    ds_write_b8 v1, v0 offset:15
143; GFX7-NEXT:    s_endpgm
144  store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 1
145  ret void
146}
147
148define amdgpu_kernel void @store_lds_v4i32_align2(<4 x i32> addrspace(3)* %out, <4 x i32> %x) {
149; GFX9-LABEL: store_lds_v4i32_align2:
150; GFX9:       ; %bb.0:
151; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x24
152; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x34
153; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
154; GFX9-NEXT:    v_mov_b32_e32 v1, s4
155; GFX9-NEXT:    v_mov_b32_e32 v0, s0
156; GFX9-NEXT:    s_lshr_b32 s5, s0, 16
157; GFX9-NEXT:    ds_write_b16 v1, v0
158; GFX9-NEXT:    v_mov_b32_e32 v0, s5
159; GFX9-NEXT:    ds_write_b16 v1, v0 offset:2
160; GFX9-NEXT:    v_mov_b32_e32 v0, s1
161; GFX9-NEXT:    s_lshr_b32 s0, s1, 16
162; GFX9-NEXT:    ds_write_b16 v1, v0 offset:4
163; GFX9-NEXT:    v_mov_b32_e32 v0, s0
164; GFX9-NEXT:    ds_write_b16 v1, v0 offset:6
165; GFX9-NEXT:    v_mov_b32_e32 v0, s2
166; GFX9-NEXT:    s_lshr_b32 s0, s2, 16
167; GFX9-NEXT:    ds_write_b16 v1, v0 offset:8
168; GFX9-NEXT:    v_mov_b32_e32 v0, s0
169; GFX9-NEXT:    ds_write_b16 v1, v0 offset:10
170; GFX9-NEXT:    v_mov_b32_e32 v0, s3
171; GFX9-NEXT:    s_lshr_b32 s0, s3, 16
172; GFX9-NEXT:    ds_write_b16 v1, v0 offset:12
173; GFX9-NEXT:    v_mov_b32_e32 v0, s0
174; GFX9-NEXT:    ds_write_b16 v1, v0 offset:14
175; GFX9-NEXT:    s_endpgm
176;
177; GFX7-LABEL: store_lds_v4i32_align2:
178; GFX7:       ; %bb.0:
179; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x9
180; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
181; GFX7-NEXT:    s_mov_b32 m0, -1
182; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
183; GFX7-NEXT:    v_mov_b32_e32 v1, s4
184; GFX7-NEXT:    v_mov_b32_e32 v0, s0
185; GFX7-NEXT:    s_lshr_b32 s5, s0, 16
186; GFX7-NEXT:    ds_write_b16 v1, v0
187; GFX7-NEXT:    v_mov_b32_e32 v0, s5
188; GFX7-NEXT:    ds_write_b16 v1, v0 offset:2
189; GFX7-NEXT:    v_mov_b32_e32 v0, s1
190; GFX7-NEXT:    s_lshr_b32 s0, s1, 16
191; GFX7-NEXT:    ds_write_b16 v1, v0 offset:4
192; GFX7-NEXT:    v_mov_b32_e32 v0, s0
193; GFX7-NEXT:    ds_write_b16 v1, v0 offset:6
194; GFX7-NEXT:    v_mov_b32_e32 v0, s2
195; GFX7-NEXT:    s_lshr_b32 s0, s2, 16
196; GFX7-NEXT:    ds_write_b16 v1, v0 offset:8
197; GFX7-NEXT:    v_mov_b32_e32 v0, s0
198; GFX7-NEXT:    ds_write_b16 v1, v0 offset:10
199; GFX7-NEXT:    v_mov_b32_e32 v0, s3
200; GFX7-NEXT:    s_lshr_b32 s0, s3, 16
201; GFX7-NEXT:    ds_write_b16 v1, v0 offset:12
202; GFX7-NEXT:    v_mov_b32_e32 v0, s0
203; GFX7-NEXT:    ds_write_b16 v1, v0 offset:14
204; GFX7-NEXT:    s_endpgm
205  store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 2
206  ret void
207}
208
209define amdgpu_kernel void @store_lds_v4i32_align4(<4 x i32> addrspace(3)* %out, <4 x i32> %x) {
210; GFX9-LABEL: store_lds_v4i32_align4:
211; GFX9:       ; %bb.0:
212; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x24
213; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x34
214; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
215; GFX9-NEXT:    v_mov_b32_e32 v1, s4
216; GFX9-NEXT:    v_mov_b32_e32 v0, s0
217; GFX9-NEXT:    v_mov_b32_e32 v2, s1
218; GFX9-NEXT:    ds_write2_b32 v1, v0, v2 offset1:1
219; GFX9-NEXT:    v_mov_b32_e32 v3, s2
220; GFX9-NEXT:    v_mov_b32_e32 v0, s3
221; GFX9-NEXT:    ds_write2_b32 v1, v3, v0 offset0:2 offset1:3
222; GFX9-NEXT:    s_endpgm
223;
224; GFX7-LABEL: store_lds_v4i32_align4:
225; GFX7:       ; %bb.0:
226; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x9
227; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
228; GFX7-NEXT:    s_mov_b32 m0, -1
229; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
230; GFX7-NEXT:    v_mov_b32_e32 v1, s4
231; GFX7-NEXT:    v_mov_b32_e32 v0, s0
232; GFX7-NEXT:    v_mov_b32_e32 v2, s1
233; GFX7-NEXT:    ds_write2_b32 v1, v0, v2 offset1:1
234; GFX7-NEXT:    v_mov_b32_e32 v0, s2
235; GFX7-NEXT:    v_mov_b32_e32 v2, s3
236; GFX7-NEXT:    ds_write2_b32 v1, v0, v2 offset0:2 offset1:3
237; GFX7-NEXT:    s_endpgm
238  store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 4
239  ret void
240}
241
242define amdgpu_kernel void @store_lds_v4i32_align8(<4 x i32> addrspace(3)* %out, <4 x i32> %x) {
243; GFX9-LABEL: store_lds_v4i32_align8:
244; GFX9:       ; %bb.0:
245; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x24
246; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x34
247; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
248; GFX9-NEXT:    v_mov_b32_e32 v4, s4
249; GFX9-NEXT:    v_mov_b32_e32 v0, s0
250; GFX9-NEXT:    v_mov_b32_e32 v1, s1
251; GFX9-NEXT:    v_mov_b32_e32 v2, s2
252; GFX9-NEXT:    v_mov_b32_e32 v3, s3
253; GFX9-NEXT:    ds_write_b128 v4, v[0:3]
254; GFX9-NEXT:    s_endpgm
255;
256; GFX7-LABEL: store_lds_v4i32_align8:
257; GFX7:       ; %bb.0:
258; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x9
259; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
260; GFX7-NEXT:    s_mov_b32 m0, -1
261; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
262; GFX7-NEXT:    v_mov_b32_e32 v4, s4
263; GFX7-NEXT:    v_mov_b32_e32 v0, s0
264; GFX7-NEXT:    v_mov_b32_e32 v1, s1
265; GFX7-NEXT:    v_mov_b32_e32 v2, s2
266; GFX7-NEXT:    v_mov_b32_e32 v3, s3
267; GFX7-NEXT:    ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
268; GFX7-NEXT:    s_endpgm
269  store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 8
270  ret void
271}
272
273define amdgpu_kernel void @store_lds_v4i32_align16(<4 x i32> addrspace(3)* %out, <4 x i32> %x) {
274; GFX9-LABEL: store_lds_v4i32_align16:
275; GFX9:       ; %bb.0:
276; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x24
277; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x34
278; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
279; GFX9-NEXT:    v_mov_b32_e32 v4, s4
280; GFX9-NEXT:    v_mov_b32_e32 v0, s0
281; GFX9-NEXT:    v_mov_b32_e32 v1, s1
282; GFX9-NEXT:    v_mov_b32_e32 v2, s2
283; GFX9-NEXT:    v_mov_b32_e32 v3, s3
284; GFX9-NEXT:    ds_write_b128 v4, v[0:3]
285; GFX9-NEXT:    s_endpgm
286;
287; GFX7-LABEL: store_lds_v4i32_align16:
288; GFX7:       ; %bb.0:
289; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x9
290; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
291; GFX7-NEXT:    s_mov_b32 m0, -1
292; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
293; GFX7-NEXT:    v_mov_b32_e32 v4, s4
294; GFX7-NEXT:    v_mov_b32_e32 v0, s0
295; GFX7-NEXT:    v_mov_b32_e32 v1, s1
296; GFX7-NEXT:    v_mov_b32_e32 v2, s2
297; GFX7-NEXT:    v_mov_b32_e32 v3, s3
298; GFX7-NEXT:    ds_write_b128 v4, v[0:3]
299; GFX7-NEXT:    s_endpgm
300  store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 16
301  ret void
302}
303