1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
3; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s
4; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX6 %s
5
6define amdgpu_kernel void @store_lds_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> %x) {
7; GFX9-LABEL: store_lds_v4i32:
8; GFX9:       ; %bb.0:
9; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x24
10; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x34
11; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
12; GFX9-NEXT:    v_mov_b32_e32 v4, s4
13; GFX9-NEXT:    v_mov_b32_e32 v0, s0
14; GFX9-NEXT:    v_mov_b32_e32 v1, s1
15; GFX9-NEXT:    v_mov_b32_e32 v2, s2
16; GFX9-NEXT:    v_mov_b32_e32 v3, s3
17; GFX9-NEXT:    ds_write_b128 v4, v[0:3]
18; GFX9-NEXT:    s_endpgm
19;
20; GFX7-LABEL: store_lds_v4i32:
21; GFX7:       ; %bb.0:
22; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x9
23; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
24; GFX7-NEXT:    s_mov_b32 m0, -1
25; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
26; GFX7-NEXT:    v_mov_b32_e32 v4, s4
27; GFX7-NEXT:    v_mov_b32_e32 v0, s0
28; GFX7-NEXT:    v_mov_b32_e32 v1, s1
29; GFX7-NEXT:    v_mov_b32_e32 v2, s2
30; GFX7-NEXT:    v_mov_b32_e32 v3, s3
31; GFX7-NEXT:    ds_write_b128 v4, v[0:3]
32; GFX7-NEXT:    s_endpgm
33;
34; GFX6-LABEL: store_lds_v4i32:
35; GFX6:       ; %bb.0:
36; GFX6-NEXT:    s_load_dword s4, s[0:1], 0x9
37; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
38; GFX6-NEXT:    s_mov_b32 m0, -1
39; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
40; GFX6-NEXT:    v_mov_b32_e32 v4, s4
41; GFX6-NEXT:    v_mov_b32_e32 v0, s2
42; GFX6-NEXT:    v_mov_b32_e32 v1, s3
43; GFX6-NEXT:    v_mov_b32_e32 v2, s0
44; GFX6-NEXT:    v_mov_b32_e32 v3, s1
45; GFX6-NEXT:    ds_write2_b64 v4, v[2:3], v[0:1] offset1:1
46; GFX6-NEXT:    s_endpgm
47  store <4 x i32> %x, <4 x i32> addrspace(3)* %out
48  ret void
49}
50
51define amdgpu_kernel void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out, <4 x i32> %x) {
52; GFX9-LABEL: store_lds_v4i32_align1:
53; GFX9:       ; %bb.0:
54; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x24
55; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x34
56; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
57; GFX9-NEXT:    v_mov_b32_e32 v0, s4
58; GFX9-NEXT:    v_mov_b32_e32 v1, s3
59; GFX9-NEXT:    v_mov_b32_e32 v2, s2
60; GFX9-NEXT:    ds_write_b8 v0, v1 offset:12
61; GFX9-NEXT:    ds_write_b8_d16_hi v0, v1 offset:14
62; GFX9-NEXT:    ds_write_b8 v0, v2 offset:8
63; GFX9-NEXT:    ds_write_b8_d16_hi v0, v2 offset:10
64; GFX9-NEXT:    v_mov_b32_e32 v1, s1
65; GFX9-NEXT:    ds_write_b8 v0, v1 offset:4
66; GFX9-NEXT:    ds_write_b8_d16_hi v0, v1 offset:6
67; GFX9-NEXT:    v_mov_b32_e32 v1, s0
68; GFX9-NEXT:    s_lshr_b32 s4, s3, 8
69; GFX9-NEXT:    ds_write_b8 v0, v1
70; GFX9-NEXT:    ds_write_b8_d16_hi v0, v1 offset:2
71; GFX9-NEXT:    v_mov_b32_e32 v1, s4
72; GFX9-NEXT:    s_lshr_b32 s3, s3, 24
73; GFX9-NEXT:    ds_write_b8 v0, v1 offset:13
74; GFX9-NEXT:    v_mov_b32_e32 v1, s3
75; GFX9-NEXT:    s_lshr_b32 s3, s2, 8
76; GFX9-NEXT:    ds_write_b8 v0, v1 offset:15
77; GFX9-NEXT:    v_mov_b32_e32 v1, s3
78; GFX9-NEXT:    s_lshr_b32 s2, s2, 24
79; GFX9-NEXT:    ds_write_b8 v0, v1 offset:9
80; GFX9-NEXT:    v_mov_b32_e32 v1, s2
81; GFX9-NEXT:    s_lshr_b32 s2, s1, 8
82; GFX9-NEXT:    ds_write_b8 v0, v1 offset:11
83; GFX9-NEXT:    v_mov_b32_e32 v1, s2
84; GFX9-NEXT:    s_lshr_b32 s1, s1, 24
85; GFX9-NEXT:    ds_write_b8 v0, v1 offset:5
86; GFX9-NEXT:    v_mov_b32_e32 v1, s1
87; GFX9-NEXT:    s_lshr_b32 s1, s0, 8
88; GFX9-NEXT:    ds_write_b8 v0, v1 offset:7
89; GFX9-NEXT:    v_mov_b32_e32 v1, s1
90; GFX9-NEXT:    s_lshr_b32 s0, s0, 24
91; GFX9-NEXT:    ds_write_b8 v0, v1 offset:1
92; GFX9-NEXT:    v_mov_b32_e32 v1, s0
93; GFX9-NEXT:    ds_write_b8 v0, v1 offset:3
94; GFX9-NEXT:    s_endpgm
95;
96; GFX7-LABEL: store_lds_v4i32_align1:
97; GFX7:       ; %bb.0:
98; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x9
99; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
100; GFX7-NEXT:    s_mov_b32 m0, -1
101; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
102; GFX7-NEXT:    v_mov_b32_e32 v0, s4
103; GFX7-NEXT:    v_mov_b32_e32 v1, s3
104; GFX7-NEXT:    v_mov_b32_e32 v2, s2
105; GFX7-NEXT:    ds_write_b8 v0, v1 offset:12
106; GFX7-NEXT:    ds_write_b8 v0, v2 offset:8
107; GFX7-NEXT:    v_mov_b32_e32 v1, s1
108; GFX7-NEXT:    ds_write_b8 v0, v1 offset:4
109; GFX7-NEXT:    v_mov_b32_e32 v1, s0
110; GFX7-NEXT:    s_lshr_b32 s4, s3, 8
111; GFX7-NEXT:    ds_write_b8 v0, v1
112; GFX7-NEXT:    v_mov_b32_e32 v1, s4
113; GFX7-NEXT:    s_lshr_b32 s4, s3, 24
114; GFX7-NEXT:    ds_write_b8 v0, v1 offset:13
115; GFX7-NEXT:    v_mov_b32_e32 v1, s4
116; GFX7-NEXT:    s_lshr_b32 s3, s3, 16
117; GFX7-NEXT:    ds_write_b8 v0, v1 offset:15
118; GFX7-NEXT:    v_mov_b32_e32 v1, s3
119; GFX7-NEXT:    s_lshr_b32 s3, s2, 8
120; GFX7-NEXT:    ds_write_b8 v0, v1 offset:14
121; GFX7-NEXT:    v_mov_b32_e32 v1, s3
122; GFX7-NEXT:    s_lshr_b32 s3, s2, 24
123; GFX7-NEXT:    ds_write_b8 v0, v1 offset:9
124; GFX7-NEXT:    v_mov_b32_e32 v1, s3
125; GFX7-NEXT:    s_lshr_b32 s2, s2, 16
126; GFX7-NEXT:    ds_write_b8 v0, v1 offset:11
127; GFX7-NEXT:    v_mov_b32_e32 v1, s2
128; GFX7-NEXT:    s_lshr_b32 s2, s1, 8
129; GFX7-NEXT:    ds_write_b8 v0, v1 offset:10
130; GFX7-NEXT:    v_mov_b32_e32 v1, s2
131; GFX7-NEXT:    s_lshr_b32 s2, s1, 24
132; GFX7-NEXT:    ds_write_b8 v0, v1 offset:5
133; GFX7-NEXT:    v_mov_b32_e32 v1, s2
134; GFX7-NEXT:    s_lshr_b32 s1, s1, 16
135; GFX7-NEXT:    ds_write_b8 v0, v1 offset:7
136; GFX7-NEXT:    v_mov_b32_e32 v1, s1
137; GFX7-NEXT:    s_lshr_b32 s1, s0, 8
138; GFX7-NEXT:    ds_write_b8 v0, v1 offset:6
139; GFX7-NEXT:    v_mov_b32_e32 v1, s1
140; GFX7-NEXT:    s_lshr_b32 s1, s0, 24
141; GFX7-NEXT:    ds_write_b8 v0, v1 offset:1
142; GFX7-NEXT:    v_mov_b32_e32 v1, s1
143; GFX7-NEXT:    s_lshr_b32 s0, s0, 16
144; GFX7-NEXT:    ds_write_b8 v0, v1 offset:3
145; GFX7-NEXT:    v_mov_b32_e32 v1, s0
146; GFX7-NEXT:    ds_write_b8 v0, v1 offset:2
147; GFX7-NEXT:    s_endpgm
148;
149; GFX6-LABEL: store_lds_v4i32_align1:
150; GFX6:       ; %bb.0:
151; GFX6-NEXT:    s_load_dword s4, s[0:1], 0x9
152; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
153; GFX6-NEXT:    s_mov_b32 m0, -1
154; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
155; GFX6-NEXT:    v_mov_b32_e32 v0, s4
156; GFX6-NEXT:    v_mov_b32_e32 v1, s3
157; GFX6-NEXT:    v_mov_b32_e32 v2, s2
158; GFX6-NEXT:    ds_write_b8 v0, v1 offset:12
159; GFX6-NEXT:    ds_write_b8 v0, v2 offset:8
160; GFX6-NEXT:    v_mov_b32_e32 v1, s1
161; GFX6-NEXT:    ds_write_b8 v0, v1 offset:4
162; GFX6-NEXT:    v_mov_b32_e32 v1, s0
163; GFX6-NEXT:    s_lshr_b32 s4, s3, 8
164; GFX6-NEXT:    ds_write_b8 v0, v1
165; GFX6-NEXT:    v_mov_b32_e32 v1, s4
166; GFX6-NEXT:    s_lshr_b32 s4, s3, 24
167; GFX6-NEXT:    ds_write_b8 v0, v1 offset:13
168; GFX6-NEXT:    v_mov_b32_e32 v1, s4
169; GFX6-NEXT:    s_lshr_b32 s3, s3, 16
170; GFX6-NEXT:    ds_write_b8 v0, v1 offset:15
171; GFX6-NEXT:    v_mov_b32_e32 v1, s3
172; GFX6-NEXT:    s_lshr_b32 s3, s2, 8
173; GFX6-NEXT:    ds_write_b8 v0, v1 offset:14
174; GFX6-NEXT:    v_mov_b32_e32 v1, s3
175; GFX6-NEXT:    s_lshr_b32 s3, s2, 24
176; GFX6-NEXT:    ds_write_b8 v0, v1 offset:9
177; GFX6-NEXT:    v_mov_b32_e32 v1, s3
178; GFX6-NEXT:    s_lshr_b32 s2, s2, 16
179; GFX6-NEXT:    ds_write_b8 v0, v1 offset:11
180; GFX6-NEXT:    v_mov_b32_e32 v1, s2
181; GFX6-NEXT:    s_lshr_b32 s2, s1, 8
182; GFX6-NEXT:    ds_write_b8 v0, v1 offset:10
183; GFX6-NEXT:    v_mov_b32_e32 v1, s2
184; GFX6-NEXT:    s_lshr_b32 s2, s1, 24
185; GFX6-NEXT:    ds_write_b8 v0, v1 offset:5
186; GFX6-NEXT:    v_mov_b32_e32 v1, s2
187; GFX6-NEXT:    s_lshr_b32 s1, s1, 16
188; GFX6-NEXT:    ds_write_b8 v0, v1 offset:7
189; GFX6-NEXT:    v_mov_b32_e32 v1, s1
190; GFX6-NEXT:    s_lshr_b32 s1, s0, 8
191; GFX6-NEXT:    ds_write_b8 v0, v1 offset:6
192; GFX6-NEXT:    v_mov_b32_e32 v1, s1
193; GFX6-NEXT:    s_lshr_b32 s1, s0, 24
194; GFX6-NEXT:    ds_write_b8 v0, v1 offset:1
195; GFX6-NEXT:    v_mov_b32_e32 v1, s1
196; GFX6-NEXT:    s_lshr_b32 s0, s0, 16
197; GFX6-NEXT:    ds_write_b8 v0, v1 offset:3
198; GFX6-NEXT:    v_mov_b32_e32 v1, s0
199; GFX6-NEXT:    ds_write_b8 v0, v1 offset:2
200; GFX6-NEXT:    s_endpgm
201  store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 1
202  ret void
203}
204
205define amdgpu_kernel void @store_lds_v4i32_align2(<4 x i32> addrspace(3)* %out, <4 x i32> %x) {
206; GFX9-LABEL: store_lds_v4i32_align2:
207; GFX9:       ; %bb.0:
208; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x24
209; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x34
210; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
211; GFX9-NEXT:    v_mov_b32_e32 v0, s4
212; GFX9-NEXT:    v_mov_b32_e32 v1, s3
213; GFX9-NEXT:    v_mov_b32_e32 v2, s2
214; GFX9-NEXT:    ds_write_b16 v0, v1 offset:12
215; GFX9-NEXT:    ds_write_b16_d16_hi v0, v1 offset:14
216; GFX9-NEXT:    ds_write_b16 v0, v2 offset:8
217; GFX9-NEXT:    ds_write_b16_d16_hi v0, v2 offset:10
218; GFX9-NEXT:    v_mov_b32_e32 v1, s1
219; GFX9-NEXT:    ds_write_b16 v0, v1 offset:4
220; GFX9-NEXT:    ds_write_b16_d16_hi v0, v1 offset:6
221; GFX9-NEXT:    v_mov_b32_e32 v1, s0
222; GFX9-NEXT:    ds_write_b16 v0, v1
223; GFX9-NEXT:    ds_write_b16_d16_hi v0, v1 offset:2
224; GFX9-NEXT:    s_endpgm
225;
226; GFX7-LABEL: store_lds_v4i32_align2:
227; GFX7:       ; %bb.0:
228; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x9
229; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
230; GFX7-NEXT:    s_mov_b32 m0, -1
231; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
232; GFX7-NEXT:    v_mov_b32_e32 v0, s4
233; GFX7-NEXT:    v_mov_b32_e32 v1, s3
234; GFX7-NEXT:    v_mov_b32_e32 v2, s2
235; GFX7-NEXT:    ds_write_b16 v0, v1 offset:12
236; GFX7-NEXT:    ds_write_b16 v0, v2 offset:8
237; GFX7-NEXT:    v_mov_b32_e32 v1, s1
238; GFX7-NEXT:    ds_write_b16 v0, v1 offset:4
239; GFX7-NEXT:    v_mov_b32_e32 v1, s0
240; GFX7-NEXT:    s_lshr_b32 s3, s3, 16
241; GFX7-NEXT:    ds_write_b16 v0, v1
242; GFX7-NEXT:    v_mov_b32_e32 v1, s3
243; GFX7-NEXT:    s_lshr_b32 s2, s2, 16
244; GFX7-NEXT:    ds_write_b16 v0, v1 offset:14
245; GFX7-NEXT:    v_mov_b32_e32 v1, s2
246; GFX7-NEXT:    s_lshr_b32 s1, s1, 16
247; GFX7-NEXT:    ds_write_b16 v0, v1 offset:10
248; GFX7-NEXT:    v_mov_b32_e32 v1, s1
249; GFX7-NEXT:    s_lshr_b32 s0, s0, 16
250; GFX7-NEXT:    ds_write_b16 v0, v1 offset:6
251; GFX7-NEXT:    v_mov_b32_e32 v1, s0
252; GFX7-NEXT:    ds_write_b16 v0, v1 offset:2
253; GFX7-NEXT:    s_endpgm
254;
255; GFX6-LABEL: store_lds_v4i32_align2:
256; GFX6:       ; %bb.0:
257; GFX6-NEXT:    s_load_dword s4, s[0:1], 0x9
258; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
259; GFX6-NEXT:    s_mov_b32 m0, -1
260; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
261; GFX6-NEXT:    v_mov_b32_e32 v0, s4
262; GFX6-NEXT:    v_mov_b32_e32 v1, s3
263; GFX6-NEXT:    v_mov_b32_e32 v2, s2
264; GFX6-NEXT:    ds_write_b16 v0, v1 offset:12
265; GFX6-NEXT:    ds_write_b16 v0, v2 offset:8
266; GFX6-NEXT:    v_mov_b32_e32 v1, s1
267; GFX6-NEXT:    ds_write_b16 v0, v1 offset:4
268; GFX6-NEXT:    v_mov_b32_e32 v1, s0
269; GFX6-NEXT:    s_lshr_b32 s3, s3, 16
270; GFX6-NEXT:    ds_write_b16 v0, v1
271; GFX6-NEXT:    v_mov_b32_e32 v1, s3
272; GFX6-NEXT:    s_lshr_b32 s2, s2, 16
273; GFX6-NEXT:    ds_write_b16 v0, v1 offset:14
274; GFX6-NEXT:    v_mov_b32_e32 v1, s2
275; GFX6-NEXT:    s_lshr_b32 s1, s1, 16
276; GFX6-NEXT:    ds_write_b16 v0, v1 offset:10
277; GFX6-NEXT:    v_mov_b32_e32 v1, s1
278; GFX6-NEXT:    s_lshr_b32 s0, s0, 16
279; GFX6-NEXT:    ds_write_b16 v0, v1 offset:6
280; GFX6-NEXT:    v_mov_b32_e32 v1, s0
281; GFX6-NEXT:    ds_write_b16 v0, v1 offset:2
282; GFX6-NEXT:    s_endpgm
283  store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 2
284  ret void
285}
286
287define amdgpu_kernel void @store_lds_v4i32_align4(<4 x i32> addrspace(3)* %out, <4 x i32> %x) {
288; GFX9-LABEL: store_lds_v4i32_align4:
289; GFX9:       ; %bb.0:
290; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x24
291; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x34
292; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
293; GFX9-NEXT:    v_mov_b32_e32 v0, s4
294; GFX9-NEXT:    v_mov_b32_e32 v1, s0
295; GFX9-NEXT:    v_mov_b32_e32 v2, s1
296; GFX9-NEXT:    ds_write2_b32 v0, v1, v2 offset1:1
297; GFX9-NEXT:    v_mov_b32_e32 v3, s2
298; GFX9-NEXT:    v_mov_b32_e32 v1, s3
299; GFX9-NEXT:    ds_write2_b32 v0, v3, v1 offset0:2 offset1:3
300; GFX9-NEXT:    s_endpgm
301;
302; GFX7-LABEL: store_lds_v4i32_align4:
303; GFX7:       ; %bb.0:
304; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x9
305; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
306; GFX7-NEXT:    s_mov_b32 m0, -1
307; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
308; GFX7-NEXT:    v_mov_b32_e32 v0, s4
309; GFX7-NEXT:    v_mov_b32_e32 v1, s0
310; GFX7-NEXT:    v_mov_b32_e32 v2, s1
311; GFX7-NEXT:    ds_write2_b32 v0, v1, v2 offset1:1
312; GFX7-NEXT:    v_mov_b32_e32 v1, s2
313; GFX7-NEXT:    v_mov_b32_e32 v2, s3
314; GFX7-NEXT:    ds_write2_b32 v0, v1, v2 offset0:2 offset1:3
315; GFX7-NEXT:    s_endpgm
316;
317; GFX6-LABEL: store_lds_v4i32_align4:
318; GFX6:       ; %bb.0:
319; GFX6-NEXT:    s_load_dword s4, s[0:1], 0x9
320; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
321; GFX6-NEXT:    s_mov_b32 m0, -1
322; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
323; GFX6-NEXT:    v_mov_b32_e32 v0, s4
324; GFX6-NEXT:    v_mov_b32_e32 v1, s1
325; GFX6-NEXT:    v_mov_b32_e32 v2, s0
326; GFX6-NEXT:    ds_write2_b32 v0, v2, v1 offset1:1
327; GFX6-NEXT:    v_mov_b32_e32 v1, s3
328; GFX6-NEXT:    v_mov_b32_e32 v2, s2
329; GFX6-NEXT:    ds_write2_b32 v0, v2, v1 offset0:2 offset1:3
330; GFX6-NEXT:    s_endpgm
331  store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 4
332  ret void
333}
334
335define amdgpu_kernel void @store_lds_v4i32_align8(<4 x i32> addrspace(3)* %out, <4 x i32> %x) {
336; GFX9-LABEL: store_lds_v4i32_align8:
337; GFX9:       ; %bb.0:
338; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x24
339; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x34
340; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
341; GFX9-NEXT:    v_mov_b32_e32 v4, s4
342; GFX9-NEXT:    v_mov_b32_e32 v0, s0
343; GFX9-NEXT:    v_mov_b32_e32 v1, s1
344; GFX9-NEXT:    v_mov_b32_e32 v2, s2
345; GFX9-NEXT:    v_mov_b32_e32 v3, s3
346; GFX9-NEXT:    ds_write_b128 v4, v[0:3]
347; GFX9-NEXT:    s_endpgm
348;
349; GFX7-LABEL: store_lds_v4i32_align8:
350; GFX7:       ; %bb.0:
351; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x9
352; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
353; GFX7-NEXT:    s_mov_b32 m0, -1
354; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
355; GFX7-NEXT:    v_mov_b32_e32 v4, s4
356; GFX7-NEXT:    v_mov_b32_e32 v0, s0
357; GFX7-NEXT:    v_mov_b32_e32 v2, s2
358; GFX7-NEXT:    v_mov_b32_e32 v1, s1
359; GFX7-NEXT:    v_mov_b32_e32 v3, s3
360; GFX7-NEXT:    ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
361; GFX7-NEXT:    s_endpgm
362;
363; GFX6-LABEL: store_lds_v4i32_align8:
364; GFX6:       ; %bb.0:
365; GFX6-NEXT:    s_load_dword s4, s[0:1], 0x9
366; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
367; GFX6-NEXT:    s_mov_b32 m0, -1
368; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
369; GFX6-NEXT:    v_mov_b32_e32 v4, s4
370; GFX6-NEXT:    v_mov_b32_e32 v0, s2
371; GFX6-NEXT:    v_mov_b32_e32 v1, s3
372; GFX6-NEXT:    v_mov_b32_e32 v2, s0
373; GFX6-NEXT:    v_mov_b32_e32 v3, s1
374; GFX6-NEXT:    ds_write2_b64 v4, v[2:3], v[0:1] offset1:1
375; GFX6-NEXT:    s_endpgm
376  store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 8
377  ret void
378}
379
380define amdgpu_kernel void @store_lds_v4i32_align16(<4 x i32> addrspace(3)* %out, <4 x i32> %x) {
381; GFX9-LABEL: store_lds_v4i32_align16:
382; GFX9:       ; %bb.0:
383; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x24
384; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x34
385; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
386; GFX9-NEXT:    v_mov_b32_e32 v4, s4
387; GFX9-NEXT:    v_mov_b32_e32 v0, s0
388; GFX9-NEXT:    v_mov_b32_e32 v1, s1
389; GFX9-NEXT:    v_mov_b32_e32 v2, s2
390; GFX9-NEXT:    v_mov_b32_e32 v3, s3
391; GFX9-NEXT:    ds_write_b128 v4, v[0:3]
392; GFX9-NEXT:    s_endpgm
393;
394; GFX7-LABEL: store_lds_v4i32_align16:
395; GFX7:       ; %bb.0:
396; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x9
397; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
398; GFX7-NEXT:    s_mov_b32 m0, -1
399; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
400; GFX7-NEXT:    v_mov_b32_e32 v4, s4
401; GFX7-NEXT:    v_mov_b32_e32 v0, s0
402; GFX7-NEXT:    v_mov_b32_e32 v1, s1
403; GFX7-NEXT:    v_mov_b32_e32 v2, s2
404; GFX7-NEXT:    v_mov_b32_e32 v3, s3
405; GFX7-NEXT:    ds_write_b128 v4, v[0:3]
406; GFX7-NEXT:    s_endpgm
407;
408; GFX6-LABEL: store_lds_v4i32_align16:
409; GFX6:       ; %bb.0:
410; GFX6-NEXT:    s_load_dword s4, s[0:1], 0x9
411; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
412; GFX6-NEXT:    s_mov_b32 m0, -1
413; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
414; GFX6-NEXT:    v_mov_b32_e32 v4, s4
415; GFX6-NEXT:    v_mov_b32_e32 v0, s2
416; GFX6-NEXT:    v_mov_b32_e32 v1, s3
417; GFX6-NEXT:    v_mov_b32_e32 v2, s0
418; GFX6-NEXT:    v_mov_b32_e32 v3, s1
419; GFX6-NEXT:    ds_write2_b64 v4, v[2:3], v[0:1] offset1:1
420; GFX6-NEXT:    s_endpgm
421  store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 16
422  ret void
423}
424