1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
3; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s
4
5; FIXME:
6; XUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX6 %s
7
8define amdgpu_kernel void @store_lds_v3i32(<3 x i32> addrspace(3)* %out, <3 x i32> %x) {
9; GFX9-LABEL: store_lds_v3i32:
10; GFX9:       ; %bb.0:
11; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x24
12; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x34
13; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
14; GFX9-NEXT:    v_mov_b32_e32 v3, s4
15; GFX9-NEXT:    v_mov_b32_e32 v0, s0
16; GFX9-NEXT:    v_mov_b32_e32 v1, s1
17; GFX9-NEXT:    v_mov_b32_e32 v2, s2
18; GFX9-NEXT:    ds_write_b96 v3, v[0:2]
19; GFX9-NEXT:    s_endpgm
20;
21; GFX7-LABEL: store_lds_v3i32:
22; GFX7:       ; %bb.0:
23; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x9
24; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
25; GFX7-NEXT:    s_mov_b32 m0, -1
26; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
27; GFX7-NEXT:    v_mov_b32_e32 v3, s4
28; GFX7-NEXT:    v_mov_b32_e32 v0, s0
29; GFX7-NEXT:    v_mov_b32_e32 v1, s1
30; GFX7-NEXT:    v_mov_b32_e32 v2, s2
31; GFX7-NEXT:    ds_write_b96 v3, v[0:2]
32; GFX7-NEXT:    s_endpgm
33  store <3 x i32> %x, <3 x i32> addrspace(3)* %out
34  ret void
35}
36
37define amdgpu_kernel void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out, <3 x i32> %x) {
38; GFX9-LABEL: store_lds_v3i32_align1:
39; GFX9:       ; %bb.0:
40; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x24
41; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x34
42; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
43; GFX9-NEXT:    v_mov_b32_e32 v1, s4
44; GFX9-NEXT:    v_mov_b32_e32 v0, s0
45; GFX9-NEXT:    s_lshr_b32 s3, s0, 8
46; GFX9-NEXT:    ds_write_b8 v1, v0
47; GFX9-NEXT:    v_mov_b32_e32 v0, s3
48; GFX9-NEXT:    s_lshr_b32 s5, s0, 16
49; GFX9-NEXT:    s_lshr_b32 s6, s0, 24
50; GFX9-NEXT:    ds_write_b8 v1, v0 offset:1
51; GFX9-NEXT:    v_mov_b32_e32 v0, s5
52; GFX9-NEXT:    ds_write_b8 v1, v0 offset:2
53; GFX9-NEXT:    v_mov_b32_e32 v0, s6
54; GFX9-NEXT:    ds_write_b8 v1, v0 offset:3
55; GFX9-NEXT:    v_mov_b32_e32 v0, s1
56; GFX9-NEXT:    s_lshr_b32 s0, s1, 8
57; GFX9-NEXT:    ds_write_b8 v1, v0 offset:4
58; GFX9-NEXT:    v_mov_b32_e32 v0, s0
59; GFX9-NEXT:    s_lshr_b32 s3, s1, 16
60; GFX9-NEXT:    s_lshr_b32 s4, s1, 24
61; GFX9-NEXT:    ds_write_b8 v1, v0 offset:5
62; GFX9-NEXT:    v_mov_b32_e32 v0, s3
63; GFX9-NEXT:    ds_write_b8 v1, v0 offset:6
64; GFX9-NEXT:    v_mov_b32_e32 v0, s4
65; GFX9-NEXT:    ds_write_b8 v1, v0 offset:7
66; GFX9-NEXT:    v_mov_b32_e32 v0, s2
67; GFX9-NEXT:    s_lshr_b32 s0, s2, 8
68; GFX9-NEXT:    ds_write_b8 v1, v0 offset:8
69; GFX9-NEXT:    v_mov_b32_e32 v0, s0
70; GFX9-NEXT:    s_lshr_b32 s1, s2, 16
71; GFX9-NEXT:    ds_write_b8 v1, v0 offset:9
72; GFX9-NEXT:    v_mov_b32_e32 v0, s1
73; GFX9-NEXT:    s_lshr_b32 s3, s2, 24
74; GFX9-NEXT:    ds_write_b8 v1, v0 offset:10
75; GFX9-NEXT:    v_mov_b32_e32 v0, s3
76; GFX9-NEXT:    ds_write_b8 v1, v0 offset:11
77; GFX9-NEXT:    s_endpgm
78;
79; GFX7-LABEL: store_lds_v3i32_align1:
80; GFX7:       ; %bb.0:
81; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x9
82; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
83; GFX7-NEXT:    s_mov_b32 m0, -1
84; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
85; GFX7-NEXT:    v_mov_b32_e32 v1, s4
86; GFX7-NEXT:    v_mov_b32_e32 v0, s0
87; GFX7-NEXT:    s_lshr_b32 s3, s0, 8
88; GFX7-NEXT:    ds_write_b8 v1, v0
89; GFX7-NEXT:    v_mov_b32_e32 v0, s3
90; GFX7-NEXT:    s_lshr_b32 s5, s0, 16
91; GFX7-NEXT:    s_lshr_b32 s6, s0, 24
92; GFX7-NEXT:    ds_write_b8 v1, v0 offset:1
93; GFX7-NEXT:    v_mov_b32_e32 v0, s5
94; GFX7-NEXT:    ds_write_b8 v1, v0 offset:2
95; GFX7-NEXT:    v_mov_b32_e32 v0, s6
96; GFX7-NEXT:    ds_write_b8 v1, v0 offset:3
97; GFX7-NEXT:    v_mov_b32_e32 v0, s1
98; GFX7-NEXT:    s_lshr_b32 s0, s1, 8
99; GFX7-NEXT:    ds_write_b8 v1, v0 offset:4
100; GFX7-NEXT:    v_mov_b32_e32 v0, s0
101; GFX7-NEXT:    s_lshr_b32 s3, s1, 16
102; GFX7-NEXT:    s_lshr_b32 s4, s1, 24
103; GFX7-NEXT:    ds_write_b8 v1, v0 offset:5
104; GFX7-NEXT:    v_mov_b32_e32 v0, s3
105; GFX7-NEXT:    ds_write_b8 v1, v0 offset:6
106; GFX7-NEXT:    v_mov_b32_e32 v0, s4
107; GFX7-NEXT:    ds_write_b8 v1, v0 offset:7
108; GFX7-NEXT:    v_mov_b32_e32 v0, s2
109; GFX7-NEXT:    s_lshr_b32 s0, s2, 8
110; GFX7-NEXT:    ds_write_b8 v1, v0 offset:8
111; GFX7-NEXT:    v_mov_b32_e32 v0, s0
112; GFX7-NEXT:    s_lshr_b32 s1, s2, 16
113; GFX7-NEXT:    ds_write_b8 v1, v0 offset:9
114; GFX7-NEXT:    v_mov_b32_e32 v0, s1
115; GFX7-NEXT:    s_lshr_b32 s3, s2, 24
116; GFX7-NEXT:    ds_write_b8 v1, v0 offset:10
117; GFX7-NEXT:    v_mov_b32_e32 v0, s3
118; GFX7-NEXT:    ds_write_b8 v1, v0 offset:11
119; GFX7-NEXT:    s_endpgm
120  store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 1
121  ret void
122}
123
124define amdgpu_kernel void @store_lds_v3i32_align2(<3 x i32> addrspace(3)* %out, <3 x i32> %x) {
125; GFX9-LABEL: store_lds_v3i32_align2:
126; GFX9:       ; %bb.0:
127; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x24
128; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x34
129; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
130; GFX9-NEXT:    v_mov_b32_e32 v1, s4
131; GFX9-NEXT:    v_mov_b32_e32 v0, s0
132; GFX9-NEXT:    s_lshr_b32 s3, s0, 16
133; GFX9-NEXT:    ds_write_b16 v1, v0
134; GFX9-NEXT:    v_mov_b32_e32 v0, s3
135; GFX9-NEXT:    ds_write_b16 v1, v0 offset:2
136; GFX9-NEXT:    v_mov_b32_e32 v0, s1
137; GFX9-NEXT:    s_lshr_b32 s0, s1, 16
138; GFX9-NEXT:    ds_write_b16 v1, v0 offset:4
139; GFX9-NEXT:    v_mov_b32_e32 v0, s0
140; GFX9-NEXT:    ds_write_b16 v1, v0 offset:6
141; GFX9-NEXT:    v_mov_b32_e32 v0, s2
142; GFX9-NEXT:    s_lshr_b32 s0, s2, 16
143; GFX9-NEXT:    ds_write_b16 v1, v0 offset:8
144; GFX9-NEXT:    v_mov_b32_e32 v0, s0
145; GFX9-NEXT:    ds_write_b16 v1, v0 offset:10
146; GFX9-NEXT:    s_endpgm
147;
148; GFX7-LABEL: store_lds_v3i32_align2:
149; GFX7:       ; %bb.0:
150; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x9
151; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
152; GFX7-NEXT:    s_mov_b32 m0, -1
153; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
154; GFX7-NEXT:    v_mov_b32_e32 v1, s4
155; GFX7-NEXT:    v_mov_b32_e32 v0, s0
156; GFX7-NEXT:    s_lshr_b32 s3, s0, 16
157; GFX7-NEXT:    ds_write_b16 v1, v0
158; GFX7-NEXT:    v_mov_b32_e32 v0, s3
159; GFX7-NEXT:    ds_write_b16 v1, v0 offset:2
160; GFX7-NEXT:    v_mov_b32_e32 v0, s1
161; GFX7-NEXT:    s_lshr_b32 s0, s1, 16
162; GFX7-NEXT:    ds_write_b16 v1, v0 offset:4
163; GFX7-NEXT:    v_mov_b32_e32 v0, s0
164; GFX7-NEXT:    ds_write_b16 v1, v0 offset:6
165; GFX7-NEXT:    v_mov_b32_e32 v0, s2
166; GFX7-NEXT:    s_lshr_b32 s0, s2, 16
167; GFX7-NEXT:    ds_write_b16 v1, v0 offset:8
168; GFX7-NEXT:    v_mov_b32_e32 v0, s0
169; GFX7-NEXT:    ds_write_b16 v1, v0 offset:10
170; GFX7-NEXT:    s_endpgm
171  store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 2
172  ret void
173}
174
175define amdgpu_kernel void @store_lds_v3i32_align4(<3 x i32> addrspace(3)* %out, <3 x i32> %x) {
176; GFX9-LABEL: store_lds_v3i32_align4:
177; GFX9:       ; %bb.0:
178; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x24
179; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x34
180; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
181; GFX9-NEXT:    v_mov_b32_e32 v2, s4
182; GFX9-NEXT:    v_mov_b32_e32 v0, s0
183; GFX9-NEXT:    v_mov_b32_e32 v1, s1
184; GFX9-NEXT:    v_mov_b32_e32 v3, s2
185; GFX9-NEXT:    ds_write2_b32 v2, v0, v1 offset1:1
186; GFX9-NEXT:    ds_write_b32 v2, v3 offset:8
187; GFX9-NEXT:    s_endpgm
188;
189; GFX7-LABEL: store_lds_v3i32_align4:
190; GFX7:       ; %bb.0:
191; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x9
192; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
193; GFX7-NEXT:    s_mov_b32 m0, -1
194; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
195; GFX7-NEXT:    v_mov_b32_e32 v2, s4
196; GFX7-NEXT:    v_mov_b32_e32 v0, s0
197; GFX7-NEXT:    v_mov_b32_e32 v1, s1
198; GFX7-NEXT:    v_mov_b32_e32 v3, s2
199; GFX7-NEXT:    ds_write2_b32 v2, v0, v1 offset1:1
200; GFX7-NEXT:    ds_write_b32 v2, v3 offset:8
201; GFX7-NEXT:    s_endpgm
202  store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 4
203  ret void
204}
205
206define amdgpu_kernel void @store_lds_v3i32_align8(<3 x i32> addrspace(3)* %out, <3 x i32> %x) {
207; GFX9-LABEL: store_lds_v3i32_align8:
208; GFX9:       ; %bb.0:
209; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x24
210; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x34
211; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
212; GFX9-NEXT:    v_mov_b32_e32 v2, s4
213; GFX9-NEXT:    v_mov_b32_e32 v0, s0
214; GFX9-NEXT:    v_mov_b32_e32 v1, s1
215; GFX9-NEXT:    v_mov_b32_e32 v3, s2
216; GFX9-NEXT:    ds_write_b64 v2, v[0:1]
217; GFX9-NEXT:    ds_write_b32 v2, v3 offset:8
218; GFX9-NEXT:    s_endpgm
219;
220; GFX7-LABEL: store_lds_v3i32_align8:
221; GFX7:       ; %bb.0:
222; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x9
223; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
224; GFX7-NEXT:    s_mov_b32 m0, -1
225; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
226; GFX7-NEXT:    v_mov_b32_e32 v2, s4
227; GFX7-NEXT:    v_mov_b32_e32 v0, s0
228; GFX7-NEXT:    v_mov_b32_e32 v1, s1
229; GFX7-NEXT:    v_mov_b32_e32 v3, s2
230; GFX7-NEXT:    ds_write_b64 v2, v[0:1]
231; GFX7-NEXT:    ds_write_b32 v2, v3 offset:8
232; GFX7-NEXT:    s_endpgm
233  store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 8
234  ret void
235}
236
237define amdgpu_kernel void @store_lds_v3i32_align16(<3 x i32> addrspace(3)* %out, <3 x i32> %x) {
238; GFX9-LABEL: store_lds_v3i32_align16:
239; GFX9:       ; %bb.0:
240; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x24
241; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x34
242; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
243; GFX9-NEXT:    v_mov_b32_e32 v3, s4
244; GFX9-NEXT:    v_mov_b32_e32 v0, s0
245; GFX9-NEXT:    v_mov_b32_e32 v1, s1
246; GFX9-NEXT:    v_mov_b32_e32 v2, s2
247; GFX9-NEXT:    ds_write_b96 v3, v[0:2]
248; GFX9-NEXT:    s_endpgm
249;
250; GFX7-LABEL: store_lds_v3i32_align16:
251; GFX7:       ; %bb.0:
252; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x9
253; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
254; GFX7-NEXT:    s_mov_b32 m0, -1
255; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
256; GFX7-NEXT:    v_mov_b32_e32 v3, s4
257; GFX7-NEXT:    v_mov_b32_e32 v0, s0
258; GFX7-NEXT:    v_mov_b32_e32 v1, s1
259; GFX7-NEXT:    v_mov_b32_e32 v2, s2
260; GFX7-NEXT:    ds_write_b96 v3, v[0:2]
261; GFX7-NEXT:    s_endpgm
262  store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 16
263  ret void
264}
265