1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
3; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s
4; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX6 %s
5
6define <3 x i32> @load_lds_v3i32(<3 x i32> addrspace(3)* %ptr) {
7; GFX9-LABEL: load_lds_v3i32:
8; GFX9:       ; %bb.0:
9; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10; GFX9-NEXT:    ds_read_b96 v[0:2], v0
11; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
12; GFX9-NEXT:    s_setpc_b64 s[30:31]
13;
14; GFX7-LABEL: load_lds_v3i32:
15; GFX7:       ; %bb.0:
16; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17; GFX7-NEXT:    s_mov_b32 m0, -1
18; GFX7-NEXT:    ds_read_b96 v[0:2], v0
19; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
20; GFX7-NEXT:    s_setpc_b64 s[30:31]
21;
22; GFX6-LABEL: load_lds_v3i32:
23; GFX6:       ; %bb.0:
24; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 8, v0
26; GFX6-NEXT:    s_mov_b32 m0, -1
27; GFX6-NEXT:    ds_read_b32 v2, v1
28; GFX6-NEXT:    ds_read_b64 v[0:1], v0
29; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
30; GFX6-NEXT:    s_setpc_b64 s[30:31]
31  %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr
32  ret <3 x i32> %load
33}
34
35define <3 x i32> @load_lds_v3i32_align1(<3 x i32> addrspace(3)* %ptr) {
36; GFX9-LABEL: load_lds_v3i32_align1:
37; GFX9:       ; %bb.0:
38; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
39; GFX9-NEXT:    ds_read_u8 v1, v0
40; GFX9-NEXT:    ds_read_u8 v2, v0 offset:1
41; GFX9-NEXT:    ds_read_u8 v3, v0 offset:2
42; GFX9-NEXT:    ds_read_u8 v4, v0 offset:3
43; GFX9-NEXT:    ds_read_u8 v5, v0 offset:4
44; GFX9-NEXT:    ds_read_u8 v6, v0 offset:5
45; GFX9-NEXT:    ds_read_u8 v7, v0 offset:6
46; GFX9-NEXT:    ds_read_u8 v8, v0 offset:7
47; GFX9-NEXT:    ds_read_u8 v9, v0 offset:8
48; GFX9-NEXT:    ds_read_u8 v10, v0 offset:9
49; GFX9-NEXT:    ds_read_u8 v11, v0 offset:10
50; GFX9-NEXT:    ds_read_u8 v12, v0 offset:11
51; GFX9-NEXT:    s_waitcnt lgkmcnt(10)
52; GFX9-NEXT:    v_lshl_or_b32 v0, v2, 8, v1
53; GFX9-NEXT:    s_waitcnt lgkmcnt(8)
54; GFX9-NEXT:    v_lshl_or_b32 v1, v4, 8, v3
55; GFX9-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
56; GFX9-NEXT:    s_waitcnt lgkmcnt(6)
57; GFX9-NEXT:    v_lshl_or_b32 v1, v6, 8, v5
58; GFX9-NEXT:    s_waitcnt lgkmcnt(4)
59; GFX9-NEXT:    v_lshl_or_b32 v2, v8, 8, v7
60; GFX9-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
61; GFX9-NEXT:    s_waitcnt lgkmcnt(2)
62; GFX9-NEXT:    v_lshl_or_b32 v2, v10, 8, v9
63; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
64; GFX9-NEXT:    v_lshl_or_b32 v3, v12, 8, v11
65; GFX9-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
66; GFX9-NEXT:    s_setpc_b64 s[30:31]
67;
68; GFX7-LABEL: load_lds_v3i32_align1:
69; GFX7:       ; %bb.0:
70; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
71; GFX7-NEXT:    s_mov_b32 m0, -1
72; GFX7-NEXT:    ds_read_u8 v1, v0 offset:7
73; GFX7-NEXT:    ds_read_u8 v2, v0 offset:6
74; GFX7-NEXT:    ds_read_u8 v4, v0 offset:5
75; GFX7-NEXT:    ds_read_u8 v5, v0 offset:4
76; GFX7-NEXT:    ds_read_u8 v3, v0 offset:3
77; GFX7-NEXT:    ds_read_u8 v6, v0 offset:2
78; GFX7-NEXT:    ds_read_u8 v7, v0 offset:1
79; GFX7-NEXT:    ds_read_u8 v8, v0
80; GFX7-NEXT:    s_waitcnt lgkmcnt(7)
81; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
82; GFX7-NEXT:    s_waitcnt lgkmcnt(5)
83; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
84; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
85; GFX7-NEXT:    s_waitcnt lgkmcnt(4)
86; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
87; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
88; GFX7-NEXT:    v_or_b32_e32 v1, v1, v4
89; GFX7-NEXT:    ds_read_u8 v2, v0 offset:11
90; GFX7-NEXT:    ds_read_u8 v4, v0 offset:10
91; GFX7-NEXT:    ds_read_u8 v5, v0 offset:9
92; GFX7-NEXT:    ds_read_u8 v0, v0 offset:8
93; GFX7-NEXT:    s_waitcnt lgkmcnt(7)
94; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
95; GFX7-NEXT:    s_waitcnt lgkmcnt(5)
96; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
97; GFX7-NEXT:    v_or_b32_e32 v3, v3, v6
98; GFX7-NEXT:    s_waitcnt lgkmcnt(3)
99; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
100; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
101; GFX7-NEXT:    v_or_b32_e32 v2, v2, v4
102; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
103; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
104; GFX7-NEXT:    v_or_b32_e32 v7, v7, v8
105; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
106; GFX7-NEXT:    v_or_b32_e32 v3, v3, v7
107; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
108; GFX7-NEXT:    v_or_b32_e32 v0, v5, v0
109; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
110; GFX7-NEXT:    v_or_b32_e32 v2, v2, v0
111; GFX7-NEXT:    v_mov_b32_e32 v0, v3
112; GFX7-NEXT:    s_setpc_b64 s[30:31]
113;
114; GFX6-LABEL: load_lds_v3i32_align1:
115; GFX6:       ; %bb.0:
116; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
117; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 5, v0
118; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 4, v0
119; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 7, v0
120; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 6, v0
121; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 9, v0
122; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 8, v0
123; GFX6-NEXT:    v_add_i32_e32 v7, vcc, 11, v0
124; GFX6-NEXT:    s_mov_b32 m0, -1
125; GFX6-NEXT:    ds_read_u8 v2, v2
126; GFX6-NEXT:    ds_read_u8 v3, v3
127; GFX6-NEXT:    ds_read_u8 v4, v4
128; GFX6-NEXT:    ds_read_u8 v5, v5
129; GFX6-NEXT:    ds_read_u8 v6, v6
130; GFX6-NEXT:    ds_read_u8 v7, v7
131; GFX6-NEXT:    ds_read_u8 v1, v1
132; GFX6-NEXT:    ds_read_u8 v8, v0
133; GFX6-NEXT:    s_waitcnt lgkmcnt(1)
134; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
135; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
136; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 8, v3
137; GFX6-NEXT:    v_or_b32_e32 v2, v2, v4
138; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
139; GFX6-NEXT:    v_or_b32_e32 v1, v2, v1
140; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 8, v5
141; GFX6-NEXT:    v_or_b32_e32 v2, v2, v6
142; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 10, v0
143; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 3, v0
144; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 2, v0
145; GFX6-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
146; GFX6-NEXT:    ds_read_u8 v4, v4
147; GFX6-NEXT:    ds_read_u8 v5, v5
148; GFX6-NEXT:    ds_read_u8 v6, v6
149; GFX6-NEXT:    ds_read_u8 v0, v0
150; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 8, v7
151; GFX6-NEXT:    s_waitcnt lgkmcnt(3)
152; GFX6-NEXT:    v_or_b32_e32 v3, v3, v4
153; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
154; GFX6-NEXT:    v_or_b32_e32 v2, v3, v2
155; GFX6-NEXT:    s_waitcnt lgkmcnt(2)
156; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 8, v5
157; GFX6-NEXT:    s_waitcnt lgkmcnt(1)
158; GFX6-NEXT:    v_or_b32_e32 v3, v3, v6
159; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
160; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
161; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
162; GFX6-NEXT:    v_or_b32_e32 v0, v0, v8
163; GFX6-NEXT:    v_or_b32_e32 v0, v3, v0
164; GFX6-NEXT:    s_setpc_b64 s[30:31]
165  %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 1
166  ret <3 x i32> %load
167}
168
169define <3 x i32> @load_lds_v3i32_align2(<3 x i32> addrspace(3)* %ptr) {
170; GFX9-LABEL: load_lds_v3i32_align2:
171; GFX9:       ; %bb.0:
172; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
173; GFX9-NEXT:    ds_read_u16 v1, v0
174; GFX9-NEXT:    ds_read_u16 v2, v0 offset:2
175; GFX9-NEXT:    ds_read_u16 v3, v0 offset:4
176; GFX9-NEXT:    ds_read_u16 v4, v0 offset:6
177; GFX9-NEXT:    ds_read_u16 v5, v0 offset:8
178; GFX9-NEXT:    ds_read_u16 v6, v0 offset:10
179; GFX9-NEXT:    s_waitcnt lgkmcnt(4)
180; GFX9-NEXT:    v_lshl_or_b32 v0, v2, 16, v1
181; GFX9-NEXT:    s_waitcnt lgkmcnt(2)
182; GFX9-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
183; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
184; GFX9-NEXT:    v_lshl_or_b32 v2, v6, 16, v5
185; GFX9-NEXT:    s_setpc_b64 s[30:31]
186;
187; GFX7-LABEL: load_lds_v3i32_align2:
188; GFX7:       ; %bb.0:
189; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
190; GFX7-NEXT:    s_mov_b32 m0, -1
191; GFX7-NEXT:    ds_read_u16 v2, v0 offset:10
192; GFX7-NEXT:    ds_read_u16 v3, v0 offset:8
193; GFX7-NEXT:    ds_read_u16 v1, v0 offset:6
194; GFX7-NEXT:    ds_read_u16 v4, v0 offset:4
195; GFX7-NEXT:    ds_read_u16 v5, v0 offset:2
196; GFX7-NEXT:    ds_read_u16 v0, v0
197; GFX7-NEXT:    s_waitcnt lgkmcnt(5)
198; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
199; GFX7-NEXT:    s_waitcnt lgkmcnt(3)
200; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
201; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
202; GFX7-NEXT:    v_or_b32_e32 v1, v1, v4
203; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
204; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
205; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
206; GFX7-NEXT:    v_or_b32_e32 v0, v5, v0
207; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
208; GFX7-NEXT:    s_setpc_b64 s[30:31]
209;
210; GFX6-LABEL: load_lds_v3i32_align2:
211; GFX6:       ; %bb.0:
212; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
213; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 6, v0
214; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 4, v0
215; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 10, v0
216; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 8, v0
217; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 2, v0
218; GFX6-NEXT:    s_mov_b32 m0, -1
219; GFX6-NEXT:    ds_read_u16 v2, v2
220; GFX6-NEXT:    ds_read_u16 v3, v3
221; GFX6-NEXT:    ds_read_u16 v4, v4
222; GFX6-NEXT:    ds_read_u16 v5, v5
223; GFX6-NEXT:    ds_read_u16 v1, v1
224; GFX6-NEXT:    ds_read_u16 v0, v0
225; GFX6-NEXT:    s_waitcnt lgkmcnt(1)
226; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
227; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
228; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
229; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
230; GFX6-NEXT:    v_or_b32_e32 v2, v2, v4
231; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
232; GFX6-NEXT:    v_or_b32_e32 v0, v3, v0
233; GFX6-NEXT:    s_setpc_b64 s[30:31]
234  %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 2
235  ret <3 x i32> %load
236}
237
238define <3 x i32> @load_lds_v3i32_align4(<3 x i32> addrspace(3)* %ptr) {
239; GFX9-LABEL: load_lds_v3i32_align4:
240; GFX9:       ; %bb.0:
241; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
242; GFX9-NEXT:    v_mov_b32_e32 v2, v0
243; GFX9-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
244; GFX9-NEXT:    ds_read_b32 v2, v2 offset:8
245; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
246; GFX9-NEXT:    s_setpc_b64 s[30:31]
247;
248; GFX7-LABEL: load_lds_v3i32_align4:
249; GFX7:       ; %bb.0:
250; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
251; GFX7-NEXT:    v_mov_b32_e32 v2, v0
252; GFX7-NEXT:    s_mov_b32 m0, -1
253; GFX7-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
254; GFX7-NEXT:    ds_read_b32 v2, v2 offset:8
255; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
256; GFX7-NEXT:    s_setpc_b64 s[30:31]
257;
258; GFX6-LABEL: load_lds_v3i32_align4:
259; GFX6:       ; %bb.0:
260; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
261; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 4, v0
262; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 8, v0
263; GFX6-NEXT:    s_mov_b32 m0, -1
264; GFX6-NEXT:    ds_read_b32 v2, v2
265; GFX6-NEXT:    ds_read_b32 v1, v1
266; GFX6-NEXT:    ds_read_b32 v0, v0
267; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
268; GFX6-NEXT:    s_setpc_b64 s[30:31]
269  %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 4
270  ret <3 x i32> %load
271}
272
273define <3 x i32> @load_lds_v3i32_align8(<3 x i32> addrspace(3)* %ptr) {
274; GFX9-LABEL: load_lds_v3i32_align8:
275; GFX9:       ; %bb.0:
276; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
277; GFX9-NEXT:    v_mov_b32_e32 v2, v0
278; GFX9-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
279; GFX9-NEXT:    ds_read_b32 v2, v2 offset:8
280; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
281; GFX9-NEXT:    s_setpc_b64 s[30:31]
282;
283; GFX7-LABEL: load_lds_v3i32_align8:
284; GFX7:       ; %bb.0:
285; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
286; GFX7-NEXT:    v_mov_b32_e32 v2, v0
287; GFX7-NEXT:    s_mov_b32 m0, -1
288; GFX7-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
289; GFX7-NEXT:    ds_read_b32 v2, v2 offset:8
290; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
291; GFX7-NEXT:    s_setpc_b64 s[30:31]
292;
293; GFX6-LABEL: load_lds_v3i32_align8:
294; GFX6:       ; %bb.0:
295; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
296; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 4, v0
297; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 8, v0
298; GFX6-NEXT:    s_mov_b32 m0, -1
299; GFX6-NEXT:    ds_read_b32 v2, v2
300; GFX6-NEXT:    ds_read_b32 v1, v1
301; GFX6-NEXT:    ds_read_b32 v0, v0
302; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
303; GFX6-NEXT:    s_setpc_b64 s[30:31]
304  %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 8
305  ret <3 x i32> %load
306}
307
308define <3 x i32> @load_lds_v3i32_align16(<3 x i32> addrspace(3)* %ptr) {
309; GFX9-LABEL: load_lds_v3i32_align16:
310; GFX9:       ; %bb.0:
311; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
312; GFX9-NEXT:    ds_read_b96 v[0:2], v0
313; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
314; GFX9-NEXT:    s_setpc_b64 s[30:31]
315;
316; GFX7-LABEL: load_lds_v3i32_align16:
317; GFX7:       ; %bb.0:
318; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
319; GFX7-NEXT:    s_mov_b32 m0, -1
320; GFX7-NEXT:    ds_read_b96 v[0:2], v0
321; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
322; GFX7-NEXT:    s_setpc_b64 s[30:31]
323;
324; GFX6-LABEL: load_lds_v3i32_align16:
325; GFX6:       ; %bb.0:
326; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
327; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 8, v0
328; GFX6-NEXT:    s_mov_b32 m0, -1
329; GFX6-NEXT:    ds_read_b32 v2, v1
330; GFX6-NEXT:    ds_read_b64 v[0:1], v0
331; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
332; GFX6-NEXT:    s_setpc_b64 s[30:31]
333  %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 16
334  ret <3 x i32> %load
335}
336