1; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
2; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
3; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
4; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=EG -check-prefix=FUNC %s
5; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
6; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
7
8; FUNC-LABEL: {{^}}constant_load_i32:
9; GCN: s_load_dword s{{[0-9]+}}
10
11; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
12define amdgpu_kernel void @constant_load_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) #0 {
13entry:
14  %ld = load i32, i32 addrspace(4)* %in
15  store i32 %ld, i32 addrspace(1)* %out
16  ret void
17}
18
19; FUNC-LABEL: {{^}}constant_load_v2i32:
20; GCN: s_load_dwordx2
21
22; EG: VTX_READ_64
23define amdgpu_kernel void @constant_load_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(4)* %in) #0 {
24entry:
25  %ld = load <2 x i32>, <2 x i32> addrspace(4)* %in
26  store <2 x i32> %ld, <2 x i32> addrspace(1)* %out
27  ret void
28}
29
30; FUNC-LABEL: {{^}}constant_load_v3i32:
31; GCN: s_load_dwordx4
32
33; EG: VTX_READ_128
34define amdgpu_kernel void @constant_load_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(4)* %in) #0 {
35entry:
36  %ld = load <3 x i32>, <3 x i32> addrspace(4)* %in
37  store <3 x i32> %ld, <3 x i32> addrspace(1)* %out
38  ret void
39}
40
41; FUNC-LABEL: {{^}}constant_load_v4i32:
42; GCN: s_load_dwordx4
43
44; EG: VTX_READ_128
45define amdgpu_kernel void @constant_load_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(4)* %in) #0 {
46entry:
47  %ld = load <4 x i32>, <4 x i32> addrspace(4)* %in
48  store <4 x i32> %ld, <4 x i32> addrspace(1)* %out
49  ret void
50}
51
52; FUNC-LABEL: {{^}}constant_load_v8i32:
53; GCN: s_load_dwordx8
54
55; EG: VTX_READ_128
56; EG: VTX_READ_128
57define amdgpu_kernel void @constant_load_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(4)* %in) #0 {
58entry:
59  %ld = load <8 x i32>, <8 x i32> addrspace(4)* %in
60  store <8 x i32> %ld, <8 x i32> addrspace(1)* %out
61  ret void
62}
63
64; FUNC-LABEL: {{^}}constant_load_v16i32:
65; GCN: s_load_dwordx16
66
67; EG: VTX_READ_128
68; EG: VTX_READ_128
69; EG: VTX_READ_128
70; EG: VTX_READ_128
71define amdgpu_kernel void @constant_load_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(4)* %in) #0 {
72entry:
73  %ld = load <16 x i32>, <16 x i32> addrspace(4)* %in
74  store <16 x i32> %ld, <16 x i32> addrspace(1)* %out
75  ret void
76}
77
78; FUNC-LABEL: {{^}}constant_zextload_i32_to_i64:
79; GCN-DAG: s_load_dword s[[SLO:[0-9]+]],
80; GCN-DAG: v_mov_b32_e32 v[[SHI:[0-9]+]], 0{{$}}
81; GCN: store_dwordx2
82
83; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY
84; EG: CF_END
85; EG: VTX_READ_32
86define amdgpu_kernel void @constant_zextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(4)* %in) #0 {
87  %ld = load i32, i32 addrspace(4)* %in
88  %ext = zext i32 %ld to i64
89  store i64 %ext, i64 addrspace(1)* %out
90  ret void
91}
92
93; FUNC-LABEL: {{^}}constant_sextload_i32_to_i64:
94; GCN: s_load_dword s[[SLO:[0-9]+]]
95; GCN: s_ashr_i32 s[[HI:[0-9]+]], s[[SLO]], 31
96; GCN: store_dwordx2
97
98; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY
99; EG: CF_END
100; EG: VTX_READ_32
101; EG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}},  literal.
102; EG: 31
103define amdgpu_kernel void @constant_sextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(4)* %in) #0 {
104  %ld = load i32, i32 addrspace(4)* %in
105  %ext = sext i32 %ld to i64
106  store i64 %ext, i64 addrspace(1)* %out
107  ret void
108}
109
110; FUNC-LABEL: {{^}}constant_zextload_v1i32_to_v1i64:
111; GCN: s_load_dword
112; GCN: store_dwordx2
113define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(4)* %in) #0 {
114  %ld = load <1 x i32>, <1 x i32> addrspace(4)* %in
115  %ext = zext <1 x i32> %ld to <1 x i64>
116  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
117  ret void
118}
119
120; FUNC-LABEL: {{^}}constant_sextload_v1i32_to_v1i64:
121; GCN: s_load_dword s[[LO:[0-9]+]]
122; GCN: s_ashr_i32 s[[HI:[0-9]+]], s[[LO]], 31
123; GCN: store_dwordx2
124define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(4)* %in) #0 {
125  %ld = load <1 x i32>, <1 x i32> addrspace(4)* %in
126  %ext = sext <1 x i32> %ld to <1 x i64>
127  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
128  ret void
129}
130
131; FUNC-LABEL: {{^}}constant_zextload_v2i32_to_v2i64:
132; GCN: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
133; GCN: store_dwordx4
134define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(4)* %in) #0 {
135  %ld = load <2 x i32>, <2 x i32> addrspace(4)* %in
136  %ext = zext <2 x i32> %ld to <2 x i64>
137  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
138  ret void
139}
140
141; FUNC-LABEL: {{^}}constant_sextload_v2i32_to_v2i64:
142; GCN: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
143
144; GCN-DAG: s_ashr_i32
145; GCN-DAG: s_ashr_i32
146
147; GCN: store_dwordx4
148define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(4)* %in) #0 {
149  %ld = load <2 x i32>, <2 x i32> addrspace(4)* %in
150  %ext = sext <2 x i32> %ld to <2 x i64>
151  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
152  ret void
153}
154
155; FUNC-LABEL: {{^}}constant_zextload_v4i32_to_v4i64:
156; GCN: s_load_dwordx4
157
158; GCN: store_dwordx4
159; GCN: store_dwordx4
160define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(4)* %in) #0 {
161  %ld = load <4 x i32>, <4 x i32> addrspace(4)* %in
162  %ext = zext <4 x i32> %ld to <4 x i64>
163  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
164  ret void
165}
166
167; FUNC-LABEL: {{^}}constant_sextload_v4i32_to_v4i64:
168; GCN: s_load_dwordx4
169
170; GCN: s_ashr_i32
171; GCN: s_ashr_i32
172; GCN: s_ashr_i32
173; GCN: s_ashr_i32
174
175; GCN: store_dwordx4
176; GCN: store_dwordx4
177define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(4)* %in) #0 {
178  %ld = load <4 x i32>, <4 x i32> addrspace(4)* %in
179  %ext = sext <4 x i32> %ld to <4 x i64>
180  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
181  ret void
182}
183
184; FUNC-LABEL: {{^}}constant_zextload_v8i32_to_v8i64:
185; GCN: s_load_dwordx8
186
187; GCN-NOHSA-DAG: buffer_store_dwordx4
188; GCN-NOHSA-DAG: buffer_store_dwordx4
189; GCN-NOHSA-DAG: buffer_store_dwordx4
190; GCN-NOHSA-DAG: buffer_store_dwordx4
191
192; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
193; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
194; GCN-SA-DAG: {{flat|global}}_store_dwordx4
195; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
196define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(4)* %in) #0 {
197  %ld = load <8 x i32>, <8 x i32> addrspace(4)* %in
198  %ext = zext <8 x i32> %ld to <8 x i64>
199  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
200  ret void
201}
202
203; FUNC-LABEL: {{^}}constant_sextload_v8i32_to_v8i64:
204; GCN: s_load_dwordx8
205
206; GCN: s_ashr_i32
207; GCN: s_ashr_i32
208; GCN: s_ashr_i32
209; GCN: s_ashr_i32
210; GCN: s_ashr_i32
211; GCN: s_ashr_i32
212; GCN: s_ashr_i32
213; GCN: s_ashr_i32
214
215; GCN-NOHSA-DAG: buffer_store_dwordx4
216; GCN-NOHSA-DAG: buffer_store_dwordx4
217; GCN-NOHSA-DAG: buffer_store_dwordx4
218; GCN-NOHSA-DAG: buffer_store_dwordx4
219
220; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
221; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
222; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
223; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
224define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(4)* %in) #0 {
225  %ld = load <8 x i32>, <8 x i32> addrspace(4)* %in
226  %ext = sext <8 x i32> %ld to <8 x i64>
227  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
228  ret void
229}
230
231; FUNC-LABEL: {{^}}constant_sextload_v16i32_to_v16i64:
232; GCN: s_load_dwordx16
233
234
235; GCN-DAG: s_ashr_i32
236
237; GCN: store_dwordx4
238; GCN: store_dwordx4
239; GCN: store_dwordx4
240; GCN: store_dwordx4
241; GCN: store_dwordx4
242; GCN: store_dwordx4
243; GCN: store_dwordx4
244; GCN: store_dwordx4
245define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(4)* %in) #0 {
246  %ld = load <16 x i32>, <16 x i32> addrspace(4)* %in
247  %ext = sext <16 x i32> %ld to <16 x i64>
248  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
249  ret void
250}
251
252; FUNC-LABEL: {{^}}constant_zextload_v16i32_to_v16i64
253; GCN: s_load_dwordx16
254
255; GCN-NOHSA: buffer_store_dwordx4
256; GCN-NOHSA: buffer_store_dwordx4
257; GCN-NOHSA: buffer_store_dwordx4
258; GCN-NOHSA: buffer_store_dwordx4
259; GCN-NOHSA: buffer_store_dwordx4
260; GCN-NOHSA: buffer_store_dwordx4
261; GCN-NOHSA: buffer_store_dwordx4
262; GCN-NOHSA: buffer_store_dwordx4
263
264; GCN-HSA: {{flat|global}}_store_dwordx4
265; GCN-HSA: {{flat|global}}_store_dwordx4
266; GCN-HSA: {{flat|global}}_store_dwordx4
267; GCN-HSA: {{flat|global}}_store_dwordx4
268; GCN-HSA: {{flat|global}}_store_dwordx4
269; GCN-HSA: {{flat|global}}_store_dwordx4
270; GCN-HSA: {{flat|global}}_store_dwordx4
271; GCN-HSA: {{flat|global}}_store_dwordx4
272define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(4)* %in) #0 {
273  %ld = load <16 x i32>, <16 x i32> addrspace(4)* %in
274  %ext = zext <16 x i32> %ld to <16 x i64>
275  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
276  ret void
277}
278
279; FUNC-LABEL: {{^}}constant_sextload_v32i32_to_v32i64:
280
281; GCN: s_load_dwordx16
282; GCN-DAG: s_load_dwordx16
283
284; GCN-NOHSA-DAG: buffer_store_dwordx4
285; GCN-NOHSA-DAG: buffer_store_dwordx4
286; GCN-NOHSA-DAG: buffer_store_dwordx4
287; GCN-NOHSA-DAG: buffer_store_dwordx4
288
289; GCN-NOHSA-DAG: buffer_store_dwordx4
290; GCN-NOHSA-DAG: buffer_store_dwordx4
291; GCN-NOHSA-DAG: buffer_store_dwordx4
292; GCN-NOHSA-DAG: buffer_store_dwordx4
293
294; GCN-NOHSA-DAG: buffer_store_dwordx4
295; GCN-NOHSA-DAG: buffer_store_dwordx4
296; GCN-NOHSA-DAG: buffer_store_dwordx4
297; GCN-NOHSA-DAG: buffer_store_dwordx4
298
299; GCN-NOHSA-DAG: buffer_store_dwordx4
300; GCN-NOHSA-DAG: buffer_store_dwordx4
301; GCN-NOHSA-DAG: buffer_store_dwordx4
302; GCN-NOHSA-DAG: buffer_store_dwordx4
303
304; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
305; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
306; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
307; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
308
309; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
310; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
311; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
312; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
313
314; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
315; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
316; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
317; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
318
319; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
320; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
321; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
322; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
323
324define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(4)* %in) #0 {
325  %ld = load <32 x i32>, <32 x i32> addrspace(4)* %in
326  %ext = sext <32 x i32> %ld to <32 x i64>
327  store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
328  ret void
329}
330
331; FUNC-LABEL: {{^}}constant_zextload_v32i32_to_v32i64:
332; GCN: s_load_dwordx16
333; GCN: s_load_dwordx16
334
335; GCN-NOHSA-DAG: buffer_store_dwordx4
336; GCN-NOHSA-DAG: buffer_store_dwordx4
337; GCN-NOHSA-DAG: buffer_store_dwordx4
338; GCN-NOHSA-DAG: buffer_store_dwordx4
339
340; GCN-NOHSA-DAG: buffer_store_dwordx4
341; GCN-NOHSA-DAG: buffer_store_dwordx4
342; GCN-NOHSA-DAG: buffer_store_dwordx4
343; GCN-NOHSA-DAG: buffer_store_dwordx4
344
345; GCN-NOHSA-DAG: buffer_store_dwordx4
346; GCN-NOHSA-DAG: buffer_store_dwordx4
347; GCN-NOHSA-DAG: buffer_store_dwordx4
348; GCN-NOHSA-DAG: buffer_store_dwordx4
349
350; GCN-NOHSA-DAG: buffer_store_dwordx4
351; GCN-NOHSA-DAG: buffer_store_dwordx4
352; GCN-NOHSA-DAG: buffer_store_dwordx4
353; GCN-NOHSA-DAG: buffer_store_dwordx4
354
355
356; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
357; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
358; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
359; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
360
361; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
362; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
363; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
364; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
365
366; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
367; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
368; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
369; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
370
371; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
372; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
373; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
374; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
375define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(4)* %in) #0 {
376  %ld = load <32 x i32>, <32 x i32> addrspace(4)* %in
377  %ext = zext <32 x i32> %ld to <32 x i64>
378  store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
379  ret void
380}
381
382; FUNC-LABEL: {{^}}constant_load_v32i32:
383; GCN: s_load_dwordx16
384; GCN: s_load_dwordx16
385
386; GCN-NOHSA-DAG: buffer_store_dwordx4
387; GCN-NOHSA-DAG: buffer_store_dwordx4
388; GCN-NOHSA-DAG: buffer_store_dwordx4
389; GCN-NOHSA-DAG: buffer_store_dwordx4
390
391; GCN-NOHSA-DAG: buffer_store_dwordx4
392; GCN-NOHSA-DAG: buffer_store_dwordx4
393; GCN-NOHSA-DAG: buffer_store_dwordx4
394; GCN-NOHSA-DAG: buffer_store_dwordx4
395
396; GCN-NOHSA-DAG: buffer_store_dwordx4
397; GCN-NOHSA-DAG: buffer_store_dwordx4
398; GCN-NOHSA-DAG: buffer_store_dwordx4
399; GCN-NOHSA-DAG: buffer_store_dwordx4
400
401; GCN-NOHSA-DAG: buffer_store_dwordx4
402; GCN-NOHSA-DAG: buffer_store_dwordx4
403; GCN-NOHSA-DAG: buffer_store_dwordx4
404; GCN-NOHSA-DAG: buffer_store_dwordx4
405
406; GCN-NOT: accvgpr
407
408; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
409; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
410; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
411; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
412
413; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
414; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
415; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
416; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
417
418; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
419; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
420; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
421; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
422
423; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
424; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
425; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
426; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
427define amdgpu_kernel void @constant_load_v32i32(<32 x i32> addrspace(1)* %out, <32 x i32> addrspace(4)* %in) #0 {
428  %ld = load <32 x i32>, <32 x i32> addrspace(4)* %in
429  store <32 x i32> %ld, <32 x i32> addrspace(1)* %out
430  ret void
431}
432
433attributes #0 = { nounwind }
434