1; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-SI,FUNC %s
2; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
3; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-VI,FUNC %s
4; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
5
6; FUNC-LABEL: {{^}}constant_load_i16:
7; GCN-NOHSA: buffer_load_ushort v{{[0-9]+}}
8; GCN-HSA: flat_load_ushort
9
10; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
11define amdgpu_kernel void @constant_load_i16(i16 addrspace(1)* %out, i16 addrspace(4)* %in) {
12entry:
13  %ld = load i16, i16 addrspace(4)* %in
14  store i16 %ld, i16 addrspace(1)* %out
15  ret void
16}
17
18; FUNC-LABEL: {{^}}constant_load_v2i16:
19; GCN: s_load_dword s
20
21; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
22define amdgpu_kernel void @constant_load_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %in) {
23entry:
24  %ld = load <2 x i16>, <2 x i16> addrspace(4)* %in
25  store <2 x i16> %ld, <2 x i16> addrspace(1)* %out
26  ret void
27}
28
29; FUNC-LABEL: {{^}}constant_load_v3i16:
30; GCN: s_load_dwordx2 s
31
32; EG-DAG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
33; EG-DAG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 4, #1
34define amdgpu_kernel void @constant_load_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(4)* %in) {
35entry:
36  %ld = load <3 x i16>, <3 x i16> addrspace(4)* %in
37  store <3 x i16> %ld, <3 x i16> addrspace(1)* %out
38  ret void
39}
40
41; FUNC-LABEL: {{^}}constant_load_v4i16:
42; GCN: s_load_dwordx2
43
44; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
45define amdgpu_kernel void @constant_load_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(4)* %in) {
46entry:
47  %ld = load <4 x i16>, <4 x i16> addrspace(4)* %in
48  store <4 x i16> %ld, <4 x i16> addrspace(1)* %out
49  ret void
50}
51
52; FUNC-LABEL: {{^}}constant_load_v8i16:
53; GCN: s_load_dwordx4
54
55; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
56define amdgpu_kernel void @constant_load_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(4)* %in) {
57entry:
58  %ld = load <8 x i16>, <8 x i16> addrspace(4)* %in
59  store <8 x i16> %ld, <8 x i16> addrspace(1)* %out
60  ret void
61}
62
63; FUNC-LABEL: {{^}}constant_load_v16i16:
64; GCN: s_load_dwordx8
65
66; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
67; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
68define amdgpu_kernel void @constant_load_v16i16(<16 x i16> addrspace(1)* %out, <16 x i16> addrspace(4)* %in) {
69entry:
70  %ld = load <16 x i16>, <16 x i16> addrspace(4)* %in
71  store <16 x i16> %ld, <16 x i16> addrspace(1)* %out
72  ret void
73}
74
75; FUNC-LABEL: {{^}}constant_load_v16i16_align2:
76; GCN-HSA: flat_load_dwordx4
77; GCN-HSA: flat_load_dwordx4
78; GCN-HSA: flat_store_dwordx4
79; GCN-HSA: flat_store_dwordx4
80define amdgpu_kernel void @constant_load_v16i16_align2(<16 x i16> addrspace(4)* %ptr0) #0 {
81entry:
82  %ld =  load <16 x i16>, <16 x i16> addrspace(4)* %ptr0, align 2
83  store <16 x i16> %ld, <16 x i16> addrspace(1)* undef, align 32
84  ret void
85}
86
87; FUNC-LABEL: {{^}}constant_zextload_i16_to_i32:
88; GCN-NOHSA: buffer_load_ushort
89; GCN-NOHSA: buffer_store_dword
90
91; GCN-HSA: flat_load_ushort
92; GCN-HSA: flat_store_dword
93
94; EG: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}, 0, #1
95define amdgpu_kernel void @constant_zextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(4)* %in) #0 {
96  %a = load i16, i16 addrspace(4)* %in
97  %ext = zext i16 %a to i32
98  store i32 %ext, i32 addrspace(1)* %out
99  ret void
100}
101
102; FUNC-LABEL: {{^}}constant_sextload_i16_to_i32:
103; GCN-NOHSA: buffer_load_sshort
104; GCN-NOHSA: buffer_store_dword
105
106; GCN-HSA: flat_load_sshort
107; GCN-HSA: flat_store_dword
108
109; EG: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1
110; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
111; EG: 16
112define amdgpu_kernel void @constant_sextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(4)* %in) #0 {
113  %a = load i16, i16 addrspace(4)* %in
114  %ext = sext i16 %a to i32
115  store i32 %ext, i32 addrspace(1)* %out
116  ret void
117}
118
119; FUNC-LABEL: {{^}}constant_zextload_v1i16_to_v1i32:
120; GCN-NOHSA: buffer_load_ushort
121; GCN-HSA: flat_load_ushort
122
123; EG: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}, 0, #1
124define amdgpu_kernel void @constant_zextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(4)* %in) #0 {
125  %load = load <1 x i16>, <1 x i16> addrspace(4)* %in
126  %ext = zext <1 x i16> %load to <1 x i32>
127  store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
128  ret void
129}
130
131; FUNC-LABEL: {{^}}constant_sextload_v1i16_to_v1i32:
132; GCN-NOHSA: buffer_load_sshort
133; GCN-HSA: flat_load_sshort
134
135; EG: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1
136; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
137; EG: 16
138define amdgpu_kernel void @constant_sextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(4)* %in) #0 {
139  %load = load <1 x i16>, <1 x i16> addrspace(4)* %in
140  %ext = sext <1 x i16> %load to <1 x i32>
141  store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
142  ret void
143}
144
145; FUNC-LABEL: {{^}}constant_zextload_v2i16_to_v2i32:
146; GCN: s_load_dword s
147; GCN-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0xffff{{$}}
148; GCN-DAG: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
149
150; v2i16 is naturally 4 byte aligned
151; EG: VTX_READ_32 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1
152; EG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], literal
153; EG: 16
154; EG: 16
155define amdgpu_kernel void @constant_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(4)* %in) #0 {
156  %load = load <2 x i16>, <2 x i16> addrspace(4)* %in
157  %ext = zext <2 x i16> %load to <2 x i32>
158  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
159  ret void
160}
161
162; FUNC-LABEL: {{^}}constant_sextload_v2i16_to_v2i32:
163; GCN: s_load_dword s
164; GCN-DAG: s_ashr_i32
165; GCN-DAG: s_sext_i32_i16
166
167; v2i16 is naturally 4 byte aligned
168; EG: MEM_RAT_CACHELESS STORE_RAW [[ST:T[0-9]]].XY, {{T[0-9].[XYZW]}},
169; EG: VTX_READ_32 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1
170; EG-DAG: BFE_INT {{[* ]*}}[[ST]].X, [[DST]], 0.0, literal
171; TODO: We should use ASHR instead of LSHR + BFE
172; EG-DAG: BFE_INT {{[* ]*}}[[ST]].Y, {{PV\.[XYZW]}}, 0.0, literal
173; EG-DAG: 16
174; EG-DAG: 16
175define amdgpu_kernel void @constant_sextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(4)* %in) #0 {
176  %load = load <2 x i16>, <2 x i16> addrspace(4)* %in
177  %ext = sext <2 x i16> %load to <2 x i32>
178  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
179  ret void
180}
181
182; FUNC-LABEL: {{^}}constant_zextload_v3i16_to_v3i32:
183; GCN: s_load_dwordx2
184
185; v3i16 is naturally 8 byte aligned
186; EG-DAG: MEM_RAT_CACHELESS STORE_RAW [[ST_LO:T[0-9]]].XY, {{T[0-9].[XYZW]}},
187; EG-DAG: MEM_RAT_CACHELESS STORE_RAW [[ST_HI:T[0-9]]].X, {{T[0-9].[XYZW]}},
188; EG: CF_END
189; EG-DAG: VTX_READ_32 [[DST_LO:T[0-9]\.[XYZW]]], {{T[0-9]\.[XYZW]}}, 0, #1
190; EG-DAG: VTX_READ_16 [[DST_HI:T[0-9]\.[XYZW]]], {{T[0-9]\.[XYZW]}}, 4, #1
191; TODO: This should use DST, but for some there are redundant MOVs
192; EG-DAG: LSHR {{[* ]*}}[[ST_LO]].Y, {{T[0-9]\.[XYZW]}}, literal
193; EG-DAG: 16
194; EG-DAG: AND_INT {{[* ]*}}[[ST_LO]].X, {{T[0-9]\.[XYZW]}}, literal
195; EG-DAG: AND_INT {{[* ]*}}[[ST_HI]].X, {{T[0-9]\.[XYZW]}}, literal
196; EG-DAG: 65535
197; EG-DAG: 65535
198define amdgpu_kernel void @constant_zextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(4)* %in) {
199entry:
200  %ld = load <3 x i16>, <3 x i16> addrspace(4)* %in
201  %ext = zext <3 x i16> %ld to <3 x i32>
202  store <3 x i32> %ext, <3 x i32> addrspace(1)* %out
203  ret void
204}
205
206; FUNC-LABEL: {{^}}constant_sextload_v3i16_to_v3i32:
207; GCN: s_load_dwordx2
208
209; EG-DAG: MEM_RAT_CACHELESS STORE_RAW [[ST_LO:T[0-9]]].XY, {{T[0-9].[XYZW]}},
210; EG-DAG: MEM_RAT_CACHELESS STORE_RAW [[ST_HI:T[0-9]]].X, {{T[0-9].[XYZW]}},
211; v3i16 is naturally 8 byte aligned
212; EG-DAG: VTX_READ_32 [[DST_HI:T[0-9]\.[XYZW]]], [[PTR:T[0-9]\.[XYZW]]], 0, #1
213; EG-DAG: VTX_READ_16 [[DST_LO:T[0-9]\.[XYZW]]], {{T[0-9]\.[XYZW]}}, 4, #1
214; EG-DAG: ASHR {{[* ]*}}[[ST_LO]].Y, {{T[0-9]\.[XYZW]}}, literal
215; EG-DAG: BFE_INT {{[* ]*}}[[ST_LO]].X, {{T[0-9]\.[XYZW]}}, 0.0, literal
216; EG-DAG: BFE_INT {{[* ]*}}[[ST_HI]].X, {{T[0-9]\.[XYZW]}}, 0.0, literal
217; EG-DAG: 16
218; EG-DAG: 16
219define amdgpu_kernel void @constant_sextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(4)* %in) {
220entry:
221  %ld = load <3 x i16>, <3 x i16> addrspace(4)* %in
222  %ext = sext <3 x i16> %ld to <3 x i32>
223  store <3 x i32> %ext, <3 x i32> addrspace(1)* %out
224  ret void
225}
226
227; FUNC-LABEL: {{^}}constant_zextload_v4i16_to_v4i32:
228; GCN: s_load_dwordx2
229; GCN-DAG: s_and_b32
230; GCN-DAG: s_lshr_b32
231
232; v4i16 is naturally 8 byte aligned
233; EG: MEM_RAT_CACHELESS STORE_RAW [[ST:T[0-9]]].XYZW, {{T[0-9].[XYZW]}}
234; EG: VTX_READ_64 [[LD:T[0-9]]].XY, {{T[0-9].[XYZW]}}, 0, #1
235; TODO: This should use LD, but for some there are redundant MOVs
236; EG-DAG: BFE_UINT {{[* ]*}}[[ST]].Y, {{.*\.[XYZW]}}, literal
237; EG-DAG: BFE_UINT {{[* ]*}}[[ST]].W, {{.*\.[XYZW]}}, literal
238; EG-DAG: 16
239; EG-DAG: 16
240; EG-DAG: AND_INT {{[* ]*}}[[ST]].X, {{T[0-9]\.[XYZW]}}, literal
241; EG-DAG: AND_INT {{[* ]*}}[[ST]].Z, {{T[0-9]\.[XYZW]}}, literal
242; EG-DAG: 65535
243; EG-DAG: 65535
244define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(4)* %in) #0 {
245  %load = load <4 x i16>, <4 x i16> addrspace(4)* %in
246  %ext = zext <4 x i16> %load to <4 x i32>
247  store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
248  ret void
249}
250
251; FUNC-LABEL: {{^}}constant_sextload_v4i16_to_v4i32:
252; GCN: s_load_dwordx2
253; GCN-DAG: s_ashr_i32
254; GCN-DAG: s_sext_i32_i16
255
256; v4i16 is naturally 8 byte aligned
257; EG: MEM_RAT_CACHELESS STORE_RAW [[ST:T[0-9]]].XYZW, {{T[0-9]\.[XYZW]}},
258; EG: VTX_READ_64 [[DST:T[0-9]]].XY, {{T[0-9].[XYZW]}}, 0, #1
259; TODO: This should use LD, but for some there are redundant MOVs
260; EG-DAG: BFE_INT {{[* ]*}}[[ST]].X, {{.*}}, 0.0, literal
261; EG-DAG: BFE_INT {{[* ]*}}[[ST]].Z, {{.*}}, 0.0, literal
262; TODO: We should use ASHR instead of LSHR + BFE
263; EG-DAG: BFE_INT {{[* ]*}}[[ST]].Y, {{.*}}, 0.0, literal
264; EG-DAG: BFE_INT {{[* ]*}}[[ST]].W, {{.*}}, 0.0, literal
265; EG-DAG: 16
266; EG-DAG: 16
267; EG-DAG: 16
268; EG-DAG: 16
269define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(4)* %in) #0 {
270  %load = load <4 x i16>, <4 x i16> addrspace(4)* %in
271  %ext = sext <4 x i16> %load to <4 x i32>
272  store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
273  ret void
274}
275
276; FUNC-LABEL: {{^}}constant_zextload_v8i16_to_v8i32:
277; GCN: s_load_dwordx4
278; GCN-DAG: s_and_b32
279; GCN-DAG: s_lshr_b32
280
281; v8i16 is naturally 16 byte aligned
282; EG: MEM_RAT_CACHELESS STORE_RAW [[ST_HI:T[0-9]]].XYZW, {{T[0-9]+.[XYZW]}},
283; EG: MEM_RAT_CACHELESS STORE_RAW [[ST_LO:T[0-9]]].XYZW, {{T[0-9]+.[XYZW]}},
284; EG: VTX_READ_128 [[DST:T[0-9]]].XYZW, {{T[0-9].[XYZW]}}, 0, #1
285; TODO: These should use LSHR instead of BFE_UINT
286; TODO: This should use DST, but for some there are redundant MOVs
287; EG-DAG: BFE_UINT {{[* ]*}}[[ST_LO]].Y, {{.*}}, literal
288; EG-DAG: BFE_UINT {{[* ]*}}[[ST_LO]].W, {{.*}}, literal
289; EG-DAG: BFE_UINT {{[* ]*}}[[ST_HI]].Y, {{.*}}, literal
290; EG-DAG: BFE_UINT {{[* ]*}}[[ST_HI]].W, {{.*}}, literal
291; EG-DAG: AND_INT {{[* ]*}}[[ST_LO]].X, {{.*}}, literal
292; EG-DAG: AND_INT {{[* ]*}}[[ST_LO]].Z, {{.*}}, literal
293; EG-DAG: AND_INT {{[* ]*}}[[ST_HI]].X, {{.*}}, literal
294; EG-DAG: AND_INT {{[* ]*}}[[ST_HI]].Z, {{.*}}, literal
295; EG-DAG: 16
296; EG-DAG: 16
297; EG-DAG: 16
298; EG-DAG: 16
299; EG-DAG: 65535
300; EG-DAG: 65535
301; EG-DAG: 65535
302; EG-DAG: 65535
303define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(4)* %in) #0 {
304  %load = load <8 x i16>, <8 x i16> addrspace(4)* %in
305  %ext = zext <8 x i16> %load to <8 x i32>
306  store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
307  ret void
308}
309
310; FUNC-LABEL: {{^}}constant_sextload_v8i16_to_v8i32:
311; GCN: s_load_dwordx4
312; GCN-DAG: s_ashr_i32
313; GCN-DAG: s_sext_i32_i16
314
315; v8i16 is naturally 16 byte aligned
316; EG: MEM_RAT_CACHELESS STORE_RAW [[ST_HI:T[0-9]]].XYZW, {{T[0-9]+.[XYZW]}},
317; EG: MEM_RAT_CACHELESS STORE_RAW [[ST_LO:T[0-9]]].XYZW, {{T[0-9]+.[XYZW]}},
318; EG: VTX_READ_128 [[DST:T[0-9]]].XYZW, {{T[0-9].[XYZW]}}, 0, #1
319; TODO: 4 of these should use ASHR instead of LSHR + BFE_INT
320; TODO: This should use DST, but for some there are redundant MOVs
321; EG-DAG: BFE_INT {{[* ]*}}[[ST_LO]].Y, {{.*}}, 0.0, literal
322; EG-DAG: BFE_INT {{[* ]*}}[[ST_LO]].W, {{.*}}, 0.0, literal
323; EG-DAG: BFE_INT {{[* ]*}}[[ST_HI]].Y, {{.*}}, 0.0, literal
324; EG-DAG: BFE_INT {{[* ]*}}[[ST_HI]].W, {{.*}}, 0.0, literal
325; EG-DAG: BFE_INT {{[* ]*}}[[ST_LO]].X, {{.*}}, 0.0, literal
326; EG-DAG: BFE_INT {{[* ]*}}[[ST_LO]].Z, {{.*}}, 0.0, literal
327; EG-DAG: BFE_INT {{[* ]*}}[[ST_HI]].X, {{.*}}, 0.0, literal
328; EG-DAG: BFE_INT {{[* ]*}}[[ST_HI]].Z, {{.*}}, 0.0, literal
329; EG-DAG: 16
330; EG-DAG: 16
331; EG-DAG: 16
332; EG-DAG: 16
333; EG-DAG: 16
334; EG-DAG: 16
335; EG-DAG: 16
336; EG-DAG: 16
337define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(4)* %in) #0 {
338  %load = load <8 x i16>, <8 x i16> addrspace(4)* %in
339  %ext = sext <8 x i16> %load to <8 x i32>
340  store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
341  ret void
342}
343
344; FUNC-LABEL: {{^}}constant_zextload_v16i16_to_v16i32:
345; GCN: s_load_dwordx8
346; GCN-DAG: s_and_b32
347; GCN-DAG: s_lshr_b32
348
349; v16i16 is naturally 32 byte aligned
350; EG-DAG: VTX_READ_128 [[DST_HI:T[0-9]+\.XYZW]], {{T[0-9]+.[XYZW]}}, 0, #1
351; EG-DAG: VTX_READ_128 [[DST_LO:T[0-9]+\.XYZW]], {{T[0-9]+.[XYZW]}}, 16, #1
352define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(4)* %in) #0 {
353  %load = load <16 x i16>, <16 x i16> addrspace(4)* %in
354  %ext = zext <16 x i16> %load to <16 x i32>
355  store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
356  ret void
357}
358
359; FUNC-LABEL: {{^}}constant_sextload_v16i16_to_v16i32:
360; GCN: s_load_dwordx8
361; GCN-DAG: s_ashr_i32
362; GCN-DAG: s_sext_i32_i16
363
364; v16i16 is naturally 32 byte aligned
365; EG-DAG: VTX_READ_128 [[DST_HI:T[0-9]+\.XYZW]], {{T[0-9]+\.[XYZW]}}, 0, #1
366; EG-DAG: VTX_READ_128 [[DST_LO:T[0-9]+\.XYZW]], {{T[0-9]+\.[XYZW]}}, 16, #1
367define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(4)* %in) #0 {
368  %load = load <16 x i16>, <16 x i16> addrspace(4)* %in
369  %ext = sext <16 x i16> %load to <16 x i32>
370  store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
371  ret void
372}
373
374; FUNC-LABEL: {{^}}constant_zextload_v32i16_to_v32i32:
375; GCN-DAG: s_load_dwordx16
376; GCN-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}}
377; GCN-DAG: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
378; GCN-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[K]]
379
380; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 0, #1
381; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 16, #1
382; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 32, #1
383; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 48, #1
384define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(4)* %in) #0 {
385  %load = load <32 x i16>, <32 x i16> addrspace(4)* %in
386  %ext = zext <32 x i16> %load to <32 x i32>
387  store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
388  ret void
389}
390
391; FUNC-LABEL: {{^}}constant_sextload_v32i16_to_v32i32:
392; GCN: s_load_dwordx16
393; GCN-DAG: s_ashr_i32
394; GCN-DAG: s_sext_i32_i16
395
396; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 0, #1
397; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 16, #1
398; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 32, #1
399; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 48, #1
400define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(4)* %in) #0 {
401  %load = load <32 x i16>, <32 x i16> addrspace(4)* %in
402  %ext = sext <32 x i16> %load to <32 x i32>
403  store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
404  ret void
405}
406
407; FUNC-LABEL: {{^}}constant_zextload_v64i16_to_v64i32:
408; GCN: s_load_dwordx16
409; GCN: s_load_dwordx16
410
411; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 0, #1
412; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 16, #1
413; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 32, #1
414; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 48, #1
415; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 64, #1
416; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 80, #1
417; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 96, #1
418; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 112, #1
419define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(4)* %in) #0 {
420  %load = load <64 x i16>, <64 x i16> addrspace(4)* %in
421  %ext = zext <64 x i16> %load to <64 x i32>
422  store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
423  ret void
424}
425
426; FUNC-LABEL: {{^}}constant_sextload_v64i16_to_v64i32:
427
428; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 0, #1
429; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 16, #1
430; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 32, #1
431; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 48, #1
432; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 64, #1
433; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 80, #1
434; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 96, #1
435; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 112, #1
436define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(4)* %in) #0 {
437  %load = load <64 x i16>, <64 x i16> addrspace(4)* %in
438  %ext = sext <64 x i16> %load to <64 x i32>
439  store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
440  ret void
441}
442
443; FUNC-LABEL: {{^}}constant_zextload_i16_to_i64:
444; GCN-NOHSA-DAG: buffer_load_ushort v[[LO:[0-9]+]],
445; GCN-HSA-DAG: flat_load_ushort v[[LO:[0-9]+]],
446; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
447
448; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]
449; GCN-HSA: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
450
451; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
452; EG: MOV {{.*}}, 0.0
453define amdgpu_kernel void @constant_zextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(4)* %in) #0 {
454  %a = load i16, i16 addrspace(4)* %in
455  %ext = zext i16 %a to i64
456  store i64 %ext, i64 addrspace(1)* %out
457  ret void
458}
459
460; FUNC-LABEL: {{^}}constant_sextload_i16_to_i64:
461; FIXME: Need to optimize this sequence to avoid extra bfe:
462;  t28: i32,ch = load<LD2[%in(addrspace=1)], anyext from i16> t12, t27, undef:i64
463;          t31: i64 = any_extend t28
464;        t33: i64 = sign_extend_inreg t31, ValueType:ch:i16
465
466; GCN-NOHSA-SI-DAG: buffer_load_sshort v[[LO:[0-9]+]],
467; GCN-HSA-DAG: flat_load_sshort v[[LO:[0-9]+]],
468; GCN-NOHSA-VI-DAG: buffer_load_ushort v[[ULO:[0-9]+]],
469; GCN-NOHSA-VI-DAG: v_bfe_i32 v[[LO:[0-9]+]], v[[ULO]], 0, 16
470; GCN-DAG: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
471
472; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]
473; GCN-HSA: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
474
475; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
476; EG: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal
477; TODO: These could be expanded earlier using ASHR 15
478; EG: 31
479define amdgpu_kernel void @constant_sextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(4)* %in) #0 {
480  %a = load i16, i16 addrspace(4)* %in
481  %ext = sext i16 %a to i64
482  store i64 %ext, i64 addrspace(1)* %out
483  ret void
484}
485
486; FUNC-LABEL: {{^}}constant_zextload_v1i16_to_v1i64:
487
488; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
489; EG: MOV {{.*}}, 0.0
490define amdgpu_kernel void @constant_zextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(4)* %in) #0 {
491  %load = load <1 x i16>, <1 x i16> addrspace(4)* %in
492  %ext = zext <1 x i16> %load to <1 x i64>
493  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
494  ret void
495}
496
497; FUNC-LABEL: {{^}}constant_sextload_v1i16_to_v1i64:
498
499; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
500; EG: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal
501; TODO: These could be expanded earlier using ASHR 15
502; EG: 31
503define amdgpu_kernel void @constant_sextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(4)* %in) #0 {
504  %load = load <1 x i16>, <1 x i16> addrspace(4)* %in
505  %ext = sext <1 x i16> %load to <1 x i64>
506  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
507  ret void
508}
509
510; FUNC-LABEL: {{^}}constant_zextload_v2i16_to_v2i64:
511
512; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
513define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(4)* %in) #0 {
514  %load = load <2 x i16>, <2 x i16> addrspace(4)* %in
515  %ext = zext <2 x i16> %load to <2 x i64>
516  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
517  ret void
518}
519
520; FUNC-LABEL: {{^}}constant_sextload_v2i16_to_v2i64:
521
522; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
523define amdgpu_kernel void @constant_sextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(4)* %in) #0 {
524  %load = load <2 x i16>, <2 x i16> addrspace(4)* %in
525  %ext = sext <2 x i16> %load to <2 x i64>
526  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
527  ret void
528}
529
530; FUNC-LABEL: {{^}}constant_zextload_v4i16_to_v4i64:
531
532; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
533define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(4)* %in) #0 {
534  %load = load <4 x i16>, <4 x i16> addrspace(4)* %in
535  %ext = zext <4 x i16> %load to <4 x i64>
536  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
537  ret void
538}
539
540; FUNC-LABEL: {{^}}constant_sextload_v4i16_to_v4i64:
541
542; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
543define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(4)* %in) #0 {
544  %load = load <4 x i16>, <4 x i16> addrspace(4)* %in
545  %ext = sext <4 x i16> %load to <4 x i64>
546  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
547  ret void
548}
549
550; FUNC-LABEL: {{^}}constant_zextload_v8i16_to_v8i64:
551
552; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
553define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(4)* %in) #0 {
554  %load = load <8 x i16>, <8 x i16> addrspace(4)* %in
555  %ext = zext <8 x i16> %load to <8 x i64>
556  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
557  ret void
558}
559
560; FUNC-LABEL: {{^}}constant_sextload_v8i16_to_v8i64:
561
562; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
563define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(4)* %in) #0 {
564  %load = load <8 x i16>, <8 x i16> addrspace(4)* %in
565  %ext = sext <8 x i16> %load to <8 x i64>
566  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
567  ret void
568}
569
570; FUNC-LABEL: {{^}}constant_zextload_v16i16_to_v16i64:
571
572; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
573; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
574define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(4)* %in) #0 {
575  %load = load <16 x i16>, <16 x i16> addrspace(4)* %in
576  %ext = zext <16 x i16> %load to <16 x i64>
577  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
578  ret void
579}
580
581; FUNC-LABEL: {{^}}constant_sextload_v16i16_to_v16i64:
582
583; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
584; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
585define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(4)* %in) #0 {
586  %load = load <16 x i16>, <16 x i16> addrspace(4)* %in
587  %ext = sext <16 x i16> %load to <16 x i64>
588  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
589  ret void
590}
591
592; FUNC-LABEL: {{^}}constant_zextload_v32i16_to_v32i64:
593
594; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
595; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
596; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 32, #1
597; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 48, #1
598define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(4)* %in) #0 {
599  %load = load <32 x i16>, <32 x i16> addrspace(4)* %in
600  %ext = zext <32 x i16> %load to <32 x i64>
601  store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
602  ret void
603}
604
605; FUNC-LABEL: {{^}}constant_sextload_v32i16_to_v32i64:
606
607; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
608; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
609; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 32, #1
610; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 48, #1
611define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(4)* %in) #0 {
612  %load = load <32 x i16>, <32 x i16> addrspace(4)* %in
613  %ext = sext <32 x i16> %load to <32 x i64>
614  store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
615  ret void
616}
617
618; These trigger undefined register machine verifier errors
619
620; ; XFUNC-LABEL: {{^}}constant_zextload_v64i16_to_v64i64:
621; define amdgpu_kernel void @constant_zextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(4)* %in) #0 {
622;   %load = load <64 x i16>, <64 x i16> addrspace(4)* %in
623;   %ext = zext <64 x i16> %load to <64 x i64>
624;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
625;   ret void
626; }
627
628; ; XFUNC-LABEL: {{^}}constant_sextload_v64i16_to_v64i64:
629; define amdgpu_kernel void @constant_sextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(4)* %in) #0 {
630;   %load = load <64 x i16>, <64 x i16> addrspace(4)* %in
631;   %ext = sext <64 x i16> %load to <64 x i64>
632;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
633;   ret void
634; }
635
636attributes #0 = { nounwind }
637