1; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,SI,SICIVI,FUNC %s
2; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,VI,SICIVI,FUNC %s
3; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX9,FUNC %s
4; RUN: llc -march=r600 -mtriple=r600---amdgiz -mcpu=redwood -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=EG -check-prefix=FUNC %s
5
6; Testing for ds_read/write_b128
7; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+enable-ds128 < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=CIVI,FUNC %s
8; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=+enable-ds128 < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=CIVI,FUNC %s
9
10; FUNC-LABEL: {{^}}local_load_i8:
11; GCN-NOT: s_wqm_b64
12; SICIVI: s_mov_b32 m0
13; GFX9-NOT: m0
14; GCN: ds_read_u8
15
16; EG: LDS_UBYTE_READ_RET
17define amdgpu_kernel void @local_load_i8(i8 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
18entry:
19  %ld = load i8, i8 addrspace(3)* %in
20  store i8 %ld, i8 addrspace(3)* %out
21  ret void
22}
23
24; FUNC-LABEL: {{^}}local_load_v2i8:
25; GCN-NOT: s_wqm_b64
26; SICIVI: s_mov_b32 m0
27; GFX9-NOT: m0
28; GCN: ds_read_u16
29
30; EG: LDS_USHORT_READ_RET
31define amdgpu_kernel void @local_load_v2i8(<2 x i8> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
32entry:
33  %ld = load <2 x i8>, <2 x i8> addrspace(3)* %in
34  store <2 x i8> %ld, <2 x i8> addrspace(3)* %out
35  ret void
36}
37
38; FUNC-LABEL: {{^}}local_load_v3i8:
39; GFX9-NOT: m0
40; GCN: ds_read_b32
41
42; EG: DS_READ_RET
43define amdgpu_kernel void @local_load_v3i8(<3 x i8> addrspace(3)* %out, <3 x i8> addrspace(3)* %in) #0 {
44entry:
45  %ld = load <3 x i8>, <3 x i8> addrspace(3)* %in
46  store <3 x i8> %ld, <3 x i8> addrspace(3)* %out
47  ret void
48}
49
50; FUNC-LABEL: {{^}}local_load_v4i8:
51; GFX9-NOT: m0
52; GCN: ds_read_b32
53
54; EG: LDS_READ_RET
55define amdgpu_kernel void @local_load_v4i8(<4 x i8> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
56entry:
57  %ld = load <4 x i8>, <4 x i8> addrspace(3)* %in
58  store <4 x i8> %ld, <4 x i8> addrspace(3)* %out
59  ret void
60}
61
62; FUNC-LABEL: {{^}}local_load_v8i8:
63; GFX9-NOT: m0
64; GCN: ds_read_b64
65
66; EG: LDS_READ_RET
67; EG: LDS_READ_RET
68define amdgpu_kernel void @local_load_v8i8(<8 x i8> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
69entry:
70  %ld = load <8 x i8>, <8 x i8> addrspace(3)* %in
71  store <8 x i8> %ld, <8 x i8> addrspace(3)* %out
72  ret void
73}
74
75; FUNC-LABEL: {{^}}local_load_v16i8:
76; GFX9-NOT: m0
77; GCN: ds_read2_b64  v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1{{$}}
78; GCN: ds_write2_b64 v{{[0-9]+}}, v{{\[}}[[LO]]:{{[0-9]+}}], v[{{[0-9]+}}:[[HI]]{{\]}} offset1:1{{$}}
79
80; EG: LDS_READ_RET
81; EG: LDS_READ_RET
82; EG: LDS_READ_RET
83; EG: LDS_READ_RET
84define amdgpu_kernel void @local_load_v16i8(<16 x i8> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
85entry:
86  %ld = load <16 x i8>, <16 x i8> addrspace(3)* %in
87  store <16 x i8> %ld, <16 x i8> addrspace(3)* %out
88  ret void
89}
90
91; FUNC-LABEL: {{^}}local_zextload_i8_to_i32:
92; GFX9-NOT: m0
93; GCN-NOT: s_wqm_b64
94; SICIVI: s_mov_b32 m0
95; GCN: ds_read_u8
96
97; EG: LDS_UBYTE_READ_RET
98define amdgpu_kernel void @local_zextload_i8_to_i32(i32 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
99  %a = load i8, i8 addrspace(3)* %in
100  %ext = zext i8 %a to i32
101  store i32 %ext, i32 addrspace(3)* %out
102  ret void
103}
104
105; FUNC-LABEL: {{^}}local_sextload_i8_to_i32:
106; GCN-NOT: s_wqm_b64
107; GFX9-NOT: m0
108; SICIVI: s_mov_b32 m0
109; GCN: ds_read_i8
110
111; EG: LDS_UBYTE_READ_RET
112; EG: BFE_INT
113define amdgpu_kernel void @local_sextload_i8_to_i32(i32 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
114  %ld = load i8, i8 addrspace(3)* %in
115  %ext = sext i8 %ld to i32
116  store i32 %ext, i32 addrspace(3)* %out
117  ret void
118}
119
120; FUNC-LABEL: {{^}}local_zextload_v1i8_to_v1i32:
121
122; EG: LDS_UBYTE_READ_RET
123define amdgpu_kernel void @local_zextload_v1i8_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
124  %load = load <1 x i8>, <1 x i8> addrspace(3)* %in
125  %ext = zext <1 x i8> %load to <1 x i32>
126  store <1 x i32> %ext, <1 x i32> addrspace(3)* %out
127  ret void
128}
129
130; FUNC-LABEL: {{^}}local_sextload_v1i8_to_v1i32:
131; GFX9-NOT: m0
132
133; EG: LDS_UBYTE_READ_RET
134; EG: BFE_INT
135define amdgpu_kernel void @local_sextload_v1i8_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
136  %load = load <1 x i8>, <1 x i8> addrspace(3)* %in
137  %ext = sext <1 x i8> %load to <1 x i32>
138  store <1 x i32> %ext, <1 x i32> addrspace(3)* %out
139  ret void
140}
141
142; FUNC-LABEL: {{^}}local_zextload_v2i8_to_v2i32:
143; GFX9-NOT: m0
144; GCN: ds_read_u16
145
146; EG: LDS_USHORT_READ_RET
147define amdgpu_kernel void @local_zextload_v2i8_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
148  %load = load <2 x i8>, <2 x i8> addrspace(3)* %in
149  %ext = zext <2 x i8> %load to <2 x i32>
150  store <2 x i32> %ext, <2 x i32> addrspace(3)* %out
151  ret void
152}
153
154; FUNC-LABEL: {{^}}local_sextload_v2i8_to_v2i32:
155; GCN-NOT: s_wqm_b64
156; GFX9-NOT: m0
157; SICIVI: s_mov_b32 m0
158; GCN: ds_read_u16
159; FIXME: Need to optimize this sequence to avoid extra shift on VI.
160;         t23: i16 = srl t39, Constant:i32<8>
161;          t31: i32 = any_extend t23
162;        t33: i32 = sign_extend_inreg t31, ValueType:ch:i8
163
164; SI-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 8, 8
165; SI-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8
166
167; VI-DAG: v_lshrrev_b16_e32 [[SHIFT:v[0-9]+]], 8, v{{[0-9]+}}
168; VI-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8
169; VI-DAG: v_bfe_i32 v{{[0-9]+}}, [[SHIFT]], 0, 8
170
171; EG: LDS_USHORT_READ_RET
172; EG-DAG: BFE_INT
173; EG-DAG: BFE_INT
174define amdgpu_kernel void @local_sextload_v2i8_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
175  %load = load <2 x i8>, <2 x i8> addrspace(3)* %in
176  %ext = sext <2 x i8> %load to <2 x i32>
177  store <2 x i32> %ext, <2 x i32> addrspace(3)* %out
178  ret void
179}
180
181; FUNC-LABEL: {{^}}local_zextload_v3i8_to_v3i32:
182; GFX9-NOT: m0
183; GCN: ds_read_b32
184
185; SI-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, 8, 8
186; VI-DAG: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, {{v[0-9]+}}
187; GCN-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, 16, 8
188; GCN-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xff,
189
190; EG: LDS_READ_RET
191define amdgpu_kernel void @local_zextload_v3i8_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i8> addrspace(3)* %in) #0 {
192entry:
193  %ld = load <3 x i8>, <3 x i8> addrspace(3)* %in
194  %ext = zext <3 x i8> %ld to <3 x i32>
195  store <3 x i32> %ext, <3 x i32> addrspace(3)* %out
196  ret void
197}
198
199; FUNC-LABEL: {{^}}local_sextload_v3i8_to_v3i32:
200; GCN-NOT: s_wqm_b64
201; GFX9-NOT: m0
202; SICIVI: s_mov_b32 m0
203; GCN: ds_read_b32
204
205; GCN-DAG: v_bfe_i32
206; GCN-DAG: v_bfe_i32
207; GCN-DAG: v_bfe_i32
208; GCN-DAG: v_bfe_i32
209
210; GCN-DAG: ds_write_b64
211; GCN-DAG: ds_write_b32
212
213; EG: LDS_READ_RET
214; EG-DAG: BFE_INT
215; EG-DAG: BFE_INT
216; EG-DAG: BFE_INT
217define amdgpu_kernel void @local_sextload_v3i8_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i8> addrspace(3)* %in) #0 {
218entry:
219  %ld = load <3 x i8>, <3 x i8> addrspace(3)* %in
220  %ext = sext <3 x i8> %ld to <3 x i32>
221  store <3 x i32> %ext, <3 x i32> addrspace(3)* %out
222  ret void
223}
224
225; FUNC-LABEL: {{^}}local_zextload_v4i8_to_v4i32:
226; GCN-NOT: s_wqm_b64
227; GFX9-NOT: m0
228; SICIVI: s_mov_b32 m0
229; GCN: ds_read_b32
230
231; EG: LDS_READ_RET
232; EG-DAG: BFE_UINT
233; EG-DAG: BFE_UINT
234; EG-DAG: BFE_UINT
235define amdgpu_kernel void @local_zextload_v4i8_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
236  %load = load <4 x i8>, <4 x i8> addrspace(3)* %in
237  %ext = zext <4 x i8> %load to <4 x i32>
238  store <4 x i32> %ext, <4 x i32> addrspace(3)* %out
239  ret void
240}
241
242; FUNC-LABEL: {{^}}local_sextload_v4i8_to_v4i32:
243; GCN-NOT: s_wqm_b64
244; GFX9-NOT: m0
245; SICIVI: s_mov_b32 m0
246; GCN: ds_read_b32
247
248; EG-DAG: LDS_READ_RET
249; EG-DAG: BFE_INT
250; EG-DAG: BFE_INT
251; EG-DAG: BFE_INT
252; EG-DAG: BFE_INT
253define amdgpu_kernel void @local_sextload_v4i8_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
254  %load = load <4 x i8>, <4 x i8> addrspace(3)* %in
255  %ext = sext <4 x i8> %load to <4 x i32>
256  store <4 x i32> %ext, <4 x i32> addrspace(3)* %out
257  ret void
258}
259
260; FUNC-LABEL: {{^}}local_zextload_v8i8_to_v8i32:
261; SICIVI: s_mov_b32 m0
262; GFX9-NOT: m0
263
264; EG-DAG: LDS_READ_RET
265; EG-DAG: LDS_READ_RET
266; EG-DAG: BFE_UINT
267; EG-DAG: BFE_UINT
268; EG-DAG: BFE_UINT
269; EG-DAG: BFE_UINT
270; EG-DAG: BFE_UINT
271; EG-DAG: BFE_UINT
272define amdgpu_kernel void @local_zextload_v8i8_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
273  %load = load <8 x i8>, <8 x i8> addrspace(3)* %in
274  %ext = zext <8 x i8> %load to <8 x i32>
275  store <8 x i32> %ext, <8 x i32> addrspace(3)* %out
276  ret void
277}
278
279; FUNC-LABEL: {{^}}local_sextload_v8i8_to_v8i32:
280; SICIVI: s_mov_b32 m0
281; GFX9-NOT: m0
282
283; EG-DAG: LDS_READ_RET
284; EG-DAG: LDS_READ_RET
285; EG-DAG: BFE_INT
286; EG-DAG: BFE_INT
287; EG-DAG: BFE_INT
288; EG-DAG: BFE_INT
289; EG-DAG: BFE_INT
290; EG-DAG: BFE_INT
291; EG-DAG: BFE_INT
292; EG-DAG: BFE_INT
293define amdgpu_kernel void @local_sextload_v8i8_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
294  %load = load <8 x i8>, <8 x i8> addrspace(3)* %in
295  %ext = sext <8 x i8> %load to <8 x i32>
296  store <8 x i32> %ext, <8 x i32> addrspace(3)* %out
297  ret void
298}
299
300; FUNC-LABEL: {{^}}local_zextload_v16i8_to_v16i32:
301; SICIVI: s_mov_b32 m0
302; GFX9-NOT: m0
303
304; EG-DAG: LDS_READ_RET
305; EG-DAG: LDS_READ_RET
306; EG-DAG: LDS_READ_RET
307; EG-DAG: LDS_READ_RET
308; EG-DAG: BFE_UINT
309; EG-DAG: BFE_UINT
310; EG-DAG: BFE_UINT
311; EG-DAG: BFE_UINT
312; EG-DAG: BFE_UINT
313; EG-DAG: BFE_UINT
314; EG-DAG: BFE_UINT
315; EG-DAG: BFE_UINT
316; EG-DAG: BFE_UINT
317; EG-DAG: BFE_UINT
318; EG-DAG: BFE_UINT
319; EG-DAG: BFE_UINT
320define amdgpu_kernel void @local_zextload_v16i8_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
321  %load = load <16 x i8>, <16 x i8> addrspace(3)* %in
322  %ext = zext <16 x i8> %load to <16 x i32>
323  store <16 x i32> %ext, <16 x i32> addrspace(3)* %out
324  ret void
325}
326
327; FUNC-LABEL: {{^}}local_sextload_v16i8_to_v16i32:
328; SICIVI: s_mov_b32 m0
329; GFX9-NOT: m0
330
331; EG-DAG: LDS_READ_RET
332; EG-DAG: LDS_READ_RET
333; EG-DAG: LDS_READ_RET
334; EG-DAG: LDS_READ_RET
335; EG-DAG: BFE_INT
336; EG-DAG: BFE_INT
337; EG-DAG: BFE_INT
338; EG-DAG: BFE_INT
339; EG-DAG: BFE_INT
340; EG-DAG: BFE_INT
341; EG-DAG: BFE_INT
342; EG-DAG: BFE_INT
343; EG-DAG: BFE_INT
344; EG-DAG: BFE_INT
345; EG-DAG: BFE_INT
346; EG-DAG: BFE_INT
347; EG-DAG: BFE_INT
348; EG-DAG: BFE_INT
349; EG-DAG: BFE_INT
350; EG-DAG: BFE_INT
351define amdgpu_kernel void @local_sextload_v16i8_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
352  %load = load <16 x i8>, <16 x i8> addrspace(3)* %in
353  %ext = sext <16 x i8> %load to <16 x i32>
354  store <16 x i32> %ext, <16 x i32> addrspace(3)* %out
355  ret void
356}
357
358; FUNC-LABEL: {{^}}local_zextload_v32i8_to_v32i32:
359; SICIVI: s_mov_b32 m0
360; GFX9-NOT: m0
361
362; EG-DAG: LDS_READ_RET
363; EG-DAG: LDS_READ_RET
364; EG-DAG: LDS_READ_RET
365; EG-DAG: LDS_READ_RET
366; EG-DAG: LDS_READ_RET
367; EG-DAG: LDS_READ_RET
368; EG-DAG: LDS_READ_RET
369; EG-DAG: LDS_READ_RET
370define amdgpu_kernel void @local_zextload_v32i8_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
371  %load = load <32 x i8>, <32 x i8> addrspace(3)* %in
372  %ext = zext <32 x i8> %load to <32 x i32>
373  store <32 x i32> %ext, <32 x i32> addrspace(3)* %out
374  ret void
375}
376
377; FUNC-LABEL: {{^}}local_sextload_v32i8_to_v32i32:
378; SICIVI: s_mov_b32 m0
379; GFX9-NOT: m0
380
381; EG-DAG: LDS_READ_RET
382; EG-DAG: LDS_READ_RET
383; EG-DAG: LDS_READ_RET
384; EG-DAG: LDS_READ_RET
385; EG-DAG: LDS_READ_RET
386; EG-DAG: LDS_READ_RET
387; EG-DAG: LDS_READ_RET
388; EG-DAG: LDS_READ_RET
389define amdgpu_kernel void @local_sextload_v32i8_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
390  %load = load <32 x i8>, <32 x i8> addrspace(3)* %in
391  %ext = sext <32 x i8> %load to <32 x i32>
392  store <32 x i32> %ext, <32 x i32> addrspace(3)* %out
393  ret void
394}
395
396; FUNC-LABEL: {{^}}local_zextload_v64i8_to_v64i32:
397; SICIVI: s_mov_b32 m0
398; GFX9-NOT: m0
399
400; EG-DAG: LDS_READ_RET
401; EG-DAG: LDS_READ_RET
402; EG-DAG: LDS_READ_RET
403; EG-DAG: LDS_READ_RET
404; EG-DAG: LDS_READ_RET
405; EG-DAG: LDS_READ_RET
406; EG-DAG: LDS_READ_RET
407; EG-DAG: LDS_READ_RET
408; EG-DAG: LDS_READ_RET
409; EG-DAG: LDS_READ_RET
410; EG-DAG: LDS_READ_RET
411; EG-DAG: LDS_READ_RET
412; EG-DAG: LDS_READ_RET
413; EG-DAG: LDS_READ_RET
414; EG-DAG: LDS_READ_RET
415; EG-DAG: LDS_READ_RET
416define amdgpu_kernel void @local_zextload_v64i8_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
417  %load = load <64 x i8>, <64 x i8> addrspace(3)* %in
418  %ext = zext <64 x i8> %load to <64 x i32>
419  store <64 x i32> %ext, <64 x i32> addrspace(3)* %out
420  ret void
421}
422
423; FUNC-LABEL: {{^}}local_sextload_v64i8_to_v64i32:
424; SICIVI: s_mov_b32 m0
425; GFX9-NOT: m0
426
427; EG-DAG: LDS_READ_RET
428; EG-DAG: LDS_READ_RET
429; EG-DAG: LDS_READ_RET
430; EG-DAG: LDS_READ_RET
431; EG-DAG: LDS_READ_RET
432; EG-DAG: LDS_READ_RET
433; EG-DAG: LDS_READ_RET
434; EG-DAG: LDS_READ_RET
435; EG-DAG: LDS_READ_RET
436; EG-DAG: LDS_READ_RET
437; EG-DAG: LDS_READ_RET
438; EG-DAG: LDS_READ_RET
439; EG-DAG: LDS_READ_RET
440; EG-DAG: LDS_READ_RET
441; EG-DAG: LDS_READ_RET
442; EG-DAG: LDS_READ_RET
443define amdgpu_kernel void @local_sextload_v64i8_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
444  %load = load <64 x i8>, <64 x i8> addrspace(3)* %in
445  %ext = sext <64 x i8> %load to <64 x i32>
446  store <64 x i32> %ext, <64 x i32> addrspace(3)* %out
447  ret void
448}
449
450; FUNC-LABEL: {{^}}local_zextload_i8_to_i64:
451; SICIVI: s_mov_b32 m0
452; GFX9-NOT: m0
453
454; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
455; GCN-DAG: ds_read_u8 v[[LO:[0-9]+]],
456; GCN: ds_write_b64 v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]]
457
458; EG: LDS_UBYTE_READ_RET
459; EG: MOV {{.*}}, literal
460; EG: 0.0
461define amdgpu_kernel void @local_zextload_i8_to_i64(i64 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
462  %a = load i8, i8 addrspace(3)* %in
463  %ext = zext i8 %a to i64
464  store i64 %ext, i64 addrspace(3)* %out
465  ret void
466}
467
468; FUNC-LABEL: {{^}}local_sextload_i8_to_i64:
469; SICIVI: s_mov_b32 m0
470; GFX9-NOT: m0
471
472; GCN: ds_read_i8 v[[LO:[0-9]+]],
473; GCN: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
474
475; GCN: ds_write_b64 v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
476
477; EG: LDS_UBYTE_READ_RET
478; EG: ASHR
479; TODO: why not 7?
480; EG: 31
481define amdgpu_kernel void @local_sextload_i8_to_i64(i64 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
482  %a = load i8, i8 addrspace(3)* %in
483  %ext = sext i8 %a to i64
484  store i64 %ext, i64 addrspace(3)* %out
485  ret void
486}
487
488; FUNC-LABEL: {{^}}local_zextload_v1i8_to_v1i64:
489; SICIVI: s_mov_b32 m0
490; GFX9-NOT: m0
491
492; EG: LDS_UBYTE_READ_RET
493; EG: MOV {{.*}}, literal
494; TODO: merge?
495; EG: 0.0
496define amdgpu_kernel void @local_zextload_v1i8_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
497  %load = load <1 x i8>, <1 x i8> addrspace(3)* %in
498  %ext = zext <1 x i8> %load to <1 x i64>
499  store <1 x i64> %ext, <1 x i64> addrspace(3)* %out
500  ret void
501}
502
503; FUNC-LABEL: {{^}}local_sextload_v1i8_to_v1i64:
504; SICIVI: s_mov_b32 m0
505; GFX9-NOT: m0
506
507; EG: LDS_UBYTE_READ_RET
508; EG: ASHR
509; TODO: why not 7?
510; EG: 31
511define amdgpu_kernel void @local_sextload_v1i8_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
512  %load = load <1 x i8>, <1 x i8> addrspace(3)* %in
513  %ext = sext <1 x i8> %load to <1 x i64>
514  store <1 x i64> %ext, <1 x i64> addrspace(3)* %out
515  ret void
516}
517
518; FUNC-LABEL: {{^}}local_zextload_v2i8_to_v2i64:
519; SICIVI: s_mov_b32 m0
520; GFX9-NOT: m0
521
522; EG: LDS_USHORT_READ_RET
523define amdgpu_kernel void @local_zextload_v2i8_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
524  %load = load <2 x i8>, <2 x i8> addrspace(3)* %in
525  %ext = zext <2 x i8> %load to <2 x i64>
526  store <2 x i64> %ext, <2 x i64> addrspace(3)* %out
527  ret void
528}
529
530; FUNC-LABEL: {{^}}local_sextload_v2i8_to_v2i64:
531; SICIVI: s_mov_b32 m0
532; GFX9-NOT: m0
533
534; EG: LDS_USHORT_READ_RET
535; EG: BFE_INT
536; EG: BFE_INT
537define amdgpu_kernel void @local_sextload_v2i8_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
538  %load = load <2 x i8>, <2 x i8> addrspace(3)* %in
539  %ext = sext <2 x i8> %load to <2 x i64>
540  store <2 x i64> %ext, <2 x i64> addrspace(3)* %out
541  ret void
542}
543
544; FUNC-LABEL: {{^}}local_zextload_v4i8_to_v4i64:
545; SICIVI: s_mov_b32 m0
546; GFX9-NOT: m0
547
548; EG: LDS_READ_RET
549define amdgpu_kernel void @local_zextload_v4i8_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
550  %load = load <4 x i8>, <4 x i8> addrspace(3)* %in
551  %ext = zext <4 x i8> %load to <4 x i64>
552  store <4 x i64> %ext, <4 x i64> addrspace(3)* %out
553  ret void
554}
555
556; FUNC-LABEL: {{^}}local_sextload_v4i8_to_v4i64:
557; SICIVI: s_mov_b32 m0
558; GFX9-NOT: m0
559
560; EG: LDS_READ_RET
561define amdgpu_kernel void @local_sextload_v4i8_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
562  %load = load <4 x i8>, <4 x i8> addrspace(3)* %in
563  %ext = sext <4 x i8> %load to <4 x i64>
564  store <4 x i64> %ext, <4 x i64> addrspace(3)* %out
565  ret void
566}
567
568; FUNC-LABEL: {{^}}local_zextload_v8i8_to_v8i64:
569; SICIVI: s_mov_b32 m0
570; GFX9-NOT: m0
571
572; EG: LDS_READ_RET
573; EG: LDS_READ_RET
574define amdgpu_kernel void @local_zextload_v8i8_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
575  %load = load <8 x i8>, <8 x i8> addrspace(3)* %in
576  %ext = zext <8 x i8> %load to <8 x i64>
577  store <8 x i64> %ext, <8 x i64> addrspace(3)* %out
578  ret void
579}
580
581; FUNC-LABEL: {{^}}local_sextload_v8i8_to_v8i64:
582; SICIVI: s_mov_b32 m0
583; GFX9-NOT: m0
584
585; EG: LDS_READ_RET
586; EG: LDS_READ_RET
587; EG-DAG: ASHR
588; EG-DAG: ASHR
589; EG-DAG: BFE_INT
590; EG-DAG: BFE_INT
591; EG-DAG: BFE_INT
592; EG-DAG: BFE_INT
593; EG-DAG: BFE_INT
594; EG-DAG: BFE_INT
595; EG-DAG: BFE_INT
596define amdgpu_kernel void @local_sextload_v8i8_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
597  %load = load <8 x i8>, <8 x i8> addrspace(3)* %in
598  %ext = sext <8 x i8> %load to <8 x i64>
599  store <8 x i64> %ext, <8 x i64> addrspace(3)* %out
600  ret void
601}
602
603; FUNC-LABEL: {{^}}local_zextload_v16i8_to_v16i64:
604; SICIVI: s_mov_b32 m0
605; GFX9-NOT: m0
606
607; EG: LDS_READ_RET
608; EG: LDS_READ_RET
609; EG: LDS_READ_RET
610; EG: LDS_READ_RET
611define amdgpu_kernel void @local_zextload_v16i8_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
612  %load = load <16 x i8>, <16 x i8> addrspace(3)* %in
613  %ext = zext <16 x i8> %load to <16 x i64>
614  store <16 x i64> %ext, <16 x i64> addrspace(3)* %out
615  ret void
616}
617
618; FUNC-LABEL: {{^}}local_sextload_v16i8_to_v16i64:
619; SICIVI: s_mov_b32 m0
620; GFX9-NOT: m0
621
622; EG: LDS_READ_RET
623; EG: LDS_READ_RET
624; EG: LDS_READ_RET
625; EG: LDS_READ_RET
626define amdgpu_kernel void @local_sextload_v16i8_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
627  %load = load <16 x i8>, <16 x i8> addrspace(3)* %in
628  %ext = sext <16 x i8> %load to <16 x i64>
629  store <16 x i64> %ext, <16 x i64> addrspace(3)* %out
630  ret void
631}
632
633; FUNC-LABEL: {{^}}local_zextload_v32i8_to_v32i64:
634; SICIVI: s_mov_b32 m0
635; GFX9-NOT: m0
636
637; EG: LDS_READ_RET
638; EG: LDS_READ_RET
639; EG: LDS_READ_RET
640; EG: LDS_READ_RET
641; EG: LDS_READ_RET
642; EG: LDS_READ_RET
643; EG: LDS_READ_RET
644; EG: LDS_READ_RET
645define amdgpu_kernel void @local_zextload_v32i8_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
646  %load = load <32 x i8>, <32 x i8> addrspace(3)* %in
647  %ext = zext <32 x i8> %load to <32 x i64>
648  store <32 x i64> %ext, <32 x i64> addrspace(3)* %out
649  ret void
650}
651
652; FUNC-LABEL: {{^}}local_sextload_v32i8_to_v32i64:
653; SICIVI: s_mov_b32 m0
654; GFX9-NOT: m0
655
656; EG: LDS_READ_RET
657; EG: LDS_READ_RET
658; EG: LDS_READ_RET
659; EG: LDS_READ_RET
660; EG: LDS_READ_RET
661; EG: LDS_READ_RET
662; EG: LDS_READ_RET
663; EG: LDS_READ_RET
664define amdgpu_kernel void @local_sextload_v32i8_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
665  %load = load <32 x i8>, <32 x i8> addrspace(3)* %in
666  %ext = sext <32 x i8> %load to <32 x i64>
667  store <32 x i64> %ext, <32 x i64> addrspace(3)* %out
668  ret void
669}
670
671; XFUNC-LABEL: {{^}}local_zextload_v64i8_to_v64i64:
672; define amdgpu_kernel void @local_zextload_v64i8_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
673;   %load = load <64 x i8>, <64 x i8> addrspace(3)* %in
674;   %ext = zext <64 x i8> %load to <64 x i64>
675;   store <64 x i64> %ext, <64 x i64> addrspace(3)* %out
676;   ret void
677; }
678
679; XFUNC-LABEL: {{^}}local_sextload_v64i8_to_v64i64:
680; define amdgpu_kernel void @local_sextload_v64i8_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
681;   %load = load <64 x i8>, <64 x i8> addrspace(3)* %in
682;   %ext = sext <64 x i8> %load to <64 x i64>
683;   store <64 x i64> %ext, <64 x i64> addrspace(3)* %out
684;   ret void
685; }
686
687; FUNC-LABEL: {{^}}local_zextload_i8_to_i16:
688; SICIVI: s_mov_b32 m0
689; GFX9-NOT: m0
690; GCN: ds_read_u8 v[[VAL:[0-9]+]],
691; GCN: ds_write_b16 v[[VAL:[0-9]+]]
692
693; EG: LDS_UBYTE_READ_RET
694; EG: LDS_SHORT_WRITE
695define amdgpu_kernel void @local_zextload_i8_to_i16(i16 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
696  %a = load i8, i8 addrspace(3)* %in
697  %ext = zext i8 %a to i16
698  store i16 %ext, i16 addrspace(3)* %out
699  ret void
700}
701
702; FUNC-LABEL: {{^}}local_sextload_i8_to_i16:
703; SICIVI: s_mov_b32 m0
704; GFX9-NOT: m0
705; GCN: ds_read_i8 v[[VAL:[0-9]+]],
706; GCN: ds_write_b16 v{{[0-9]+}}, v[[VAL]]
707
708; EG: LDS_UBYTE_READ_RET
709; EG: BFE_INT
710; EG: LDS_SHORT_WRITE
711define amdgpu_kernel void @local_sextload_i8_to_i16(i16 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
712  %a = load i8, i8 addrspace(3)* %in
713  %ext = sext i8 %a to i16
714  store i16 %ext, i16 addrspace(3)* %out
715  ret void
716}
717
718; FUNC-LABEL: {{^}}local_zextload_v1i8_to_v1i16:
719; SICIVI: s_mov_b32 m0
720; GFX9-NOT: m0
721
722; EG: LDS_UBYTE_READ_RET
723; EG: LDS_SHORT_WRITE
724define amdgpu_kernel void @local_zextload_v1i8_to_v1i16(<1 x i16> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
725  %load = load <1 x i8>, <1 x i8> addrspace(3)* %in
726  %ext = zext <1 x i8> %load to <1 x i16>
727  store <1 x i16> %ext, <1 x i16> addrspace(3)* %out
728  ret void
729}
730
731; FUNC-LABEL: {{^}}local_sextload_v1i8_to_v1i16:
732; SICIVI: s_mov_b32 m0
733; GFX9-NOT: m0
734
735; EG: LDS_UBYTE_READ_RET
736; EG: BFE_INT
737; EG: LDS_SHORT_WRITE
738define amdgpu_kernel void @local_sextload_v1i8_to_v1i16(<1 x i16> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
739  %load = load <1 x i8>, <1 x i8> addrspace(3)* %in
740  %ext = sext <1 x i8> %load to <1 x i16>
741  store <1 x i16> %ext, <1 x i16> addrspace(3)* %out
742  ret void
743}
744
745; FUNC-LABEL: {{^}}local_zextload_v2i8_to_v2i16:
746; SICIVI: s_mov_b32 m0
747; GFX9-NOT: m0
748
749; EG: LDS_USHORT_READ_RET
750; EG: LDS_WRITE
751define amdgpu_kernel void @local_zextload_v2i8_to_v2i16(<2 x i16> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
752  %load = load <2 x i8>, <2 x i8> addrspace(3)* %in
753  %ext = zext <2 x i8> %load to <2 x i16>
754  store <2 x i16> %ext, <2 x i16> addrspace(3)* %out
755  ret void
756}
757
758; FUNC-LABEL: {{^}}local_sextload_v2i8_to_v2i16:
759; SICIVI: s_mov_b32 m0
760; GFX9-NOT: m0
761
762; EG: LDS_USHORT_READ_RET
763; EG: BFE_INT
764; EG: BFE_INT
765; EG: LDS_WRITE
766define amdgpu_kernel void @local_sextload_v2i8_to_v2i16(<2 x i16> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
767  %load = load <2 x i8>, <2 x i8> addrspace(3)* %in
768  %ext = sext <2 x i8> %load to <2 x i16>
769  store <2 x i16> %ext, <2 x i16> addrspace(3)* %out
770  ret void
771}
772
773; FUNC-LABEL: {{^}}local_zextload_v4i8_to_v4i16:
774; SICIVI: s_mov_b32 m0
775; GFX9-NOT: m0
776
777; EG: LDS_READ_RET
778; EG: LDS_WRITE
779; EG: LDS_WRITE
780define amdgpu_kernel void @local_zextload_v4i8_to_v4i16(<4 x i16> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
781  %load = load <4 x i8>, <4 x i8> addrspace(3)* %in
782  %ext = zext <4 x i8> %load to <4 x i16>
783  store <4 x i16> %ext, <4 x i16> addrspace(3)* %out
784  ret void
785}
786
787; FUNC-LABEL: {{^}}local_sextload_v4i8_to_v4i16:
788; SICIVI: s_mov_b32 m0
789; GFX9-NOT: m0
790
791; EG: LDS_READ_RET
792; TODO: these do LSHR + BFE_INT, instead of just BFE_INT/ASHR
793; EG-DAG: BFE_INT
794; EG-DAG: BFE_INT
795; EG-DAG: BFE_INT
796; EG-DAG: BFE_INT
797; EG: LDS_WRITE
798; EG: LDS_WRITE
799define amdgpu_kernel void @local_sextload_v4i8_to_v4i16(<4 x i16> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
800  %load = load <4 x i8>, <4 x i8> addrspace(3)* %in
801  %ext = sext <4 x i8> %load to <4 x i16>
802  store <4 x i16> %ext, <4 x i16> addrspace(3)* %out
803  ret void
804}
805
806; FUNC-LABEL: {{^}}local_zextload_v8i8_to_v8i16:
807; SICIVI: s_mov_b32 m0
808; GFX9-NOT: m0
809
810; EG: LDS_READ_RET
811; EG: LDS_READ_RET
812; EG: LDS_WRITE
813; EG: LDS_WRITE
814; EG: LDS_WRITE
815; EG: LDS_WRITE
816define amdgpu_kernel void @local_zextload_v8i8_to_v8i16(<8 x i16> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
817  %load = load <8 x i8>, <8 x i8> addrspace(3)* %in
818  %ext = zext <8 x i8> %load to <8 x i16>
819  store <8 x i16> %ext, <8 x i16> addrspace(3)* %out
820  ret void
821}
822
823; FUNC-LABEL: {{^}}local_sextload_v8i8_to_v8i16:
824; SICIVI: s_mov_b32 m0
825; GFX9-NOT: m0
826
827; EG: LDS_READ_RET
828; EG: LDS_READ_RET
829; TODO: these do LSHR + BFE_INT, instead of just BFE_INT/ASHR
830; EG-DAG: BFE_INT
831; EG-DAG: BFE_INT
832; EG-DAG: BFE_INT
833; EG-DAG: BFE_INT
834; EG-DAG: BFE_INT
835; EG-DAG: BFE_INT
836; EG-DAG: BFE_INT
837; EG-DAG: BFE_INT
838; EG: LDS_WRITE
839; EG: LDS_WRITE
840; EG: LDS_WRITE
841; EG: LDS_WRITE
842define amdgpu_kernel void @local_sextload_v8i8_to_v8i16(<8 x i16> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
843  %load = load <8 x i8>, <8 x i8> addrspace(3)* %in
844  %ext = sext <8 x i8> %load to <8 x i16>
845  store <8 x i16> %ext, <8 x i16> addrspace(3)* %out
846  ret void
847}
848
849; FUNC-LABEL: {{^}}local_zextload_v16i8_to_v16i16:
850; SICIVI: s_mov_b32 m0
851; GFX9-NOT: m0
852
853; EG: LDS_READ_RET
854; EG: LDS_READ_RET
855; EG: LDS_READ_RET
856; EG: LDS_READ_RET
857; EG: LDS_WRITE
858; EG: LDS_WRITE
859; EG: LDS_WRITE
860; EG: LDS_WRITE
861; EG: LDS_WRITE
862; EG: LDS_WRITE
863; EG: LDS_WRITE
864; EG: LDS_WRITE
865define amdgpu_kernel void @local_zextload_v16i8_to_v16i16(<16 x i16> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
866  %load = load <16 x i8>, <16 x i8> addrspace(3)* %in
867  %ext = zext <16 x i8> %load to <16 x i16>
868  store <16 x i16> %ext, <16 x i16> addrspace(3)* %out
869  ret void
870}
871
872; FUNC-LABEL: {{^}}local_sextload_v16i8_to_v16i16:
873; SICIVI: s_mov_b32 m0
874; GFX9-NOT: m0
875
876; EG: LDS_READ_RET
877; EG: LDS_READ_RET
878; EG: LDS_READ_RET
879; EG: LDS_READ_RET
880; TODO: these do LSHR + BFE_INT, instead of just BFE_INT/ASHR
881; EG-DAG: BFE_INT
882; EG-DAG: BFE_INT
883; EG-DAG: BFE_INT
884; EG-DAG: BFE_INT
885; EG-DAG: BFE_INT
886; EG-DAG: BFE_INT
887; EG-DAG: BFE_INT
888; EG-DAG: BFE_INT
889; EG-DAG: BFE_INT
890; EG-DAG: BFE_INT
891; EG-DAG: BFE_INT
892; EG-DAG: BFE_INT
893; EG-DAG: BFE_INT
894; EG-DAG: BFE_INT
895; EG-DAG: BFE_INT
896; EG-DAG: BFE_INT
897; EG: LDS_WRITE
898; EG: LDS_WRITE
899; EG: LDS_WRITE
900; EG: LDS_WRITE
901; EG: LDS_WRITE
902; EG: LDS_WRITE
903; EG: LDS_WRITE
904; EG: LDS_WRITE
905define amdgpu_kernel void @local_sextload_v16i8_to_v16i16(<16 x i16> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
906  %load = load <16 x i8>, <16 x i8> addrspace(3)* %in
907  %ext = sext <16 x i8> %load to <16 x i16>
908  store <16 x i16> %ext, <16 x i16> addrspace(3)* %out
909  ret void
910}
911
912; FUNC-LABEL: {{^}}local_zextload_v32i8_to_v32i16:
913; SICIVI: s_mov_b32 m0
914; GFX9-NOT: m0
915
916; EG: LDS_READ_RET
917; EG: LDS_READ_RET
918; EG: LDS_READ_RET
919; EG: LDS_READ_RET
920; EG: LDS_READ_RET
921; EG: LDS_READ_RET
922; EG: LDS_READ_RET
923; EG: LDS_READ_RET
924; EG: LDS_WRITE
925; EG: LDS_WRITE
926; EG: LDS_WRITE
927; EG: LDS_WRITE
928; EG: LDS_WRITE
929; EG: LDS_WRITE
930; EG: LDS_WRITE
931; EG: LDS_WRITE
932; EG: LDS_WRITE
933; EG: LDS_WRITE
934; EG: LDS_WRITE
935; EG: LDS_WRITE
936; EG: LDS_WRITE
937; EG: LDS_WRITE
938; EG: LDS_WRITE
939; EG: LDS_WRITE
940define amdgpu_kernel void @local_zextload_v32i8_to_v32i16(<32 x i16> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
941  %load = load <32 x i8>, <32 x i8> addrspace(3)* %in
942  %ext = zext <32 x i8> %load to <32 x i16>
943  store <32 x i16> %ext, <32 x i16> addrspace(3)* %out
944  ret void
945}
946
947; FUNC-LABEL: {{^}}local_sextload_v32i8_to_v32i16:
948; SICIVI: s_mov_b32 m0
949; GFX9-NOT: m0
950
951; EG: LDS_READ_RET
952; EG: LDS_READ_RET
953; EG: LDS_READ_RET
954; EG: LDS_READ_RET
955; EG: LDS_READ_RET
956; EG: LDS_READ_RET
957; EG: LDS_READ_RET
958; EG: LDS_READ_RET
959; TODO: these do LSHR + BFE_INT, instead of just BFE_INT/ASHR
960; EG-DAG: BFE_INT
961; EG-DAG: BFE_INT
962; EG-DAG: BFE_INT
963; EG-DAG: BFE_INT
964; EG-DAG: BFE_INT
965; EG-DAG: BFE_INT
966; EG-DAG: BFE_INT
967; EG-DAG: BFE_INT
968; EG-DAG: BFE_INT
969; EG-DAG: BFE_INT
970; EG-DAG: BFE_INT
971; EG-DAG: BFE_INT
972; EG-DAG: BFE_INT
973; EG-DAG: BFE_INT
974; EG-DAG: BFE_INT
975; EG-DAG: BFE_INT
976; EG-DAG: BFE_INT
977; EG-DAG: BFE_INT
978; EG-DAG: BFE_INT
979; EG-DAG: BFE_INT
980; EG-DAG: BFE_INT
981; EG-DAG: BFE_INT
982; EG-DAG: BFE_INT
983; EG-DAG: BFE_INT
984; EG-DAG: BFE_INT
985; EG-DAG: BFE_INT
986; EG-DAG: BFE_INT
987; EG-DAG: BFE_INT
988; EG: LDS_WRITE
989; EG: LDS_WRITE
990; EG: LDS_WRITE
991; EG: LDS_WRITE
992; EG: LDS_WRITE
993; EG: LDS_WRITE
994; EG: LDS_WRITE
995; EG: LDS_WRITE
996; EG: LDS_WRITE
997; EG: LDS_WRITE
998; EG: LDS_WRITE
999; EG: LDS_WRITE
1000; EG: LDS_WRITE
1001; EG: LDS_WRITE
1002; EG: LDS_WRITE
1003; EG: LDS_WRITE
1004define amdgpu_kernel void @local_sextload_v32i8_to_v32i16(<32 x i16> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
1005  %load = load <32 x i8>, <32 x i8> addrspace(3)* %in
1006  %ext = sext <32 x i8> %load to <32 x i16>
1007  store <32 x i16> %ext, <32 x i16> addrspace(3)* %out
1008  ret void
1009}
1010
1011; XFUNC-LABEL: {{^}}local_zextload_v64i8_to_v64i16:
1012; define amdgpu_kernel void @local_zextload_v64i8_to_v64i16(<64 x i16> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
1013;   %load = load <64 x i8>, <64 x i8> addrspace(3)* %in
1014;   %ext = zext <64 x i8> %load to <64 x i16>
1015;   store <64 x i16> %ext, <64 x i16> addrspace(3)* %out
1016;   ret void
1017; }
1018
1019; XFUNC-LABEL: {{^}}local_sextload_v64i8_to_v64i16:
1020; define amdgpu_kernel void @local_sextload_v64i8_to_v64i16(<64 x i16> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
1021;   %load = load <64 x i8>, <64 x i8> addrspace(3)* %in
1022;   %ext = sext <64 x i8> %load to <64 x i16>
1023;   store <64 x i16> %ext, <64 x i16> addrspace(3)* %out
1024;   ret void
1025; }
1026
1027; Tests if ds_read/write_b128 gets generated for the 16 byte aligned load.
1028; FUNC-LABEL: {{^}}local_v16i8_to_128:
1029
1030; SI-NOT: ds_read_b128
1031; SI-NOT: ds_write_b128
1032
1033; CIVI: ds_read_b128
1034; CIVI: ds_write_b128
1035
1036; EG: LDS_READ_RET
1037; EG: LDS_READ_RET
1038; EG: LDS_READ_RET
1039; EG: LDS_READ_RET
1040define amdgpu_kernel void @local_v16i8_to_128(<16 x i8> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) {
1041  %ld = load <16 x i8>, <16 x i8> addrspace(3)* %in, align 16
1042  store <16 x i8> %ld, <16 x i8> addrspace(3)* %out, align 16
1043  ret void
1044}
1045
1046attributes #0 = { nounwind }
1047