1; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900,GFX900-MUBUF %s
2; RUN: llc -march=amdgcn -mcpu=gfx906 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX906,NO-D16-HI %s
3; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX803,NO-D16-HI %s
4; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900,GFX900-FLATSCR %s
5
6; GCN-LABEL: {{^}}load_local_lo_hi_v2i16_multi_use_lo:
7; GFX900: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8; GFX900-NEXT: ds_read_u16 v2, v0
9; GFX900-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
10; GFX900-DAG: s_waitcnt lgkmcnt(0)
11; GFX900-DAG: v_mov_b32_e32 v1, v2
12; GFX900-DAG: ds_read_u16_d16_hi v1, v0 offset:16
13; GFX900: ds_write_b16 [[ZERO]], v2
14; GFX900-NEXT: s_waitcnt lgkmcnt(1)
15; GFX900-NEXT: v_mov_b32_e32 v0, v1
16; GFX900-NEXT: s_waitcnt lgkmcnt(0)
17; GFX900-NEXT: s_setpc_b64 s[30:31]
18define <2 x i16> @load_local_lo_hi_v2i16_multi_use_lo(i16 addrspace(3)* noalias %in) #0 {
19entry:
20  %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 8
21  %load.lo = load i16, i16 addrspace(3)* %in
22  %load.hi = load i16, i16 addrspace(3)* %gep
23  store i16 %load.lo, i16 addrspace(3)* null
24  %build0 = insertelement <2 x i16> undef, i16 %load.lo, i32 0
25  %build1 = insertelement <2 x i16> %build0, i16 %load.hi, i32 1
26  ret <2 x i16> %build1
27}
28
29; GCN-LABEL: {{^}}load_local_lo_hi_v2i16_multi_use_hi:
30; GFX900: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31; GFX900-DAG: ds_read_u16 [[LO:v[0-9]+]], v0
32; GFX900-DAG: ds_read_u16 [[HI:v[0-9]+]], v0 offset:16
33; GFX900-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
34; GFX900-DAG: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, [[LO]]
35; GFX900-DAG: s_waitcnt lgkmcnt(0)
36; GFX900-DAG: ds_write_b16 [[ZERO]], [[HI]]
37; GFX900: v_lshl_or_b32 [[HI]], [[HI]], 16, [[AND]]
38; GFX900-NEXT: s_waitcnt lgkmcnt(0)
39; GFX900-NEXT: s_setpc_b64 s[30:31]
40define <2 x i16> @load_local_lo_hi_v2i16_multi_use_hi(i16 addrspace(3)* noalias %in) #0 {
41entry:
42  %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 8
43  %load.lo = load i16, i16 addrspace(3)* %in
44  %load.hi = load i16, i16 addrspace(3)* %gep
45  store i16 %load.hi, i16 addrspace(3)* null
46  %build0 = insertelement <2 x i16> undef, i16 %load.lo, i32 0
47  %build1 = insertelement <2 x i16> %build0, i16 %load.hi, i32 1
48  ret <2 x i16> %build1
49}
50
51; GCN-LABEL: {{^}}load_local_lo_hi_v2i16_multi_use_lohi:
52; GFX900: ds_read_u16 v3, v0
53; GFX900-NEXT: ds_read_u16 v0, v0 offset:16
54; GFX900-NEXT: s_waitcnt lgkmcnt(1)
55; GFX900-NEXT: ds_write_b16 v1, v3
56; GFX900-NEXT: s_waitcnt lgkmcnt(1)
57; GFX900-NEXT: ds_write_b16 v2, v0
58; GFX900-NEXT: v_and_b32_e32 v1, 0xffff, v3
59; GFX900-NEXT: v_lshl_or_b32 v0, v0, 16, v1
60; GFX900-NEXT: s_waitcnt lgkmcnt(0)
61; GFX900-NEXT: s_setpc_b64 s[30:31]
62define <2 x i16> @load_local_lo_hi_v2i16_multi_use_lohi(i16 addrspace(3)* noalias %in, i16 addrspace(3)* noalias %out0, i16 addrspace(3)* noalias %out1) #0 {
63entry:
64  %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 8
65  %load.lo = load i16, i16 addrspace(3)* %in
66  %load.hi = load i16, i16 addrspace(3)* %gep
67  store i16 %load.lo, i16 addrspace(3)* %out0
68  store i16 %load.hi, i16 addrspace(3)* %out1
69  %build0 = insertelement <2 x i16> undef, i16 %load.lo, i32 0
70  %build1 = insertelement <2 x i16> %build0, i16 %load.hi, i32 1
71  ret <2 x i16> %build1
72}
73
74; GCN-LABEL: {{^}}load_local_hi_v2i16_undeflo:
75; GCN: s_waitcnt
76; GFX900-NEXT: ds_read_u16_d16_hi v0, v0
77; GFX900-NEXT: s_waitcnt
78; GFX900-NEXT: s_setpc_b64
79
80; NO-D16-HI: ds_read_u16 v
81define <2 x i16> @load_local_hi_v2i16_undeflo(i16 addrspace(3)* %in) #0 {
82entry:
83  %load = load i16, i16 addrspace(3)* %in
84  %build = insertelement <2 x i16> undef, i16 %load, i32 1
85  ret <2 x i16> %build
86}
87
88; GCN-LABEL: {{^}}load_local_hi_v2i16_reglo:
89; GCN: s_waitcnt
90; GFX900-NEXT: ds_read_u16_d16_hi v1, v0
91; GFX900-NEXT: s_waitcnt
92; GFX900-NEXT: v_mov_b32_e32 v0, v1
93; GFX900-NEXT: s_setpc_b64
94
95; NO-D16-HI: ds_read_u16 v
96define <2 x i16> @load_local_hi_v2i16_reglo(i16 addrspace(3)* %in, i16 %reg) #0 {
97entry:
98  %load = load i16, i16 addrspace(3)* %in
99  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
100  %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
101  ret <2 x i16> %build1
102}
103
104; Show that we get reasonable regalloc without physreg constraints.
105; GCN-LABEL: {{^}}load_local_hi_v2i16_reglo_vreg:
106; GCN: s_waitcnt
107; GFX900-NEXT: ds_read_u16_d16_hi v1, v0
108; GFX900-NEXT: s_waitcnt
109; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}}
110; GFX900-NEXT: s_waitcnt
111; GFX900-NEXT: s_setpc_b64
112
113; NO-D16-HI: ds_read_u16 v
114define void @load_local_hi_v2i16_reglo_vreg(i16 addrspace(3)* %in, i16 %reg) #0 {
115entry:
116  %load = load i16, i16 addrspace(3)* %in
117  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
118  %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
119  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
120  ret void
121}
122
123; GCN-LABEL: {{^}}load_local_hi_v2i16_zerolo:
124; GCN: s_waitcnt
125; GFX900-NEXT: v_mov_b32_e32 v1, 0
126; GFX900-NEXT: ds_read_u16_d16_hi v1, v0
127; GFX900-NEXT: s_waitcnt
128; GFX900-NEXT: v_mov_b32_e32 v0, v1
129; GFX900-NEXT: s_setpc_b64
130
131; NO-D16-HI: ds_read_u16 v
132define <2 x i16> @load_local_hi_v2i16_zerolo(i16 addrspace(3)* %in) #0 {
133entry:
134  %load = load i16, i16 addrspace(3)* %in
135  %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 1
136  ret <2 x i16> %build
137}
138
139; FIXME: Remove m0 initialization
140; GCN-LABEL: {{^}}load_local_hi_v2i16_zerolo_shift:
141; GCN: s_waitcnt
142; GFX900-NEXT: ds_read_u16 v0, v0
143; GFX900-NEXT: s_waitcnt lgkmcnt(0)
144; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0
145; GFX900-NEXT: s_setpc_b64
146
147; NO-D16-HI: ds_read_u16 v
148; NO-D16-HI: v_lshlrev_b32_e32 v0, 16, v0
149define i32 @load_local_hi_v2i16_zerolo_shift(i16 addrspace(3)* %in) #0 {
150entry:
151  %load = load i16, i16 addrspace(3)* %in
152  %zext = zext i16 %load to i32
153  %shift = shl i32 %zext, 16
154  ret i32 %shift
155}
156
157; GCN-LABEL: {{^}}load_local_hi_v2f16_reglo_vreg:
158; GCN: s_waitcnt
159; GFX900-NEXT: ds_read_u16_d16_hi v1, v0
160; GFX900-NEXT: s_waitcnt
161; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}}
162; GFX900-NEXT: s_waitcnt
163; GFX900-NEXT: s_setpc_b64
164
165; NO-D16-HI: ds_read_u16 v
166define void @load_local_hi_v2f16_reglo_vreg(half addrspace(3)* %in, half %reg) #0 {
167entry:
168  %load = load half, half addrspace(3)* %in
169  %build0 = insertelement <2 x half> undef, half %reg, i32 0
170  %build1 = insertelement <2 x half> %build0, half %load, i32 1
171  store <2 x half> %build1, <2 x half> addrspace(1)* undef
172  ret void
173}
174
175; GCN-LABEL: {{^}}load_local_hi_v2i16_reglo_vreg_zexti8:
176; GCN: s_waitcnt
177; GFX900-NEXT: ds_read_u8_d16_hi v1, v0
178; GFX900-NEXT: s_waitcnt
179; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}}
180; GFX900-NEXT: s_waitcnt
181; GFX900-NEXT: s_setpc_b64
182
183; NO-D16-HI: ds_read_u8 v
184define void @load_local_hi_v2i16_reglo_vreg_zexti8(i8 addrspace(3)* %in, i16 %reg) #0 {
185entry:
186  %load = load i8, i8 addrspace(3)* %in
187  %ext = zext i8 %load to i16
188  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
189  %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
190  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
191  ret void
192}
193
194; GCN-LABEL: {{^}}load_local_hi_v2i16_reglo_vreg_sexti8:
195; GCN: s_waitcnt
196; GFX900-NEXT: ds_read_i8_d16_hi v1, v0
197; GFX900-NEXT: s_waitcnt
198; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}}
199; GFX900-NEXT: s_waitcnt
200; GFX900-NEXT: s_setpc_b64
201
202; NO-D16-HI: ds_read_i8 v
203define void @load_local_hi_v2i16_reglo_vreg_sexti8(i8 addrspace(3)* %in, i16 %reg) #0 {
204entry:
205  %load = load i8, i8 addrspace(3)* %in
206  %ext = sext i8 %load to i16
207  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
208  %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
209  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
210  ret void
211}
212
213; GCN-LABEL: {{^}}load_local_hi_v2f16_reglo_vreg_zexti8:
214; GCN: s_waitcnt
215; GFX900-NEXT: ds_read_u8_d16_hi v1, v0
216; GFX900-NEXT: s_waitcnt
217; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}}
218; GFX900-NEXT: s_waitcnt
219; GFX900-NEXT: s_setpc_b64
220
221; NO-D16-HI: ds_read_u8 v
222define void @load_local_hi_v2f16_reglo_vreg_zexti8(i8 addrspace(3)* %in, half %reg) #0 {
223entry:
224  %load = load i8, i8 addrspace(3)* %in
225  %ext = zext i8 %load to i16
226  %bitcast = bitcast i16 %ext to half
227
228  %build0 = insertelement <2 x half> undef, half %reg, i32 0
229  %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1
230  store <2 x half> %build1, <2 x half> addrspace(1)* undef
231  ret void
232}
233
234; GCN-LABEL: {{^}}load_local_hi_v2f16_reglo_vreg_sexti8:
235; GCN: s_waitcnt
236; GFX900-NEXT: ds_read_i8_d16_hi v1, v0
237; GFX900-NEXT: s_waitcnt
238; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}}
239; GFX900-NEXT: s_waitcnt
240; GFX900-NEXT: s_setpc_b64
241
242; NO-D16-HI: ds_read_i8 v
243define void @load_local_hi_v2f16_reglo_vreg_sexti8(i8 addrspace(3)* %in, half %reg) #0 {
244entry:
245  %load = load i8, i8 addrspace(3)* %in
246  %ext = sext i8 %load to i16
247  %bitcast = bitcast i16 %ext to half
248
249  %build0 = insertelement <2 x half> undef, half %reg, i32 0
250  %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1
251  store <2 x half> %build1, <2 x half> addrspace(1)* undef
252  ret void
253}
254
255; GCN-LABEL: {{^}}load_global_hi_v2i16_reglo_vreg:
256; GCN: s_waitcnt
257; GFX900-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094
258; GFX900-NEXT: s_waitcnt
259; GFX900-NEXT: global_store_dword
260; GFX900-NEXT: s_waitcnt
261; GFX900-NEXT: s_setpc_b64
262define void @load_global_hi_v2i16_reglo_vreg(i16 addrspace(1)* %in, i16 %reg) #0 {
263entry:
264  %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 -2047
265  %load = load i16, i16 addrspace(1)* %gep
266  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
267  %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
268  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
269  ret void
270}
271
272; GCN-LABEL: {{^}}load_global_hi_v2f16_reglo_vreg:
273; GCN: s_waitcnt
274; GFX900-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094
275; GFX900-NEXT: s_waitcnt
276; GFX900-NEXT: global_store_dword
277; GFX900-NEXT: s_waitcnt
278; GFX900-NEXT: s_setpc_b64
279define void @load_global_hi_v2f16_reglo_vreg(half addrspace(1)* %in, half %reg) #0 {
280entry:
281  %gep = getelementptr inbounds half, half addrspace(1)* %in, i64 -2047
282  %load = load half, half addrspace(1)* %gep
283  %build0 = insertelement <2 x half> undef, half %reg, i32 0
284  %build1 = insertelement <2 x half> %build0, half %load, i32 1
285  store <2 x half> %build1, <2 x half> addrspace(1)* undef
286  ret void
287}
288
289; GCN-LABEL: {{^}}load_global_hi_v2i16_reglo_vreg_zexti8:
290; GCN: s_waitcnt
291; GFX900-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:-4095
292; GFX900-NEXT: s_waitcnt
293; GFX900-NEXT: global_store_dword
294; GFX900-NEXT: s_waitcnt
295; GFX900-NEXT: s_setpc_b64
296define void @load_global_hi_v2i16_reglo_vreg_zexti8(i8 addrspace(1)* %in, i16 %reg) #0 {
297entry:
298  %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095
299  %load = load i8, i8 addrspace(1)* %gep
300  %ext = zext i8 %load to i16
301  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
302  %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
303  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
304  ret void
305}
306
307; GCN-LABEL: {{^}}load_global_hi_v2i16_reglo_vreg_sexti8:
308; GCN: s_waitcnt
309; GFX900-NEXT: global_load_sbyte_d16_hi v2, v[0:1], off offset:-4095
310; GFX900-NEXT: s_waitcnt
311; GFX900-NEXT: global_store_dword
312; GFX900-NEXT: s_waitcnt
313; GFX900-NEXT: s_setpc_b64
314define void @load_global_hi_v2i16_reglo_vreg_sexti8(i8 addrspace(1)* %in, i16 %reg) #0 {
315entry:
316  %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095
317  %load = load i8, i8 addrspace(1)* %gep
318  %ext = sext i8 %load to i16
319  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
320  %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
321  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
322  ret void
323}
324
325; GCN-LABEL: {{^}}load_global_hi_v2f16_reglo_vreg_sexti8:
326; GCN: s_waitcnt
327; GFX900-NEXT: global_load_sbyte_d16_hi v2, v[0:1], off offset:-4095
328; GFX900-NEXT: s_waitcnt
329; GFX900-NEXT: global_store_dword
330; GFX900-NEXT: s_waitcnt
331; GFX900-NEXT: s_setpc_b64
332define void @load_global_hi_v2f16_reglo_vreg_sexti8(i8 addrspace(1)* %in, half %reg) #0 {
333entry:
334  %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095
335  %load = load i8, i8 addrspace(1)* %gep
336  %ext = sext i8 %load to i16
337  %bitcast = bitcast i16 %ext to half
338  %build0 = insertelement <2 x half> undef, half %reg, i32 0
339  %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1
340  store <2 x half> %build1, <2 x half> addrspace(1)* undef
341  ret void
342}
343
344; GCN-LABEL: {{^}}load_global_hi_v2f16_reglo_vreg_zexti8:
345; GCN: s_waitcnt
346; GFX900-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:-4095
347; GFX900-NEXT: s_waitcnt
348; GFX900-NEXT: global_store_dword
349; GFX900-NEXT: s_waitcnt
350; GFX900-NEXT: s_setpc_b64
351define void @load_global_hi_v2f16_reglo_vreg_zexti8(i8 addrspace(1)* %in, half %reg) #0 {
352entry:
353  %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095
354  %load = load i8, i8 addrspace(1)* %gep
355  %ext = zext i8 %load to i16
356  %bitcast = bitcast i16 %ext to half
357  %build0 = insertelement <2 x half> undef, half %reg, i32 0
358  %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1
359  store <2 x half> %build1, <2 x half> addrspace(1)* undef
360  ret void
361}
362
363; GCN-LABEL: load_flat_hi_v2i16_reglo_vreg:
364; GCN: s_waitcnt
365; GFX900-NEXT: flat_load_short_d16_hi v2, v[0:1]
366; GFX900-NEXT: s_waitcnt
367; GFX900-NEXT: global_store_dword v[0:1], v2
368; GFX900-NEXT: s_waitcnt
369; GFX900-NEXT: s_setpc_b64
370
371; NO-D16-HI: flat_load_ushort v{{[0-9]+}}
372; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
373; GFX803: v_or_b32_sdwa
374; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16,
375define void @load_flat_hi_v2i16_reglo_vreg(i16* %in, i16 %reg) #0 {
376entry:
377  %load = load i16, i16* %in
378  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
379  %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
380  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
381  ret void
382}
383
384; GCN-LABEL: {{^}}load_flat_hi_v2f16_reglo_vreg:
385; GCN: s_waitcnt
386; GFX900-NEXT: flat_load_short_d16_hi v2, v[0:1]
387; GFX900-NEXT: s_waitcnt
388; GFX900-NEXT: global_store_dword v[0:1], v2
389; GFX900-NEXT: s_waitcnt
390; GFX900-NEXT: s_setpc_b64
391
392; NO-D16-HI: flat_load_ushort v{{[0-9]+}}
393; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
394; GFX803: v_or_b32_sdwa
395; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16,
396define void @load_flat_hi_v2f16_reglo_vreg(half* %in, half %reg) #0 {
397entry:
398  %load = load half, half* %in
399  %build0 = insertelement <2 x half> undef, half %reg, i32 0
400  %build1 = insertelement <2 x half> %build0, half %load, i32 1
401  store <2 x half> %build1, <2 x half> addrspace(1)* undef
402  ret void
403}
404
405; GCN-LABEL: {{^}}load_flat_hi_v2i16_reglo_vreg_zexti8:
406; GCN: s_waitcnt
407; GFX900-NEXT: flat_load_ubyte_d16_hi v2, v[0:1]
408; GFX900-NEXT: s_waitcnt
409; GFX900-NEXT: global_store_dword v[0:1], v2
410; GFX900-NEXT: s_waitcnt
411; GFX900-NEXT: s_setpc_b64
412
413; NO-D16-HI: flat_load_ubyte v{{[0-9]+}}
414; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
415; GFX803: v_or_b32_sdwa
416; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16,
417define void @load_flat_hi_v2i16_reglo_vreg_zexti8(i8* %in, i16 %reg) #0 {
418entry:
419  %load = load i8, i8* %in
420  %ext = zext i8 %load to i16
421  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
422  %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
423  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
424  ret void
425}
426
427; GCN-LABEL: {{^}}load_flat_hi_v2i16_reglo_vreg_sexti8:
428; GCN: s_waitcnt
429; GFX900-NEXT: flat_load_sbyte_d16_hi v2, v[0:1]
430; GFX900-NEXT: s_waitcnt
431; GFX900-NEXT: global_store_dword v[0:1], v2
432; GFX900-NEXT: s_waitcnt
433; GFX900-NEXT: s_setpc_b64
434
435; NO-D16-HI: flat_load_sbyte v{{[0-9]+}}
436; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
437; GFX803: v_or_b32_sdwa
438; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16,
439define void @load_flat_hi_v2i16_reglo_vreg_sexti8(i8* %in, i16 %reg) #0 {
440entry:
441  %load = load i8, i8* %in
442  %ext = sext i8 %load to i16
443  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
444  %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
445  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
446  ret void
447}
448
449; GCN-LABEL: {{^}}load_flat_hi_v2f16_reglo_vreg_zexti8:
450; GCN: s_waitcnt
451; GFX900-NEXT: flat_load_ubyte_d16_hi v2, v[0:1]
452; GFX900-NEXT: s_waitcnt
453; GFX900-NEXT: global_store_dword v[0:1], v2
454; GFX900-NEXT: s_waitcnt
455; GFX900-NEXT: s_setpc_b64
456
457; NO-D16-HI: flat_load_ubyte v{{[0-9]+}}
458; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
459; GFX803: v_or_b32_sdwa
460; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16,
461define void @load_flat_hi_v2f16_reglo_vreg_zexti8(i8* %in, half %reg) #0 {
462entry:
463  %load = load i8, i8* %in
464  %ext = zext i8 %load to i16
465  %bitcast = bitcast i16 %ext to half
466  %build0 = insertelement <2 x half> undef, half %reg, i32 0
467  %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1
468  store <2 x half> %build1, <2 x half> addrspace(1)* undef
469  ret void
470}
471
472; GCN-LABEL: {{^}}load_flat_hi_v2f16_reglo_vreg_sexti8:
473; GCN: s_waitcnt
474; GFX900-NEXT: flat_load_sbyte_d16_hi v2, v[0:1]
475; GFX900-NEXT: s_waitcnt
476; GFX900-NEXT: global_store_dword v[0:1], v2
477; GFX900-NEXT: s_waitcnt
478; GFX900-NEXT: s_setpc_b64
479
480; NO-D16-HI: flat_load_sbyte v{{[0-9]+}}
481; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
482; GFX803: v_or_b32_sdwa
483; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16,
484define void @load_flat_hi_v2f16_reglo_vreg_sexti8(i8* %in, half %reg) #0 {
485entry:
486  %load = load i8, i8* %in
487  %ext = sext i8 %load to i16
488  %bitcast = bitcast i16 %ext to half
489  %build0 = insertelement <2 x half> undef, half %reg, i32 0
490  %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1
491  store <2 x half> %build1, <2 x half> addrspace(1)* undef
492  ret void
493}
494
495; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg:
496; GCN: s_waitcnt
497; GFX900-MUBUF:   buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:4094{{$}}
498; GFX900-FLATSCR: scratch_load_short_d16_hi v0, off, s32 offset:4094{{$}}
499; GFX900-NEXT: s_waitcnt
500; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
501; GFX900-NEXT: s_waitcnt
502; GFX900-NEXT: s_setpc_b64
503
504; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s32 offset:4094{{$}}
505define void @load_private_hi_v2i16_reglo_vreg(i16 addrspace(5)* byval(i16) %in, i16 %reg) #0 {
506entry:
507  %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i64 2047
508  %load = load i16, i16 addrspace(5)* %gep
509  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
510  %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
511  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
512  ret void
513}
514
515; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg:
516; GCN: s_waitcnt
517; GFX900-MUBUF:   buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:4094{{$}}
518; GFX900-FLATSCR: scratch_load_short_d16_hi v0, off, s32 offset:4094{{$}}
519; GFX900-NEXT: s_waitcnt
520; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
521; GFX900-NEXT: s_waitcnt
522; GFX900-NEXT: s_setpc_b64
523
524; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s32 offset:4094{{$}}
525define void @load_private_hi_v2f16_reglo_vreg(half addrspace(5)* byval(half) %in, half %reg) #0 {
526entry:
527  %gep = getelementptr inbounds half, half addrspace(5)* %in, i64 2047
528  %load = load half, half addrspace(5)* %gep
529  %build0 = insertelement <2 x half> undef, half %reg, i32 0
530  %build1 = insertelement <2 x half> %build0, half %load, i32 1
531  store <2 x half> %build1, <2 x half> addrspace(1)* undef
532  ret void
533}
534
535; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff:
536; GCN: s_waitcnt
537; GFX900-MUBUFF:  buffer_load_short_d16_hi v0, off, s[0:3], 0 offset:4094{{$}}
538; GFX900-FLATSCR: s_movk_i32 [[SOFF:[^,]+]], 0xffe
539; GFX900-FLATSCR: scratch_load_short_d16_hi v0, off, [[SOFF]]{{$}}
540; GFX900: s_waitcnt
541; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
542; GFX900-NEXT: s_waitcnt
543; GFX900-NEXT: s_setpc_b64
544
545; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], 0 offset:4094{{$}}
546define void @load_private_hi_v2i16_reglo_vreg_nooff(i16 addrspace(5)* byval(i16) %in, i16 %reg) #0 {
547entry:
548  %load = load volatile i16, i16 addrspace(5)* inttoptr (i32 4094 to i16 addrspace(5)*)
549  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
550  %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
551  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
552  ret void
553}
554
555; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_nooff:
556; GCN: s_waitcnt
557; GFX900-MUBUF-NEXT:   buffer_load_short_d16_hi v1, off, s[0:3], 0 offset:4094{{$}}
558; GFX900-FLATSCR-NEXT: s_movk_i32 [[SOFF:[^,]+]], 0xffe
559; GFX900-FLATSCR-NEXT: scratch_load_short_d16_hi v1, off, [[SOFF]]{{$}}
560; GFX900-NEXT: s_waitcnt
561; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
562; GFX900-NEXT: s_waitcnt
563; GFX900-NEXT: s_setpc_b64
564
565; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], 0 offset:4094{{$}}
566define void @load_private_hi_v2f16_reglo_vreg_nooff(half addrspace(5)* %in, half %reg) #0 {
567entry:
568  %load = load volatile half, half addrspace(5)* inttoptr (i32 4094 to half addrspace(5)*)
569  %build0 = insertelement <2 x half> undef, half %reg, i32 0
570  %build1 = insertelement <2 x half> %build0, half %load, i32 1
571  store <2 x half> %build1, <2 x half> addrspace(1)* undef
572  ret void
573}
574
575; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_zexti8:
576; GCN: s_waitcnt
577; GFX900-MUBUF:   buffer_load_ubyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}}
578; GFX900-FLATSCR: scratch_load_ubyte_d16_hi v0, off, s32 offset:4095{{$}}
579; GFX900-NEXT: s_waitcnt
580; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
581; GFX900-NEXT: s_waitcnt
582; GFX900-NEXT: s_setpc_b64
583
584; NO-D16-HI: buffer_load_ubyte v{{[0-9]+}}, off, s[0:3], s32 offset:4095{{$}}
585define void @load_private_hi_v2i16_reglo_vreg_zexti8(i8 addrspace(5)* byval(i8) %in, i16 %reg) #0 {
586entry:
587  %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095
588  %load = load i8, i8 addrspace(5)* %gep
589  %ext = zext i8 %load to i16
590  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
591  %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
592  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
593  ret void
594}
595
596; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_zexti8:
597; GCN: s_waitcnt
598; GFX900-MUBUF:   buffer_load_ubyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}}
599; GFX900-FLATSCR: scratch_load_ubyte_d16_hi v0, off, s32 offset:4095{{$}}
600; GFX900-NEXT: s_waitcnt
601; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
602; GFX900-NEXT: s_waitcnt
603; GFX900-NEXT: s_setpc_b64
604
605; NO-D16-HI: buffer_load_ubyte v{{[0-9]+}}, off, s[0:3], s32 offset:4095{{$}}
606define void @load_private_hi_v2f16_reglo_vreg_zexti8(i8 addrspace(5)* byval(i8) %in, half %reg) #0 {
607entry:
608  %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095
609  %load = load i8, i8 addrspace(5)* %gep
610  %ext = zext i8 %load to i16
611  %bitcast = bitcast i16 %ext to half
612  %build0 = insertelement <2 x half> undef, half %reg, i32 0
613  %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1
614  store <2 x half> %build1, <2 x half> addrspace(1)* undef
615  ret void
616}
617
618; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_sexti8:
619; GCN: s_waitcnt
620; GFX900-MUBUF:   buffer_load_sbyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}}
621; GFX900-FLATSCR: scratch_load_sbyte_d16_hi v0, off, s32 offset:4095{{$}}
622; GFX900-NEXT: s_waitcnt
623; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
624; GFX900-NEXT: s_waitcnt
625; GFX900-NEXT: s_setpc_b64
626
627; NO-D16-HI: buffer_load_sbyte v{{[0-9]+}}, off, s[0:3], s32 offset:4095{{$}}
628define void @load_private_hi_v2f16_reglo_vreg_sexti8(i8 addrspace(5)* byval(i8) %in, half %reg) #0 {
629entry:
630  %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095
631  %load = load i8, i8 addrspace(5)* %gep
632  %ext = sext i8 %load to i16
633  %bitcast = bitcast i16 %ext to half
634  %build0 = insertelement <2 x half> undef, half %reg, i32 0
635  %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1
636  store <2 x half> %build1, <2 x half> addrspace(1)* undef
637  ret void
638}
639
640; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_sexti8:
641; GCN: s_waitcnt
642; GFX900-MUBUF:   buffer_load_sbyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}}
643; GFX900-FLATSCR: scratch_load_sbyte_d16_hi v0, off, s32 offset:4095{{$}}
644; GFX900-NEXT: s_waitcnt
645; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
646; GFX900-NEXT: s_waitcnt
647; GFX900-NEXT: s_setpc_b64
648
649; NO-D16-HI: buffer_load_sbyte v{{[0-9]+}}, off, s[0:3], s32 offset:4095{{$}}
650define void @load_private_hi_v2i16_reglo_vreg_sexti8(i8 addrspace(5)* byval(i8) %in, i16 %reg) #0 {
651entry:
652  %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095
653  %load = load i8, i8 addrspace(5)* %gep
654  %ext = sext i8 %load to i16
655  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
656  %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
657  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
658  ret void
659}
660
661; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff_zexti8:
662; GCN: s_waitcnt
663; GFX900-MUBUF-NEXT:   buffer_load_ubyte_d16_hi v1, off, s[0:3], 0 offset:4094{{$}}
664; GFX900-FLATSCR-NEXT: s_movk_i32 [[SOFF:[^,]+]], 0xffe
665; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16_hi v1, off, [[SOFF]]{{$}}
666; GFX900-NEXT: s_waitcnt
667; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
668; GFX900-NEXT: s_waitcnt
669; GFX900-NEXT: s_setpc_b64
670
671; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094{{$}}
672define void @load_private_hi_v2i16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, i16 %reg) #0 {
673entry:
674  %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*)
675  %ext = zext i8 %load to i16
676  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
677  %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
678  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
679  ret void
680}
681
682; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff_sexti8:
683; GCN: s_waitcnt
684; GFX900-MUBUF-NEXT:   buffer_load_sbyte_d16_hi v1, off, s[0:3], 0 offset:4094{{$}}
685; GFX900-FLATSCR-NEXT: s_movk_i32 [[SOFF:[^,]+]], 0xffe
686; GFX900-FLATSCR-NEXT: scratch_load_sbyte_d16_hi v1, off, [[SOFF]]{{$}}
687; GFX900-NEXT: s_waitcnt
688; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
689; GFX900-NEXT: s_waitcnt
690; GFX900-NEXT: s_setpc_b64
691
692; NO-D16-HI: buffer_load_sbyte v0, off, s[0:3], 0 offset:4094{{$}}
693define void @load_private_hi_v2i16_reglo_vreg_nooff_sexti8(i8 addrspace(5)* %in, i16 %reg) #0 {
694entry:
695  %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*)
696  %ext = sext i8 %load to i16
697  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
698  %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
699  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
700  ret void
701}
702
703; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_nooff_zexti8:
704; GCN: s_waitcnt
705; GFX900-MUBUF-NEXT:   buffer_load_ubyte_d16_hi v1, off, s[0:3], 0 offset:4094{{$}}
706; GFX900-FLATSCR-NEXT: s_movk_i32 [[SOFF:[^,]+]], 0xffe
707; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16_hi v1, off, [[SOFF]]{{$}}
708; GFX900-NEXT: s_waitcnt
709; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
710; GFX900-NEXT: s_waitcnt
711; GFX900-NEXT: s_setpc_b64
712
713; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094{{$}}
714define void @load_private_hi_v2f16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, half %reg) #0 {
715entry:
716  %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*)
717  %ext = zext i8 %load to i16
718  %bc.ext = bitcast i16 %ext to half
719  %build0 = insertelement <2 x half> undef, half %reg, i32 0
720  %build1 = insertelement <2 x half> %build0, half %bc.ext, i32 1
721  store <2 x half> %build1, <2 x half> addrspace(1)* undef
722  ret void
723}
724
725; GCN-LABEL: {{^}}load_constant_hi_v2i16_reglo_vreg:
726; GCN: s_waitcnt
727; GFX900-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094
728; GFX900-NEXT: s_waitcnt
729; GFX900-NEXT: global_store_dword
730; GFX900-NEXT: s_waitcnt
731; GFX900-NEXT: s_setpc_b64
732
733; GFX803: flat_load_ushort
734; GFX906: global_load_ushort
735define void @load_constant_hi_v2i16_reglo_vreg(i16 addrspace(4)* %in, i16 %reg) #0 {
736entry:
737  %gep = getelementptr inbounds i16, i16 addrspace(4)* %in, i64 -2047
738  %load = load i16, i16 addrspace(4)* %gep
739  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
740  %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
741  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
742  ret void
743}
744
745; GCN-LABEL: load_constant_hi_v2f16_reglo_vreg
746; GCN: s_waitcnt
747; GFX900-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094
748; GFX900-NEXT: s_waitcnt
749; GFX900-NEXT: global_store_dword
750; GFX900-NEXT: s_waitcnt
751; GFX900-NEXT: s_setpc_b64
752
753; GFX803: flat_load_ushort
754; GFX906: global_load_ushort
755define void @load_constant_hi_v2f16_reglo_vreg(half addrspace(4)* %in, half %reg) #0 {
756entry:
757  %gep = getelementptr inbounds half, half addrspace(4)* %in, i64 -2047
758  %load = load half, half addrspace(4)* %gep
759  %build0 = insertelement <2 x half> undef, half %reg, i32 0
760  %build1 = insertelement <2 x half> %build0, half %load, i32 1
761  store <2 x half> %build1, <2 x half> addrspace(1)* undef
762  ret void
763}
764
765; GCN-LABEL: {{^}}load_constant_hi_v2f16_reglo_vreg_sexti8:
766; GCN: s_waitcnt
767; GFX900-NEXT: global_load_sbyte_d16_hi v2, v[0:1], off offset:-4095
768; GFX900-NEXT: s_waitcnt
769; GFX900-NEXT: global_store_dword
770; GFX900-NEXT: s_waitcnt
771; GFX900-NEXT: s_setpc_b64
772define void @load_constant_hi_v2f16_reglo_vreg_sexti8(i8 addrspace(4)* %in, half %reg) #0 {
773entry:
774  %gep = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 -4095
775  %load = load i8, i8 addrspace(4)* %gep
776  %ext = sext i8 %load to i16
777  %bitcast = bitcast i16 %ext to half
778  %build0 = insertelement <2 x half> undef, half %reg, i32 0
779  %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1
780  store <2 x half> %build1, <2 x half> addrspace(1)* undef
781  ret void
782}
783
784; GCN-LABEL: {{^}}load_constant_hi_v2f16_reglo_vreg_zexti8:
785; GCN: s_waitcnt
786; GFX900-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:-4095
787; GFX900-NEXT: s_waitcnt
788; GFX900-NEXT: global_store_dword
789; GFX900-NEXT: s_waitcnt
790; GFX900-NEXT: s_setpc_b64
791define void @load_constant_hi_v2f16_reglo_vreg_zexti8(i8 addrspace(4)* %in, half %reg) #0 {
792entry:
793  %gep = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 -4095
794  %load = load i8, i8 addrspace(4)* %gep
795  %ext = zext i8 %load to i16
796  %bitcast = bitcast i16 %ext to half
797  %build0 = insertelement <2 x half> undef, half %reg, i32 0
798  %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1
799  store <2 x half> %build1, <2 x half> addrspace(1)* undef
800  ret void
801}
802
803; Local object gives known offset, so requires converting from offen
804; to offset variant.
805
806; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_to_offset:
807; GFX900-MUBUF:        buffer_store_dword
808; GFX900-MUBUF-NEXT:   buffer_load_short_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4094
809; GFX900-FLATSCR:      scratch_store_dword
810; GFX900-FLATSCR-NEXT: scratch_load_short_d16_hi v{{[0-9]+}}, off, s32 offset:4094
811define void @load_private_hi_v2i16_reglo_vreg_to_offset(i16 %reg) #0 {
812entry:
813  %obj0 = alloca [10 x i32], align 4, addrspace(5)
814  %obj1 = alloca [4096 x i16], align 2, addrspace(5)
815  %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
816  store volatile i32 123, i32 addrspace(5)* %bc
817  %gep = getelementptr inbounds [4096 x i16], [4096 x i16] addrspace(5)* %obj1, i32 0, i32 2027
818  %load = load i16, i16 addrspace(5)* %gep
819  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
820  %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
821  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
822  ret void
823}
824
825; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_sexti8_to_offset:
826; GFX900-MUBUF:        buffer_store_dword
827; GFX900-MUBUF-NEXT:   buffer_load_sbyte_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4095
828; GFX900-FLATSCR:      scratch_store_dword
829; GFX900-FLATSCR-NEXT: scratch_load_sbyte_d16_hi v{{[0-9]+}}, off, s32 offset:4095
830define void @load_private_hi_v2i16_reglo_vreg_sexti8_to_offset(i16 %reg) #0 {
831entry:
832  %obj0 = alloca [10 x i32], align 4, addrspace(5)
833  %obj1 = alloca [4096 x i8], align 2, addrspace(5)
834  %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
835  store volatile i32 123, i32 addrspace(5)* %bc
836  %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4055
837  %load = load i8, i8 addrspace(5)* %gep
838  %ext = sext i8 %load to i16
839  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
840  %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
841  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
842  ret void
843}
844
845; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_zexti8_to_offset:
846; GFX900-MUBUF:        buffer_store_dword
847; GFX900-MUBUF-NEXT:   buffer_load_ubyte_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4095
848; GFX900-FLATSCR:      scratch_store_dword
849; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16_hi v{{[0-9]+}}, off, s32 offset:4095
850define void @load_private_hi_v2i16_reglo_vreg_zexti8_to_offset(i16 %reg) #0 {
851entry:
852  %obj0 = alloca [10 x i32], align 4, addrspace(5)
853  %obj1 = alloca [4096 x i8], align 2, addrspace(5)
854  %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
855  store volatile i32 123, i32 addrspace(5)* %bc
856  %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4055
857  %load = load i8, i8 addrspace(5)* %gep
858  %ext = zext i8 %load to i16
859  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
860  %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
861  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
862  ret void
863}
864
865; FIXME: Remove m0 init and waitcnt between reads
866; FIXME: Is there a cost to using the extload over not?
867; GCN-LABEL: {{^}}load_local_v2i16_split_multi_chain:
868; GCN: s_waitcnt
869; GFX900-NEXT: ds_read_u16 v1, v0
870; GFX900-NEXT: s_waitcnt
871; GFX900-NEXT: ds_read_u16_d16_hi v1, v0 offset:2
872; GFX900-NEXT: s_waitcnt
873; GFX900-NEXT: v_mov_b32_e32 v0, v1
874; GFX900-NEXT: s_setpc_b64
875define <2 x i16> @load_local_v2i16_split_multi_chain(i16 addrspace(3)* %in) #0 {
876entry:
877  %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 1
878  %load0 = load volatile i16, i16 addrspace(3)* %in
879  %load1 = load volatile i16, i16 addrspace(3)* %gep
880  %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0
881  %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1
882  ret <2 x i16> %build1
883}
884
885; GCN-LABEL: {{^}}load_local_lo_hi_v2i16_samechain:
886; GFX900: ds_read_u16 v1, v0
887; GFX900-NEXT: s_waitcnt lgkmcnt(0)
888; GFX900-NEXT: ds_read_u16_d16_hi v1, v0 offset:16
889; GFX900-NEXT: s_waitcnt lgkmcnt(0)
890; GFX900-NEXT: v_mov_b32_e32 v0, v1
891; GFX900-NEXT: s_setpc_b64
892
893; NO-D16-HI: ds_read_u16
894; NO-D16-HI: ds_read_u16
895define <2 x i16> @load_local_lo_hi_v2i16_samechain(i16 addrspace(3)* %in) #0 {
896entry:
897  %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 8
898  %load.lo = load i16, i16 addrspace(3)* %in
899  %load.hi = load i16, i16 addrspace(3)* %gep
900  %build0 = insertelement <2 x i16> undef, i16 %load.lo, i32 0
901  %build1 = insertelement <2 x i16> %build0, i16 %load.hi, i32 1
902  ret <2 x i16> %build1
903}
904
905; FIXME: Remove and
906; GCN-LABEL: {{^}}load_local_v2i16_broadcast:
907; GCN: ds_read_u16 [[LOAD:v[0-9]+]]
908; GCN-NOT: ds_read
909; GFX9: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, [[LOAD]]
910; GFX9: v_lshl_or_b32 v0, [[LOAD]], 16, [[AND]]
911define <2 x i16> @load_local_v2i16_broadcast(i16 addrspace(3)* %in) #0 {
912entry:
913  %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 1
914  %load0 = load i16, i16 addrspace(3)* %in
915  %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0
916  %build1 = insertelement <2 x i16> %build0, i16 %load0, i32 1
917  ret <2 x i16> %build1
918}
919
920; GCN-LABEL: {{^}}load_local_lo_hi_v2i16_side_effect:
921; GFX900: ds_read_u16 [[LOAD0:v[0-9]+]], v0
922; GFX900: ds_write_b16
923; GFX900: ds_read_u16_d16_hi [[LOAD0]], v0 offset:16
924
925; NO-D16-HI: ds_read_u16
926; NO-D16-HI: ds_write_b16
927; NO-D16-HI: ds_read_u16
928define <2 x i16> @load_local_lo_hi_v2i16_side_effect(i16 addrspace(3)* %in, i16 addrspace(3)* %may.alias) #0 {
929entry:
930  %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 8
931  %load.lo = load i16, i16 addrspace(3)* %in
932  store i16 123, i16 addrspace(3)* %may.alias
933  %load.hi = load i16, i16 addrspace(3)* %gep
934  %build0 = insertelement <2 x i16> undef, i16 %load.lo, i32 0
935  %build1 = insertelement <2 x i16> %build0, i16 %load.hi, i32 1
936  ret <2 x i16> %build1
937}
938
939; FIXME: Remove waitcnt between reads
940; GCN-LABEL: {{^}}load_global_v2i16_split:
941; GCN: s_waitcnt
942; GFX900-NEXT: global_load_ushort v2
943; GFX900-NEXT: s_waitcnt
944; GFX900-NEXT: global_load_short_d16_hi v2
945; GFX900-NEXT: s_waitcnt
946; GFX900-NEXT: v_mov_b32_e32 v0, v2
947; GFX900-NEXT: s_setpc_b64
948define <2 x i16> @load_global_v2i16_split(i16 addrspace(1)* %in) #0 {
949entry:
950  %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 1
951  %load0 = load volatile i16, i16 addrspace(1)* %in
952  %load1 = load volatile i16, i16 addrspace(1)* %gep
953  %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0
954  %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1
955  ret <2 x i16> %build1
956}
957
958; FIXME: Remove waitcnt between reads
959; GCN-LABEL: {{^}}load_flat_v2i16_split:
960; GCN: s_waitcnt
961; GFX900-NEXT: flat_load_ushort v2
962; GFX900-NEXT: s_waitcnt
963; GFX900-NEXT: flat_load_short_d16_hi v2
964; GFX900-NEXT: s_waitcnt
965; GFX900-NEXT: v_mov_b32_e32 v0, v2
966; GFX900-NEXT: s_setpc_b64
967define <2 x i16> @load_flat_v2i16_split(i16* %in) #0 {
968entry:
969  %gep = getelementptr inbounds i16, i16* %in, i64 1
970  %load0 = load volatile i16, i16* %in
971  %load1 = load volatile i16, i16* %gep
972  %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0
973  %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1
974  ret <2 x i16> %build1
975}
976
977; FIXME: Remove waitcnt between reads
978; GCN-LABEL: {{^}}load_constant_v2i16_split:
979; GCN: s_waitcnt
980; GFX900-NEXT: global_load_ushort v2
981; GFX900-NEXT: s_waitcnt
982; GFX900-NEXT: global_load_short_d16_hi v2
983; GFX900-NEXT: s_waitcnt
984; GFX900-NEXT: v_mov_b32_e32 v0, v2
985; GFX900-NEXT: s_setpc_b64
986define <2 x i16> @load_constant_v2i16_split(i16 addrspace(4)* %in) #0 {
987entry:
988  %gep = getelementptr inbounds i16, i16 addrspace(4)* %in, i64 1
989  %load0 = load volatile i16, i16 addrspace(4)* %in
990  %load1 = load volatile i16, i16 addrspace(4)* %gep
991  %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0
992  %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1
993  ret <2 x i16> %build1
994}
995
996; FIXME: Remove m0 init and waitcnt between reads
997; FIXME: Is there a cost to using the extload over not?
998; GCN-LABEL: {{^}}load_private_v2i16_split:
999; GCN: s_waitcnt
1000; GFX900-MUBUF:   buffer_load_ushort v0, off, s[0:3], s32{{$}}
1001; GFX900-FLATSCR: scratch_load_ushort v0, off, s32{{$}}
1002; GFX900-NEXT: s_waitcnt
1003; GFX900-MUBUF-NEXT:   buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:2
1004; GFX900-FLATSCR-NEXT: scratch_load_short_d16_hi v0, off, s32 offset:2
1005; GFX900-NEXT: s_waitcnt
1006; GFX900-NEXT: s_setpc_b64
1007define <2 x i16> @load_private_v2i16_split(i16 addrspace(5)* byval(i16) %in) #0 {
1008entry:
1009  %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i32 1
1010  %load0 = load volatile i16, i16 addrspace(5)* %in
1011  %load1 = load volatile i16, i16 addrspace(5)* %gep
1012  %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0
1013  %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1
1014  ret <2 x i16> %build1
1015}
1016
1017; FIXME: This test should work without copying of v0.
1018;        ds_read_u16_d16_hi preserves low 16 bits of the destination
1019;        and ds_write_b16 only reads low 16 bits.
1020; GCN: s_waitcnt
1021; GFX900:      v_mov_b32_e32 [[COPY:v[0-9]+]], v0
1022; GFX900-NEXT: ds_read_u16_d16_hi [[COPY]], v1
1023; GFX900-NEXT: ds_write_b16 v1, v0
1024; GFX900-NEXT: s_waitcnt
1025; GFX900-NEXT: v_mov_b32_e32 v0, [[COPY]]
1026; GFX900-NEXT: s_waitcnt
1027; GFX900-NEXT: s_setpc_b64
1028define <2 x i16> @load_local_hi_v2i16_store_local_lo(i16 %reg, i16 addrspace(3)* %in) #0 {
1029entry:
1030  %load = load i16, i16 addrspace(3)* %in
1031  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
1032  %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
1033  store volatile i16 %reg, i16 addrspace(3)* %in
1034  ret <2 x i16> %build1
1035}
1036
1037attributes #0 = { nounwind }
1038