1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,GFX900 %s
3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -mattr=-unaligned-access-mode -amdgpu-enable-flat-scratch < %s | FileCheck -check-prefixes=GCN,FLATSCR %s
4
5define <2 x half> @chain_hi_to_lo_private() {
6; GFX900-LABEL: chain_hi_to_lo_private:
7; GFX900:       ; %bb.0: ; %bb
8; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9; GFX900-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:2
10; GFX900-NEXT:    s_waitcnt vmcnt(0)
11; GFX900-NEXT:    buffer_load_short_d16_hi v0, off, s[0:3], 0
12; GFX900-NEXT:    s_waitcnt vmcnt(0)
13; GFX900-NEXT:    s_setpc_b64 s[30:31]
14;
15; FLATSCR-LABEL: chain_hi_to_lo_private:
16; FLATSCR:       ; %bb.0: ; %bb
17; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18; FLATSCR-NEXT:    s_mov_b32 s0, 2
19; FLATSCR-NEXT:    scratch_load_ushort v0, off, s0
20; FLATSCR-NEXT:    s_mov_b32 s0, 0
21; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
22; FLATSCR-NEXT:    scratch_load_short_d16_hi v0, off, s0
23; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
24; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
25bb:
26  %gep_lo = getelementptr inbounds half, half addrspace(5)* null, i64 1
27  %load_lo = load half, half addrspace(5)* %gep_lo
28  %gep_hi = getelementptr inbounds half, half addrspace(5)* null, i64 0
29  %load_hi = load half, half addrspace(5)* %gep_hi
30
31  %temp = insertelement <2 x half> undef, half %load_lo, i32 0
32  %result = insertelement <2 x half> %temp, half %load_hi, i32 1
33
34  ret <2 x half> %result
35}
36
37define <2 x half> @chain_hi_to_lo_private_different_bases(half addrspace(5)* %base_lo, half addrspace(5)* %base_hi) {
38; GFX900-LABEL: chain_hi_to_lo_private_different_bases:
39; GFX900:       ; %bb.0: ; %bb
40; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41; GFX900-NEXT:    buffer_load_ushort v0, v0, s[0:3], 0 offen
42; GFX900-NEXT:    s_waitcnt vmcnt(0)
43; GFX900-NEXT:    buffer_load_short_d16_hi v0, v1, s[0:3], 0 offen
44; GFX900-NEXT:    s_waitcnt vmcnt(0)
45; GFX900-NEXT:    s_setpc_b64 s[30:31]
46;
47; FLATSCR-LABEL: chain_hi_to_lo_private_different_bases:
48; FLATSCR:       ; %bb.0: ; %bb
49; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
50; FLATSCR-NEXT:    scratch_load_ushort v0, v0, off
51; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
52; FLATSCR-NEXT:    scratch_load_short_d16_hi v0, v1, off
53; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
54; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
55bb:
56  %load_lo = load half, half addrspace(5)* %base_lo
57  %load_hi = load half, half addrspace(5)* %base_hi
58
59  %temp = insertelement <2 x half> undef, half %load_lo, i32 0
60  %result = insertelement <2 x half> %temp, half %load_hi, i32 1
61
62  ret <2 x half> %result
63}
64
65define <2 x half> @chain_hi_to_lo_arithmatic(half addrspace(5)* %base, half %in) {
66; GFX900-LABEL: chain_hi_to_lo_arithmatic:
67; GFX900:       ; %bb.0: ; %bb
68; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
69; GFX900-NEXT:    v_add_f16_e32 v1, 1.0, v1
70; GFX900-NEXT:    buffer_load_short_d16_hi v1, v0, s[0:3], 0 offen
71; GFX900-NEXT:    s_waitcnt vmcnt(0)
72; GFX900-NEXT:    v_mov_b32_e32 v0, v1
73; GFX900-NEXT:    s_setpc_b64 s[30:31]
74;
75; FLATSCR-LABEL: chain_hi_to_lo_arithmatic:
76; FLATSCR:       ; %bb.0: ; %bb
77; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
78; FLATSCR-NEXT:    v_add_f16_e32 v1, 1.0, v1
79; FLATSCR-NEXT:    scratch_load_short_d16_hi v1, v0, off
80; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
81; FLATSCR-NEXT:    v_mov_b32_e32 v0, v1
82; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
83bb:
84  %arith_lo = fadd half %in, 1.0
85  %load_hi = load half, half addrspace(5)* %base
86
87  %temp = insertelement <2 x half> undef, half %arith_lo, i32 0
88  %result = insertelement <2 x half> %temp, half %load_hi, i32 1
89
90  ret <2 x half> %result
91}
92
93define <2 x half> @chain_hi_to_lo_group() {
94; GCN-LABEL: chain_hi_to_lo_group:
95; GCN:       ; %bb.0: ; %bb
96; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
97; GCN-NEXT:    v_mov_b32_e32 v1, 0
98; GCN-NEXT:    ds_read_u16 v0, v1 offset:2
99; GCN-NEXT:    s_waitcnt lgkmcnt(0)
100; GCN-NEXT:    ds_read_u16_d16_hi v0, v1
101; GCN-NEXT:    s_waitcnt lgkmcnt(0)
102; GCN-NEXT:    s_setpc_b64 s[30:31]
103bb:
104  %gep_lo = getelementptr inbounds half, half addrspace(3)* null, i64 1
105  %load_lo = load half, half addrspace(3)* %gep_lo
106  %gep_hi = getelementptr inbounds half, half addrspace(3)* null, i64 0
107  %load_hi = load half, half addrspace(3)* %gep_hi
108
109  %temp = insertelement <2 x half> undef, half %load_lo, i32 0
110  %result = insertelement <2 x half> %temp, half %load_hi, i32 1
111
112  ret <2 x half> %result
113}
114
115define <2 x half> @chain_hi_to_lo_group_different_bases(half addrspace(3)* %base_lo, half addrspace(3)* %base_hi) {
116; GCN-LABEL: chain_hi_to_lo_group_different_bases:
117; GCN:       ; %bb.0: ; %bb
118; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
119; GCN-NEXT:    ds_read_u16 v0, v0
120; GCN-NEXT:    s_waitcnt lgkmcnt(0)
121; GCN-NEXT:    ds_read_u16_d16_hi v0, v1
122; GCN-NEXT:    s_waitcnt lgkmcnt(0)
123; GCN-NEXT:    s_setpc_b64 s[30:31]
124bb:
125  %load_lo = load half, half addrspace(3)* %base_lo
126  %load_hi = load half, half addrspace(3)* %base_hi
127
128  %temp = insertelement <2 x half> undef, half %load_lo, i32 0
129  %result = insertelement <2 x half> %temp, half %load_hi, i32 1
130
131  ret <2 x half> %result
132}
133
134define <2 x half> @chain_hi_to_lo_global() {
135; GCN-LABEL: chain_hi_to_lo_global:
136; GCN:       ; %bb.0: ; %bb
137; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
138; GCN-NEXT:    v_mov_b32_e32 v0, 2
139; GCN-NEXT:    v_mov_b32_e32 v1, 0
140; GCN-NEXT:    global_load_ushort v0, v[0:1], off
141; GCN-NEXT:    v_mov_b32_e32 v1, 0
142; GCN-NEXT:    v_mov_b32_e32 v2, 0
143; GCN-NEXT:    s_waitcnt vmcnt(0)
144; GCN-NEXT:    global_load_short_d16_hi v0, v[1:2], off
145; GCN-NEXT:    s_waitcnt vmcnt(0)
146; GCN-NEXT:    s_setpc_b64 s[30:31]
147bb:
148  %gep_lo = getelementptr inbounds half, half addrspace(1)* null, i64 1
149  %load_lo = load half, half addrspace(1)* %gep_lo
150  %gep_hi = getelementptr inbounds half, half addrspace(1)* null, i64 0
151  %load_hi = load half, half addrspace(1)* %gep_hi
152
153  %temp = insertelement <2 x half> undef, half %load_lo, i32 0
154  %result = insertelement <2 x half> %temp, half %load_hi, i32 1
155
156  ret <2 x half> %result
157}
158
159define <2 x half> @chain_hi_to_lo_global_different_bases(half addrspace(1)* %base_lo, half addrspace(1)* %base_hi) {
160; GCN-LABEL: chain_hi_to_lo_global_different_bases:
161; GCN:       ; %bb.0: ; %bb
162; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
163; GCN-NEXT:    global_load_ushort v0, v[0:1], off
164; GCN-NEXT:    s_waitcnt vmcnt(0)
165; GCN-NEXT:    global_load_short_d16_hi v0, v[2:3], off
166; GCN-NEXT:    s_waitcnt vmcnt(0)
167; GCN-NEXT:    s_setpc_b64 s[30:31]
168bb:
169  %load_lo = load half, half addrspace(1)* %base_lo
170  %load_hi = load half, half addrspace(1)* %base_hi
171
172  %temp = insertelement <2 x half> undef, half %load_lo, i32 0
173  %result = insertelement <2 x half> %temp, half %load_hi, i32 1
174
175  ret <2 x half> %result
176}
177
178define <2 x half> @chain_hi_to_lo_flat() {
179; GCN-LABEL: chain_hi_to_lo_flat:
180; GCN:       ; %bb.0: ; %bb
181; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
182; GCN-NEXT:    v_mov_b32_e32 v0, 2
183; GCN-NEXT:    v_mov_b32_e32 v1, 0
184; GCN-NEXT:    flat_load_ushort v0, v[0:1]
185; GCN-NEXT:    v_mov_b32_e32 v1, 0
186; GCN-NEXT:    v_mov_b32_e32 v2, 0
187; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
188; GCN-NEXT:    flat_load_short_d16_hi v0, v[1:2]
189; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
190; GCN-NEXT:    s_setpc_b64 s[30:31]
191bb:
192  %gep_lo = getelementptr inbounds half, half* null, i64 1
193  %load_lo = load half, half* %gep_lo
194  %gep_hi = getelementptr inbounds half, half* null, i64 0
195  %load_hi = load half, half* %gep_hi
196
197  %temp = insertelement <2 x half> undef, half %load_lo, i32 0
198  %result = insertelement <2 x half> %temp, half %load_hi, i32 1
199
200  ret <2 x half> %result
201}
202
203define <2 x half> @chain_hi_to_lo_flat_different_bases(half* %base_lo, half* %base_hi) {
204; GCN-LABEL: chain_hi_to_lo_flat_different_bases:
205; GCN:       ; %bb.0: ; %bb
206; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
207; GCN-NEXT:    flat_load_ushort v0, v[0:1]
208; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
209; GCN-NEXT:    flat_load_short_d16_hi v0, v[2:3]
210; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
211; GCN-NEXT:    s_setpc_b64 s[30:31]
212bb:
213  %load_lo = load half, half* %base_lo
214  %load_hi = load half, half* %base_hi
215
216  %temp = insertelement <2 x half> undef, half %load_lo, i32 0
217  %result = insertelement <2 x half> %temp, half %load_hi, i32 1
218
219  ret <2 x half> %result
220}
221
222; Make sure we don't lose any of the private stores.
223define amdgpu_kernel void @vload2_private(i16 addrspace(1)* nocapture readonly %in, <2 x i16> addrspace(1)* nocapture %out) #0 {
224; GFX900-LABEL: vload2_private:
225; GFX900:       ; %bb.0: ; %entry
226; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
227; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
228; GFX900-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
229; GFX900-NEXT:    v_mov_b32_e32 v2, 0
230; GFX900-NEXT:    s_add_u32 s0, s0, s9
231; GFX900-NEXT:    s_addc_u32 s1, s1, 0
232; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
233; GFX900-NEXT:    global_load_ushort v0, v2, s[4:5]
234; GFX900-NEXT:    s_waitcnt vmcnt(0)
235; GFX900-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:4
236; GFX900-NEXT:    global_load_ushort v0, v2, s[4:5] offset:2
237; GFX900-NEXT:    s_waitcnt vmcnt(0)
238; GFX900-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:6
239; GFX900-NEXT:    global_load_ushort v0, v2, s[4:5] offset:4
240; GFX900-NEXT:    s_waitcnt vmcnt(0)
241; GFX900-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:8
242; GFX900-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:4
243; GFX900-NEXT:    buffer_load_ushort v3, off, s[0:3], 0 offset:6
244; GFX900-NEXT:    s_waitcnt vmcnt(1)
245; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff, v0
246; GFX900-NEXT:    s_waitcnt vmcnt(0)
247; GFX900-NEXT:    v_mov_b32_e32 v1, v3
248; GFX900-NEXT:    buffer_load_short_d16_hi v1, off, s[0:3], 0 offset:8
249; GFX900-NEXT:    v_lshl_or_b32 v0, v3, 16, v0
250; GFX900-NEXT:    s_waitcnt vmcnt(0)
251; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
252; GFX900-NEXT:    s_endpgm
253;
254; FLATSCR-LABEL: vload2_private:
255; FLATSCR:       ; %bb.0: ; %entry
256; FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
257; FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
258; FLATSCR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
259; FLATSCR-NEXT:    v_mov_b32_e32 v2, 0
260; FLATSCR-NEXT:    s_mov_b32 vcc_hi, 0
261; FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
262; FLATSCR-NEXT:    global_load_ushort v0, v2, s[0:1]
263; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
264; FLATSCR-NEXT:    scratch_store_short off, v0, vcc_hi offset:4
265; FLATSCR-NEXT:    global_load_ushort v0, v2, s[0:1] offset:2
266; FLATSCR-NEXT:    s_mov_b32 vcc_hi, 0
267; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
268; FLATSCR-NEXT:    scratch_store_short off, v0, vcc_hi offset:6
269; FLATSCR-NEXT:    global_load_ushort v0, v2, s[0:1] offset:4
270; FLATSCR-NEXT:    s_mov_b32 vcc_hi, 0
271; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
272; FLATSCR-NEXT:    scratch_store_short off, v0, vcc_hi offset:8
273; FLATSCR-NEXT:    s_mov_b32 vcc_hi, 0
274; FLATSCR-NEXT:    scratch_load_ushort v0, off, vcc_hi offset:4
275; FLATSCR-NEXT:    s_mov_b32 vcc_hi, 0
276; FLATSCR-NEXT:    scratch_load_ushort v3, off, vcc_hi offset:6
277; FLATSCR-NEXT:    s_mov_b32 vcc_hi, 0
278; FLATSCR-NEXT:    s_waitcnt vmcnt(1)
279; FLATSCR-NEXT:    v_and_b32_e32 v0, 0xffff, v0
280; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
281; FLATSCR-NEXT:    v_mov_b32_e32 v1, v3
282; FLATSCR-NEXT:    scratch_load_short_d16_hi v1, off, vcc_hi offset:8
283; FLATSCR-NEXT:    v_lshl_or_b32 v0, v3, 16, v0
284; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
285; FLATSCR-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
286; FLATSCR-NEXT:    s_endpgm
287entry:
288  %loc = alloca [3 x i16], align 2, addrspace(5)
289  %loc.0.sroa_cast1 = bitcast [3 x i16] addrspace(5)* %loc to i8 addrspace(5)*
290  %tmp = load i16, i16 addrspace(1)* %in, align 2
291  %loc.0.sroa_idx = getelementptr inbounds [3 x i16], [3 x i16] addrspace(5)* %loc, i32 0, i32 0
292  store volatile i16 %tmp, i16 addrspace(5)* %loc.0.sroa_idx
293  %arrayidx.1 = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 1
294  %tmp1 = load i16, i16 addrspace(1)* %arrayidx.1, align 2
295  %loc.2.sroa_idx3 = getelementptr inbounds [3 x i16], [3 x i16] addrspace(5)* %loc, i32 0, i32 1
296  store volatile i16 %tmp1, i16 addrspace(5)* %loc.2.sroa_idx3
297  %arrayidx.2 = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 2
298  %tmp2 = load i16, i16 addrspace(1)* %arrayidx.2, align 2
299  %loc.4.sroa_idx = getelementptr inbounds [3 x i16], [3 x i16] addrspace(5)* %loc, i32 0, i32 2
300  store volatile i16 %tmp2, i16 addrspace(5)* %loc.4.sroa_idx
301  %loc.0.sroa_cast = bitcast [3 x i16] addrspace(5)* %loc to <2 x i16> addrspace(5)*
302  %loc.0. = load <2 x i16>, <2 x i16> addrspace(5)* %loc.0.sroa_cast, align 2
303  store <2 x i16> %loc.0., <2 x i16> addrspace(1)* %out, align 4
304  %loc.2.sroa_idx = getelementptr inbounds [3 x i16], [3 x i16] addrspace(5)* %loc, i32 0, i32 1
305  %loc.2.sroa_cast = bitcast i16 addrspace(5)* %loc.2.sroa_idx to <2 x i16> addrspace(5)*
306  %loc.2. = load <2 x i16>, <2 x i16> addrspace(5)* %loc.2.sroa_cast, align 2
307  %arrayidx6 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 1
308  store <2 x i16> %loc.2., <2 x i16> addrspace(1)* %arrayidx6, align 4
309  %loc.0.sroa_cast2 = bitcast [3 x i16] addrspace(5)* %loc to i8 addrspace(5)*
310  ret void
311}
312
313; There is another instruction between the misordered instruction and
314; the value dependent load, so a simple operand check is insufficient.
315define <2 x i16> @chain_hi_to_lo_group_other_dep(i16 addrspace(3)* %ptr) {
316; GCN-LABEL: chain_hi_to_lo_group_other_dep:
317; GCN:       ; %bb.0: ; %bb
318; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
319; GCN-NEXT:    ds_read_u16_d16_hi v1, v0
320; GCN-NEXT:    s_waitcnt lgkmcnt(0)
321; GCN-NEXT:    v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0]
322; GCN-NEXT:    ds_read_u16_d16 v1, v0 offset:2
323; GCN-NEXT:    s_waitcnt lgkmcnt(0)
324; GCN-NEXT:    v_mov_b32_e32 v0, v1
325; GCN-NEXT:    s_setpc_b64 s[30:31]
326bb:
327  %gep_lo = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 1
328  %load_lo = load i16, i16 addrspace(3)* %gep_lo
329  %gep_hi = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 0
330  %load_hi = load i16, i16 addrspace(3)* %gep_hi
331  %to.hi = insertelement <2 x i16> undef, i16 %load_hi, i32 1
332  %op.hi = add <2 x i16> %to.hi, <i16 12, i16 12>
333  %result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0
334  ret <2 x i16> %result
335}
336
337; The volatile operations aren't put on the same chain
338define <2 x i16> @chain_hi_to_lo_group_other_dep_multi_chain(i16 addrspace(3)* %ptr) {
339; GCN-LABEL: chain_hi_to_lo_group_other_dep_multi_chain:
340; GCN:       ; %bb.0: ; %bb
341; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
342; GCN-NEXT:    ds_read_u16 v1, v0 offset:2
343; GCN-NEXT:    ds_read_u16_d16_hi v0, v0
344; GCN-NEXT:    v_mov_b32_e32 v2, 0xffff
345; GCN-NEXT:    s_waitcnt lgkmcnt(0)
346; GCN-NEXT:    v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0]
347; GCN-NEXT:    v_bfi_b32 v0, v2, v1, v0
348; GCN-NEXT:    s_setpc_b64 s[30:31]
349bb:
350  %gep_lo = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 1
351  %load_lo = load volatile i16, i16 addrspace(3)* %gep_lo
352  %gep_hi = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 0
353  %load_hi = load volatile i16, i16 addrspace(3)* %gep_hi
354  %to.hi = insertelement <2 x i16> undef, i16 %load_hi, i32 1
355  %op.hi = add <2 x i16> %to.hi, <i16 12, i16 12>
356  %result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0
357  ret <2 x i16> %result
358}
359
360define <2 x i16> @chain_hi_to_lo_private_other_dep(i16 addrspace(5)* %ptr) {
361; GFX900-LABEL: chain_hi_to_lo_private_other_dep:
362; GFX900:       ; %bb.0: ; %bb
363; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
364; GFX900-NEXT:    buffer_load_short_d16_hi v1, v0, s[0:3], 0 offen
365; GFX900-NEXT:    s_waitcnt vmcnt(0)
366; GFX900-NEXT:    v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0]
367; GFX900-NEXT:    buffer_load_short_d16 v1, v0, s[0:3], 0 offen offset:2
368; GFX900-NEXT:    s_waitcnt vmcnt(0)
369; GFX900-NEXT:    v_mov_b32_e32 v0, v1
370; GFX900-NEXT:    s_setpc_b64 s[30:31]
371;
372; FLATSCR-LABEL: chain_hi_to_lo_private_other_dep:
373; FLATSCR:       ; %bb.0: ; %bb
374; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
375; FLATSCR-NEXT:    scratch_load_short_d16_hi v1, v0, off
376; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
377; FLATSCR-NEXT:    v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0]
378; FLATSCR-NEXT:    scratch_load_short_d16 v1, v0, off offset:2
379; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
380; FLATSCR-NEXT:    v_mov_b32_e32 v0, v1
381; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
382bb:
383  %gep_lo = getelementptr inbounds i16, i16 addrspace(5)* %ptr, i64 1
384  %load_lo = load i16, i16 addrspace(5)* %gep_lo
385  %gep_hi = getelementptr inbounds i16, i16 addrspace(5)* %ptr, i64 0
386  %load_hi = load i16, i16 addrspace(5)* %gep_hi
387  %to.hi = insertelement <2 x i16> undef, i16 %load_hi, i32 1
388  %op.hi = add <2 x i16> %to.hi, <i16 12, i16 12>
389  %result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0
390  ret <2 x i16> %result
391}
392
393define <2 x i16> @chain_hi_to_lo_global_other_dep(i16 addrspace(1)* %ptr) {
394; GCN-LABEL: chain_hi_to_lo_global_other_dep:
395; GCN:       ; %bb.0: ; %bb
396; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
397; GCN-NEXT:    global_load_ushort v2, v[0:1], off offset:2
398; GCN-NEXT:    global_load_short_d16_hi v0, v[0:1], off
399; GCN-NEXT:    v_mov_b32_e32 v1, 0xffff
400; GCN-NEXT:    s_waitcnt vmcnt(0)
401; GCN-NEXT:    v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0]
402; GCN-NEXT:    v_bfi_b32 v0, v1, v2, v0
403; GCN-NEXT:    s_setpc_b64 s[30:31]
404bb:
405  %gep_lo = getelementptr inbounds i16, i16 addrspace(1)* %ptr, i64 1
406  %load_lo = load volatile i16, i16 addrspace(1)* %gep_lo
407  %gep_hi = getelementptr inbounds i16, i16 addrspace(1)* %ptr, i64 0
408  %load_hi = load volatile i16, i16 addrspace(1)* %gep_hi
409  %to.hi = insertelement <2 x i16> undef, i16 %load_hi, i32 1
410  %op.hi = add <2 x i16> %to.hi, <i16 12, i16 12>
411  %result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0
412  ret <2 x i16> %result
413}
414
415define <2 x i16> @chain_hi_to_lo_flat_other_dep(i16 addrspace(0)* %ptr) {
416; GCN-LABEL: chain_hi_to_lo_flat_other_dep:
417; GCN:       ; %bb.0: ; %bb
418; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
419; GCN-NEXT:    flat_load_ushort v2, v[0:1] offset:2
420; GCN-NEXT:    flat_load_short_d16_hi v0, v[0:1]
421; GCN-NEXT:    v_mov_b32_e32 v1, 0xffff
422; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
423; GCN-NEXT:    v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0]
424; GCN-NEXT:    v_bfi_b32 v0, v1, v2, v0
425; GCN-NEXT:    s_setpc_b64 s[30:31]
426bb:
427  %gep_lo = getelementptr inbounds i16, i16 addrspace(0)* %ptr, i64 1
428  %load_lo = load volatile i16, i16 addrspace(0)* %gep_lo
429  %gep_hi = getelementptr inbounds i16, i16 addrspace(0)* %ptr, i64 0
430  %load_hi = load volatile i16, i16 addrspace(0)* %gep_hi
431  %to.hi = insertelement <2 x i16> undef, i16 %load_hi, i32 1
432  %op.hi = add <2 x i16> %to.hi, <i16 12, i16 12>
433  %result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0
434  ret <2 x i16> %result
435}
436
437define <2 x i16> @chain_hi_to_lo_group_may_alias_store(i16 addrspace(3)* %ptr, i16 addrspace(3)* %may.alias) {
438; GCN-LABEL: chain_hi_to_lo_group_may_alias_store:
439; GCN:       ; %bb.0: ; %bb
440; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
441; GCN-NEXT:    v_mov_b32_e32 v3, 0x7b
442; GCN-NEXT:    ds_read_u16 v2, v0
443; GCN-NEXT:    ds_write_b16 v1, v3
444; GCN-NEXT:    ds_read_u16 v0, v0 offset:2
445; GCN-NEXT:    s_waitcnt lgkmcnt(0)
446; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
447; GCN-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
448; GCN-NEXT:    s_setpc_b64 s[30:31]
449bb:
450  %gep_lo = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 1
451  %gep_hi = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 0
452  %load_hi = load i16, i16 addrspace(3)* %gep_hi
453  store i16 123, i16 addrspace(3)* %may.alias
454  %load_lo = load i16, i16 addrspace(3)* %gep_lo
455
456  %to.hi = insertelement <2 x i16> undef, i16 %load_hi, i32 1
457  %result = insertelement <2 x i16> %to.hi, i16 %load_lo, i32 0
458  ret <2 x i16> %result
459}
460