1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,GFX89,GFX9
3; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,GFX89,VI
4
5; FIXME: Need to handle non-uniform case for function below (load without gep).
6define amdgpu_kernel void @v_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
7; GFX9-LABEL: v_test_sub_v2i16:
8; GFX9:       ; %bb.0:
9; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
10; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
11; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
12; GFX9-NEXT:    s_mov_b32 s3, 0xf000
13; GFX9-NEXT:    s_mov_b32 s2, -1
14; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
15; GFX9-NEXT:    s_mov_b32 s0, s4
16; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
17; GFX9-NEXT:    global_load_dword v0, v0, s[8:9]
18; GFX9-NEXT:    s_mov_b32 s1, s5
19; GFX9-NEXT:    s_waitcnt vmcnt(0)
20; GFX9-NEXT:    v_pk_sub_i16 v0, v1, v0
21; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
22; GFX9-NEXT:    s_endpgm
23;
24; VI-LABEL: v_test_sub_v2i16:
25; VI:       ; %bb.0:
26; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
27; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
28; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
29; VI-NEXT:    s_waitcnt lgkmcnt(0)
30; VI-NEXT:    v_mov_b32_e32 v1, s7
31; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
32; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
33; VI-NEXT:    v_mov_b32_e32 v3, s1
34; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
35; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
36; VI-NEXT:    flat_load_dword v0, v[0:1]
37; VI-NEXT:    flat_load_dword v1, v[2:3]
38; VI-NEXT:    s_mov_b32 s7, 0xf000
39; VI-NEXT:    s_mov_b32 s6, -1
40; VI-NEXT:    s_waitcnt vmcnt(0)
41; VI-NEXT:    v_sub_u16_e32 v2, v0, v1
42; VI-NEXT:    v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
43; VI-NEXT:    v_or_b32_e32 v0, v2, v0
44; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
45; VI-NEXT:    s_endpgm
46  %tid = call i32 @llvm.amdgcn.workitem.id.x()
47  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
48  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
49  %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid
50  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
51  %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1
52  %add = sub <2 x i16> %a, %b
53  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
54  ret void
55}
56
57define amdgpu_kernel void @s_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %in0, <2 x i16> addrspace(4)* %in1) #1 {
58; GFX9-LABEL: s_test_sub_v2i16:
59; GFX9:       ; %bb.0:
60; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
61; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
62; GFX9-NEXT:    s_mov_b32 s3, 0xf000
63; GFX9-NEXT:    s_mov_b32 s2, -1
64; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
65; GFX9-NEXT:    s_mov_b32 s0, s4
66; GFX9-NEXT:    s_mov_b32 s1, s5
67; GFX9-NEXT:    s_load_dword s4, s[6:7], 0x0
68; GFX9-NEXT:    s_load_dword s5, s[8:9], 0x0
69; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
70; GFX9-NEXT:    v_mov_b32_e32 v0, s5
71; GFX9-NEXT:    v_pk_sub_i16 v0, s4, v0
72; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
73; GFX9-NEXT:    s_endpgm
74;
75; VI-LABEL: s_test_sub_v2i16:
76; VI:       ; %bb.0:
77; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
78; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
79; VI-NEXT:    s_mov_b32 s3, 0xf000
80; VI-NEXT:    s_mov_b32 s2, -1
81; VI-NEXT:    s_waitcnt lgkmcnt(0)
82; VI-NEXT:    s_mov_b32 s0, s4
83; VI-NEXT:    s_load_dword s4, s[6:7], 0x0
84; VI-NEXT:    s_load_dword s6, s[8:9], 0x0
85; VI-NEXT:    s_mov_b32 s1, s5
86; VI-NEXT:    s_waitcnt lgkmcnt(0)
87; VI-NEXT:    s_lshr_b32 s5, s4, 16
88; VI-NEXT:    s_lshr_b32 s7, s6, 16
89; VI-NEXT:    s_sub_i32 s4, s4, s6
90; VI-NEXT:    s_sub_i32 s5, s5, s7
91; VI-NEXT:    s_and_b32 s4, s4, 0xffff
92; VI-NEXT:    s_lshl_b32 s5, s5, 16
93; VI-NEXT:    s_or_b32 s4, s4, s5
94; VI-NEXT:    v_mov_b32_e32 v0, s4
95; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
96; VI-NEXT:    s_endpgm
97  %a = load <2 x i16>, <2 x i16> addrspace(4)* %in0
98  %b = load <2 x i16>, <2 x i16> addrspace(4)* %in1
99  %add = sub <2 x i16> %a, %b
100  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
101  ret void
102}
103
104define amdgpu_kernel void @s_test_sub_self_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %in0) #1 {
105; GCN-LABEL: s_test_sub_self_v2i16:
106; GCN:       ; %bb.0:
107; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
108; GCN-NEXT:    s_mov_b32 s3, 0xf000
109; GCN-NEXT:    s_mov_b32 s2, -1
110; GCN-NEXT:    v_mov_b32_e32 v0, 0
111; GCN-NEXT:    s_waitcnt lgkmcnt(0)
112; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
113; GCN-NEXT:    s_endpgm
114  %a = load <2 x i16>, <2 x i16> addrspace(4)* %in0
115  %add = sub <2 x i16> %a, %a
116  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
117  ret void
118}
119
120; FIXME: VI should not scalarize arg access.
121define amdgpu_kernel void @s_test_sub_v2i16_kernarg(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #1 {
122; GFX9-LABEL: s_test_sub_v2i16_kernarg:
123; GFX9:       ; %bb.0:
124; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
125; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
126; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x30
127; GFX9-NEXT:    s_mov_b32 s7, 0xf000
128; GFX9-NEXT:    s_mov_b32 s6, -1
129; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
130; GFX9-NEXT:    v_mov_b32_e32 v0, s0
131; GFX9-NEXT:    v_pk_sub_i16 v0, s2, v0
132; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
133; GFX9-NEXT:    s_endpgm
134;
135; VI-LABEL: s_test_sub_v2i16_kernarg:
136; VI:       ; %bb.0:
137; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
138; VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
139; VI-NEXT:    s_load_dword s0, s[0:1], 0x30
140; VI-NEXT:    s_mov_b32 s7, 0xf000
141; VI-NEXT:    s_mov_b32 s6, -1
142; VI-NEXT:    s_waitcnt lgkmcnt(0)
143; VI-NEXT:    s_lshr_b32 s1, s2, 16
144; VI-NEXT:    s_lshr_b32 s3, s0, 16
145; VI-NEXT:    s_sub_i32 s1, s1, s3
146; VI-NEXT:    s_sub_i32 s0, s2, s0
147; VI-NEXT:    s_lshl_b32 s1, s1, 16
148; VI-NEXT:    s_and_b32 s0, s0, 0xffff
149; VI-NEXT:    s_or_b32 s0, s0, s1
150; VI-NEXT:    v_mov_b32_e32 v0, s0
151; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
152; VI-NEXT:    s_endpgm
153  %add = sub <2 x i16> %a, %b
154  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
155  ret void
156}
157
158define amdgpu_kernel void @v_test_sub_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
159; GFX9-LABEL: v_test_sub_v2i16_constant:
160; GFX9:       ; %bb.0:
161; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
162; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
163; GFX9-NEXT:    s_mov_b32 s3, 0xf000
164; GFX9-NEXT:    s_mov_b32 s2, -1
165; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
166; GFX9-NEXT:    global_load_dword v0, v0, s[6:7]
167; GFX9-NEXT:    s_mov_b32 s0, s4
168; GFX9-NEXT:    s_mov_b32 s4, 0x1c8007b
169; GFX9-NEXT:    s_mov_b32 s1, s5
170; GFX9-NEXT:    s_waitcnt vmcnt(0)
171; GFX9-NEXT:    v_pk_sub_i16 v0, v0, s4
172; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
173; GFX9-NEXT:    s_endpgm
174;
175; VI-LABEL: v_test_sub_v2i16_constant:
176; VI:       ; %bb.0:
177; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
178; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
179; VI-NEXT:    s_waitcnt lgkmcnt(0)
180; VI-NEXT:    v_mov_b32_e32 v1, s3
181; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
182; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
183; VI-NEXT:    flat_load_dword v0, v[0:1]
184; VI-NEXT:    v_mov_b32_e32 v1, 0xfffffe38
185; VI-NEXT:    s_mov_b32 s3, 0xf000
186; VI-NEXT:    s_mov_b32 s2, -1
187; VI-NEXT:    s_waitcnt vmcnt(0)
188; VI-NEXT:    v_add_u16_e32 v2, 0xff85, v0
189; VI-NEXT:    v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
190; VI-NEXT:    v_or_b32_e32 v0, v2, v0
191; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
192; VI-NEXT:    s_endpgm
193  %tid = call i32 @llvm.amdgcn.workitem.id.x()
194  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
195  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
196  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
197  %add = sub <2 x i16> %a, <i16 123, i16 456>
198  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
199  ret void
200}
201
202; FIXME: Need to handle non-uniform case for function below (load without gep).
203define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
204; GFX9-LABEL: v_test_sub_v2i16_neg_constant:
205; GFX9:       ; %bb.0:
206; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
207; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
208; GFX9-NEXT:    s_mov_b32 s3, 0xf000
209; GFX9-NEXT:    s_mov_b32 s2, -1
210; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
211; GFX9-NEXT:    global_load_dword v0, v0, s[6:7]
212; GFX9-NEXT:    s_mov_b32 s0, s4
213; GFX9-NEXT:    s_mov_b32 s4, 0xfc21fcb3
214; GFX9-NEXT:    s_mov_b32 s1, s5
215; GFX9-NEXT:    s_waitcnt vmcnt(0)
216; GFX9-NEXT:    v_pk_sub_i16 v0, v0, s4
217; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
218; GFX9-NEXT:    s_endpgm
219;
220; VI-LABEL: v_test_sub_v2i16_neg_constant:
221; VI:       ; %bb.0:
222; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
223; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
224; VI-NEXT:    s_waitcnt lgkmcnt(0)
225; VI-NEXT:    v_mov_b32_e32 v1, s3
226; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
227; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
228; VI-NEXT:    flat_load_dword v0, v[0:1]
229; VI-NEXT:    v_mov_b32_e32 v1, 0x3df
230; VI-NEXT:    s_mov_b32 s3, 0xf000
231; VI-NEXT:    s_mov_b32 s2, -1
232; VI-NEXT:    s_waitcnt vmcnt(0)
233; VI-NEXT:    v_add_u16_e32 v2, 0x34d, v0
234; VI-NEXT:    v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
235; VI-NEXT:    v_or_b32_e32 v0, v2, v0
236; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
237; VI-NEXT:    s_endpgm
238  %tid = call i32 @llvm.amdgcn.workitem.id.x()
239  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
240  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
241  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
242  %add = sub <2 x i16> %a, <i16 -845, i16 -991>
243  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
244  ret void
245}
246
247define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
248; GFX9-LABEL: v_test_sub_v2i16_inline_neg1:
249; GFX9:       ; %bb.0:
250; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
251; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
252; GFX9-NEXT:    s_mov_b32 s3, 0xf000
253; GFX9-NEXT:    s_mov_b32 s2, -1
254; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
255; GFX9-NEXT:    global_load_dword v0, v0, s[6:7]
256; GFX9-NEXT:    s_mov_b32 s0, s4
257; GFX9-NEXT:    s_mov_b32 s1, s5
258; GFX9-NEXT:    s_waitcnt vmcnt(0)
259; GFX9-NEXT:    v_pk_sub_i16 v0, v0, -1 op_sel_hi:[1,0]
260; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
261; GFX9-NEXT:    s_endpgm
262;
263; VI-LABEL: v_test_sub_v2i16_inline_neg1:
264; VI:       ; %bb.0:
265; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
266; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
267; VI-NEXT:    s_waitcnt lgkmcnt(0)
268; VI-NEXT:    v_mov_b32_e32 v1, s3
269; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
270; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
271; VI-NEXT:    flat_load_dword v0, v[0:1]
272; VI-NEXT:    v_mov_b32_e32 v1, 1
273; VI-NEXT:    s_mov_b32 s3, 0xf000
274; VI-NEXT:    s_mov_b32 s2, -1
275; VI-NEXT:    s_waitcnt vmcnt(0)
276; VI-NEXT:    v_add_u16_e32 v2, 1, v0
277; VI-NEXT:    v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
278; VI-NEXT:    v_or_b32_e32 v0, v2, v0
279; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
280; VI-NEXT:    s_endpgm
281  %tid = call i32 @llvm.amdgcn.workitem.id.x()
282  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
283  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
284  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
285  %add = sub <2 x i16> %a, <i16 -1, i16 -1>
286  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
287  ret void
288}
289
290define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
291; GFX9-LABEL: v_test_sub_v2i16_inline_lo_zero_hi:
292; GFX9:       ; %bb.0:
293; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
294; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
295; GFX9-NEXT:    s_mov_b32 s3, 0xf000
296; GFX9-NEXT:    s_mov_b32 s2, -1
297; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
298; GFX9-NEXT:    global_load_dword v0, v0, s[6:7]
299; GFX9-NEXT:    s_mov_b32 s0, s4
300; GFX9-NEXT:    s_mov_b32 s1, s5
301; GFX9-NEXT:    s_waitcnt vmcnt(0)
302; GFX9-NEXT:    v_pk_sub_i16 v0, v0, 32
303; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
304; GFX9-NEXT:    s_endpgm
305;
306; VI-LABEL: v_test_sub_v2i16_inline_lo_zero_hi:
307; VI:       ; %bb.0:
308; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
309; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
310; VI-NEXT:    s_waitcnt lgkmcnt(0)
311; VI-NEXT:    v_mov_b32_e32 v1, s3
312; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
313; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
314; VI-NEXT:    flat_load_dword v0, v[0:1]
315; VI-NEXT:    s_mov_b32 s3, 0xf000
316; VI-NEXT:    s_mov_b32 s2, -1
317; VI-NEXT:    s_waitcnt vmcnt(0)
318; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
319; VI-NEXT:    v_subrev_u16_e32 v0, 32, v0
320; VI-NEXT:    v_or_b32_e32 v0, v0, v1
321; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
322; VI-NEXT:    s_endpgm
323  %tid = call i32 @llvm.amdgcn.workitem.id.x()
324  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
325  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
326  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
327  %add = sub <2 x i16> %a, <i16 32, i16 0>
328  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
329  ret void
330}
331
332; The high element gives fp
333define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
334; GFX9-LABEL: v_test_sub_v2i16_inline_fp_split:
335; GFX9:       ; %bb.0:
336; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
337; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
338; GFX9-NEXT:    s_mov_b32 s3, 0xf000
339; GFX9-NEXT:    s_mov_b32 s2, -1
340; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
341; GFX9-NEXT:    global_load_dword v0, v0, s[6:7]
342; GFX9-NEXT:    s_mov_b32 s0, s4
343; GFX9-NEXT:    s_mov_b32 s4, 1.0
344; GFX9-NEXT:    s_mov_b32 s1, s5
345; GFX9-NEXT:    s_waitcnt vmcnt(0)
346; GFX9-NEXT:    v_pk_sub_i16 v0, v0, s4
347; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
348; GFX9-NEXT:    s_endpgm
349;
350; VI-LABEL: v_test_sub_v2i16_inline_fp_split:
351; VI:       ; %bb.0:
352; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
353; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
354; VI-NEXT:    s_mov_b32 s3, 0xf000
355; VI-NEXT:    s_mov_b32 s2, -1
356; VI-NEXT:    s_waitcnt lgkmcnt(0)
357; VI-NEXT:    v_mov_b32_e32 v1, s7
358; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
359; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
360; VI-NEXT:    flat_load_dword v0, v[0:1]
361; VI-NEXT:    v_mov_b32_e32 v1, 0xffffc080
362; VI-NEXT:    s_mov_b32 s0, s4
363; VI-NEXT:    s_mov_b32 s1, s5
364; VI-NEXT:    s_waitcnt vmcnt(0)
365; VI-NEXT:    v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
366; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
367; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
368; VI-NEXT:    s_endpgm
369  %tid = call i32 @llvm.amdgcn.workitem.id.x()
370  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
371  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
372  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
373  %add = sub <2 x i16> %a, <i16 0, i16 16256>
374  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
375  ret void
376}
377
378; FIXME: Need to handle non-uniform case for function below (load without gep).
379define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
380; GFX9-LABEL: v_test_sub_v2i16_zext_to_v2i32:
381; GFX9:       ; %bb.0:
382; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
383; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
384; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
385; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
386; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
387; GFX9-NEXT:    global_load_dword v0, v0, s[0:1]
388; GFX9-NEXT:    s_mov_b32 s7, 0xf000
389; GFX9-NEXT:    s_mov_b32 s6, -1
390; GFX9-NEXT:    s_waitcnt vmcnt(0)
391; GFX9-NEXT:    v_pk_sub_i16 v0, v1, v0
392; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
393; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
394; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
395; GFX9-NEXT:    s_endpgm
396;
397; VI-LABEL: v_test_sub_v2i16_zext_to_v2i32:
398; VI:       ; %bb.0:
399; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
400; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
401; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
402; VI-NEXT:    s_waitcnt lgkmcnt(0)
403; VI-NEXT:    v_mov_b32_e32 v1, s7
404; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
405; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
406; VI-NEXT:    v_mov_b32_e32 v3, s1
407; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
408; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
409; VI-NEXT:    flat_load_dword v1, v[0:1]
410; VI-NEXT:    flat_load_dword v2, v[2:3]
411; VI-NEXT:    s_mov_b32 s7, 0xf000
412; VI-NEXT:    s_mov_b32 s6, -1
413; VI-NEXT:    s_waitcnt vmcnt(0)
414; VI-NEXT:    v_sub_u16_e32 v0, v1, v2
415; VI-NEXT:    v_sub_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
416; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
417; VI-NEXT:    s_endpgm
418  %tid = call i32 @llvm.amdgcn.workitem.id.x()
419  %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid
420  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
421  %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid
422  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
423  %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1
424  %add = sub <2 x i16> %a, %b
425  %ext = zext <2 x i16> %add to <2 x i32>
426  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
427  ret void
428}
429
430; FIXME: Need to handle non-uniform case for function below (load without gep).
431define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
432; GFX9-LABEL: v_test_sub_v2i16_zext_to_v2i64:
433; GFX9:       ; %bb.0:
434; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
435; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
436; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
437; GFX9-NEXT:    v_mov_b32_e32 v1, 0
438; GFX9-NEXT:    v_mov_b32_e32 v3, v1
439; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
440; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
441; GFX9-NEXT:    global_load_dword v0, v0, s[0:1]
442; GFX9-NEXT:    s_mov_b32 s7, 0xf000
443; GFX9-NEXT:    s_mov_b32 s6, -1
444; GFX9-NEXT:    s_waitcnt vmcnt(0)
445; GFX9-NEXT:    v_pk_sub_i16 v2, v2, v0
446; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v2
447; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
448; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
449; GFX9-NEXT:    s_endpgm
450;
451; VI-LABEL: v_test_sub_v2i16_zext_to_v2i64:
452; VI:       ; %bb.0:
453; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
454; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
455; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
456; VI-NEXT:    s_waitcnt lgkmcnt(0)
457; VI-NEXT:    v_mov_b32_e32 v1, s7
458; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
459; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
460; VI-NEXT:    v_mov_b32_e32 v3, s1
461; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
462; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
463; VI-NEXT:    flat_load_dword v4, v[0:1]
464; VI-NEXT:    flat_load_dword v2, v[2:3]
465; VI-NEXT:    v_mov_b32_e32 v1, 0
466; VI-NEXT:    s_mov_b32 s7, 0xf000
467; VI-NEXT:    s_mov_b32 s6, -1
468; VI-NEXT:    v_mov_b32_e32 v3, v1
469; VI-NEXT:    s_waitcnt vmcnt(0)
470; VI-NEXT:    v_sub_u16_e32 v0, v4, v2
471; VI-NEXT:    v_sub_u16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
472; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
473; VI-NEXT:    s_endpgm
474  %tid = call i32 @llvm.amdgcn.workitem.id.x()
475  %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid
476  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
477  %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid
478  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
479  %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1
480  %add = sub <2 x i16> %a, %b
481  %ext = zext <2 x i16> %add to <2 x i64>
482  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
483  ret void
484}
485
486; FIXME: Need to handle non-uniform case for function below (load without gep).
487define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
488; GFX9-LABEL: v_test_sub_v2i16_sext_to_v2i32:
489; GFX9:       ; %bb.0:
490; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
491; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
492; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
493; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
494; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
495; GFX9-NEXT:    global_load_dword v0, v0, s[0:1]
496; GFX9-NEXT:    s_mov_b32 s7, 0xf000
497; GFX9-NEXT:    s_mov_b32 s6, -1
498; GFX9-NEXT:    s_waitcnt vmcnt(0)
499; GFX9-NEXT:    v_pk_sub_i16 v0, v1, v0
500; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 16, v0
501; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 16
502; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
503; GFX9-NEXT:    s_endpgm
504;
505; VI-LABEL: v_test_sub_v2i16_sext_to_v2i32:
506; VI:       ; %bb.0:
507; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
508; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
509; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
510; VI-NEXT:    s_waitcnt lgkmcnt(0)
511; VI-NEXT:    v_mov_b32_e32 v1, s7
512; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
513; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
514; VI-NEXT:    v_mov_b32_e32 v3, s1
515; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
516; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
517; VI-NEXT:    flat_load_dword v0, v[0:1]
518; VI-NEXT:    flat_load_dword v1, v[2:3]
519; VI-NEXT:    s_mov_b32 s7, 0xf000
520; VI-NEXT:    s_mov_b32 s6, -1
521; VI-NEXT:    s_waitcnt vmcnt(0)
522; VI-NEXT:    v_sub_u16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
523; VI-NEXT:    v_sub_u16_e32 v0, v0, v1
524; VI-NEXT:    v_bfe_i32 v0, v0, 0, 16
525; VI-NEXT:    v_bfe_i32 v1, v2, 0, 16
526; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
527; VI-NEXT:    s_endpgm
528  %tid = call i32 @llvm.amdgcn.workitem.id.x()
529  %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid
530  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
531  %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid
532  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
533  %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1
534  %add = sub <2 x i16> %a, %b
535  %ext = sext <2 x i16> %add to <2 x i32>
536  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
537  ret void
538}
539
540; FIXME: Need to handle non-uniform case for function below (load without gep).
541define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
542; GFX9-LABEL: v_test_sub_v2i16_sext_to_v2i64:
543; GFX9:       ; %bb.0:
544; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
545; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
546; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
547; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
548; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
549; GFX9-NEXT:    global_load_dword v0, v0, s[0:1]
550; GFX9-NEXT:    s_mov_b32 s7, 0xf000
551; GFX9-NEXT:    s_mov_b32 s6, -1
552; GFX9-NEXT:    s_waitcnt vmcnt(0)
553; GFX9-NEXT:    v_pk_sub_i16 v1, v1, v0
554; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
555; GFX9-NEXT:    v_bfe_i32 v0, v1, 0, 16
556; GFX9-NEXT:    v_bfe_i32 v2, v2, 0, 16
557; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
558; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
559; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
560; GFX9-NEXT:    s_endpgm
561;
562; VI-LABEL: v_test_sub_v2i16_sext_to_v2i64:
563; VI:       ; %bb.0:
564; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
565; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
566; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
567; VI-NEXT:    s_waitcnt lgkmcnt(0)
568; VI-NEXT:    v_mov_b32_e32 v1, s7
569; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
570; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
571; VI-NEXT:    v_mov_b32_e32 v3, s1
572; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
573; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
574; VI-NEXT:    flat_load_dword v0, v[0:1]
575; VI-NEXT:    flat_load_dword v1, v[2:3]
576; VI-NEXT:    s_mov_b32 s7, 0xf000
577; VI-NEXT:    s_mov_b32 s6, -1
578; VI-NEXT:    s_waitcnt vmcnt(0)
579; VI-NEXT:    v_sub_u16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
580; VI-NEXT:    v_sub_u16_e32 v0, v0, v1
581; VI-NEXT:    v_bfe_i32 v0, v0, 0, 16
582; VI-NEXT:    v_bfe_i32 v2, v2, 0, 16
583; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
584; VI-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
585; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
586; VI-NEXT:    s_endpgm
587  %tid = call i32 @llvm.amdgcn.workitem.id.x()
588  %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid
589  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
590  %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid
591  %a = load <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
592  %b = load <2 x i16>, <2 x i16> addrspace(1)* %gep.in1
593  %add = sub <2 x i16> %a, %b
594  %ext = sext <2 x i16> %add to <2 x i64>
595  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
596  ret void
597}
598
599declare i32 @llvm.amdgcn.workitem.id.x() #0
600
601attributes #0 = { nounwind readnone }
602attributes #1 = { nounwind }
603