1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,SI
3; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,VI
4; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,GFX9
5; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,GFX10
6
7; Test that add/sub with a constant is swapped to sub/add with negated
8; constant to minimize code size.
9
10define amdgpu_kernel void @v_test_i32_x_sub_64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
11; SI-LABEL: v_test_i32_x_sub_64:
12; SI:       ; %bb.0:
13; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
14; SI-NEXT:    s_mov_b32 s7, 0xf000
15; SI-NEXT:    s_mov_b32 s6, 0
16; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
17; SI-NEXT:    v_mov_b32_e32 v1, 0
18; SI-NEXT:    s_waitcnt lgkmcnt(0)
19; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
20; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
21; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
22; SI-NEXT:    s_waitcnt vmcnt(0)
23; SI-NEXT:    v_subrev_i32_e32 v2, vcc, 64, v2
24; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
25; SI-NEXT:    s_endpgm
26;
27; VI-LABEL: v_test_i32_x_sub_64:
28; VI:       ; %bb.0:
29; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
30; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
31; VI-NEXT:    s_waitcnt lgkmcnt(0)
32; VI-NEXT:    v_mov_b32_e32 v1, s3
33; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
34; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
35; VI-NEXT:    flat_load_dword v0, v[0:1]
36; VI-NEXT:    v_mov_b32_e32 v3, s1
37; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
38; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
39; VI-NEXT:    s_waitcnt vmcnt(0)
40; VI-NEXT:    v_subrev_u32_e32 v0, vcc, 64, v0
41; VI-NEXT:    flat_store_dword v[2:3], v0
42; VI-NEXT:    s_endpgm
43;
44; GFX9-LABEL: v_test_i32_x_sub_64:
45; GFX9:       ; %bb.0:
46; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
47; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
48; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
49; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
50; GFX9-NEXT:    s_waitcnt vmcnt(0)
51; GFX9-NEXT:    v_subrev_u32_e32 v1, 64, v1
52; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
53; GFX9-NEXT:    s_endpgm
54;
55; GFX10-LABEL: v_test_i32_x_sub_64:
56; GFX10:       ; %bb.0:
57; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
58; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
59; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
60; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
61; GFX10-NEXT:    s_waitcnt vmcnt(0)
62; GFX10-NEXT:    v_subrev_nc_u32_e32 v1, 64, v1
63; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
64; GFX10-NEXT:    s_endpgm
65  %tid = call i32 @llvm.amdgcn.workitem.id.x()
66  %tid.ext = sext i32 %tid to i64
67  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
68  %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
69  %x = load i32, i32 addrspace(1)* %gep
70  %result = sub i32 %x, 64
71  store i32 %result, i32 addrspace(1)* %gep.out
72  ret void
73}
74
75define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
76; SI-LABEL: v_test_i32_x_sub_64_multi_use:
77; SI:       ; %bb.0:
78; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
79; SI-NEXT:    s_mov_b32 s7, 0xf000
80; SI-NEXT:    s_mov_b32 s6, 0
81; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
82; SI-NEXT:    v_mov_b32_e32 v1, 0
83; SI-NEXT:    s_waitcnt lgkmcnt(0)
84; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
85; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
86; SI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
87; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
88; SI-NEXT:    s_waitcnt vmcnt(1)
89; SI-NEXT:    v_subrev_i32_e32 v2, vcc, 64, v2
90; SI-NEXT:    s_waitcnt vmcnt(0)
91; SI-NEXT:    v_subrev_i32_e32 v3, vcc, 64, v3
92; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
93; SI-NEXT:    buffer_store_dword v3, v[0:1], s[0:3], 0 addr64
94; SI-NEXT:    s_endpgm
95;
96; VI-LABEL: v_test_i32_x_sub_64_multi_use:
97; VI:       ; %bb.0:
98; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
99; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
100; VI-NEXT:    s_waitcnt lgkmcnt(0)
101; VI-NEXT:    v_mov_b32_e32 v1, s3
102; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
103; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
104; VI-NEXT:    flat_load_dword v3, v[0:1]
105; VI-NEXT:    flat_load_dword v4, v[0:1]
106; VI-NEXT:    v_mov_b32_e32 v1, s1
107; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
108; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
109; VI-NEXT:    s_waitcnt vmcnt(1)
110; VI-NEXT:    v_subrev_u32_e32 v2, vcc, 64, v3
111; VI-NEXT:    s_waitcnt vmcnt(0)
112; VI-NEXT:    v_subrev_u32_e32 v3, vcc, 64, v4
113; VI-NEXT:    flat_store_dword v[0:1], v2
114; VI-NEXT:    flat_store_dword v[0:1], v3
115; VI-NEXT:    s_endpgm
116;
117; GFX9-LABEL: v_test_i32_x_sub_64_multi_use:
118; GFX9:       ; %bb.0:
119; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
120; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
121; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
122; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
123; GFX9-NEXT:    global_load_dword v2, v0, s[2:3]
124; GFX9-NEXT:    s_waitcnt vmcnt(1)
125; GFX9-NEXT:    v_subrev_u32_e32 v1, 64, v1
126; GFX9-NEXT:    s_waitcnt vmcnt(0)
127; GFX9-NEXT:    v_subrev_u32_e32 v2, 64, v2
128; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
129; GFX9-NEXT:    global_store_dword v0, v2, s[0:1]
130; GFX9-NEXT:    s_endpgm
131;
132; GFX10-LABEL: v_test_i32_x_sub_64_multi_use:
133; GFX10:       ; %bb.0:
134; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
135; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
136; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
137; GFX10-NEXT:    s_clause 0x1
138; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
139; GFX10-NEXT:    global_load_dword v2, v0, s[2:3]
140; GFX10-NEXT:    s_waitcnt vmcnt(1)
141; GFX10-NEXT:    v_subrev_nc_u32_e32 v1, 64, v1
142; GFX10-NEXT:    s_waitcnt vmcnt(0)
143; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, 64, v2
144; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
145; GFX10-NEXT:    global_store_dword v0, v2, s[0:1]
146; GFX10-NEXT:    s_endpgm
147  %tid = call i32 @llvm.amdgcn.workitem.id.x()
148  %tid.ext = sext i32 %tid to i64
149  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
150  %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
151  %x = load volatile i32, i32 addrspace(1)* %gep
152  %y = load volatile i32, i32 addrspace(1)* %gep
153  %result0 = sub i32 %x, 64
154  %result1 = sub i32 %y, 64
155  store volatile i32 %result0, i32 addrspace(1)* %gep.out
156  store volatile i32 %result1, i32 addrspace(1)* %gep.out
157  ret void
158}
159
160define amdgpu_kernel void @v_test_i32_64_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
161; SI-LABEL: v_test_i32_64_sub_x:
162; SI:       ; %bb.0:
163; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
164; SI-NEXT:    s_mov_b32 s7, 0xf000
165; SI-NEXT:    s_mov_b32 s6, 0
166; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
167; SI-NEXT:    v_mov_b32_e32 v1, 0
168; SI-NEXT:    s_waitcnt lgkmcnt(0)
169; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
170; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
171; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
172; SI-NEXT:    s_waitcnt vmcnt(0)
173; SI-NEXT:    v_sub_i32_e32 v2, vcc, 64, v2
174; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
175; SI-NEXT:    s_endpgm
176;
177; VI-LABEL: v_test_i32_64_sub_x:
178; VI:       ; %bb.0:
179; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
180; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
181; VI-NEXT:    s_waitcnt lgkmcnt(0)
182; VI-NEXT:    v_mov_b32_e32 v1, s3
183; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
184; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
185; VI-NEXT:    flat_load_dword v0, v[0:1]
186; VI-NEXT:    v_mov_b32_e32 v3, s1
187; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
188; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
189; VI-NEXT:    s_waitcnt vmcnt(0)
190; VI-NEXT:    v_sub_u32_e32 v0, vcc, 64, v0
191; VI-NEXT:    flat_store_dword v[2:3], v0
192; VI-NEXT:    s_endpgm
193;
194; GFX9-LABEL: v_test_i32_64_sub_x:
195; GFX9:       ; %bb.0:
196; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
197; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
198; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
199; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
200; GFX9-NEXT:    s_waitcnt vmcnt(0)
201; GFX9-NEXT:    v_sub_u32_e32 v1, 64, v1
202; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
203; GFX9-NEXT:    s_endpgm
204;
205; GFX10-LABEL: v_test_i32_64_sub_x:
206; GFX10:       ; %bb.0:
207; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
208; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
209; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
210; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
211; GFX10-NEXT:    s_waitcnt vmcnt(0)
212; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 64, v1
213; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
214; GFX10-NEXT:    s_endpgm
215  %tid = call i32 @llvm.amdgcn.workitem.id.x()
216  %tid.ext = sext i32 %tid to i64
217  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
218  %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
219  %x = load i32, i32 addrspace(1)* %gep
220  %result = sub i32 64, %x
221  store i32 %result, i32 addrspace(1)* %gep.out
222  ret void
223}
224
225define amdgpu_kernel void @v_test_i32_x_sub_65(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
226; SI-LABEL: v_test_i32_x_sub_65:
227; SI:       ; %bb.0:
228; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
229; SI-NEXT:    s_mov_b32 s7, 0xf000
230; SI-NEXT:    s_mov_b32 s6, 0
231; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
232; SI-NEXT:    v_mov_b32_e32 v1, 0
233; SI-NEXT:    s_waitcnt lgkmcnt(0)
234; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
235; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
236; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
237; SI-NEXT:    s_waitcnt vmcnt(0)
238; SI-NEXT:    v_add_i32_e32 v2, vcc, 0xffffffbf, v2
239; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
240; SI-NEXT:    s_endpgm
241;
242; VI-LABEL: v_test_i32_x_sub_65:
243; VI:       ; %bb.0:
244; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
245; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
246; VI-NEXT:    s_waitcnt lgkmcnt(0)
247; VI-NEXT:    v_mov_b32_e32 v1, s3
248; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
249; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
250; VI-NEXT:    flat_load_dword v0, v[0:1]
251; VI-NEXT:    v_mov_b32_e32 v3, s1
252; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
253; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
254; VI-NEXT:    s_waitcnt vmcnt(0)
255; VI-NEXT:    v_add_u32_e32 v0, vcc, 0xffffffbf, v0
256; VI-NEXT:    flat_store_dword v[2:3], v0
257; VI-NEXT:    s_endpgm
258;
259; GFX9-LABEL: v_test_i32_x_sub_65:
260; GFX9:       ; %bb.0:
261; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
262; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
263; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
264; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
265; GFX9-NEXT:    s_waitcnt vmcnt(0)
266; GFX9-NEXT:    v_add_u32_e32 v1, 0xffffffbf, v1
267; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
268; GFX9-NEXT:    s_endpgm
269;
270; GFX10-LABEL: v_test_i32_x_sub_65:
271; GFX10:       ; %bb.0:
272; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
273; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
274; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
275; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
276; GFX10-NEXT:    s_waitcnt vmcnt(0)
277; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0xffffffbf, v1
278; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
279; GFX10-NEXT:    s_endpgm
280  %tid = call i32 @llvm.amdgcn.workitem.id.x()
281  %tid.ext = sext i32 %tid to i64
282  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
283  %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
284  %x = load i32, i32 addrspace(1)* %gep
285  %result = sub i32 %x, 65
286  store i32 %result, i32 addrspace(1)* %gep.out
287  ret void
288}
289
290define amdgpu_kernel void @v_test_i32_65_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
291; SI-LABEL: v_test_i32_65_sub_x:
292; SI:       ; %bb.0:
293; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
294; SI-NEXT:    s_mov_b32 s7, 0xf000
295; SI-NEXT:    s_mov_b32 s6, 0
296; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
297; SI-NEXT:    v_mov_b32_e32 v1, 0
298; SI-NEXT:    s_waitcnt lgkmcnt(0)
299; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
300; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
301; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
302; SI-NEXT:    s_waitcnt vmcnt(0)
303; SI-NEXT:    v_sub_i32_e32 v2, vcc, 0x41, v2
304; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
305; SI-NEXT:    s_endpgm
306;
307; VI-LABEL: v_test_i32_65_sub_x:
308; VI:       ; %bb.0:
309; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
310; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
311; VI-NEXT:    s_waitcnt lgkmcnt(0)
312; VI-NEXT:    v_mov_b32_e32 v1, s3
313; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
314; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
315; VI-NEXT:    flat_load_dword v0, v[0:1]
316; VI-NEXT:    v_mov_b32_e32 v3, s1
317; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
318; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
319; VI-NEXT:    s_waitcnt vmcnt(0)
320; VI-NEXT:    v_sub_u32_e32 v0, vcc, 0x41, v0
321; VI-NEXT:    flat_store_dword v[2:3], v0
322; VI-NEXT:    s_endpgm
323;
324; GFX9-LABEL: v_test_i32_65_sub_x:
325; GFX9:       ; %bb.0:
326; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
327; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
328; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
329; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
330; GFX9-NEXT:    s_waitcnt vmcnt(0)
331; GFX9-NEXT:    v_sub_u32_e32 v1, 0x41, v1
332; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
333; GFX9-NEXT:    s_endpgm
334;
335; GFX10-LABEL: v_test_i32_65_sub_x:
336; GFX10:       ; %bb.0:
337; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
338; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
339; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
340; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
341; GFX10-NEXT:    s_waitcnt vmcnt(0)
342; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 0x41, v1
343; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
344; GFX10-NEXT:    s_endpgm
345  %tid = call i32 @llvm.amdgcn.workitem.id.x()
346  %tid.ext = sext i32 %tid to i64
347  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
348  %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
349  %x = load i32, i32 addrspace(1)* %gep
350  %result = sub i32 65, %x
351  store i32 %result, i32 addrspace(1)* %gep.out
352  ret void
353}
354
355define amdgpu_kernel void @v_test_i32_x_sub_neg16(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
356; SI-LABEL: v_test_i32_x_sub_neg16:
357; SI:       ; %bb.0:
358; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
359; SI-NEXT:    s_mov_b32 s7, 0xf000
360; SI-NEXT:    s_mov_b32 s6, 0
361; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
362; SI-NEXT:    v_mov_b32_e32 v1, 0
363; SI-NEXT:    s_waitcnt lgkmcnt(0)
364; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
365; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
366; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
367; SI-NEXT:    s_waitcnt vmcnt(0)
368; SI-NEXT:    v_add_i32_e32 v2, vcc, 16, v2
369; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
370; SI-NEXT:    s_endpgm
371;
372; VI-LABEL: v_test_i32_x_sub_neg16:
373; VI:       ; %bb.0:
374; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
375; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
376; VI-NEXT:    s_waitcnt lgkmcnt(0)
377; VI-NEXT:    v_mov_b32_e32 v1, s3
378; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
379; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
380; VI-NEXT:    flat_load_dword v0, v[0:1]
381; VI-NEXT:    v_mov_b32_e32 v3, s1
382; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
383; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
384; VI-NEXT:    s_waitcnt vmcnt(0)
385; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
386; VI-NEXT:    flat_store_dword v[2:3], v0
387; VI-NEXT:    s_endpgm
388;
389; GFX9-LABEL: v_test_i32_x_sub_neg16:
390; GFX9:       ; %bb.0:
391; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
392; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
393; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
394; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
395; GFX9-NEXT:    s_waitcnt vmcnt(0)
396; GFX9-NEXT:    v_add_u32_e32 v1, 16, v1
397; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
398; GFX9-NEXT:    s_endpgm
399;
400; GFX10-LABEL: v_test_i32_x_sub_neg16:
401; GFX10:       ; %bb.0:
402; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
403; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
404; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
405; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
406; GFX10-NEXT:    s_waitcnt vmcnt(0)
407; GFX10-NEXT:    v_add_nc_u32_e32 v1, 16, v1
408; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
409; GFX10-NEXT:    s_endpgm
410  %tid = call i32 @llvm.amdgcn.workitem.id.x()
411  %tid.ext = sext i32 %tid to i64
412  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
413  %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
414  %x = load i32, i32 addrspace(1)* %gep
415  %result = sub i32 %x, -16
416  store i32 %result, i32 addrspace(1)* %gep.out
417  ret void
418}
419
420define amdgpu_kernel void @v_test_i32_neg16_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
421; SI-LABEL: v_test_i32_neg16_sub_x:
422; SI:       ; %bb.0:
423; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
424; SI-NEXT:    s_mov_b32 s7, 0xf000
425; SI-NEXT:    s_mov_b32 s6, 0
426; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
427; SI-NEXT:    v_mov_b32_e32 v1, 0
428; SI-NEXT:    s_waitcnt lgkmcnt(0)
429; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
430; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
431; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
432; SI-NEXT:    s_waitcnt vmcnt(0)
433; SI-NEXT:    v_sub_i32_e32 v2, vcc, -16, v2
434; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
435; SI-NEXT:    s_endpgm
436;
437; VI-LABEL: v_test_i32_neg16_sub_x:
438; VI:       ; %bb.0:
439; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
440; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
441; VI-NEXT:    s_waitcnt lgkmcnt(0)
442; VI-NEXT:    v_mov_b32_e32 v1, s3
443; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
444; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
445; VI-NEXT:    flat_load_dword v0, v[0:1]
446; VI-NEXT:    v_mov_b32_e32 v3, s1
447; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
448; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
449; VI-NEXT:    s_waitcnt vmcnt(0)
450; VI-NEXT:    v_sub_u32_e32 v0, vcc, -16, v0
451; VI-NEXT:    flat_store_dword v[2:3], v0
452; VI-NEXT:    s_endpgm
453;
454; GFX9-LABEL: v_test_i32_neg16_sub_x:
455; GFX9:       ; %bb.0:
456; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
457; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
458; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
459; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
460; GFX9-NEXT:    s_waitcnt vmcnt(0)
461; GFX9-NEXT:    v_sub_u32_e32 v1, -16, v1
462; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
463; GFX9-NEXT:    s_endpgm
464;
465; GFX10-LABEL: v_test_i32_neg16_sub_x:
466; GFX10:       ; %bb.0:
467; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
468; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
469; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
470; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
471; GFX10-NEXT:    s_waitcnt vmcnt(0)
472; GFX10-NEXT:    v_sub_nc_u32_e32 v1, -16, v1
473; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
474; GFX10-NEXT:    s_endpgm
475  %tid = call i32 @llvm.amdgcn.workitem.id.x()
476  %tid.ext = sext i32 %tid to i64
477  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
478  %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
479  %x = load i32, i32 addrspace(1)* %gep
480  %result = sub i32 -16, %x
481  store i32 %result, i32 addrspace(1)* %gep.out
482  ret void
483}
484
485define amdgpu_kernel void @v_test_i32_x_sub_neg17(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
486; SI-LABEL: v_test_i32_x_sub_neg17:
487; SI:       ; %bb.0:
488; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
489; SI-NEXT:    s_mov_b32 s7, 0xf000
490; SI-NEXT:    s_mov_b32 s6, 0
491; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
492; SI-NEXT:    v_mov_b32_e32 v1, 0
493; SI-NEXT:    s_waitcnt lgkmcnt(0)
494; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
495; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
496; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
497; SI-NEXT:    s_waitcnt vmcnt(0)
498; SI-NEXT:    v_add_i32_e32 v2, vcc, 17, v2
499; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
500; SI-NEXT:    s_endpgm
501;
502; VI-LABEL: v_test_i32_x_sub_neg17:
503; VI:       ; %bb.0:
504; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
505; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
506; VI-NEXT:    s_waitcnt lgkmcnt(0)
507; VI-NEXT:    v_mov_b32_e32 v1, s3
508; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
509; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
510; VI-NEXT:    flat_load_dword v0, v[0:1]
511; VI-NEXT:    v_mov_b32_e32 v3, s1
512; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
513; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
514; VI-NEXT:    s_waitcnt vmcnt(0)
515; VI-NEXT:    v_add_u32_e32 v0, vcc, 17, v0
516; VI-NEXT:    flat_store_dword v[2:3], v0
517; VI-NEXT:    s_endpgm
518;
519; GFX9-LABEL: v_test_i32_x_sub_neg17:
520; GFX9:       ; %bb.0:
521; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
522; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
523; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
524; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
525; GFX9-NEXT:    s_waitcnt vmcnt(0)
526; GFX9-NEXT:    v_add_u32_e32 v1, 17, v1
527; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
528; GFX9-NEXT:    s_endpgm
529;
530; GFX10-LABEL: v_test_i32_x_sub_neg17:
531; GFX10:       ; %bb.0:
532; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
533; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
534; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
535; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
536; GFX10-NEXT:    s_waitcnt vmcnt(0)
537; GFX10-NEXT:    v_add_nc_u32_e32 v1, 17, v1
538; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
539; GFX10-NEXT:    s_endpgm
540  %tid = call i32 @llvm.amdgcn.workitem.id.x()
541  %tid.ext = sext i32 %tid to i64
542  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
543  %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
544  %x = load i32, i32 addrspace(1)* %gep
545  %result = sub i32 %x, -17
546  store i32 %result, i32 addrspace(1)* %gep.out
547  ret void
548}
549
550define amdgpu_kernel void @v_test_i32_neg17_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
551; SI-LABEL: v_test_i32_neg17_sub_x:
552; SI:       ; %bb.0:
553; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
554; SI-NEXT:    s_mov_b32 s7, 0xf000
555; SI-NEXT:    s_mov_b32 s6, 0
556; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
557; SI-NEXT:    v_mov_b32_e32 v1, 0
558; SI-NEXT:    s_waitcnt lgkmcnt(0)
559; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
560; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
561; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
562; SI-NEXT:    s_waitcnt vmcnt(0)
563; SI-NEXT:    v_sub_i32_e32 v2, vcc, 0xffffffef, v2
564; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
565; SI-NEXT:    s_endpgm
566;
567; VI-LABEL: v_test_i32_neg17_sub_x:
568; VI:       ; %bb.0:
569; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
570; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
571; VI-NEXT:    s_waitcnt lgkmcnt(0)
572; VI-NEXT:    v_mov_b32_e32 v1, s3
573; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
574; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
575; VI-NEXT:    flat_load_dword v0, v[0:1]
576; VI-NEXT:    v_mov_b32_e32 v3, s1
577; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
578; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
579; VI-NEXT:    s_waitcnt vmcnt(0)
580; VI-NEXT:    v_sub_u32_e32 v0, vcc, 0xffffffef, v0
581; VI-NEXT:    flat_store_dword v[2:3], v0
582; VI-NEXT:    s_endpgm
583;
584; GFX9-LABEL: v_test_i32_neg17_sub_x:
585; GFX9:       ; %bb.0:
586; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
587; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
588; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
589; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
590; GFX9-NEXT:    s_waitcnt vmcnt(0)
591; GFX9-NEXT:    v_sub_u32_e32 v1, 0xffffffef, v1
592; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
593; GFX9-NEXT:    s_endpgm
594;
595; GFX10-LABEL: v_test_i32_neg17_sub_x:
596; GFX10:       ; %bb.0:
597; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
598; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
599; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
600; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
601; GFX10-NEXT:    s_waitcnt vmcnt(0)
602; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 0xffffffef, v1
603; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
604; GFX10-NEXT:    s_endpgm
605  %tid = call i32 @llvm.amdgcn.workitem.id.x()
606  %tid.ext = sext i32 %tid to i64
607  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
608  %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
609  %x = load i32, i32 addrspace(1)* %gep
610  %result = sub i32 -17, %x
611  store i32 %result, i32 addrspace(1)* %gep.out
612  ret void
613}
614
615define amdgpu_kernel void @s_test_i32_x_sub_64(i32 %x) #0 {
616; SI-LABEL: s_test_i32_x_sub_64:
617; SI:       ; %bb.0:
618; SI-NEXT:    s_load_dword s0, s[0:1], 0x9
619; SI-NEXT:    s_waitcnt lgkmcnt(0)
620; SI-NEXT:    s_sub_i32 s0, s0, 64
621; SI-NEXT:    ;;#ASMSTART
622; SI-NEXT:    ; use s0
623; SI-NEXT:    ;;#ASMEND
624; SI-NEXT:    s_endpgm
625;
626; VI-LABEL: s_test_i32_x_sub_64:
627; VI:       ; %bb.0:
628; VI-NEXT:    s_load_dword s0, s[0:1], 0x24
629; VI-NEXT:    s_waitcnt lgkmcnt(0)
630; VI-NEXT:    s_sub_i32 s0, s0, 64
631; VI-NEXT:    ;;#ASMSTART
632; VI-NEXT:    ; use s0
633; VI-NEXT:    ;;#ASMEND
634; VI-NEXT:    s_endpgm
635;
636; GFX9-LABEL: s_test_i32_x_sub_64:
637; GFX9:       ; %bb.0:
638; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
639; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
640; GFX9-NEXT:    s_sub_i32 s0, s0, 64
641; GFX9-NEXT:    ;;#ASMSTART
642; GFX9-NEXT:    ; use s0
643; GFX9-NEXT:    ;;#ASMEND
644; GFX9-NEXT:    s_endpgm
645;
646; GFX10-LABEL: s_test_i32_x_sub_64:
647; GFX10:       ; %bb.0:
648; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x24
649; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
650; GFX10-NEXT:    s_sub_i32 s0, s0, 64
651; GFX10-NEXT:    ;;#ASMSTART
652; GFX10-NEXT:    ; use s0
653; GFX10-NEXT:    ;;#ASMEND
654; GFX10-NEXT:    s_endpgm
655  %result = sub i32 %x, 64
656  call void asm sideeffect "; use $0", "s"(i32 %result)
657  ret void
658}
659
660define amdgpu_kernel void @v_test_i16_x_sub_64(i16 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
661; SI-LABEL: v_test_i16_x_sub_64:
662; SI:       ; %bb.0:
663; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
664; SI-NEXT:    s_mov_b32 s7, 0xf000
665; SI-NEXT:    s_mov_b32 s6, 0
666; SI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
667; SI-NEXT:    v_mov_b32_e32 v1, 0
668; SI-NEXT:    s_waitcnt lgkmcnt(0)
669; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
670; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64
671; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
672; SI-NEXT:    s_waitcnt vmcnt(0)
673; SI-NEXT:    v_subrev_i32_e32 v2, vcc, 64, v2
674; SI-NEXT:    buffer_store_short v2, v[0:1], s[0:3], 0 addr64
675; SI-NEXT:    s_endpgm
676;
677; VI-LABEL: v_test_i16_x_sub_64:
678; VI:       ; %bb.0:
679; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
680; VI-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
681; VI-NEXT:    s_waitcnt lgkmcnt(0)
682; VI-NEXT:    v_mov_b32_e32 v1, s3
683; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
684; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
685; VI-NEXT:    flat_load_ushort v0, v[0:1]
686; VI-NEXT:    v_mov_b32_e32 v3, s1
687; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
688; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
689; VI-NEXT:    s_waitcnt vmcnt(0)
690; VI-NEXT:    v_subrev_u16_e32 v0, 64, v0
691; VI-NEXT:    flat_store_short v[2:3], v0
692; VI-NEXT:    s_endpgm
693;
694; GFX9-LABEL: v_test_i16_x_sub_64:
695; GFX9:       ; %bb.0:
696; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
697; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
698; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
699; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3]
700; GFX9-NEXT:    s_waitcnt vmcnt(0)
701; GFX9-NEXT:    v_subrev_u16_e32 v1, 64, v1
702; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
703; GFX9-NEXT:    s_endpgm
704;
705; GFX10-LABEL: v_test_i16_x_sub_64:
706; GFX10:       ; %bb.0:
707; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
708; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
709; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
710; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3]
711; GFX10-NEXT:    s_waitcnt vmcnt(0)
712; GFX10-NEXT:    v_sub_nc_u16_e64 v1, v1, 64
713; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
714; GFX10-NEXT:    s_endpgm
715  %tid = call i32 @llvm.amdgcn.workitem.id.x()
716  %tid.ext = sext i32 %tid to i64
717  %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 %tid.ext
718  %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i64 %tid.ext
719  %x = load i16, i16 addrspace(1)* %gep
720  %result = sub i16 %x, 64
721  store i16 %result, i16 addrspace(1)* %gep.out
722  ret void
723}
724
725define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
726; SI-LABEL: v_test_i16_x_sub_64_zext_to_i32:
727; SI:       ; %bb.0:
728; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
729; SI-NEXT:    s_mov_b32 s7, 0xf000
730; SI-NEXT:    s_mov_b32 s6, 0
731; SI-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
732; SI-NEXT:    v_mov_b32_e32 v2, 0
733; SI-NEXT:    s_waitcnt lgkmcnt(0)
734; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
735; SI-NEXT:    buffer_load_ushort v3, v[1:2], s[4:7], 0 addr64
736; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
737; SI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
738; SI-NEXT:    s_waitcnt vmcnt(0)
739; SI-NEXT:    v_subrev_i32_e32 v0, vcc, 64, v3
740; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
741; SI-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
742; SI-NEXT:    s_endpgm
743;
744; VI-LABEL: v_test_i16_x_sub_64_zext_to_i32:
745; VI:       ; %bb.0:
746; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
747; VI-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
748; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
749; VI-NEXT:    s_waitcnt lgkmcnt(0)
750; VI-NEXT:    v_mov_b32_e32 v2, s3
751; VI-NEXT:    v_add_u32_e32 v1, vcc, s2, v1
752; VI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
753; VI-NEXT:    v_add_u32_e32 v3, vcc, s0, v0
754; VI-NEXT:    flat_load_ushort v0, v[1:2]
755; VI-NEXT:    v_mov_b32_e32 v4, s1
756; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
757; VI-NEXT:    s_waitcnt vmcnt(0)
758; VI-NEXT:    v_subrev_u16_e32 v0, 64, v0
759; VI-NEXT:    flat_store_dword v[3:4], v0
760; VI-NEXT:    s_endpgm
761;
762; GFX9-LABEL: v_test_i16_x_sub_64_zext_to_i32:
763; GFX9:       ; %bb.0:
764; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
765; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
766; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
767; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
768; GFX9-NEXT:    global_load_ushort v1, v1, s[2:3]
769; GFX9-NEXT:    s_waitcnt vmcnt(0)
770; GFX9-NEXT:    v_subrev_u16_e32 v1, 64, v1
771; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
772; GFX9-NEXT:    s_endpgm
773;
774; GFX10-LABEL: v_test_i16_x_sub_64_zext_to_i32:
775; GFX10:       ; %bb.0:
776; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
777; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
778; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
779; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
780; GFX10-NEXT:    global_load_ushort v1, v1, s[2:3]
781; GFX10-NEXT:    s_waitcnt vmcnt(0)
782; GFX10-NEXT:    v_sub_nc_u16_e64 v1, v1, 64
783; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
784; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
785; GFX10-NEXT:    s_endpgm
786  %tid = call i32 @llvm.amdgcn.workitem.id.x()
787  %tid.ext = sext i32 %tid to i64
788  %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 %tid.ext
789  %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
790  %x = load i16, i16 addrspace(1)* %gep
791  %result = sub i16 %x, 64
792  %zext = zext i16 %result to i32
793  store i32 %zext, i32 addrspace(1)* %gep.out
794  ret void
795}
796
797define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(i16 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
798; SI-LABEL: v_test_i16_x_sub_64_multi_use:
799; SI:       ; %bb.0:
800; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
801; SI-NEXT:    s_mov_b32 s7, 0xf000
802; SI-NEXT:    s_mov_b32 s6, 0
803; SI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
804; SI-NEXT:    v_mov_b32_e32 v1, 0
805; SI-NEXT:    s_waitcnt lgkmcnt(0)
806; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
807; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64
808; SI-NEXT:    buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64
809; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
810; SI-NEXT:    s_waitcnt vmcnt(1)
811; SI-NEXT:    v_subrev_i32_e32 v2, vcc, 64, v2
812; SI-NEXT:    s_waitcnt vmcnt(0)
813; SI-NEXT:    v_subrev_i32_e32 v3, vcc, 64, v3
814; SI-NEXT:    buffer_store_short v2, v[0:1], s[0:3], 0 addr64
815; SI-NEXT:    buffer_store_short v3, v[0:1], s[0:3], 0 addr64
816; SI-NEXT:    s_endpgm
817;
818; VI-LABEL: v_test_i16_x_sub_64_multi_use:
819; VI:       ; %bb.0:
820; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
821; VI-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
822; VI-NEXT:    s_waitcnt lgkmcnt(0)
823; VI-NEXT:    v_mov_b32_e32 v1, s3
824; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
825; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
826; VI-NEXT:    flat_load_ushort v3, v[0:1]
827; VI-NEXT:    flat_load_ushort v4, v[0:1]
828; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
829; VI-NEXT:    v_mov_b32_e32 v1, s1
830; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
831; VI-NEXT:    s_waitcnt vmcnt(1)
832; VI-NEXT:    v_subrev_u16_e32 v2, 64, v3
833; VI-NEXT:    s_waitcnt vmcnt(0)
834; VI-NEXT:    v_subrev_u16_e32 v3, 64, v4
835; VI-NEXT:    flat_store_short v[0:1], v2
836; VI-NEXT:    flat_store_short v[0:1], v3
837; VI-NEXT:    s_endpgm
838;
839; GFX9-LABEL: v_test_i16_x_sub_64_multi_use:
840; GFX9:       ; %bb.0:
841; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
842; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
843; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
844; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3]
845; GFX9-NEXT:    global_load_ushort v2, v0, s[2:3]
846; GFX9-NEXT:    s_waitcnt vmcnt(1)
847; GFX9-NEXT:    v_subrev_u16_e32 v1, 64, v1
848; GFX9-NEXT:    s_waitcnt vmcnt(0)
849; GFX9-NEXT:    v_subrev_u16_e32 v2, 64, v2
850; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
851; GFX9-NEXT:    global_store_short v0, v2, s[0:1]
852; GFX9-NEXT:    s_endpgm
853;
854; GFX10-LABEL: v_test_i16_x_sub_64_multi_use:
855; GFX10:       ; %bb.0:
856; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
857; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
858; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
859; GFX10-NEXT:    s_clause 0x1
860; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3]
861; GFX10-NEXT:    global_load_ushort v2, v0, s[2:3]
862; GFX10-NEXT:    s_waitcnt vmcnt(1)
863; GFX10-NEXT:    v_sub_nc_u16_e64 v1, v1, 64
864; GFX10-NEXT:    s_waitcnt vmcnt(0)
865; GFX10-NEXT:    v_sub_nc_u16_e64 v2, v2, 64
866; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
867; GFX10-NEXT:    global_store_short v0, v2, s[0:1]
868; GFX10-NEXT:    s_endpgm
869  %tid = call i32 @llvm.amdgcn.workitem.id.x()
870  %tid.ext = sext i32 %tid to i64
871  %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 %tid.ext
872  %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i64 %tid.ext
873  %x = load volatile i16, i16 addrspace(1)* %gep
874  %y = load volatile i16, i16 addrspace(1)* %gep
875  %result0 = sub i16 %x, 64
876  %result1 = sub i16 %y, 64
877  store volatile i16 %result0, i16 addrspace(1)* %gep.out
878  store volatile i16 %result1, i16 addrspace(1)* %gep.out
879  ret void
880}
881
882define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
883; SI-LABEL: v_test_v2i16_x_sub_64_64:
884; SI:       ; %bb.0:
885; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
886; SI-NEXT:    s_mov_b32 s7, 0xf000
887; SI-NEXT:    s_mov_b32 s6, 0
888; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
889; SI-NEXT:    v_mov_b32_e32 v1, 0
890; SI-NEXT:    s_waitcnt lgkmcnt(0)
891; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
892; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
893; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
894; SI-NEXT:    s_waitcnt vmcnt(0)
895; SI-NEXT:    v_subrev_i32_e32 v3, vcc, 64, v2
896; SI-NEXT:    s_mov_b32 s4, 0xffff0000
897; SI-NEXT:    v_bfi_b32 v2, s4, v2, v3
898; SI-NEXT:    v_add_i32_e32 v2, vcc, 0xffc00000, v2
899; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
900; SI-NEXT:    s_endpgm
901;
902; VI-LABEL: v_test_v2i16_x_sub_64_64:
903; VI:       ; %bb.0:
904; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
905; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
906; VI-NEXT:    v_mov_b32_e32 v4, 64
907; VI-NEXT:    s_waitcnt lgkmcnt(0)
908; VI-NEXT:    v_mov_b32_e32 v1, s3
909; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
910; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
911; VI-NEXT:    flat_load_dword v3, v[0:1]
912; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
913; VI-NEXT:    v_mov_b32_e32 v1, s1
914; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
915; VI-NEXT:    s_waitcnt vmcnt(0)
916; VI-NEXT:    v_sub_u16_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
917; VI-NEXT:    v_subrev_u16_e32 v3, 64, v3
918; VI-NEXT:    v_or_b32_e32 v2, v3, v2
919; VI-NEXT:    flat_store_dword v[0:1], v2
920; VI-NEXT:    s_endpgm
921;
922; GFX9-LABEL: v_test_v2i16_x_sub_64_64:
923; GFX9:       ; %bb.0:
924; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
925; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
926; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
927; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
928; GFX9-NEXT:    s_waitcnt vmcnt(0)
929; GFX9-NEXT:    v_pk_sub_i16 v1, v1, 64 op_sel_hi:[1,0]
930; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
931; GFX9-NEXT:    s_endpgm
932;
933; GFX10-LABEL: v_test_v2i16_x_sub_64_64:
934; GFX10:       ; %bb.0:
935; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
936; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
937; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
938; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
939; GFX10-NEXT:    s_waitcnt vmcnt(0)
940; GFX10-NEXT:    v_pk_sub_i16 v1, v1, 64 op_sel_hi:[1,0]
941; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
942; GFX10-NEXT:    s_endpgm
943  %tid = call i32 @llvm.amdgcn.workitem.id.x()
944  %tid.ext = sext i32 %tid to i64
945  %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
946  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
947  %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
948  %result = sub <2 x i16> %x, <i16 64, i16 64>
949  store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
950  ret void
951}
952
953define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
954; SI-LABEL: v_test_v2i16_x_sub_7_64:
955; SI:       ; %bb.0:
956; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
957; SI-NEXT:    s_mov_b32 s7, 0xf000
958; SI-NEXT:    s_mov_b32 s6, 0
959; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
960; SI-NEXT:    v_mov_b32_e32 v1, 0
961; SI-NEXT:    s_waitcnt lgkmcnt(0)
962; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
963; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
964; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
965; SI-NEXT:    s_waitcnt vmcnt(0)
966; SI-NEXT:    v_add_i32_e32 v3, vcc, -7, v2
967; SI-NEXT:    s_mov_b32 s4, 0xffff0000
968; SI-NEXT:    v_bfi_b32 v2, s4, v2, v3
969; SI-NEXT:    v_add_i32_e32 v2, vcc, 0xffc00000, v2
970; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
971; SI-NEXT:    s_endpgm
972;
973; VI-LABEL: v_test_v2i16_x_sub_7_64:
974; VI:       ; %bb.0:
975; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
976; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
977; VI-NEXT:    v_mov_b32_e32 v4, 64
978; VI-NEXT:    s_waitcnt lgkmcnt(0)
979; VI-NEXT:    v_mov_b32_e32 v1, s3
980; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
981; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
982; VI-NEXT:    flat_load_dword v3, v[0:1]
983; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
984; VI-NEXT:    v_mov_b32_e32 v1, s1
985; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
986; VI-NEXT:    s_waitcnt vmcnt(0)
987; VI-NEXT:    v_add_u16_e32 v2, -7, v3
988; VI-NEXT:    v_sub_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
989; VI-NEXT:    v_or_b32_e32 v2, v2, v3
990; VI-NEXT:    flat_store_dword v[0:1], v2
991; VI-NEXT:    s_endpgm
992;
993; GFX9-LABEL: v_test_v2i16_x_sub_7_64:
994; GFX9:       ; %bb.0:
995; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
996; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
997; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
998; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
999; GFX9-NEXT:    s_mov_b32 s2, 0x400007
1000; GFX9-NEXT:    s_waitcnt vmcnt(0)
1001; GFX9-NEXT:    v_pk_sub_i16 v1, v1, s2
1002; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1003; GFX9-NEXT:    s_endpgm
1004;
1005; GFX10-LABEL: v_test_v2i16_x_sub_7_64:
1006; GFX10:       ; %bb.0:
1007; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1008; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1009; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1010; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
1011; GFX10-NEXT:    s_waitcnt vmcnt(0)
1012; GFX10-NEXT:    v_pk_sub_i16 v1, v1, 0x400007
1013; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
1014; GFX10-NEXT:    s_endpgm
1015  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1016  %tid.ext = sext i32 %tid to i64
1017  %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
1018  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
1019  %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
1020  %result = sub <2 x i16> %x, <i16 7, i16 64>
1021  store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
1022  ret void
1023}
1024
1025define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
1026; SI-LABEL: v_test_v2i16_x_sub_64_123:
1027; SI:       ; %bb.0:
1028; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1029; SI-NEXT:    s_mov_b32 s7, 0xf000
1030; SI-NEXT:    s_mov_b32 s6, 0
1031; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1032; SI-NEXT:    v_mov_b32_e32 v1, 0
1033; SI-NEXT:    s_waitcnt lgkmcnt(0)
1034; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
1035; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1036; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
1037; SI-NEXT:    s_waitcnt vmcnt(0)
1038; SI-NEXT:    v_subrev_i32_e32 v3, vcc, 64, v2
1039; SI-NEXT:    s_mov_b32 s4, 0xffff0000
1040; SI-NEXT:    v_bfi_b32 v2, s4, v2, v3
1041; SI-NEXT:    v_add_i32_e32 v2, vcc, 0xff850000, v2
1042; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1043; SI-NEXT:    s_endpgm
1044;
1045; VI-LABEL: v_test_v2i16_x_sub_64_123:
1046; VI:       ; %bb.0:
1047; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1048; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1049; VI-NEXT:    v_mov_b32_e32 v4, 0xffffff85
1050; VI-NEXT:    s_waitcnt lgkmcnt(0)
1051; VI-NEXT:    v_mov_b32_e32 v1, s3
1052; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1053; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1054; VI-NEXT:    flat_load_dword v3, v[0:1]
1055; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1056; VI-NEXT:    v_mov_b32_e32 v1, s1
1057; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1058; VI-NEXT:    s_waitcnt vmcnt(0)
1059; VI-NEXT:    v_add_u16_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1060; VI-NEXT:    v_subrev_u16_e32 v3, 64, v3
1061; VI-NEXT:    v_or_b32_e32 v2, v3, v2
1062; VI-NEXT:    flat_store_dword v[0:1], v2
1063; VI-NEXT:    s_endpgm
1064;
1065; GFX9-LABEL: v_test_v2i16_x_sub_64_123:
1066; GFX9:       ; %bb.0:
1067; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1068; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1069; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1070; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
1071; GFX9-NEXT:    s_mov_b32 s2, 0x7b0040
1072; GFX9-NEXT:    s_waitcnt vmcnt(0)
1073; GFX9-NEXT:    v_pk_sub_i16 v1, v1, s2
1074; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1075; GFX9-NEXT:    s_endpgm
1076;
1077; GFX10-LABEL: v_test_v2i16_x_sub_64_123:
1078; GFX10:       ; %bb.0:
1079; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1080; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1081; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1082; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
1083; GFX10-NEXT:    s_waitcnt vmcnt(0)
1084; GFX10-NEXT:    v_pk_sub_i16 v1, v1, 0x7b0040
1085; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
1086; GFX10-NEXT:    s_endpgm
1087  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1088  %tid.ext = sext i32 %tid to i64
1089  %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
1090  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
1091  %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
1092  %result = sub <2 x i16> %x, <i16 64, i16 123>
1093  store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
1094  ret void
1095}
1096
1097; Can fold 0 and inline immediate in other half.
1098define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
1099; SI-LABEL: v_test_v2i16_x_sub_7_0:
1100; SI:       ; %bb.0:
1101; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1102; SI-NEXT:    s_mov_b32 s7, 0xf000
1103; SI-NEXT:    s_mov_b32 s6, 0
1104; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1105; SI-NEXT:    v_mov_b32_e32 v1, 0
1106; SI-NEXT:    s_waitcnt lgkmcnt(0)
1107; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
1108; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1109; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
1110; SI-NEXT:    s_waitcnt vmcnt(0)
1111; SI-NEXT:    v_add_i32_e32 v3, vcc, -7, v2
1112; SI-NEXT:    s_mov_b32 s4, 0xffff
1113; SI-NEXT:    v_bfi_b32 v2, s4, v3, v2
1114; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1115; SI-NEXT:    s_endpgm
1116;
1117; VI-LABEL: v_test_v2i16_x_sub_7_0:
1118; VI:       ; %bb.0:
1119; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1120; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1121; VI-NEXT:    s_waitcnt lgkmcnt(0)
1122; VI-NEXT:    v_mov_b32_e32 v1, s3
1123; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1124; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1125; VI-NEXT:    flat_load_dword v3, v[0:1]
1126; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1127; VI-NEXT:    v_mov_b32_e32 v1, s1
1128; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1129; VI-NEXT:    s_waitcnt vmcnt(0)
1130; VI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
1131; VI-NEXT:    v_add_u16_e32 v3, -7, v3
1132; VI-NEXT:    v_or_b32_e32 v2, v3, v2
1133; VI-NEXT:    flat_store_dword v[0:1], v2
1134; VI-NEXT:    s_endpgm
1135;
1136; GFX9-LABEL: v_test_v2i16_x_sub_7_0:
1137; GFX9:       ; %bb.0:
1138; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1139; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1140; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1141; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
1142; GFX9-NEXT:    s_waitcnt vmcnt(0)
1143; GFX9-NEXT:    v_pk_sub_i16 v1, v1, 7
1144; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1145; GFX9-NEXT:    s_endpgm
1146;
1147; GFX10-LABEL: v_test_v2i16_x_sub_7_0:
1148; GFX10:       ; %bb.0:
1149; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1150; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1151; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1152; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
1153; GFX10-NEXT:    s_waitcnt vmcnt(0)
1154; GFX10-NEXT:    v_pk_sub_i16 v1, v1, 7
1155; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
1156; GFX10-NEXT:    s_endpgm
1157  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1158  %tid.ext = sext i32 %tid to i64
1159  %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
1160  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
1161  %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
1162  %result = sub <2 x i16> %x, <i16 7, i16 0>
1163  store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
1164  ret void
1165}
1166
1167; Can fold 0 and inline immediate in other half.
1168define amdgpu_kernel void @v_test_v2i16_x_sub_0_16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
1169; SI-LABEL: v_test_v2i16_x_sub_0_16:
1170; SI:       ; %bb.0:
1171; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1172; SI-NEXT:    s_mov_b32 s7, 0xf000
1173; SI-NEXT:    s_mov_b32 s6, 0
1174; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1175; SI-NEXT:    v_mov_b32_e32 v1, 0
1176; SI-NEXT:    s_waitcnt lgkmcnt(0)
1177; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
1178; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1179; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
1180; SI-NEXT:    s_waitcnt vmcnt(0)
1181; SI-NEXT:    v_add_i32_e32 v2, vcc, 0xfff00000, v2
1182; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1183; SI-NEXT:    s_endpgm
1184;
1185; VI-LABEL: v_test_v2i16_x_sub_0_16:
1186; VI:       ; %bb.0:
1187; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1188; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1189; VI-NEXT:    s_waitcnt lgkmcnt(0)
1190; VI-NEXT:    v_mov_b32_e32 v1, s3
1191; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1192; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1193; VI-NEXT:    flat_load_dword v0, v[0:1]
1194; VI-NEXT:    v_mov_b32_e32 v1, -16
1195; VI-NEXT:    v_mov_b32_e32 v3, s1
1196; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
1197; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1198; VI-NEXT:    s_waitcnt vmcnt(0)
1199; VI-NEXT:    v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1200; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1201; VI-NEXT:    flat_store_dword v[2:3], v0
1202; VI-NEXT:    s_endpgm
1203;
1204; GFX9-LABEL: v_test_v2i16_x_sub_0_16:
1205; GFX9:       ; %bb.0:
1206; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1207; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1208; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1209; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
1210; GFX9-NEXT:    s_waitcnt vmcnt(0)
1211; GFX9-NEXT:    v_pk_sub_i16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0]
1212; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1213; GFX9-NEXT:    s_endpgm
1214;
1215; GFX10-LABEL: v_test_v2i16_x_sub_0_16:
1216; GFX10:       ; %bb.0:
1217; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1218; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1219; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1220; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
1221; GFX10-NEXT:    s_waitcnt vmcnt(0)
1222; GFX10-NEXT:    v_pk_sub_i16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0]
1223; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
1224; GFX10-NEXT:    s_endpgm
1225  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1226  %tid.ext = sext i32 %tid to i64
1227  %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
1228  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
1229  %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
1230  %result = sub <2 x i16> %x, <i16 0, i16 16>
1231  store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
1232  ret void
1233}
1234
1235define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
1236; SI-LABEL: v_test_v2i16_x_sub_0_1_0:
1237; SI:       ; %bb.0:
1238; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1239; SI-NEXT:    s_mov_b32 s7, 0xf000
1240; SI-NEXT:    s_mov_b32 s6, 0
1241; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1242; SI-NEXT:    v_mov_b32_e32 v1, 0
1243; SI-NEXT:    s_waitcnt lgkmcnt(0)
1244; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
1245; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1246; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
1247; SI-NEXT:    s_waitcnt vmcnt(0)
1248; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x3c000000, v2
1249; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1250; SI-NEXT:    s_endpgm
1251;
1252; VI-LABEL: v_test_v2i16_x_sub_0_1_0:
1253; VI:       ; %bb.0:
1254; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1255; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1256; VI-NEXT:    s_waitcnt lgkmcnt(0)
1257; VI-NEXT:    v_mov_b32_e32 v1, s3
1258; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1259; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1260; VI-NEXT:    flat_load_dword v0, v[0:1]
1261; VI-NEXT:    v_mov_b32_e32 v1, 0x3c00
1262; VI-NEXT:    v_mov_b32_e32 v3, s1
1263; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
1264; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1265; VI-NEXT:    s_waitcnt vmcnt(0)
1266; VI-NEXT:    v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1267; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1268; VI-NEXT:    flat_store_dword v[2:3], v0
1269; VI-NEXT:    s_endpgm
1270;
1271; GFX9-LABEL: v_test_v2i16_x_sub_0_1_0:
1272; GFX9:       ; %bb.0:
1273; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1274; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1275; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1276; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
1277; GFX9-NEXT:    s_brev_b32 s2, 35
1278; GFX9-NEXT:    s_waitcnt vmcnt(0)
1279; GFX9-NEXT:    v_pk_sub_i16 v1, v1, s2
1280; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1281; GFX9-NEXT:    s_endpgm
1282;
1283; GFX10-LABEL: v_test_v2i16_x_sub_0_1_0:
1284; GFX10:       ; %bb.0:
1285; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1286; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1287; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1288; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
1289; GFX10-NEXT:    s_waitcnt vmcnt(0)
1290; GFX10-NEXT:    v_pk_sub_i16 v1, v1, 0xc400 op_sel:[0,1] op_sel_hi:[1,0]
1291; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
1292; GFX10-NEXT:    s_endpgm
1293  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1294  %tid.ext = sext i32 %tid to i64
1295  %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
1296  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
1297  %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
1298  %result = sub <2 x i16> %x, <i16 0, i16 -15360>
1299  store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
1300  ret void
1301}
1302
1303define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
1304; SI-LABEL: v_test_v2i16_x_sub_0_neg1_0:
1305; SI:       ; %bb.0:
1306; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1307; SI-NEXT:    s_mov_b32 s7, 0xf000
1308; SI-NEXT:    s_mov_b32 s6, 0
1309; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1310; SI-NEXT:    v_mov_b32_e32 v1, 0
1311; SI-NEXT:    s_waitcnt lgkmcnt(0)
1312; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
1313; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1314; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
1315; SI-NEXT:    s_waitcnt vmcnt(0)
1316; SI-NEXT:    v_add_i32_e32 v2, vcc, 0xbc000000, v2
1317; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1318; SI-NEXT:    s_endpgm
1319;
1320; VI-LABEL: v_test_v2i16_x_sub_0_neg1_0:
1321; VI:       ; %bb.0:
1322; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1323; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1324; VI-NEXT:    s_waitcnt lgkmcnt(0)
1325; VI-NEXT:    v_mov_b32_e32 v1, s3
1326; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1327; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1328; VI-NEXT:    flat_load_dword v0, v[0:1]
1329; VI-NEXT:    v_mov_b32_e32 v1, 0xffffbc00
1330; VI-NEXT:    v_mov_b32_e32 v3, s1
1331; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
1332; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1333; VI-NEXT:    s_waitcnt vmcnt(0)
1334; VI-NEXT:    v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1335; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1336; VI-NEXT:    flat_store_dword v[2:3], v0
1337; VI-NEXT:    s_endpgm
1338;
1339; GFX9-LABEL: v_test_v2i16_x_sub_0_neg1_0:
1340; GFX9:       ; %bb.0:
1341; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1342; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1343; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1344; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
1345; GFX9-NEXT:    s_brev_b32 s2, 34
1346; GFX9-NEXT:    s_waitcnt vmcnt(0)
1347; GFX9-NEXT:    v_pk_sub_i16 v1, v1, s2
1348; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1349; GFX9-NEXT:    s_endpgm
1350;
1351; GFX10-LABEL: v_test_v2i16_x_sub_0_neg1_0:
1352; GFX10:       ; %bb.0:
1353; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1354; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1355; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1356; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
1357; GFX10-NEXT:    s_waitcnt vmcnt(0)
1358; GFX10-NEXT:    v_pk_sub_i16 v1, v1, 0x4400 op_sel:[0,1] op_sel_hi:[1,0]
1359; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
1360; GFX10-NEXT:    s_endpgm
1361  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1362  %tid.ext = sext i32 %tid to i64
1363  %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
1364  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
1365  %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
1366  %result = sub <2 x i16> %x, <i16 0, i16 17408>
1367  store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
1368  ret void
1369}
1370
1371; -32 isn't an inline immediate, but 32 is
1372define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
1373; SI-LABEL: v_test_v2i16_x_add_neg32_neg32:
1374; SI:       ; %bb.0:
1375; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1376; SI-NEXT:    s_mov_b32 s7, 0xf000
1377; SI-NEXT:    s_mov_b32 s6, 0
1378; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1379; SI-NEXT:    v_mov_b32_e32 v1, 0
1380; SI-NEXT:    s_waitcnt lgkmcnt(0)
1381; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
1382; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1383; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
1384; SI-NEXT:    s_waitcnt vmcnt(0)
1385; SI-NEXT:    v_subrev_i32_e32 v3, vcc, 32, v2
1386; SI-NEXT:    s_mov_b32 s4, 0xffff0000
1387; SI-NEXT:    v_bfi_b32 v2, s4, v2, v3
1388; SI-NEXT:    v_add_i32_e32 v2, vcc, 0xffe00000, v2
1389; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1390; SI-NEXT:    s_endpgm
1391;
1392; VI-LABEL: v_test_v2i16_x_add_neg32_neg32:
1393; VI:       ; %bb.0:
1394; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1395; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1396; VI-NEXT:    v_mov_b32_e32 v4, 32
1397; VI-NEXT:    s_waitcnt lgkmcnt(0)
1398; VI-NEXT:    v_mov_b32_e32 v1, s3
1399; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1400; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1401; VI-NEXT:    flat_load_dword v3, v[0:1]
1402; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1403; VI-NEXT:    v_mov_b32_e32 v1, s1
1404; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1405; VI-NEXT:    s_waitcnt vmcnt(0)
1406; VI-NEXT:    v_sub_u16_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1407; VI-NEXT:    v_subrev_u16_e32 v3, 32, v3
1408; VI-NEXT:    v_or_b32_e32 v2, v3, v2
1409; VI-NEXT:    flat_store_dword v[0:1], v2
1410; VI-NEXT:    s_endpgm
1411;
1412; GFX9-LABEL: v_test_v2i16_x_add_neg32_neg32:
1413; GFX9:       ; %bb.0:
1414; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1415; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1416; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1417; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
1418; GFX9-NEXT:    s_waitcnt vmcnt(0)
1419; GFX9-NEXT:    v_pk_sub_u16 v1, v1, 32 op_sel_hi:[1,0]
1420; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1421; GFX9-NEXT:    s_endpgm
1422;
1423; GFX10-LABEL: v_test_v2i16_x_add_neg32_neg32:
1424; GFX10:       ; %bb.0:
1425; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1426; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1427; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1428; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
1429; GFX10-NEXT:    s_waitcnt vmcnt(0)
1430; GFX10-NEXT:    v_pk_sub_u16 v1, v1, 32 op_sel_hi:[1,0]
1431; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
1432; GFX10-NEXT:    s_endpgm
1433  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1434  %tid.ext = sext i32 %tid to i64
1435  %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
1436  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
1437  %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
1438  %result = add <2 x i16> %x, <i16 -32, i16 -32>
1439  store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
1440  ret void
1441}
1442
1443define amdgpu_kernel void @v_test_v2i16_x_add_0_neg32(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
1444; SI-LABEL: v_test_v2i16_x_add_0_neg32:
1445; SI:       ; %bb.0:
1446; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1447; SI-NEXT:    s_mov_b32 s7, 0xf000
1448; SI-NEXT:    s_mov_b32 s6, 0
1449; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1450; SI-NEXT:    v_mov_b32_e32 v1, 0
1451; SI-NEXT:    s_waitcnt lgkmcnt(0)
1452; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
1453; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1454; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
1455; SI-NEXT:    s_waitcnt vmcnt(0)
1456; SI-NEXT:    v_add_i32_e32 v2, vcc, 0xffe00000, v2
1457; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1458; SI-NEXT:    s_endpgm
1459;
1460; VI-LABEL: v_test_v2i16_x_add_0_neg32:
1461; VI:       ; %bb.0:
1462; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1463; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1464; VI-NEXT:    s_waitcnt lgkmcnt(0)
1465; VI-NEXT:    v_mov_b32_e32 v1, s3
1466; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1467; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1468; VI-NEXT:    flat_load_dword v0, v[0:1]
1469; VI-NEXT:    v_mov_b32_e32 v1, 32
1470; VI-NEXT:    v_mov_b32_e32 v3, s1
1471; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
1472; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1473; VI-NEXT:    s_waitcnt vmcnt(0)
1474; VI-NEXT:    v_sub_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1475; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1476; VI-NEXT:    flat_store_dword v[2:3], v0
1477; VI-NEXT:    s_endpgm
1478;
1479; GFX9-LABEL: v_test_v2i16_x_add_0_neg32:
1480; GFX9:       ; %bb.0:
1481; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1482; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1483; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1484; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
1485; GFX9-NEXT:    s_waitcnt vmcnt(0)
1486; GFX9-NEXT:    v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
1487; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1488; GFX9-NEXT:    s_endpgm
1489;
1490; GFX10-LABEL: v_test_v2i16_x_add_0_neg32:
1491; GFX10:       ; %bb.0:
1492; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1493; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1494; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1495; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
1496; GFX10-NEXT:    s_waitcnt vmcnt(0)
1497; GFX10-NEXT:    v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
1498; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
1499; GFX10-NEXT:    s_endpgm
1500  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1501  %tid.ext = sext i32 %tid to i64
1502  %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
1503  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
1504  %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
1505  %result = add <2 x i16> %x, <i16 0, i16 -32>
1506  store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
1507  ret void
1508}
1509
1510define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
1511; SI-LABEL: v_test_v2i16_x_add_neg32_0:
1512; SI:       ; %bb.0:
1513; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1514; SI-NEXT:    s_mov_b32 s7, 0xf000
1515; SI-NEXT:    s_mov_b32 s6, 0
1516; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1517; SI-NEXT:    v_mov_b32_e32 v1, 0
1518; SI-NEXT:    s_waitcnt lgkmcnt(0)
1519; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
1520; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1521; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
1522; SI-NEXT:    s_waitcnt vmcnt(0)
1523; SI-NEXT:    v_subrev_i32_e32 v3, vcc, 32, v2
1524; SI-NEXT:    s_mov_b32 s4, 0xffff
1525; SI-NEXT:    v_bfi_b32 v2, s4, v3, v2
1526; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1527; SI-NEXT:    s_endpgm
1528;
1529; VI-LABEL: v_test_v2i16_x_add_neg32_0:
1530; VI:       ; %bb.0:
1531; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1532; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1533; VI-NEXT:    s_waitcnt lgkmcnt(0)
1534; VI-NEXT:    v_mov_b32_e32 v1, s3
1535; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1536; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1537; VI-NEXT:    flat_load_dword v3, v[0:1]
1538; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1539; VI-NEXT:    v_mov_b32_e32 v1, s1
1540; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1541; VI-NEXT:    s_waitcnt vmcnt(0)
1542; VI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
1543; VI-NEXT:    v_subrev_u16_e32 v3, 32, v3
1544; VI-NEXT:    v_or_b32_e32 v2, v3, v2
1545; VI-NEXT:    flat_store_dword v[0:1], v2
1546; VI-NEXT:    s_endpgm
1547;
1548; GFX9-LABEL: v_test_v2i16_x_add_neg32_0:
1549; GFX9:       ; %bb.0:
1550; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1551; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1552; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1553; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
1554; GFX9-NEXT:    s_waitcnt vmcnt(0)
1555; GFX9-NEXT:    v_pk_sub_u16 v1, v1, 32
1556; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1557; GFX9-NEXT:    s_endpgm
1558;
1559; GFX10-LABEL: v_test_v2i16_x_add_neg32_0:
1560; GFX10:       ; %bb.0:
1561; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1562; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1563; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1564; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
1565; GFX10-NEXT:    s_waitcnt vmcnt(0)
1566; GFX10-NEXT:    v_pk_sub_u16 v1, v1, 32
1567; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
1568; GFX10-NEXT:    s_endpgm
1569  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1570  %tid.ext = sext i32 %tid to i64
1571  %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
1572  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
1573  %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
1574  %result = add <2 x i16> %x, <i16 -32, i16 0>
1575  store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
1576  ret void
1577}
1578
1579; 16 and -16 are both inline immediates
1580define amdgpu_kernel void @v_test_v2i16_x_add_neg16_neg16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
1581; SI-LABEL: v_test_v2i16_x_add_neg16_neg16:
1582; SI:       ; %bb.0:
1583; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1584; SI-NEXT:    s_mov_b32 s7, 0xf000
1585; SI-NEXT:    s_mov_b32 s6, 0
1586; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1587; SI-NEXT:    v_mov_b32_e32 v1, 0
1588; SI-NEXT:    s_waitcnt lgkmcnt(0)
1589; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
1590; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1591; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
1592; SI-NEXT:    s_waitcnt vmcnt(0)
1593; SI-NEXT:    v_add_i32_e32 v3, vcc, -16, v2
1594; SI-NEXT:    s_mov_b32 s4, 0xffff0000
1595; SI-NEXT:    v_bfi_b32 v2, s4, v2, v3
1596; SI-NEXT:    v_add_i32_e32 v2, vcc, 0xfff00000, v2
1597; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1598; SI-NEXT:    s_endpgm
1599;
1600; VI-LABEL: v_test_v2i16_x_add_neg16_neg16:
1601; VI:       ; %bb.0:
1602; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1603; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1604; VI-NEXT:    v_mov_b32_e32 v4, -16
1605; VI-NEXT:    s_waitcnt lgkmcnt(0)
1606; VI-NEXT:    v_mov_b32_e32 v1, s3
1607; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1608; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1609; VI-NEXT:    flat_load_dword v3, v[0:1]
1610; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1611; VI-NEXT:    v_mov_b32_e32 v1, s1
1612; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1613; VI-NEXT:    s_waitcnt vmcnt(0)
1614; VI-NEXT:    v_add_u16_e32 v2, -16, v3
1615; VI-NEXT:    v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1616; VI-NEXT:    v_or_b32_e32 v2, v2, v3
1617; VI-NEXT:    flat_store_dword v[0:1], v2
1618; VI-NEXT:    s_endpgm
1619;
1620; GFX9-LABEL: v_test_v2i16_x_add_neg16_neg16:
1621; GFX9:       ; %bb.0:
1622; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1623; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1624; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1625; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
1626; GFX9-NEXT:    s_waitcnt vmcnt(0)
1627; GFX9-NEXT:    v_pk_sub_u16 v1, v1, 16 op_sel_hi:[1,0]
1628; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1629; GFX9-NEXT:    s_endpgm
1630;
1631; GFX10-LABEL: v_test_v2i16_x_add_neg16_neg16:
1632; GFX10:       ; %bb.0:
1633; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1634; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1635; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1636; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
1637; GFX10-NEXT:    s_waitcnt vmcnt(0)
1638; GFX10-NEXT:    v_pk_sub_u16 v1, v1, 16 op_sel_hi:[1,0]
1639; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
1640; GFX10-NEXT:    s_endpgm
1641  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1642  %tid.ext = sext i32 %tid to i64
1643  %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
1644  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
1645  %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
1646  %result = add <2 x i16> %x, <i16 -16, i16 -16>
1647  store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
1648  ret void
1649}
1650
1651define amdgpu_kernel void @v_test_v2i16_x_add_0_neg16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
1652; SI-LABEL: v_test_v2i16_x_add_0_neg16:
1653; SI:       ; %bb.0:
1654; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1655; SI-NEXT:    s_mov_b32 s7, 0xf000
1656; SI-NEXT:    s_mov_b32 s6, 0
1657; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1658; SI-NEXT:    v_mov_b32_e32 v1, 0
1659; SI-NEXT:    s_waitcnt lgkmcnt(0)
1660; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
1661; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1662; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
1663; SI-NEXT:    s_waitcnt vmcnt(0)
1664; SI-NEXT:    v_add_i32_e32 v2, vcc, 0xfff00000, v2
1665; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1666; SI-NEXT:    s_endpgm
1667;
1668; VI-LABEL: v_test_v2i16_x_add_0_neg16:
1669; VI:       ; %bb.0:
1670; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1671; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1672; VI-NEXT:    s_waitcnt lgkmcnt(0)
1673; VI-NEXT:    v_mov_b32_e32 v1, s3
1674; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1675; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1676; VI-NEXT:    flat_load_dword v0, v[0:1]
1677; VI-NEXT:    v_mov_b32_e32 v1, -16
1678; VI-NEXT:    v_mov_b32_e32 v3, s1
1679; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
1680; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1681; VI-NEXT:    s_waitcnt vmcnt(0)
1682; VI-NEXT:    v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1683; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1684; VI-NEXT:    flat_store_dword v[2:3], v0
1685; VI-NEXT:    s_endpgm
1686;
1687; GFX9-LABEL: v_test_v2i16_x_add_0_neg16:
1688; GFX9:       ; %bb.0:
1689; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1690; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1691; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1692; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
1693; GFX9-NEXT:    s_waitcnt vmcnt(0)
1694; GFX9-NEXT:    v_pk_sub_u16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0]
1695; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1696; GFX9-NEXT:    s_endpgm
1697;
1698; GFX10-LABEL: v_test_v2i16_x_add_0_neg16:
1699; GFX10:       ; %bb.0:
1700; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1701; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1702; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1703; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
1704; GFX10-NEXT:    s_waitcnt vmcnt(0)
1705; GFX10-NEXT:    v_pk_sub_u16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0]
1706; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
1707; GFX10-NEXT:    s_endpgm
1708  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1709  %tid.ext = sext i32 %tid to i64
1710  %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
1711  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
1712  %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
1713  %result = add <2 x i16> %x, <i16 0, i16 -16>
1714  store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
1715  ret void
1716}
1717
1718define amdgpu_kernel void @v_test_v2i16_x_add_neg16_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
1719; SI-LABEL: v_test_v2i16_x_add_neg16_0:
1720; SI:       ; %bb.0:
1721; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1722; SI-NEXT:    s_mov_b32 s7, 0xf000
1723; SI-NEXT:    s_mov_b32 s6, 0
1724; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1725; SI-NEXT:    v_mov_b32_e32 v1, 0
1726; SI-NEXT:    s_waitcnt lgkmcnt(0)
1727; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
1728; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1729; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
1730; SI-NEXT:    s_waitcnt vmcnt(0)
1731; SI-NEXT:    v_add_i32_e32 v3, vcc, -16, v2
1732; SI-NEXT:    s_mov_b32 s4, 0xffff
1733; SI-NEXT:    v_bfi_b32 v2, s4, v3, v2
1734; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1735; SI-NEXT:    s_endpgm
1736;
1737; VI-LABEL: v_test_v2i16_x_add_neg16_0:
1738; VI:       ; %bb.0:
1739; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1740; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1741; VI-NEXT:    s_waitcnt lgkmcnt(0)
1742; VI-NEXT:    v_mov_b32_e32 v1, s3
1743; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1744; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1745; VI-NEXT:    flat_load_dword v3, v[0:1]
1746; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1747; VI-NEXT:    v_mov_b32_e32 v1, s1
1748; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1749; VI-NEXT:    s_waitcnt vmcnt(0)
1750; VI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
1751; VI-NEXT:    v_add_u16_e32 v3, -16, v3
1752; VI-NEXT:    v_or_b32_e32 v2, v3, v2
1753; VI-NEXT:    flat_store_dword v[0:1], v2
1754; VI-NEXT:    s_endpgm
1755;
1756; GFX9-LABEL: v_test_v2i16_x_add_neg16_0:
1757; GFX9:       ; %bb.0:
1758; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1759; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1760; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1761; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
1762; GFX9-NEXT:    s_waitcnt vmcnt(0)
1763; GFX9-NEXT:    v_pk_sub_u16 v1, v1, 16
1764; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1765; GFX9-NEXT:    s_endpgm
1766;
1767; GFX10-LABEL: v_test_v2i16_x_add_neg16_0:
1768; GFX10:       ; %bb.0:
1769; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1770; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1771; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1772; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
1773; GFX10-NEXT:    s_waitcnt vmcnt(0)
1774; GFX10-NEXT:    v_pk_sub_u16 v1, v1, 16
1775; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
1776; GFX10-NEXT:    s_endpgm
1777  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1778  %tid.ext = sext i32 %tid to i64
1779  %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
1780  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
1781  %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
1782  %result = add <2 x i16> %x, <i16 -16, i16 0>
1783  store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
1784  ret void
1785}
1786
1787define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
1788; SI-LABEL: v_test_v2i16_x_add_neg_fpone:
1789; SI:       ; %bb.0:
1790; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1791; SI-NEXT:    s_mov_b32 s7, 0xf000
1792; SI-NEXT:    s_mov_b32 s6, 0
1793; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1794; SI-NEXT:    v_mov_b32_e32 v1, 0
1795; SI-NEXT:    s_waitcnt lgkmcnt(0)
1796; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
1797; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1798; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
1799; SI-NEXT:    s_waitcnt vmcnt(0)
1800; SI-NEXT:    v_add_i32_e32 v3, vcc, 0xffffc400, v2
1801; SI-NEXT:    s_mov_b32 s4, 0xffff0000
1802; SI-NEXT:    v_bfi_b32 v2, s4, v2, v3
1803; SI-NEXT:    v_add_i32_e32 v2, vcc, 0xc4000000, v2
1804; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1805; SI-NEXT:    s_endpgm
1806;
1807; VI-LABEL: v_test_v2i16_x_add_neg_fpone:
1808; VI:       ; %bb.0:
1809; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1810; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1811; VI-NEXT:    s_waitcnt lgkmcnt(0)
1812; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1813; VI-NEXT:    v_mov_b32_e32 v1, s3
1814; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1815; VI-NEXT:    flat_load_dword v3, v[0:1]
1816; VI-NEXT:    s_movk_i32 s2, 0xc400
1817; VI-NEXT:    v_mov_b32_e32 v4, s2
1818; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1819; VI-NEXT:    v_mov_b32_e32 v1, s1
1820; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1821; VI-NEXT:    s_waitcnt vmcnt(0)
1822; VI-NEXT:    v_add_u16_e32 v2, s2, v3
1823; VI-NEXT:    v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1824; VI-NEXT:    v_or_b32_e32 v2, v2, v3
1825; VI-NEXT:    flat_store_dword v[0:1], v2
1826; VI-NEXT:    s_endpgm
1827;
1828; GFX9-LABEL: v_test_v2i16_x_add_neg_fpone:
1829; GFX9:       ; %bb.0:
1830; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1831; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1832; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1833; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
1834; GFX9-NEXT:    s_mov_b32 s2, 0x3c003c00
1835; GFX9-NEXT:    s_waitcnt vmcnt(0)
1836; GFX9-NEXT:    v_pk_sub_u16 v1, v1, s2
1837; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1838; GFX9-NEXT:    s_endpgm
1839;
1840; GFX10-LABEL: v_test_v2i16_x_add_neg_fpone:
1841; GFX10:       ; %bb.0:
1842; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1843; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1844; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1845; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
1846; GFX10-NEXT:    s_waitcnt vmcnt(0)
1847; GFX10-NEXT:    v_pk_sub_u16 v1, v1, 0x3c00 op_sel_hi:[1,0]
1848; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
1849; GFX10-NEXT:    s_endpgm
1850  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1851  %tid.ext = sext i32 %tid to i64
1852  %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
1853  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
1854  %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
1855  %result = add <2 x i16> %x, <i16 -15360, i16 -15360>
1856  store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
1857  ret void
1858}
1859
1860define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
1861; SI-LABEL: v_test_v2i16_x_add_neg_negfpone:
1862; SI:       ; %bb.0:
1863; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1864; SI-NEXT:    s_mov_b32 s7, 0xf000
1865; SI-NEXT:    s_mov_b32 s6, 0
1866; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1867; SI-NEXT:    v_mov_b32_e32 v1, 0
1868; SI-NEXT:    s_waitcnt lgkmcnt(0)
1869; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
1870; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1871; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
1872; SI-NEXT:    s_waitcnt vmcnt(0)
1873; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x4400, v2
1874; SI-NEXT:    s_mov_b32 s4, 0xffff0000
1875; SI-NEXT:    v_bfi_b32 v2, s4, v2, v3
1876; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x44000000, v2
1877; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1878; SI-NEXT:    s_endpgm
1879;
1880; VI-LABEL: v_test_v2i16_x_add_neg_negfpone:
1881; VI:       ; %bb.0:
1882; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1883; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1884; VI-NEXT:    s_waitcnt lgkmcnt(0)
1885; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1886; VI-NEXT:    v_mov_b32_e32 v1, s3
1887; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1888; VI-NEXT:    flat_load_dword v3, v[0:1]
1889; VI-NEXT:    s_movk_i32 s2, 0x4400
1890; VI-NEXT:    v_mov_b32_e32 v4, s2
1891; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1892; VI-NEXT:    v_mov_b32_e32 v1, s1
1893; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1894; VI-NEXT:    s_waitcnt vmcnt(0)
1895; VI-NEXT:    v_add_u16_e32 v2, s2, v3
1896; VI-NEXT:    v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1897; VI-NEXT:    v_or_b32_e32 v2, v2, v3
1898; VI-NEXT:    flat_store_dword v[0:1], v2
1899; VI-NEXT:    s_endpgm
1900;
1901; GFX9-LABEL: v_test_v2i16_x_add_neg_negfpone:
1902; GFX9:       ; %bb.0:
1903; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1904; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1905; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1906; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
1907; GFX9-NEXT:    s_mov_b32 s2, 0xbc00bc00
1908; GFX9-NEXT:    s_waitcnt vmcnt(0)
1909; GFX9-NEXT:    v_pk_sub_u16 v1, v1, s2
1910; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1911; GFX9-NEXT:    s_endpgm
1912;
1913; GFX10-LABEL: v_test_v2i16_x_add_neg_negfpone:
1914; GFX10:       ; %bb.0:
1915; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1916; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1917; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1918; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
1919; GFX10-NEXT:    s_waitcnt vmcnt(0)
1920; GFX10-NEXT:    v_pk_sub_u16 v1, v1, 0xbc00 op_sel_hi:[1,0]
1921; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
1922; GFX10-NEXT:    s_endpgm
1923  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1924  %tid.ext = sext i32 %tid to i64
1925  %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
1926  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
1927  %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
1928  %result = add <2 x i16> %x, <i16 17408, i16 17408>
1929  store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
1930  ret void
1931}
1932
1933define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
1934; SI-LABEL: v_test_v2i16_x_add_neg_fptwo:
1935; SI:       ; %bb.0:
1936; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1937; SI-NEXT:    s_mov_b32 s7, 0xf000
1938; SI-NEXT:    s_mov_b32 s6, 0
1939; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1940; SI-NEXT:    v_mov_b32_e32 v1, 0
1941; SI-NEXT:    s_waitcnt lgkmcnt(0)
1942; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
1943; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1944; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
1945; SI-NEXT:    s_waitcnt vmcnt(0)
1946; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x4000, v2
1947; SI-NEXT:    s_mov_b32 s4, 0xffff0000
1948; SI-NEXT:    v_bfi_b32 v2, s4, v2, v3
1949; SI-NEXT:    v_add_i32_e32 v2, vcc, 2.0, v2
1950; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1951; SI-NEXT:    s_endpgm
1952;
1953; VI-LABEL: v_test_v2i16_x_add_neg_fptwo:
1954; VI:       ; %bb.0:
1955; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1956; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1957; VI-NEXT:    s_waitcnt lgkmcnt(0)
1958; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1959; VI-NEXT:    v_mov_b32_e32 v1, s3
1960; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1961; VI-NEXT:    flat_load_dword v3, v[0:1]
1962; VI-NEXT:    s_movk_i32 s2, 0x4000
1963; VI-NEXT:    v_mov_b32_e32 v4, s2
1964; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1965; VI-NEXT:    v_mov_b32_e32 v1, s1
1966; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1967; VI-NEXT:    s_waitcnt vmcnt(0)
1968; VI-NEXT:    v_add_u16_e32 v2, s2, v3
1969; VI-NEXT:    v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1970; VI-NEXT:    v_or_b32_e32 v2, v2, v3
1971; VI-NEXT:    flat_store_dword v[0:1], v2
1972; VI-NEXT:    s_endpgm
1973;
1974; GFX9-LABEL: v_test_v2i16_x_add_neg_fptwo:
1975; GFX9:       ; %bb.0:
1976; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1977; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1978; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1979; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
1980; GFX9-NEXT:    s_mov_b32 s2, 0xc000c000
1981; GFX9-NEXT:    s_waitcnt vmcnt(0)
1982; GFX9-NEXT:    v_pk_sub_u16 v1, v1, s2
1983; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1984; GFX9-NEXT:    s_endpgm
1985;
1986; GFX10-LABEL: v_test_v2i16_x_add_neg_fptwo:
1987; GFX10:       ; %bb.0:
1988; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1989; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1990; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1991; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
1992; GFX10-NEXT:    s_waitcnt vmcnt(0)
1993; GFX10-NEXT:    v_pk_sub_u16 v1, v1, 0xc000 op_sel_hi:[1,0]
1994; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
1995; GFX10-NEXT:    s_endpgm
1996  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1997  %tid.ext = sext i32 %tid to i64
1998  %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
1999  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
2000  %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
2001  %result = add <2 x i16> %x, <i16 16384, i16 16384>
2002  store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
2003  ret void
2004}
2005
2006define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
2007; SI-LABEL: v_test_v2i16_x_add_neg_negfptwo:
2008; SI:       ; %bb.0:
2009; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
2010; SI-NEXT:    s_mov_b32 s7, 0xf000
2011; SI-NEXT:    s_mov_b32 s6, 0
2012; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2013; SI-NEXT:    v_mov_b32_e32 v1, 0
2014; SI-NEXT:    s_waitcnt lgkmcnt(0)
2015; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
2016; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2017; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
2018; SI-NEXT:    s_waitcnt vmcnt(0)
2019; SI-NEXT:    v_add_i32_e32 v3, vcc, 0xffffc000, v2
2020; SI-NEXT:    s_mov_b32 s4, 0xffff0000
2021; SI-NEXT:    v_bfi_b32 v2, s4, v2, v3
2022; SI-NEXT:    v_add_i32_e32 v2, vcc, -2.0, v2
2023; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2024; SI-NEXT:    s_endpgm
2025;
2026; VI-LABEL: v_test_v2i16_x_add_neg_negfptwo:
2027; VI:       ; %bb.0:
2028; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2029; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2030; VI-NEXT:    s_waitcnt lgkmcnt(0)
2031; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
2032; VI-NEXT:    v_mov_b32_e32 v1, s3
2033; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2034; VI-NEXT:    flat_load_dword v3, v[0:1]
2035; VI-NEXT:    s_movk_i32 s2, 0xc000
2036; VI-NEXT:    v_mov_b32_e32 v4, s2
2037; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
2038; VI-NEXT:    v_mov_b32_e32 v1, s1
2039; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2040; VI-NEXT:    s_waitcnt vmcnt(0)
2041; VI-NEXT:    v_add_u16_e32 v2, s2, v3
2042; VI-NEXT:    v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2043; VI-NEXT:    v_or_b32_e32 v2, v2, v3
2044; VI-NEXT:    flat_store_dword v[0:1], v2
2045; VI-NEXT:    s_endpgm
2046;
2047; GFX9-LABEL: v_test_v2i16_x_add_neg_negfptwo:
2048; GFX9:       ; %bb.0:
2049; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2050; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2051; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2052; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
2053; GFX9-NEXT:    s_mov_b32 s2, 0x40004000
2054; GFX9-NEXT:    s_waitcnt vmcnt(0)
2055; GFX9-NEXT:    v_pk_sub_u16 v1, v1, s2
2056; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
2057; GFX9-NEXT:    s_endpgm
2058;
2059; GFX10-LABEL: v_test_v2i16_x_add_neg_negfptwo:
2060; GFX10:       ; %bb.0:
2061; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2062; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2063; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2064; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
2065; GFX10-NEXT:    s_waitcnt vmcnt(0)
2066; GFX10-NEXT:    v_pk_sub_u16 v1, v1, 0x4000 op_sel_hi:[1,0]
2067; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
2068; GFX10-NEXT:    s_endpgm
2069  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2070  %tid.ext = sext i32 %tid to i64
2071  %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
2072  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
2073  %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
2074  %result = add <2 x i16> %x, <i16 -16384, i16 -16384>
2075  store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
2076  ret void
2077}
2078
2079define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
2080; SI-LABEL: v_test_v2i16_x_add_undef_neg32:
2081; SI:       ; %bb.0:
2082; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
2083; SI-NEXT:    s_mov_b32 s7, 0xf000
2084; SI-NEXT:    s_mov_b32 s6, 0
2085; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2086; SI-NEXT:    v_mov_b32_e32 v1, 0
2087; SI-NEXT:    s_waitcnt lgkmcnt(0)
2088; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
2089; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2090; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
2091; SI-NEXT:    s_waitcnt vmcnt(0)
2092; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
2093; SI-NEXT:    v_add_i32_e32 v2, vcc, 0xffe00000, v2
2094; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2095; SI-NEXT:    s_endpgm
2096;
2097; VI-LABEL: v_test_v2i16_x_add_undef_neg32:
2098; VI:       ; %bb.0:
2099; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2100; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2101; VI-NEXT:    s_waitcnt lgkmcnt(0)
2102; VI-NEXT:    v_mov_b32_e32 v1, s3
2103; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
2104; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2105; VI-NEXT:    flat_load_dword v0, v[0:1]
2106; VI-NEXT:    v_mov_b32_e32 v1, 32
2107; VI-NEXT:    v_mov_b32_e32 v3, s1
2108; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
2109; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
2110; VI-NEXT:    s_waitcnt vmcnt(0)
2111; VI-NEXT:    v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2112; VI-NEXT:    flat_store_dword v[2:3], v0
2113; VI-NEXT:    s_endpgm
2114;
2115; GFX9-LABEL: v_test_v2i16_x_add_undef_neg32:
2116; GFX9:       ; %bb.0:
2117; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2118; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2119; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2120; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
2121; GFX9-NEXT:    s_waitcnt vmcnt(0)
2122; GFX9-NEXT:    v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
2123; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
2124; GFX9-NEXT:    s_endpgm
2125;
2126; GFX10-LABEL: v_test_v2i16_x_add_undef_neg32:
2127; GFX10:       ; %bb.0:
2128; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2129; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2130; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2131; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
2132; GFX10-NEXT:    s_waitcnt vmcnt(0)
2133; GFX10-NEXT:    v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
2134; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
2135; GFX10-NEXT:    s_endpgm
2136  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2137  %tid.ext = sext i32 %tid to i64
2138  %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
2139  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
2140  %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
2141  %result = add <2 x i16> %x, <i16 undef, i16 -32>
2142  store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
2143  ret void
2144}
2145
2146define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
2147; SI-LABEL: v_test_v2i16_x_add_neg32_undef:
2148; SI:       ; %bb.0:
2149; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
2150; SI-NEXT:    s_mov_b32 s7, 0xf000
2151; SI-NEXT:    s_mov_b32 s6, 0
2152; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2153; SI-NEXT:    v_mov_b32_e32 v1, 0
2154; SI-NEXT:    s_waitcnt lgkmcnt(0)
2155; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
2156; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2157; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
2158; SI-NEXT:    s_waitcnt vmcnt(0)
2159; SI-NEXT:    v_subrev_i32_e32 v2, vcc, 32, v2
2160; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
2161; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2162; SI-NEXT:    s_endpgm
2163;
2164; VI-LABEL: v_test_v2i16_x_add_neg32_undef:
2165; VI:       ; %bb.0:
2166; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2167; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2168; VI-NEXT:    s_waitcnt lgkmcnt(0)
2169; VI-NEXT:    v_mov_b32_e32 v1, s3
2170; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
2171; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2172; VI-NEXT:    flat_load_dword v0, v[0:1]
2173; VI-NEXT:    v_mov_b32_e32 v3, s1
2174; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
2175; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
2176; VI-NEXT:    s_waitcnt vmcnt(0)
2177; VI-NEXT:    v_subrev_u16_e32 v0, 32, v0
2178; VI-NEXT:    flat_store_dword v[2:3], v0
2179; VI-NEXT:    s_endpgm
2180;
2181; GFX9-LABEL: v_test_v2i16_x_add_neg32_undef:
2182; GFX9:       ; %bb.0:
2183; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2184; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2185; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2186; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
2187; GFX9-NEXT:    s_waitcnt vmcnt(0)
2188; GFX9-NEXT:    v_pk_sub_u16 v1, v1, 32
2189; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
2190; GFX9-NEXT:    s_endpgm
2191;
2192; GFX10-LABEL: v_test_v2i16_x_add_neg32_undef:
2193; GFX10:       ; %bb.0:
2194; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2195; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2196; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2197; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
2198; GFX10-NEXT:    s_waitcnt vmcnt(0)
2199; GFX10-NEXT:    v_pk_sub_u16 v1, v1, 32
2200; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
2201; GFX10-NEXT:    s_endpgm
2202  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2203  %tid.ext = sext i32 %tid to i64
2204  %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
2205  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
2206  %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
2207  %result = add <2 x i16> %x, <i16 -32, i16 undef>
2208  store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
2209  ret void
2210}
2211
2212declare i32 @llvm.amdgcn.workitem.id.x() #1
2213
2214attributes #0 = { nounwind }
2215attributes #1 = { nounwind readnone }
2216