1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
3; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,CIVI %s
4; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,CIVI %s
5; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,VI %s
6; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,CI %s
7
8define amdgpu_kernel void @s_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
9; GFX9-LABEL: s_lshr_v2i16:
10; GFX9:       ; %bb.0:
11; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
12; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
13; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x30
14; GFX9-NEXT:    v_mov_b32_e32 v0, 0
15; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
16; GFX9-NEXT:    v_mov_b32_e32 v1, s4
17; GFX9-NEXT:    v_pk_lshrrev_b16 v1, s0, v1
18; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
19; GFX9-NEXT:    s_endpgm
20;
21; VI-LABEL: s_lshr_v2i16:
22; VI:       ; %bb.0:
23; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
24; VI-NEXT:    s_load_dword s5, s[0:1], 0x2c
25; VI-NEXT:    s_load_dword s0, s[0:1], 0x30
26; VI-NEXT:    s_mov_b32 s4, 0xffff
27; VI-NEXT:    s_waitcnt lgkmcnt(0)
28; VI-NEXT:    v_mov_b32_e32 v0, s2
29; VI-NEXT:    s_lshr_b32 s1, s5, 16
30; VI-NEXT:    s_lshr_b32 s6, s0, 16
31; VI-NEXT:    s_lshr_b32 s1, s1, s6
32; VI-NEXT:    s_and_b32 s5, s5, s4
33; VI-NEXT:    s_and_b32 s0, s0, s4
34; VI-NEXT:    s_lshr_b32 s0, s5, s0
35; VI-NEXT:    s_lshl_b32 s1, s1, 16
36; VI-NEXT:    s_or_b32 s0, s0, s1
37; VI-NEXT:    v_mov_b32_e32 v1, s3
38; VI-NEXT:    v_mov_b32_e32 v2, s0
39; VI-NEXT:    flat_store_dword v[0:1], v2
40; VI-NEXT:    s_endpgm
41;
42; CI-LABEL: s_lshr_v2i16:
43; CI:       ; %bb.0:
44; CI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
45; CI-NEXT:    s_load_dword s2, s[0:1], 0xb
46; CI-NEXT:    s_load_dword s0, s[0:1], 0xc
47; CI-NEXT:    s_mov_b32 s3, 0xffff
48; CI-NEXT:    s_mov_b32 s7, 0xf000
49; CI-NEXT:    s_mov_b32 s6, -1
50; CI-NEXT:    s_waitcnt lgkmcnt(0)
51; CI-NEXT:    s_lshr_b32 s1, s2, 16
52; CI-NEXT:    s_lshr_b32 s8, s0, 16
53; CI-NEXT:    s_lshr_b32 s1, s1, s8
54; CI-NEXT:    s_and_b32 s2, s2, s3
55; CI-NEXT:    s_and_b32 s0, s0, s3
56; CI-NEXT:    s_lshr_b32 s0, s2, s0
57; CI-NEXT:    s_lshl_b32 s1, s1, 16
58; CI-NEXT:    s_or_b32 s0, s0, s1
59; CI-NEXT:    v_mov_b32_e32 v0, s0
60; CI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
61; CI-NEXT:    s_endpgm
62  %result = lshr <2 x i16> %lhs, %rhs
63  store <2 x i16> %result, <2 x i16> addrspace(1)* %out
64  ret void
65}
66
67define amdgpu_kernel void @v_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
68; GFX9-LABEL: v_lshr_v2i16:
69; GFX9:       ; %bb.0:
70; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
71; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
72; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
73; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
74; GFX9-NEXT:    global_load_dword v2, v0, s[2:3] offset:4
75; GFX9-NEXT:    s_waitcnt vmcnt(0)
76; GFX9-NEXT:    v_pk_lshrrev_b16 v1, v2, v1
77; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
78; GFX9-NEXT:    s_endpgm
79;
80; VI-LABEL: v_lshr_v2i16:
81; VI:       ; %bb.0:
82; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
83; VI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
84; VI-NEXT:    s_waitcnt lgkmcnt(0)
85; VI-NEXT:    v_mov_b32_e32 v1, s3
86; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
87; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
88; VI-NEXT:    v_add_u32_e32 v2, vcc, 4, v0
89; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
90; VI-NEXT:    flat_load_dword v5, v[0:1]
91; VI-NEXT:    flat_load_dword v2, v[2:3]
92; VI-NEXT:    v_mov_b32_e32 v1, s1
93; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v4
94; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
95; VI-NEXT:    s_waitcnt vmcnt(0)
96; VI-NEXT:    v_lshrrev_b16_e32 v3, v2, v5
97; VI-NEXT:    v_lshrrev_b16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
98; VI-NEXT:    v_or_b32_e32 v2, v3, v2
99; VI-NEXT:    flat_store_dword v[0:1], v2
100; VI-NEXT:    s_endpgm
101;
102; CI-LABEL: v_lshr_v2i16:
103; CI:       ; %bb.0:
104; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
105; CI-NEXT:    s_mov_b32 s3, 0xf000
106; CI-NEXT:    s_mov_b32 s2, 0
107; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
108; CI-NEXT:    v_mov_b32_e32 v1, 0
109; CI-NEXT:    s_waitcnt lgkmcnt(0)
110; CI-NEXT:    s_mov_b64 s[0:1], s[6:7]
111; CI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
112; CI-NEXT:    buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4
113; CI-NEXT:    s_mov_b32 s0, 0xffff
114; CI-NEXT:    s_mov_b64 s[6:7], s[2:3]
115; CI-NEXT:    s_waitcnt vmcnt(1)
116; CI-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
117; CI-NEXT:    s_waitcnt vmcnt(0)
118; CI-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
119; CI-NEXT:    v_and_b32_e32 v2, s0, v2
120; CI-NEXT:    v_and_b32_e32 v3, s0, v3
121; CI-NEXT:    v_lshr_b32_e32 v2, v2, v3
122; CI-NEXT:    v_lshr_b32_e32 v3, v4, v5
123; CI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
124; CI-NEXT:    v_or_b32_e32 v2, v2, v3
125; CI-NEXT:    buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
126; CI-NEXT:    s_endpgm
127  %tid = call i32 @llvm.amdgcn.workitem.id.x()
128  %tid.ext = sext i32 %tid to i64
129  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
130  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
131  %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in.gep, i32 1
132  %a = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
133  %b = load <2 x i16>, <2 x i16> addrspace(1)* %b_ptr
134  %result = lshr <2 x i16> %a, %b
135  store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
136  ret void
137}
138
139define amdgpu_kernel void @lshr_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 {
140; GFX9-LABEL: lshr_v_s_v2i16:
141; GFX9:       ; %bb.0:
142; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
143; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x34
144; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
145; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
146; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
147; GFX9-NEXT:    s_waitcnt vmcnt(0)
148; GFX9-NEXT:    v_pk_lshrrev_b16 v1, s0, v1
149; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
150; GFX9-NEXT:    s_endpgm
151;
152; VI-LABEL: lshr_v_s_v2i16:
153; VI:       ; %bb.0:
154; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
155; VI-NEXT:    s_load_dword s0, s[0:1], 0x34
156; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
157; VI-NEXT:    s_waitcnt lgkmcnt(0)
158; VI-NEXT:    v_mov_b32_e32 v1, s7
159; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
160; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
161; VI-NEXT:    flat_load_dword v3, v[0:1]
162; VI-NEXT:    s_lshr_b32 s1, s0, 16
163; VI-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
164; VI-NEXT:    v_mov_b32_e32 v2, s1
165; VI-NEXT:    v_mov_b32_e32 v1, s5
166; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
167; VI-NEXT:    s_waitcnt vmcnt(0)
168; VI-NEXT:    v_lshrrev_b16_e32 v4, s0, v3
169; VI-NEXT:    v_lshrrev_b16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
170; VI-NEXT:    v_or_b32_e32 v2, v4, v2
171; VI-NEXT:    flat_store_dword v[0:1], v2
172; VI-NEXT:    s_endpgm
173;
174; CI-LABEL: lshr_v_s_v2i16:
175; CI:       ; %bb.0:
176; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
177; CI-NEXT:    s_load_dword s8, s[0:1], 0xd
178; CI-NEXT:    s_mov_b32 s3, 0xf000
179; CI-NEXT:    s_mov_b32 s2, 0
180; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
181; CI-NEXT:    s_waitcnt lgkmcnt(0)
182; CI-NEXT:    s_mov_b64 s[0:1], s[6:7]
183; CI-NEXT:    v_mov_b32_e32 v1, 0
184; CI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
185; CI-NEXT:    s_mov_b32 s0, 0xffff
186; CI-NEXT:    s_lshr_b32 s1, s8, 16
187; CI-NEXT:    s_and_b32 s8, s8, s0
188; CI-NEXT:    s_mov_b64 s[6:7], s[2:3]
189; CI-NEXT:    s_waitcnt vmcnt(0)
190; CI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
191; CI-NEXT:    v_and_b32_e32 v2, s0, v2
192; CI-NEXT:    v_lshrrev_b32_e32 v3, s1, v3
193; CI-NEXT:    v_lshrrev_b32_e32 v2, s8, v2
194; CI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
195; CI-NEXT:    v_or_b32_e32 v2, v2, v3
196; CI-NEXT:    buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
197; CI-NEXT:    s_endpgm
198  %tid = call i32 @llvm.amdgcn.workitem.id.x()
199  %tid.ext = sext i32 %tid to i64
200  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
201  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
202  %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
203  %result = lshr <2 x i16> %vgpr, %sgpr
204  store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
205  ret void
206}
207
208define amdgpu_kernel void @lshr_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 {
209; GFX9-LABEL: lshr_s_v_v2i16:
210; GFX9:       ; %bb.0:
211; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
212; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x34
213; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
214; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
215; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
216; GFX9-NEXT:    s_waitcnt vmcnt(0)
217; GFX9-NEXT:    v_pk_lshrrev_b16 v1, v1, s0
218; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
219; GFX9-NEXT:    s_endpgm
220;
221; VI-LABEL: lshr_s_v_v2i16:
222; VI:       ; %bb.0:
223; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
224; VI-NEXT:    s_load_dword s0, s[0:1], 0x34
225; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
226; VI-NEXT:    s_waitcnt lgkmcnt(0)
227; VI-NEXT:    v_mov_b32_e32 v1, s7
228; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
229; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
230; VI-NEXT:    flat_load_dword v3, v[0:1]
231; VI-NEXT:    s_lshr_b32 s1, s0, 16
232; VI-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
233; VI-NEXT:    v_mov_b32_e32 v2, s1
234; VI-NEXT:    v_mov_b32_e32 v1, s5
235; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
236; VI-NEXT:    s_waitcnt vmcnt(0)
237; VI-NEXT:    v_lshrrev_b16_e64 v4, v3, s0
238; VI-NEXT:    v_lshrrev_b16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
239; VI-NEXT:    v_or_b32_e32 v2, v4, v2
240; VI-NEXT:    flat_store_dword v[0:1], v2
241; VI-NEXT:    s_endpgm
242;
243; CI-LABEL: lshr_s_v_v2i16:
244; CI:       ; %bb.0:
245; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
246; CI-NEXT:    s_load_dword s8, s[0:1], 0xd
247; CI-NEXT:    s_mov_b32 s3, 0xf000
248; CI-NEXT:    s_mov_b32 s2, 0
249; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
250; CI-NEXT:    s_waitcnt lgkmcnt(0)
251; CI-NEXT:    s_mov_b64 s[0:1], s[6:7]
252; CI-NEXT:    v_mov_b32_e32 v1, 0
253; CI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
254; CI-NEXT:    s_mov_b32 s0, 0xffff
255; CI-NEXT:    s_lshr_b32 s1, s8, 16
256; CI-NEXT:    s_and_b32 s8, s8, s0
257; CI-NEXT:    s_mov_b64 s[6:7], s[2:3]
258; CI-NEXT:    s_waitcnt vmcnt(0)
259; CI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
260; CI-NEXT:    v_and_b32_e32 v2, s0, v2
261; CI-NEXT:    v_lshr_b32_e32 v3, s1, v3
262; CI-NEXT:    v_lshr_b32_e32 v2, s8, v2
263; CI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
264; CI-NEXT:    v_or_b32_e32 v2, v2, v3
265; CI-NEXT:    buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
266; CI-NEXT:    s_endpgm
267  %tid = call i32 @llvm.amdgcn.workitem.id.x()
268  %tid.ext = sext i32 %tid to i64
269  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
270  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
271  %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
272  %result = lshr <2 x i16> %sgpr, %vgpr
273  store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
274  ret void
275}
276
277define amdgpu_kernel void @lshr_imm_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
278; GFX9-LABEL: lshr_imm_v_v2i16:
279; GFX9:       ; %bb.0:
280; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
281; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
282; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
283; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
284; GFX9-NEXT:    s_waitcnt vmcnt(0)
285; GFX9-NEXT:    v_pk_lshrrev_b16 v1, v1, 8 op_sel_hi:[1,0]
286; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
287; GFX9-NEXT:    s_endpgm
288;
289; VI-LABEL: lshr_imm_v_v2i16:
290; VI:       ; %bb.0:
291; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
292; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
293; VI-NEXT:    v_mov_b32_e32 v4, 8
294; VI-NEXT:    s_waitcnt lgkmcnt(0)
295; VI-NEXT:    v_mov_b32_e32 v1, s3
296; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
297; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
298; VI-NEXT:    flat_load_dword v3, v[0:1]
299; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
300; VI-NEXT:    v_mov_b32_e32 v1, s1
301; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
302; VI-NEXT:    s_waitcnt vmcnt(0)
303; VI-NEXT:    v_lshrrev_b16_e64 v2, v3, 8
304; VI-NEXT:    v_lshrrev_b16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
305; VI-NEXT:    v_or_b32_e32 v2, v2, v3
306; VI-NEXT:    flat_store_dword v[0:1], v2
307; VI-NEXT:    s_endpgm
308;
309; CI-LABEL: lshr_imm_v_v2i16:
310; CI:       ; %bb.0:
311; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
312; CI-NEXT:    s_mov_b32 s3, 0xf000
313; CI-NEXT:    s_mov_b32 s2, 0
314; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
315; CI-NEXT:    v_mov_b32_e32 v1, 0
316; CI-NEXT:    s_waitcnt lgkmcnt(0)
317; CI-NEXT:    s_mov_b64 s[0:1], s[6:7]
318; CI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
319; CI-NEXT:    s_mov_b64 s[6:7], s[2:3]
320; CI-NEXT:    s_waitcnt vmcnt(0)
321; CI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
322; CI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
323; CI-NEXT:    v_lshr_b32_e32 v3, 8, v3
324; CI-NEXT:    v_lshr_b32_e32 v2, 8, v2
325; CI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
326; CI-NEXT:    v_or_b32_e32 v2, v2, v3
327; CI-NEXT:    buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
328; CI-NEXT:    s_endpgm
329  %tid = call i32 @llvm.amdgcn.workitem.id.x()
330  %tid.ext = sext i32 %tid to i64
331  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
332  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
333  %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
334  %result = lshr <2 x i16> <i16 8, i16 8>, %vgpr
335  store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
336  ret void
337}
338
339define amdgpu_kernel void @lshr_v_imm_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
340; GFX9-LABEL: lshr_v_imm_v2i16:
341; GFX9:       ; %bb.0:
342; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
343; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
344; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
345; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
346; GFX9-NEXT:    s_waitcnt vmcnt(0)
347; GFX9-NEXT:    v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
348; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
349; GFX9-NEXT:    s_endpgm
350;
351; VI-LABEL: lshr_v_imm_v2i16:
352; VI:       ; %bb.0:
353; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
354; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
355; VI-NEXT:    s_waitcnt lgkmcnt(0)
356; VI-NEXT:    v_mov_b32_e32 v1, s3
357; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
358; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
359; VI-NEXT:    flat_load_dword v0, v[0:1]
360; VI-NEXT:    v_mov_b32_e32 v3, s1
361; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
362; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
363; VI-NEXT:    s_waitcnt vmcnt(0)
364; VI-NEXT:    v_lshrrev_b32_e32 v1, 24, v0
365; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
366; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
367; VI-NEXT:    flat_store_dword v[2:3], v0
368; VI-NEXT:    s_endpgm
369;
370; CI-LABEL: lshr_v_imm_v2i16:
371; CI:       ; %bb.0:
372; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
373; CI-NEXT:    s_mov_b32 s3, 0xf000
374; CI-NEXT:    s_mov_b32 s2, 0
375; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
376; CI-NEXT:    v_mov_b32_e32 v1, 0
377; CI-NEXT:    s_waitcnt lgkmcnt(0)
378; CI-NEXT:    s_mov_b64 s[0:1], s[6:7]
379; CI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
380; CI-NEXT:    s_mov_b64 s[6:7], s[2:3]
381; CI-NEXT:    s_waitcnt vmcnt(0)
382; CI-NEXT:    v_lshrrev_b32_e32 v2, 8, v2
383; CI-NEXT:    v_and_b32_e32 v2, 0xff00ff, v2
384; CI-NEXT:    buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
385; CI-NEXT:    s_endpgm
386  %tid = call i32 @llvm.amdgcn.workitem.id.x()
387  %tid.ext = sext i32 %tid to i64
388  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
389  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
390  %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
391  %result = lshr <2 x i16> %vgpr, <i16 8, i16 8>
392  store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
393  ret void
394}
395
396define amdgpu_kernel void @v_lshr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
397; GFX9-LABEL: v_lshr_v4i16:
398; GFX9:       ; %bb.0:
399; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
400; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
401; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
402; GFX9-NEXT:    global_load_dwordx2 v[0:1], v4, s[2:3]
403; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[2:3] offset:8
404; GFX9-NEXT:    s_waitcnt vmcnt(0)
405; GFX9-NEXT:    v_pk_lshrrev_b16 v1, v3, v1
406; GFX9-NEXT:    v_pk_lshrrev_b16 v0, v2, v0
407; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
408; GFX9-NEXT:    s_endpgm
409;
410; VI-LABEL: v_lshr_v4i16:
411; VI:       ; %bb.0:
412; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
413; VI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
414; VI-NEXT:    s_waitcnt lgkmcnt(0)
415; VI-NEXT:    v_mov_b32_e32 v1, s3
416; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
417; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
418; VI-NEXT:    v_add_u32_e32 v2, vcc, 8, v0
419; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
420; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
421; VI-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
422; VI-NEXT:    v_mov_b32_e32 v5, s1
423; VI-NEXT:    v_add_u32_e32 v4, vcc, s0, v4
424; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
425; VI-NEXT:    s_waitcnt vmcnt(0)
426; VI-NEXT:    v_lshrrev_b16_e32 v6, v3, v1
427; VI-NEXT:    v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
428; VI-NEXT:    v_lshrrev_b16_e32 v3, v2, v0
429; VI-NEXT:    v_lshrrev_b16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
430; VI-NEXT:    v_or_b32_e32 v1, v6, v1
431; VI-NEXT:    v_or_b32_e32 v0, v3, v0
432; VI-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
433; VI-NEXT:    s_endpgm
434;
435; CI-LABEL: v_lshr_v4i16:
436; CI:       ; %bb.0:
437; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
438; CI-NEXT:    s_mov_b32 s3, 0xf000
439; CI-NEXT:    s_mov_b32 s2, 0
440; CI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
441; CI-NEXT:    v_mov_b32_e32 v1, 0
442; CI-NEXT:    s_waitcnt lgkmcnt(0)
443; CI-NEXT:    s_mov_b64 s[0:1], s[6:7]
444; CI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
445; CI-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8
446; CI-NEXT:    s_mov_b32 s0, 0xffff
447; CI-NEXT:    s_mov_b64 s[6:7], s[2:3]
448; CI-NEXT:    s_waitcnt vmcnt(1)
449; CI-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
450; CI-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
451; CI-NEXT:    s_waitcnt vmcnt(0)
452; CI-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
453; CI-NEXT:    v_lshrrev_b32_e32 v9, 16, v5
454; CI-NEXT:    v_and_b32_e32 v2, s0, v2
455; CI-NEXT:    v_and_b32_e32 v4, s0, v4
456; CI-NEXT:    v_and_b32_e32 v3, s0, v3
457; CI-NEXT:    v_and_b32_e32 v5, s0, v5
458; CI-NEXT:    v_lshr_b32_e32 v3, v3, v5
459; CI-NEXT:    v_lshr_b32_e32 v5, v7, v9
460; CI-NEXT:    v_lshr_b32_e32 v2, v2, v4
461; CI-NEXT:    v_lshr_b32_e32 v4, v6, v8
462; CI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
463; CI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
464; CI-NEXT:    v_or_b32_e32 v3, v3, v5
465; CI-NEXT:    v_or_b32_e32 v2, v2, v4
466; CI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
467; CI-NEXT:    s_endpgm
468  %tid = call i32 @llvm.amdgcn.workitem.id.x()
469  %tid.ext = sext i32 %tid to i64
470  %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext
471  %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext
472  %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in.gep, i32 1
473  %a = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep
474  %b = load <4 x i16>, <4 x i16> addrspace(1)* %b_ptr
475  %result = lshr <4 x i16> %a, %b
476  store <4 x i16> %result, <4 x i16> addrspace(1)* %out.gep
477  ret void
478}
479
480define amdgpu_kernel void @lshr_v_imm_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
481; GFX9-LABEL: lshr_v_imm_v4i16:
482; GFX9:       ; %bb.0:
483; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
484; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
485; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
486; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
487; GFX9-NEXT:    s_waitcnt vmcnt(0)
488; GFX9-NEXT:    v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
489; GFX9-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
490; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
491; GFX9-NEXT:    s_endpgm
492;
493; VI-LABEL: lshr_v_imm_v4i16:
494; VI:       ; %bb.0:
495; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
496; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
497; VI-NEXT:    s_waitcnt lgkmcnt(0)
498; VI-NEXT:    v_mov_b32_e32 v1, s3
499; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
500; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
501; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
502; VI-NEXT:    v_mov_b32_e32 v3, s1
503; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
504; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
505; VI-NEXT:    s_waitcnt vmcnt(0)
506; VI-NEXT:    v_lshrrev_b32_e32 v4, 24, v1
507; VI-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
508; VI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
509; VI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
510; VI-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
511; VI-NEXT:    v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
512; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
513; VI-NEXT:    s_endpgm
514;
515; CI-LABEL: lshr_v_imm_v4i16:
516; CI:       ; %bb.0:
517; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
518; CI-NEXT:    s_mov_b32 s3, 0xf000
519; CI-NEXT:    s_mov_b32 s2, 0
520; CI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
521; CI-NEXT:    v_mov_b32_e32 v1, 0
522; CI-NEXT:    s_waitcnt lgkmcnt(0)
523; CI-NEXT:    s_mov_b64 s[0:1], s[6:7]
524; CI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
525; CI-NEXT:    s_mov_b32 s0, 0xff00ff
526; CI-NEXT:    s_mov_b64 s[6:7], s[2:3]
527; CI-NEXT:    s_waitcnt vmcnt(0)
528; CI-NEXT:    v_lshrrev_b32_e32 v3, 8, v3
529; CI-NEXT:    v_lshrrev_b32_e32 v2, 8, v2
530; CI-NEXT:    v_and_b32_e32 v3, s0, v3
531; CI-NEXT:    v_and_b32_e32 v2, s0, v2
532; CI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
533; CI-NEXT:    s_endpgm
534  %tid = call i32 @llvm.amdgcn.workitem.id.x()
535  %tid.ext = sext i32 %tid to i64
536  %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext
537  %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext
538  %vgpr = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep
539  %result = lshr <4 x i16> %vgpr, <i16 8, i16 8, i16 8, i16 8>
540  store <4 x i16> %result, <4 x i16> addrspace(1)* %out.gep
541  ret void
542}
543
544declare i32 @llvm.amdgcn.workitem.id.x() #1
545
546attributes #0 = { nounwind }
547attributes #1 = { nounwind readnone }
548