1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
3; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,VI %s
4; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,CI %s
5
6define amdgpu_kernel void @s_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
7; GFX9-LABEL: s_shl_v2i16:
8; GFX9:       ; %bb.0:
9; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
10; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
11; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x30
12; GFX9-NEXT:    s_mov_b32 s7, 0xf000
13; GFX9-NEXT:    s_mov_b32 s6, -1
14; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
15; GFX9-NEXT:    v_mov_b32_e32 v0, s2
16; GFX9-NEXT:    v_pk_lshlrev_b16 v0, s0, v0
17; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
18; GFX9-NEXT:    s_endpgm
19;
20; VI-LABEL: s_shl_v2i16:
21; VI:       ; %bb.0:
22; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
23; VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
24; VI-NEXT:    s_load_dword s0, s[0:1], 0x30
25; VI-NEXT:    s_mov_b32 s3, 0xffff
26; VI-NEXT:    s_mov_b32 s7, 0xf000
27; VI-NEXT:    s_mov_b32 s6, -1
28; VI-NEXT:    s_waitcnt lgkmcnt(0)
29; VI-NEXT:    s_lshr_b32 s1, s2, 16
30; VI-NEXT:    s_lshr_b32 s8, s0, 16
31; VI-NEXT:    s_and_b32 s2, s2, s3
32; VI-NEXT:    s_and_b32 s0, s0, s3
33; VI-NEXT:    s_lshl_b32 s0, s2, s0
34; VI-NEXT:    s_lshl_b32 s1, s1, s8
35; VI-NEXT:    s_lshl_b32 s1, s1, 16
36; VI-NEXT:    s_and_b32 s0, s0, s3
37; VI-NEXT:    s_or_b32 s0, s0, s1
38; VI-NEXT:    v_mov_b32_e32 v0, s0
39; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
40; VI-NEXT:    s_endpgm
41;
42; CI-LABEL: s_shl_v2i16:
43; CI:       ; %bb.0:
44; CI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
45; CI-NEXT:    s_load_dword s2, s[0:1], 0xb
46; CI-NEXT:    s_load_dword s0, s[0:1], 0xc
47; CI-NEXT:    s_mov_b32 s3, 0xffff
48; CI-NEXT:    s_mov_b32 s7, 0xf000
49; CI-NEXT:    s_mov_b32 s6, -1
50; CI-NEXT:    s_waitcnt lgkmcnt(0)
51; CI-NEXT:    s_lshr_b32 s1, s2, 16
52; CI-NEXT:    s_and_b32 s8, s0, s3
53; CI-NEXT:    s_lshr_b32 s0, s0, 16
54; CI-NEXT:    s_lshl_b32 s0, s1, s0
55; CI-NEXT:    s_lshl_b32 s1, s2, s8
56; CI-NEXT:    s_lshl_b32 s0, s0, 16
57; CI-NEXT:    s_and_b32 s1, s1, s3
58; CI-NEXT:    s_or_b32 s0, s1, s0
59; CI-NEXT:    v_mov_b32_e32 v0, s0
60; CI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
61; CI-NEXT:    s_endpgm
62  %result = shl <2 x i16> %lhs, %rhs
63  store <2 x i16> %result, <2 x i16> addrspace(1)* %out
64  ret void
65}
66
67define amdgpu_kernel void @v_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
68; GFX9-LABEL: v_shl_v2i16:
69; GFX9:       ; %bb.0:
70; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
71; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
72; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
73; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
74; GFX9-NEXT:    global_load_dword v2, v0, s[2:3] offset:4
75; GFX9-NEXT:    s_waitcnt vmcnt(0)
76; GFX9-NEXT:    v_pk_lshlrev_b16 v1, v2, v1
77; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
78; GFX9-NEXT:    s_endpgm
79;
80; VI-LABEL: v_shl_v2i16:
81; VI:       ; %bb.0:
82; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
83; VI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
84; VI-NEXT:    s_waitcnt lgkmcnt(0)
85; VI-NEXT:    v_mov_b32_e32 v1, s3
86; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
87; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
88; VI-NEXT:    v_add_u32_e32 v2, vcc, 4, v0
89; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
90; VI-NEXT:    flat_load_dword v5, v[0:1]
91; VI-NEXT:    flat_load_dword v2, v[2:3]
92; VI-NEXT:    v_mov_b32_e32 v1, s1
93; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v4
94; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
95; VI-NEXT:    s_waitcnt vmcnt(0)
96; VI-NEXT:    v_lshlrev_b16_e32 v3, v2, v5
97; VI-NEXT:    v_lshlrev_b16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
98; VI-NEXT:    v_or_b32_e32 v2, v3, v2
99; VI-NEXT:    flat_store_dword v[0:1], v2
100; VI-NEXT:    s_endpgm
101;
102; CI-LABEL: v_shl_v2i16:
103; CI:       ; %bb.0:
104; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
105; CI-NEXT:    s_mov_b32 s3, 0xf000
106; CI-NEXT:    s_mov_b32 s2, 0
107; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
108; CI-NEXT:    v_mov_b32_e32 v1, 0
109; CI-NEXT:    s_waitcnt lgkmcnt(0)
110; CI-NEXT:    s_mov_b64 s[0:1], s[6:7]
111; CI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
112; CI-NEXT:    buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4
113; CI-NEXT:    s_mov_b32 s0, 0xffff
114; CI-NEXT:    s_mov_b64 s[6:7], s[2:3]
115; CI-NEXT:    s_waitcnt vmcnt(1)
116; CI-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
117; CI-NEXT:    s_waitcnt vmcnt(0)
118; CI-NEXT:    v_and_b32_e32 v5, s0, v3
119; CI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
120; CI-NEXT:    v_lshl_b32_e32 v3, v4, v3
121; CI-NEXT:    v_lshl_b32_e32 v2, v2, v5
122; CI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
123; CI-NEXT:    v_and_b32_e32 v2, s0, v2
124; CI-NEXT:    v_or_b32_e32 v2, v2, v3
125; CI-NEXT:    buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
126; CI-NEXT:    s_endpgm
127  %tid = call i32 @llvm.amdgcn.workitem.id.x()
128  %tid.ext = sext i32 %tid to i64
129  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
130  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
131  %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in.gep, i32 1
132  %a = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
133  %b = load <2 x i16>, <2 x i16> addrspace(1)* %b_ptr
134  %result = shl <2 x i16> %a, %b
135  store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
136  ret void
137}
138
139define amdgpu_kernel void @shl_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 {
140; GFX9-LABEL: shl_v_s_v2i16:
141; GFX9:       ; %bb.0:
142; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
143; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x34
144; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
145; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
146; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
147; GFX9-NEXT:    s_waitcnt vmcnt(0)
148; GFX9-NEXT:    v_pk_lshlrev_b16 v1, s0, v1
149; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
150; GFX9-NEXT:    s_endpgm
151;
152; VI-LABEL: shl_v_s_v2i16:
153; VI:       ; %bb.0:
154; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
155; VI-NEXT:    s_load_dword s0, s[0:1], 0x34
156; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
157; VI-NEXT:    s_waitcnt lgkmcnt(0)
158; VI-NEXT:    v_mov_b32_e32 v1, s7
159; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
160; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
161; VI-NEXT:    flat_load_dword v3, v[0:1]
162; VI-NEXT:    s_lshr_b32 s1, s0, 16
163; VI-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
164; VI-NEXT:    v_mov_b32_e32 v2, s1
165; VI-NEXT:    v_mov_b32_e32 v1, s5
166; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
167; VI-NEXT:    s_waitcnt vmcnt(0)
168; VI-NEXT:    v_lshlrev_b16_e32 v4, s0, v3
169; VI-NEXT:    v_lshlrev_b16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
170; VI-NEXT:    v_or_b32_e32 v2, v4, v2
171; VI-NEXT:    flat_store_dword v[0:1], v2
172; VI-NEXT:    s_endpgm
173;
174; CI-LABEL: shl_v_s_v2i16:
175; CI:       ; %bb.0:
176; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
177; CI-NEXT:    s_load_dword s8, s[0:1], 0xd
178; CI-NEXT:    s_mov_b32 s3, 0xf000
179; CI-NEXT:    s_mov_b32 s2, 0
180; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
181; CI-NEXT:    s_waitcnt lgkmcnt(0)
182; CI-NEXT:    s_mov_b64 s[0:1], s[6:7]
183; CI-NEXT:    v_mov_b32_e32 v1, 0
184; CI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
185; CI-NEXT:    s_mov_b32 s0, 0xffff
186; CI-NEXT:    s_lshr_b32 s1, s8, 16
187; CI-NEXT:    s_and_b32 s8, s8, s0
188; CI-NEXT:    s_mov_b64 s[6:7], s[2:3]
189; CI-NEXT:    s_waitcnt vmcnt(0)
190; CI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
191; CI-NEXT:    v_lshlrev_b32_e32 v2, s8, v2
192; CI-NEXT:    v_lshlrev_b32_e32 v3, s1, v3
193; CI-NEXT:    v_and_b32_e32 v2, s0, v2
194; CI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
195; CI-NEXT:    v_or_b32_e32 v2, v2, v3
196; CI-NEXT:    buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
197; CI-NEXT:    s_endpgm
198  %tid = call i32 @llvm.amdgcn.workitem.id.x()
199  %tid.ext = sext i32 %tid to i64
200  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
201  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
202  %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
203  %result = shl <2 x i16> %vgpr, %sgpr
204  store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
205  ret void
206}
207
208define amdgpu_kernel void @shl_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 {
209; GFX9-LABEL: shl_s_v_v2i16:
210; GFX9:       ; %bb.0:
211; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
212; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x34
213; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
214; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
215; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
216; GFX9-NEXT:    s_waitcnt vmcnt(0)
217; GFX9-NEXT:    v_pk_lshlrev_b16 v1, v1, s0
218; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
219; GFX9-NEXT:    s_endpgm
220;
221; VI-LABEL: shl_s_v_v2i16:
222; VI:       ; %bb.0:
223; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
224; VI-NEXT:    s_load_dword s0, s[0:1], 0x34
225; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
226; VI-NEXT:    s_waitcnt lgkmcnt(0)
227; VI-NEXT:    v_mov_b32_e32 v1, s7
228; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
229; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
230; VI-NEXT:    flat_load_dword v3, v[0:1]
231; VI-NEXT:    s_lshr_b32 s1, s0, 16
232; VI-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
233; VI-NEXT:    v_mov_b32_e32 v2, s1
234; VI-NEXT:    v_mov_b32_e32 v1, s5
235; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
236; VI-NEXT:    s_waitcnt vmcnt(0)
237; VI-NEXT:    v_lshlrev_b16_e64 v4, v3, s0
238; VI-NEXT:    v_lshlrev_b16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
239; VI-NEXT:    v_or_b32_e32 v2, v4, v2
240; VI-NEXT:    flat_store_dword v[0:1], v2
241; VI-NEXT:    s_endpgm
242;
243; CI-LABEL: shl_s_v_v2i16:
244; CI:       ; %bb.0:
245; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
246; CI-NEXT:    s_load_dword s8, s[0:1], 0xd
247; CI-NEXT:    s_mov_b32 s3, 0xf000
248; CI-NEXT:    s_mov_b32 s2, 0
249; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
250; CI-NEXT:    s_waitcnt lgkmcnt(0)
251; CI-NEXT:    s_mov_b64 s[0:1], s[6:7]
252; CI-NEXT:    v_mov_b32_e32 v1, 0
253; CI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
254; CI-NEXT:    s_mov_b32 s0, 0xffff
255; CI-NEXT:    s_lshr_b32 s1, s8, 16
256; CI-NEXT:    s_mov_b64 s[6:7], s[2:3]
257; CI-NEXT:    s_waitcnt vmcnt(0)
258; CI-NEXT:    v_and_b32_e32 v3, s0, v2
259; CI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
260; CI-NEXT:    v_lshl_b32_e32 v2, s1, v2
261; CI-NEXT:    v_lshl_b32_e32 v3, s8, v3
262; CI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
263; CI-NEXT:    v_and_b32_e32 v3, s0, v3
264; CI-NEXT:    v_or_b32_e32 v2, v3, v2
265; CI-NEXT:    buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
266; CI-NEXT:    s_endpgm
267  %tid = call i32 @llvm.amdgcn.workitem.id.x()
268  %tid.ext = sext i32 %tid to i64
269  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
270  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
271  %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
272  %result = shl <2 x i16> %sgpr, %vgpr
273  store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
274  ret void
275}
276
277define amdgpu_kernel void @shl_imm_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
278; GFX9-LABEL: shl_imm_v_v2i16:
279; GFX9:       ; %bb.0:
280; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
281; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
282; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
283; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
284; GFX9-NEXT:    s_waitcnt vmcnt(0)
285; GFX9-NEXT:    v_pk_lshlrev_b16 v1, v1, 8 op_sel_hi:[1,0]
286; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
287; GFX9-NEXT:    s_endpgm
288;
289; VI-LABEL: shl_imm_v_v2i16:
290; VI:       ; %bb.0:
291; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
292; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
293; VI-NEXT:    v_mov_b32_e32 v4, 8
294; VI-NEXT:    s_waitcnt lgkmcnt(0)
295; VI-NEXT:    v_mov_b32_e32 v1, s3
296; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
297; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
298; VI-NEXT:    flat_load_dword v3, v[0:1]
299; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
300; VI-NEXT:    v_mov_b32_e32 v1, s1
301; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
302; VI-NEXT:    s_waitcnt vmcnt(0)
303; VI-NEXT:    v_lshlrev_b16_e64 v2, v3, 8
304; VI-NEXT:    v_lshlrev_b16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
305; VI-NEXT:    v_or_b32_e32 v2, v2, v3
306; VI-NEXT:    flat_store_dword v[0:1], v2
307; VI-NEXT:    s_endpgm
308;
309; CI-LABEL: shl_imm_v_v2i16:
310; CI:       ; %bb.0:
311; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
312; CI-NEXT:    s_mov_b32 s3, 0xf000
313; CI-NEXT:    s_mov_b32 s2, 0
314; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
315; CI-NEXT:    v_mov_b32_e32 v1, 0
316; CI-NEXT:    s_waitcnt lgkmcnt(0)
317; CI-NEXT:    s_mov_b64 s[0:1], s[6:7]
318; CI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
319; CI-NEXT:    s_mov_b64 s[6:7], s[2:3]
320; CI-NEXT:    s_waitcnt vmcnt(0)
321; CI-NEXT:    v_and_b32_e32 v3, 0xffff, v2
322; CI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
323; CI-NEXT:    v_lshl_b32_e32 v2, 8, v2
324; CI-NEXT:    v_lshl_b32_e32 v3, 8, v3
325; CI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
326; CI-NEXT:    v_and_b32_e32 v3, 0xfff8, v3
327; CI-NEXT:    v_or_b32_e32 v2, v3, v2
328; CI-NEXT:    buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
329; CI-NEXT:    s_endpgm
330  %tid = call i32 @llvm.amdgcn.workitem.id.x()
331  %tid.ext = sext i32 %tid to i64
332  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
333  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
334  %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
335  %result = shl <2 x i16> <i16 8, i16 8>, %vgpr
336  store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
337  ret void
338}
339
340define amdgpu_kernel void @shl_v_imm_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
341; GFX9-LABEL: shl_v_imm_v2i16:
342; GFX9:       ; %bb.0:
343; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
344; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
345; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
346; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
347; GFX9-NEXT:    s_waitcnt vmcnt(0)
348; GFX9-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
349; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
350; GFX9-NEXT:    s_endpgm
351;
352; VI-LABEL: shl_v_imm_v2i16:
353; VI:       ; %bb.0:
354; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
355; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
356; VI-NEXT:    s_waitcnt lgkmcnt(0)
357; VI-NEXT:    v_mov_b32_e32 v1, s3
358; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
359; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
360; VI-NEXT:    flat_load_dword v3, v[0:1]
361; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
362; VI-NEXT:    v_mov_b32_e32 v1, s1
363; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
364; VI-NEXT:    s_waitcnt vmcnt(0)
365; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v3
366; VI-NEXT:    v_and_b32_e32 v2, 0xff000000, v2
367; VI-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
368; VI-NEXT:    v_or_b32_e32 v2, v3, v2
369; VI-NEXT:    flat_store_dword v[0:1], v2
370; VI-NEXT:    s_endpgm
371;
372; CI-LABEL: shl_v_imm_v2i16:
373; CI:       ; %bb.0:
374; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
375; CI-NEXT:    s_mov_b32 s3, 0xf000
376; CI-NEXT:    s_mov_b32 s2, 0
377; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
378; CI-NEXT:    v_mov_b32_e32 v1, 0
379; CI-NEXT:    s_waitcnt lgkmcnt(0)
380; CI-NEXT:    s_mov_b64 s[0:1], s[6:7]
381; CI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
382; CI-NEXT:    s_mov_b64 s[6:7], s[2:3]
383; CI-NEXT:    s_waitcnt vmcnt(0)
384; CI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
385; CI-NEXT:    v_and_b32_e32 v2, 0xff00ff00, v2
386; CI-NEXT:    buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
387; CI-NEXT:    s_endpgm
388  %tid = call i32 @llvm.amdgcn.workitem.id.x()
389  %tid.ext = sext i32 %tid to i64
390  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
391  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
392  %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
393  %result = shl <2 x i16> %vgpr, <i16 8, i16 8>
394  store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
395  ret void
396}
397
398define amdgpu_kernel void @v_shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
399; GFX9-LABEL: v_shl_v4i16:
400; GFX9:       ; %bb.0:
401; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
402; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
403; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
404; GFX9-NEXT:    global_load_dwordx2 v[0:1], v4, s[2:3]
405; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[2:3] offset:8
406; GFX9-NEXT:    s_waitcnt vmcnt(0)
407; GFX9-NEXT:    v_pk_lshlrev_b16 v1, v3, v1
408; GFX9-NEXT:    v_pk_lshlrev_b16 v0, v2, v0
409; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
410; GFX9-NEXT:    s_endpgm
411;
412; VI-LABEL: v_shl_v4i16:
413; VI:       ; %bb.0:
414; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
415; VI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
416; VI-NEXT:    s_waitcnt lgkmcnt(0)
417; VI-NEXT:    v_mov_b32_e32 v1, s3
418; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
419; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
420; VI-NEXT:    v_add_u32_e32 v2, vcc, 8, v0
421; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
422; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
423; VI-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
424; VI-NEXT:    v_mov_b32_e32 v5, s1
425; VI-NEXT:    v_add_u32_e32 v4, vcc, s0, v4
426; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
427; VI-NEXT:    s_waitcnt vmcnt(0)
428; VI-NEXT:    v_lshlrev_b16_e32 v6, v3, v1
429; VI-NEXT:    v_lshlrev_b16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
430; VI-NEXT:    v_lshlrev_b16_e32 v3, v2, v0
431; VI-NEXT:    v_lshlrev_b16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
432; VI-NEXT:    v_or_b32_e32 v1, v6, v1
433; VI-NEXT:    v_or_b32_e32 v0, v3, v0
434; VI-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
435; VI-NEXT:    s_endpgm
436;
437; CI-LABEL: v_shl_v4i16:
438; CI:       ; %bb.0:
439; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
440; CI-NEXT:    s_mov_b32 s3, 0xf000
441; CI-NEXT:    s_mov_b32 s2, 0
442; CI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
443; CI-NEXT:    v_mov_b32_e32 v1, 0
444; CI-NEXT:    s_waitcnt lgkmcnt(0)
445; CI-NEXT:    s_mov_b64 s[0:1], s[6:7]
446; CI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
447; CI-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8
448; CI-NEXT:    s_mov_b32 s0, 0xffff
449; CI-NEXT:    s_mov_b64 s[6:7], s[2:3]
450; CI-NEXT:    s_waitcnt vmcnt(1)
451; CI-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
452; CI-NEXT:    s_waitcnt vmcnt(0)
453; CI-NEXT:    v_and_b32_e32 v8, s0, v4
454; CI-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
455; CI-NEXT:    v_and_b32_e32 v9, s0, v5
456; CI-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
457; CI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
458; CI-NEXT:    v_lshl_b32_e32 v5, v7, v5
459; CI-NEXT:    v_lshl_b32_e32 v3, v3, v9
460; CI-NEXT:    v_lshl_b32_e32 v4, v6, v4
461; CI-NEXT:    v_lshl_b32_e32 v2, v2, v8
462; CI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
463; CI-NEXT:    v_and_b32_e32 v3, s0, v3
464; CI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
465; CI-NEXT:    v_and_b32_e32 v2, s0, v2
466; CI-NEXT:    v_or_b32_e32 v3, v3, v5
467; CI-NEXT:    v_or_b32_e32 v2, v2, v4
468; CI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
469; CI-NEXT:    s_endpgm
470  %tid = call i32 @llvm.amdgcn.workitem.id.x()
471  %tid.ext = sext i32 %tid to i64
472  %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext
473  %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext
474  %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in.gep, i32 1
475  %a = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep
476  %b = load <4 x i16>, <4 x i16> addrspace(1)* %b_ptr
477  %result = shl <4 x i16> %a, %b
478  store <4 x i16> %result, <4 x i16> addrspace(1)* %out.gep
479  ret void
480}
481
482define amdgpu_kernel void @shl_v_imm_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
483; GFX9-LABEL: shl_v_imm_v4i16:
484; GFX9:       ; %bb.0:
485; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
486; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
487; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
488; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
489; GFX9-NEXT:    s_waitcnt vmcnt(0)
490; GFX9-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
491; GFX9-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
492; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
493; GFX9-NEXT:    s_endpgm
494;
495; VI-LABEL: shl_v_imm_v4i16:
496; VI:       ; %bb.0:
497; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
498; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
499; VI-NEXT:    s_waitcnt lgkmcnt(0)
500; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
501; VI-NEXT:    v_mov_b32_e32 v1, s3
502; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
503; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
504; VI-NEXT:    s_mov_b32 s2, 0xff000000
505; VI-NEXT:    v_mov_b32_e32 v3, s1
506; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
507; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
508; VI-NEXT:    s_waitcnt vmcnt(0)
509; VI-NEXT:    v_lshlrev_b32_e32 v4, 8, v1
510; VI-NEXT:    v_lshlrev_b16_e32 v5, 8, v0
511; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
512; VI-NEXT:    v_and_b32_e32 v0, s2, v0
513; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
514; VI-NEXT:    v_and_b32_e32 v4, s2, v4
515; VI-NEXT:    v_or_b32_e32 v1, v1, v4
516; VI-NEXT:    v_or_b32_e32 v0, v5, v0
517; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
518; VI-NEXT:    s_endpgm
519;
520; CI-LABEL: shl_v_imm_v4i16:
521; CI:       ; %bb.0:
522; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
523; CI-NEXT:    s_mov_b32 s3, 0xf000
524; CI-NEXT:    s_mov_b32 s2, 0
525; CI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
526; CI-NEXT:    v_mov_b32_e32 v1, 0
527; CI-NEXT:    s_waitcnt lgkmcnt(0)
528; CI-NEXT:    s_mov_b64 s[0:1], s[6:7]
529; CI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
530; CI-NEXT:    s_mov_b32 s0, 0xff00
531; CI-NEXT:    s_mov_b64 s[6:7], s[2:3]
532; CI-NEXT:    s_waitcnt vmcnt(0)
533; CI-NEXT:    v_lshrrev_b32_e32 v4, 8, v3
534; CI-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
535; CI-NEXT:    v_and_b32_e32 v4, s0, v4
536; CI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
537; CI-NEXT:    v_and_b32_e32 v3, s0, v3
538; CI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
539; CI-NEXT:    v_or_b32_e32 v3, v3, v4
540; CI-NEXT:    v_and_b32_e32 v2, 0xff00ff00, v2
541; CI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
542; CI-NEXT:    s_endpgm
543  %tid = call i32 @llvm.amdgcn.workitem.id.x()
544  %tid.ext = sext i32 %tid to i64
545  %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext
546  %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext
547  %vgpr = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep
548  %result = shl <4 x i16> %vgpr, <i16 8, i16 8, i16 8, i16 8>
549  store <4 x i16> %result, <4 x i16> addrspace(1)* %out.gep
550  ret void
551}
552
553declare i32 @llvm.amdgcn.workitem.id.x() #1
554
555attributes #0 = { nounwind }
556attributes #1 = { nounwind readnone }
557