1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,GFX89,SI
3; RUN: llc < %s -march=amdgcn -mcpu=tonga  -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,GFX89,VI
4; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,GFX89,GFX9
5; RUN: llc < %s -march=r600 -mcpu=redwood  -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,R600
6
7declare i32 @llvm.fshl.i32(i32, i32, i32) nounwind readnone
8declare <2 x i32> @llvm.fshl.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) nounwind readnone
9declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone
10
11define amdgpu_kernel void @fshl_i32(i32 addrspace(1)* %in, i32 %x, i32 %y, i32 %z) {
12; SI-LABEL: fshl_i32:
13; SI:       ; %bb.0: ; %entry
14; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
15; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
16; SI-NEXT:    s_load_dword s0, s[0:1], 0xd
17; SI-NEXT:    s_mov_b32 s7, 0xf000
18; SI-NEXT:    s_mov_b32 s6, -1
19; SI-NEXT:    s_waitcnt lgkmcnt(0)
20; SI-NEXT:    v_mov_b32_e32 v0, s3
21; SI-NEXT:    s_not_b32 s0, s0
22; SI-NEXT:    v_alignbit_b32 v0, s2, v0, 1
23; SI-NEXT:    s_lshr_b32 s1, s2, 1
24; SI-NEXT:    v_mov_b32_e32 v1, s0
25; SI-NEXT:    v_alignbit_b32 v0, s1, v0, v1
26; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
27; SI-NEXT:    s_endpgm
28;
29; VI-LABEL: fshl_i32:
30; VI:       ; %bb.0: ; %entry
31; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
32; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
33; VI-NEXT:    s_load_dword s0, s[0:1], 0x34
34; VI-NEXT:    s_waitcnt lgkmcnt(0)
35; VI-NEXT:    v_mov_b32_e32 v0, s5
36; VI-NEXT:    s_not_b32 s0, s0
37; VI-NEXT:    s_lshr_b32 s1, s4, 1
38; VI-NEXT:    v_alignbit_b32 v0, s4, v0, 1
39; VI-NEXT:    v_mov_b32_e32 v1, s0
40; VI-NEXT:    v_alignbit_b32 v2, s1, v0, v1
41; VI-NEXT:    v_mov_b32_e32 v0, s2
42; VI-NEXT:    v_mov_b32_e32 v1, s3
43; VI-NEXT:    flat_store_dword v[0:1], v2
44; VI-NEXT:    s_endpgm
45;
46; GFX9-LABEL: fshl_i32:
47; GFX9:       ; %bb.0: ; %entry
48; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
49; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
50; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x34
51; GFX9-NEXT:    v_mov_b32_e32 v0, 0
52; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
53; GFX9-NEXT:    v_mov_b32_e32 v1, s5
54; GFX9-NEXT:    s_not_b32 s0, s0
55; GFX9-NEXT:    s_lshr_b32 s1, s4, 1
56; GFX9-NEXT:    v_alignbit_b32 v1, s4, v1, 1
57; GFX9-NEXT:    v_mov_b32_e32 v2, s0
58; GFX9-NEXT:    v_alignbit_b32 v1, s1, v1, v2
59; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
60; GFX9-NEXT:    s_endpgm
61;
62; R600-LABEL: fshl_i32:
63; R600:       ; %bb.0: ; %entry
64; R600-NEXT:    ALU 5, @4, KC0[CB0:0-32], KC1[]
65; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
66; R600-NEXT:    CF_END
67; R600-NEXT:    PAD
68; R600-NEXT:    ALU clause starting at 4:
69; R600-NEXT:     LSHR T0.Z, KC0[2].Z, 1,
70; R600-NEXT:     BIT_ALIGN_INT T0.W, KC0[2].Z, KC0[2].W, 1,
71; R600-NEXT:     NOT_INT * T1.W, KC0[3].X,
72; R600-NEXT:     BIT_ALIGN_INT T0.X, PV.Z, PV.W, PS,
73; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
74; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
75entry:
76  %0 = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %z)
77  store i32 %0, i32 addrspace(1)* %in
78  ret void
79}
80
81define amdgpu_kernel void @fshl_i32_imm(i32 addrspace(1)* %in, i32 %x, i32 %y) {
82; SI-LABEL: fshl_i32_imm:
83; SI:       ; %bb.0: ; %entry
84; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
85; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
86; SI-NEXT:    s_mov_b32 s7, 0xf000
87; SI-NEXT:    s_mov_b32 s6, -1
88; SI-NEXT:    s_waitcnt lgkmcnt(0)
89; SI-NEXT:    v_mov_b32_e32 v0, s1
90; SI-NEXT:    v_alignbit_b32 v0, s0, v0, 25
91; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
92; SI-NEXT:    s_endpgm
93;
94; VI-LABEL: fshl_i32_imm:
95; VI:       ; %bb.0: ; %entry
96; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
97; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
98; VI-NEXT:    s_waitcnt lgkmcnt(0)
99; VI-NEXT:    v_mov_b32_e32 v0, s1
100; VI-NEXT:    v_alignbit_b32 v2, s0, v0, 25
101; VI-NEXT:    v_mov_b32_e32 v0, s2
102; VI-NEXT:    v_mov_b32_e32 v1, s3
103; VI-NEXT:    flat_store_dword v[0:1], v2
104; VI-NEXT:    s_endpgm
105;
106; GFX9-LABEL: fshl_i32_imm:
107; GFX9:       ; %bb.0: ; %entry
108; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
109; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
110; GFX9-NEXT:    v_mov_b32_e32 v0, 0
111; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
112; GFX9-NEXT:    v_mov_b32_e32 v1, s1
113; GFX9-NEXT:    v_alignbit_b32 v1, s0, v1, 25
114; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
115; GFX9-NEXT:    s_endpgm
116;
117; R600-LABEL: fshl_i32_imm:
118; R600:       ; %bb.0: ; %entry
119; R600-NEXT:    ALU 3, @4, KC0[CB0:0-32], KC1[]
120; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
121; R600-NEXT:    CF_END
122; R600-NEXT:    PAD
123; R600-NEXT:    ALU clause starting at 4:
124; R600-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
125; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
126; R600-NEXT:     BIT_ALIGN_INT * T1.X, KC0[2].Z, KC0[2].W, literal.x,
127; R600-NEXT:    25(3.503246e-44), 0(0.000000e+00)
128entry:
129  %0 = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 7)
130  store i32 %0, i32 addrspace(1)* %in
131  ret void
132}
133
134define amdgpu_kernel void @fshl_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) {
135; SI-LABEL: fshl_v2i32:
136; SI:       ; %bb.0: ; %entry
137; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
138; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
139; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
140; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xf
141; SI-NEXT:    s_mov_b32 s7, 0xf000
142; SI-NEXT:    s_mov_b32 s6, -1
143; SI-NEXT:    s_waitcnt lgkmcnt(0)
144; SI-NEXT:    v_mov_b32_e32 v0, s9
145; SI-NEXT:    s_not_b32 s1, s1
146; SI-NEXT:    v_alignbit_b32 v0, s3, v0, 1
147; SI-NEXT:    v_mov_b32_e32 v1, s1
148; SI-NEXT:    s_lshr_b32 s3, s3, 1
149; SI-NEXT:    v_alignbit_b32 v1, s3, v0, v1
150; SI-NEXT:    v_mov_b32_e32 v0, s8
151; SI-NEXT:    s_not_b32 s0, s0
152; SI-NEXT:    v_alignbit_b32 v0, s2, v0, 1
153; SI-NEXT:    s_lshr_b32 s1, s2, 1
154; SI-NEXT:    v_mov_b32_e32 v2, s0
155; SI-NEXT:    v_alignbit_b32 v0, s1, v0, v2
156; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
157; SI-NEXT:    s_endpgm
158;
159; VI-LABEL: fshl_v2i32:
160; VI:       ; %bb.0: ; %entry
161; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
162; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
163; VI-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x34
164; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x3c
165; VI-NEXT:    s_waitcnt lgkmcnt(0)
166; VI-NEXT:    v_mov_b32_e32 v0, s7
167; VI-NEXT:    s_not_b32 s1, s1
168; VI-NEXT:    v_mov_b32_e32 v1, s1
169; VI-NEXT:    s_lshr_b32 s7, s5, 1
170; VI-NEXT:    v_alignbit_b32 v0, s5, v0, 1
171; VI-NEXT:    v_alignbit_b32 v1, s7, v0, v1
172; VI-NEXT:    v_mov_b32_e32 v0, s6
173; VI-NEXT:    s_not_b32 s0, s0
174; VI-NEXT:    v_alignbit_b32 v0, s4, v0, 1
175; VI-NEXT:    s_lshr_b32 s1, s4, 1
176; VI-NEXT:    v_mov_b32_e32 v2, s0
177; VI-NEXT:    v_alignbit_b32 v0, s1, v0, v2
178; VI-NEXT:    v_mov_b32_e32 v2, s2
179; VI-NEXT:    v_mov_b32_e32 v3, s3
180; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
181; VI-NEXT:    s_endpgm
182;
183; GFX9-LABEL: fshl_v2i32:
184; GFX9:       ; %bb.0: ; %entry
185; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
186; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
187; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x34
188; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x3c
189; GFX9-NEXT:    v_mov_b32_e32 v2, 0
190; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
191; GFX9-NEXT:    v_mov_b32_e32 v0, s7
192; GFX9-NEXT:    s_not_b32 s1, s1
193; GFX9-NEXT:    v_alignbit_b32 v0, s5, v0, 1
194; GFX9-NEXT:    v_mov_b32_e32 v1, s1
195; GFX9-NEXT:    s_lshr_b32 s5, s5, 1
196; GFX9-NEXT:    v_alignbit_b32 v1, s5, v0, v1
197; GFX9-NEXT:    v_mov_b32_e32 v0, s6
198; GFX9-NEXT:    s_not_b32 s0, s0
199; GFX9-NEXT:    v_alignbit_b32 v0, s4, v0, 1
200; GFX9-NEXT:    s_lshr_b32 s1, s4, 1
201; GFX9-NEXT:    v_mov_b32_e32 v3, s0
202; GFX9-NEXT:    v_alignbit_b32 v0, s1, v0, v3
203; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
204; GFX9-NEXT:    s_endpgm
205;
206; R600-LABEL: fshl_v2i32:
207; R600:       ; %bb.0: ; %entry
208; R600-NEXT:    ALU 9, @4, KC0[CB0:0-32], KC1[]
209; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
210; R600-NEXT:    CF_END
211; R600-NEXT:    PAD
212; R600-NEXT:    ALU clause starting at 4:
213; R600-NEXT:     LSHR T0.Z, KC0[3].X, 1,
214; R600-NEXT:     BIT_ALIGN_INT * T0.W, KC0[3].X, KC0[3].Z, 1,
215; R600-NEXT:     NOT_INT * T1.W, KC0[4].X,
216; R600-NEXT:     BIT_ALIGN_INT T0.Y, T0.Z, T0.W, PV.W,
217; R600-NEXT:     LSHR T0.Z, KC0[2].W, 1,
218; R600-NEXT:     BIT_ALIGN_INT * T0.W, KC0[2].W, KC0[3].Y, 1,
219; R600-NEXT:     NOT_INT * T1.W, KC0[3].W,
220; R600-NEXT:     BIT_ALIGN_INT T0.X, T0.Z, T0.W, PV.W,
221; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
222; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
223entry:
224  %0 = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z)
225  store <2 x i32> %0, <2 x i32> addrspace(1)* %in
226  ret void
227}
228
229define amdgpu_kernel void @fshl_v2i32_imm(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) {
230; SI-LABEL: fshl_v2i32_imm:
231; SI:       ; %bb.0: ; %entry
232; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
233; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
234; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
235; SI-NEXT:    s_mov_b32 s7, 0xf000
236; SI-NEXT:    s_mov_b32 s6, -1
237; SI-NEXT:    s_waitcnt lgkmcnt(0)
238; SI-NEXT:    v_mov_b32_e32 v0, s1
239; SI-NEXT:    v_alignbit_b32 v1, s3, v0, 23
240; SI-NEXT:    v_mov_b32_e32 v0, s0
241; SI-NEXT:    v_alignbit_b32 v0, s2, v0, 25
242; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
243; SI-NEXT:    s_endpgm
244;
245; VI-LABEL: fshl_v2i32_imm:
246; VI:       ; %bb.0: ; %entry
247; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
248; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
249; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
250; VI-NEXT:    s_waitcnt lgkmcnt(0)
251; VI-NEXT:    v_mov_b32_e32 v0, s1
252; VI-NEXT:    v_mov_b32_e32 v2, s0
253; VI-NEXT:    v_alignbit_b32 v1, s5, v0, 23
254; VI-NEXT:    v_alignbit_b32 v0, s4, v2, 25
255; VI-NEXT:    v_mov_b32_e32 v2, s2
256; VI-NEXT:    v_mov_b32_e32 v3, s3
257; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
258; VI-NEXT:    s_endpgm
259;
260; GFX9-LABEL: fshl_v2i32_imm:
261; GFX9:       ; %bb.0: ; %entry
262; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
263; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
264; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
265; GFX9-NEXT:    v_mov_b32_e32 v2, 0
266; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
267; GFX9-NEXT:    v_mov_b32_e32 v0, s1
268; GFX9-NEXT:    v_mov_b32_e32 v3, s0
269; GFX9-NEXT:    v_alignbit_b32 v1, s5, v0, 23
270; GFX9-NEXT:    v_alignbit_b32 v0, s4, v3, 25
271; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
272; GFX9-NEXT:    s_endpgm
273;
274; R600-LABEL: fshl_v2i32_imm:
275; R600:       ; %bb.0: ; %entry
276; R600-NEXT:    ALU 5, @4, KC0[CB0:0-32], KC1[]
277; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
278; R600-NEXT:    CF_END
279; R600-NEXT:    PAD
280; R600-NEXT:    ALU clause starting at 4:
281; R600-NEXT:     BIT_ALIGN_INT * T0.Y, KC0[3].X, KC0[3].Z, literal.x,
282; R600-NEXT:    23(3.222986e-44), 0(0.000000e+00)
283; R600-NEXT:     BIT_ALIGN_INT * T0.X, KC0[2].W, KC0[3].Y, literal.x,
284; R600-NEXT:    25(3.503246e-44), 0(0.000000e+00)
285; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
286; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
287entry:
288  %0 = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> <i32 7, i32 9>)
289  store <2 x i32> %0, <2 x i32> addrspace(1)* %in
290  ret void
291}
292
293define amdgpu_kernel void @fshl_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
294; SI-LABEL: fshl_v4i32:
295; SI:       ; %bb.0: ; %entry
296; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
297; SI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
298; SI-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x11
299; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x15
300; SI-NEXT:    s_mov_b32 s7, 0xf000
301; SI-NEXT:    s_mov_b32 s6, -1
302; SI-NEXT:    s_waitcnt lgkmcnt(0)
303; SI-NEXT:    v_mov_b32_e32 v0, s15
304; SI-NEXT:    s_not_b32 s3, s3
305; SI-NEXT:    v_alignbit_b32 v0, s11, v0, 1
306; SI-NEXT:    v_mov_b32_e32 v1, s3
307; SI-NEXT:    s_lshr_b32 s11, s11, 1
308; SI-NEXT:    v_alignbit_b32 v3, s11, v0, v1
309; SI-NEXT:    v_mov_b32_e32 v0, s14
310; SI-NEXT:    s_not_b32 s2, s2
311; SI-NEXT:    v_mov_b32_e32 v1, s2
312; SI-NEXT:    v_alignbit_b32 v0, s10, v0, 1
313; SI-NEXT:    s_lshr_b32 s3, s10, 1
314; SI-NEXT:    v_alignbit_b32 v2, s3, v0, v1
315; SI-NEXT:    v_mov_b32_e32 v0, s13
316; SI-NEXT:    s_not_b32 s1, s1
317; SI-NEXT:    v_mov_b32_e32 v1, s1
318; SI-NEXT:    v_alignbit_b32 v0, s9, v0, 1
319; SI-NEXT:    s_lshr_b32 s2, s9, 1
320; SI-NEXT:    v_alignbit_b32 v1, s2, v0, v1
321; SI-NEXT:    v_mov_b32_e32 v0, s12
322; SI-NEXT:    s_not_b32 s0, s0
323; SI-NEXT:    v_alignbit_b32 v0, s8, v0, 1
324; SI-NEXT:    s_lshr_b32 s1, s8, 1
325; SI-NEXT:    v_mov_b32_e32 v4, s0
326; SI-NEXT:    v_alignbit_b32 v0, s1, v0, v4
327; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
328; SI-NEXT:    s_endpgm
329;
330; VI-LABEL: fshl_v4i32:
331; VI:       ; %bb.0: ; %entry
332; VI-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x24
333; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
334; VI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x44
335; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x54
336; VI-NEXT:    s_waitcnt lgkmcnt(0)
337; VI-NEXT:    v_mov_b32_e32 v0, s11
338; VI-NEXT:    s_not_b32 s3, s3
339; VI-NEXT:    v_mov_b32_e32 v1, s3
340; VI-NEXT:    s_lshr_b32 s11, s7, 1
341; VI-NEXT:    v_alignbit_b32 v0, s7, v0, 1
342; VI-NEXT:    v_alignbit_b32 v3, s11, v0, v1
343; VI-NEXT:    v_mov_b32_e32 v0, s10
344; VI-NEXT:    s_not_b32 s2, s2
345; VI-NEXT:    v_mov_b32_e32 v1, s2
346; VI-NEXT:    v_alignbit_b32 v0, s6, v0, 1
347; VI-NEXT:    s_lshr_b32 s3, s6, 1
348; VI-NEXT:    v_alignbit_b32 v2, s3, v0, v1
349; VI-NEXT:    v_mov_b32_e32 v0, s9
350; VI-NEXT:    s_not_b32 s1, s1
351; VI-NEXT:    v_mov_b32_e32 v1, s1
352; VI-NEXT:    v_alignbit_b32 v0, s5, v0, 1
353; VI-NEXT:    s_lshr_b32 s2, s5, 1
354; VI-NEXT:    v_alignbit_b32 v1, s2, v0, v1
355; VI-NEXT:    v_mov_b32_e32 v0, s8
356; VI-NEXT:    s_not_b32 s0, s0
357; VI-NEXT:    v_alignbit_b32 v0, s4, v0, 1
358; VI-NEXT:    s_lshr_b32 s1, s4, 1
359; VI-NEXT:    v_mov_b32_e32 v4, s0
360; VI-NEXT:    v_alignbit_b32 v0, s1, v0, v4
361; VI-NEXT:    v_mov_b32_e32 v4, s12
362; VI-NEXT:    v_mov_b32_e32 v5, s13
363; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
364; VI-NEXT:    s_endpgm
365;
366; GFX9-LABEL: fshl_v4i32:
367; GFX9:       ; %bb.0: ; %entry
368; GFX9-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x24
369; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
370; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x44
371; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x54
372; GFX9-NEXT:    v_mov_b32_e32 v4, 0
373; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
374; GFX9-NEXT:    v_mov_b32_e32 v0, s11
375; GFX9-NEXT:    s_not_b32 s3, s3
376; GFX9-NEXT:    v_alignbit_b32 v0, s7, v0, 1
377; GFX9-NEXT:    v_mov_b32_e32 v1, s3
378; GFX9-NEXT:    s_lshr_b32 s7, s7, 1
379; GFX9-NEXT:    v_alignbit_b32 v3, s7, v0, v1
380; GFX9-NEXT:    v_mov_b32_e32 v0, s10
381; GFX9-NEXT:    s_not_b32 s2, s2
382; GFX9-NEXT:    v_mov_b32_e32 v1, s2
383; GFX9-NEXT:    v_alignbit_b32 v0, s6, v0, 1
384; GFX9-NEXT:    s_lshr_b32 s3, s6, 1
385; GFX9-NEXT:    v_alignbit_b32 v2, s3, v0, v1
386; GFX9-NEXT:    v_mov_b32_e32 v0, s9
387; GFX9-NEXT:    s_not_b32 s1, s1
388; GFX9-NEXT:    v_mov_b32_e32 v1, s1
389; GFX9-NEXT:    v_alignbit_b32 v0, s5, v0, 1
390; GFX9-NEXT:    s_lshr_b32 s2, s5, 1
391; GFX9-NEXT:    v_alignbit_b32 v1, s2, v0, v1
392; GFX9-NEXT:    v_mov_b32_e32 v0, s8
393; GFX9-NEXT:    s_not_b32 s0, s0
394; GFX9-NEXT:    v_alignbit_b32 v0, s4, v0, 1
395; GFX9-NEXT:    s_lshr_b32 s1, s4, 1
396; GFX9-NEXT:    v_mov_b32_e32 v5, s0
397; GFX9-NEXT:    v_alignbit_b32 v0, s1, v0, v5
398; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[12:13]
399; GFX9-NEXT:    s_endpgm
400;
401; R600-LABEL: fshl_v4i32:
402; R600:       ; %bb.0: ; %entry
403; R600-NEXT:    ALU 17, @4, KC0[CB0:0-32], KC1[]
404; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
405; R600-NEXT:    CF_END
406; R600-NEXT:    PAD
407; R600-NEXT:    ALU clause starting at 4:
408; R600-NEXT:     LSHR T0.Z, KC0[4].X, 1,
409; R600-NEXT:     BIT_ALIGN_INT * T0.W, KC0[4].X, KC0[5].X, 1,
410; R600-NEXT:     NOT_INT * T1.W, KC0[6].X,
411; R600-NEXT:     LSHR T0.Y, KC0[3].W, 1,
412; R600-NEXT:     BIT_ALIGN_INT T1.Z, KC0[3].W, KC0[4].W, 1,
413; R600-NEXT:     BIT_ALIGN_INT * T0.W, T0.Z, T0.W, PV.W,
414; R600-NEXT:     NOT_INT * T1.W, KC0[5].W,
415; R600-NEXT:     LSHR T1.Y, KC0[3].Z, 1,
416; R600-NEXT:     BIT_ALIGN_INT T0.Z, T0.Y, T1.Z, PV.W,
417; R600-NEXT:     BIT_ALIGN_INT * T1.W, KC0[3].Z, KC0[4].Z, 1,
418; R600-NEXT:     NOT_INT * T2.W, KC0[5].Z,
419; R600-NEXT:     BIT_ALIGN_INT T0.Y, T1.Y, T1.W, PV.W,
420; R600-NEXT:     LSHR T1.Z, KC0[3].Y, 1,
421; R600-NEXT:     BIT_ALIGN_INT * T1.W, KC0[3].Y, KC0[4].Y, 1,
422; R600-NEXT:     NOT_INT * T2.W, KC0[5].Y,
423; R600-NEXT:     BIT_ALIGN_INT T0.X, T1.Z, T1.W, PV.W,
424; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
425; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
426entry:
427  %0 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z)
428  store <4 x i32> %0, <4 x i32> addrspace(1)* %in
429  ret void
430}
431
432define amdgpu_kernel void @fshl_v4i32_imm(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) {
433; SI-LABEL: fshl_v4i32_imm:
434; SI:       ; %bb.0: ; %entry
435; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
436; SI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
437; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x11
438; SI-NEXT:    s_mov_b32 s7, 0xf000
439; SI-NEXT:    s_mov_b32 s6, -1
440; SI-NEXT:    s_waitcnt lgkmcnt(0)
441; SI-NEXT:    v_mov_b32_e32 v0, s3
442; SI-NEXT:    v_alignbit_b32 v3, s11, v0, 31
443; SI-NEXT:    v_mov_b32_e32 v0, s2
444; SI-NEXT:    v_alignbit_b32 v2, s10, v0, 23
445; SI-NEXT:    v_mov_b32_e32 v0, s1
446; SI-NEXT:    v_alignbit_b32 v1, s9, v0, 25
447; SI-NEXT:    v_mov_b32_e32 v0, s0
448; SI-NEXT:    v_alignbit_b32 v0, s8, v0, 31
449; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
450; SI-NEXT:    s_endpgm
451;
452; VI-LABEL: fshl_v4i32_imm:
453; VI:       ; %bb.0: ; %entry
454; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x24
455; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
456; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x44
457; VI-NEXT:    s_waitcnt lgkmcnt(0)
458; VI-NEXT:    v_mov_b32_e32 v4, s8
459; VI-NEXT:    v_mov_b32_e32 v5, s9
460; VI-NEXT:    v_mov_b32_e32 v0, s3
461; VI-NEXT:    v_mov_b32_e32 v1, s2
462; VI-NEXT:    v_alignbit_b32 v3, s7, v0, 31
463; VI-NEXT:    v_mov_b32_e32 v0, s1
464; VI-NEXT:    v_alignbit_b32 v2, s6, v1, 23
465; VI-NEXT:    v_alignbit_b32 v1, s5, v0, 25
466; VI-NEXT:    v_mov_b32_e32 v0, s0
467; VI-NEXT:    v_alignbit_b32 v0, s4, v0, 31
468; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
469; VI-NEXT:    s_endpgm
470;
471; GFX9-LABEL: fshl_v4i32_imm:
472; GFX9:       ; %bb.0: ; %entry
473; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x24
474; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
475; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x44
476; GFX9-NEXT:    v_mov_b32_e32 v4, 0
477; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
478; GFX9-NEXT:    v_mov_b32_e32 v0, s3
479; GFX9-NEXT:    v_mov_b32_e32 v1, s2
480; GFX9-NEXT:    v_alignbit_b32 v3, s7, v0, 31
481; GFX9-NEXT:    v_mov_b32_e32 v0, s1
482; GFX9-NEXT:    v_alignbit_b32 v2, s6, v1, 23
483; GFX9-NEXT:    v_alignbit_b32 v1, s5, v0, 25
484; GFX9-NEXT:    v_mov_b32_e32 v0, s0
485; GFX9-NEXT:    v_alignbit_b32 v0, s4, v0, 31
486; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[8:9]
487; GFX9-NEXT:    s_endpgm
488;
489; R600-LABEL: fshl_v4i32_imm:
490; R600:       ; %bb.0: ; %entry
491; R600-NEXT:    ALU 9, @4, KC0[CB0:0-32], KC1[]
492; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
493; R600-NEXT:    CF_END
494; R600-NEXT:    PAD
495; R600-NEXT:    ALU clause starting at 4:
496; R600-NEXT:     BIT_ALIGN_INT * T0.W, KC0[4].X, KC0[5].X, literal.x,
497; R600-NEXT:    31(4.344025e-44), 0(0.000000e+00)
498; R600-NEXT:     BIT_ALIGN_INT * T0.Z, KC0[3].W, KC0[4].W, literal.x,
499; R600-NEXT:    23(3.222986e-44), 0(0.000000e+00)
500; R600-NEXT:     BIT_ALIGN_INT * T0.Y, KC0[3].Z, KC0[4].Z, literal.x,
501; R600-NEXT:    25(3.503246e-44), 0(0.000000e+00)
502; R600-NEXT:     BIT_ALIGN_INT * T0.X, KC0[3].Y, KC0[4].Y, literal.x,
503; R600-NEXT:    31(4.344025e-44), 0(0.000000e+00)
504; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
505; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
506entry:
507  %0 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 7, i32 9, i32 33>)
508  store <4 x i32> %0, <4 x i32> addrspace(1)* %in
509  ret void
510}
511