1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -amdgpu-scalarize-global-loads=false  -march=amdgcn -mtriple=amdgcn-- -mcpu=verde -verify-machineinstrs | FileCheck %s -allow-deprecated-dag-overlap -check-prefixes=FUNC,GCN,SI
3; XUN: llc < %s -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s -check-prefixes=FUNC,GCN,VI
4; RUN: llc < %s -amdgpu-scalarize-global-loads=false  -march=r600 -mtriple=r600-- -mcpu=redwood -verify-machineinstrs | FileCheck %s -allow-deprecated-dag-overlap -check-prefixes=FUNC,EG
5
6declare i32 @llvm.amdgcn.workitem.id.x() #0
7
8declare i32 @llvm.amdgcn.workgroup.id.x() #0
9
10define amdgpu_kernel void @shl_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
11; GCN-LABEL: shl_v2i32:
12; GCN:       ; %bb.0:
13; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
14; GCN-NEXT:    s_mov_b32 s3, 0xf000
15; GCN-NEXT:    s_mov_b32 s2, -1
16; GCN-NEXT:    s_mov_b32 s10, s2
17; GCN-NEXT:    s_mov_b32 s11, s3
18; GCN-NEXT:    s_waitcnt lgkmcnt(0)
19; GCN-NEXT:    s_mov_b32 s8, s6
20; GCN-NEXT:    s_mov_b32 s9, s7
21; GCN-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
22; GCN-NEXT:    s_mov_b32 s0, s4
23; GCN-NEXT:    s_mov_b32 s1, s5
24; GCN-NEXT:    s_waitcnt vmcnt(0)
25; GCN-NEXT:    v_lshl_b32_e32 v1, v1, v3
26; GCN-NEXT:    v_lshl_b32_e32 v0, v0, v2
27; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
28; GCN-NEXT:    s_endpgm
29;
30; EG-LABEL: shl_v2i32:
31; EG:       ; %bb.0:
32; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
33; EG-NEXT:    TEX 1 @6
34; EG-NEXT:    ALU 3, @11, KC0[CB0:0-32], KC1[]
35; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
36; EG-NEXT:    CF_END
37; EG-NEXT:    PAD
38; EG-NEXT:    Fetch clause starting at 6:
39; EG-NEXT:     VTX_READ_64 T1.XY, T0.X, 8, #1
40; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
41; EG-NEXT:    ALU clause starting at 10:
42; EG-NEXT:     MOV * T0.X, KC0[2].Z,
43; EG-NEXT:    ALU clause starting at 11:
44; EG-NEXT:     LSHL * T0.Y, T0.Y, T1.Y,
45; EG-NEXT:     LSHL T0.X, T0.X, T1.X,
46; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
47; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
48  %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
49  %a = load <2 x i32>, <2 x i32> addrspace(1)* %in
50  %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr
51  %result = shl <2 x i32> %a, %b
52  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
53  ret void
54}
55
56define amdgpu_kernel void @shl_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
57; GCN-LABEL: shl_v4i32:
58; GCN:       ; %bb.0:
59; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
60; GCN-NEXT:    s_mov_b32 s3, 0xf000
61; GCN-NEXT:    s_mov_b32 s2, -1
62; GCN-NEXT:    s_mov_b32 s10, s2
63; GCN-NEXT:    s_mov_b32 s11, s3
64; GCN-NEXT:    s_waitcnt lgkmcnt(0)
65; GCN-NEXT:    s_mov_b32 s8, s6
66; GCN-NEXT:    s_mov_b32 s9, s7
67; GCN-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
68; GCN-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
69; GCN-NEXT:    s_mov_b32 s0, s4
70; GCN-NEXT:    s_mov_b32 s1, s5
71; GCN-NEXT:    s_waitcnt vmcnt(0)
72; GCN-NEXT:    v_lshl_b32_e32 v3, v3, v7
73; GCN-NEXT:    v_lshl_b32_e32 v2, v2, v6
74; GCN-NEXT:    v_lshl_b32_e32 v1, v1, v5
75; GCN-NEXT:    v_lshl_b32_e32 v0, v0, v4
76; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
77; GCN-NEXT:    s_endpgm
78;
79; EG-LABEL: shl_v4i32:
80; EG:       ; %bb.0:
81; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
82; EG-NEXT:    TEX 1 @6
83; EG-NEXT:    ALU 5, @11, KC0[CB0:0-32], KC1[]
84; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
85; EG-NEXT:    CF_END
86; EG-NEXT:    PAD
87; EG-NEXT:    Fetch clause starting at 6:
88; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 16, #1
89; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
90; EG-NEXT:    ALU clause starting at 10:
91; EG-NEXT:     MOV * T0.X, KC0[2].Z,
92; EG-NEXT:    ALU clause starting at 11:
93; EG-NEXT:     LSHL * T0.W, T0.W, T1.W,
94; EG-NEXT:     LSHL * T0.Z, T0.Z, T1.Z,
95; EG-NEXT:     LSHL * T0.Y, T0.Y, T1.Y,
96; EG-NEXT:     LSHL T0.X, T0.X, T1.X,
97; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
98; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
99  %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
100  %a = load <4 x i32>, <4 x i32> addrspace(1)* %in
101  %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr
102  %result = shl <4 x i32> %a, %b
103  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
104  ret void
105}
106
107define amdgpu_kernel void @shl_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
108; GCN-LABEL: shl_i16:
109; GCN:       ; %bb.0:
110; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
111; GCN-NEXT:    s_mov_b32 s3, 0xf000
112; GCN-NEXT:    s_mov_b32 s2, -1
113; GCN-NEXT:    s_waitcnt lgkmcnt(0)
114; GCN-NEXT:    s_mov_b32 s0, s4
115; GCN-NEXT:    s_mov_b32 s1, s5
116; GCN-NEXT:    s_mov_b32 s4, s6
117; GCN-NEXT:    s_mov_b32 s5, s7
118; GCN-NEXT:    s_mov_b32 s6, s2
119; GCN-NEXT:    s_mov_b32 s7, s3
120; GCN-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
121; GCN-NEXT:    buffer_load_ushort v1, off, s[4:7], 0 offset:2
122; GCN-NEXT:    s_waitcnt vmcnt(0)
123; GCN-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
124; GCN-NEXT:    buffer_store_short v0, off, s[0:3], 0
125; GCN-NEXT:    s_endpgm
126;
127; EG-LABEL: shl_i16:
128; EG:       ; %bb.0:
129; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
130; EG-NEXT:    TEX 1 @6
131; EG-NEXT:    ALU 12, @11, KC0[CB0:0-32], KC1[]
132; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
133; EG-NEXT:    CF_END
134; EG-NEXT:    PAD
135; EG-NEXT:    Fetch clause starting at 6:
136; EG-NEXT:     VTX_READ_16 T1.X, T0.X, 2, #1
137; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
138; EG-NEXT:    ALU clause starting at 10:
139; EG-NEXT:     MOV * T0.X, KC0[2].Z,
140; EG-NEXT:    ALU clause starting at 11:
141; EG-NEXT:     AND_INT T0.W, KC0[2].Y, literal.x,
142; EG-NEXT:     LSHL * T1.W, T0.X, T1.X,
143; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
144; EG-NEXT:     AND_INT T1.W, PS, literal.x,
145; EG-NEXT:     LSHL * T0.W, PV.W, literal.y,
146; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
147; EG-NEXT:     LSHL T0.X, PV.W, PS,
148; EG-NEXT:     LSHL * T0.W, literal.x, PS,
149; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
150; EG-NEXT:     MOV T0.Y, 0.0,
151; EG-NEXT:     MOV * T0.Z, 0.0,
152; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
153; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
154  %b_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
155  %a = load i16, i16 addrspace(1)* %in
156  %b = load i16, i16 addrspace(1)* %b_ptr
157  %result = shl i16 %a, %b
158  store i16 %result, i16 addrspace(1)* %out
159  ret void
160}
161
162define amdgpu_kernel void @shl_i16_v_s(i16 addrspace(1)* %out, i16 addrspace(1)* %in, i16 %b) {
163; GCN-LABEL: shl_i16_v_s:
164; GCN:       ; %bb.0:
165; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
166; GCN-NEXT:    s_load_dword s8, s[0:1], 0xd
167; GCN-NEXT:    s_mov_b32 s3, 0xf000
168; GCN-NEXT:    s_mov_b32 s2, -1
169; GCN-NEXT:    s_waitcnt lgkmcnt(0)
170; GCN-NEXT:    s_mov_b32 s0, s4
171; GCN-NEXT:    s_mov_b32 s1, s5
172; GCN-NEXT:    s_mov_b32 s4, s6
173; GCN-NEXT:    s_mov_b32 s5, s7
174; GCN-NEXT:    s_mov_b32 s6, s2
175; GCN-NEXT:    s_mov_b32 s7, s3
176; GCN-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
177; GCN-NEXT:    s_and_b32 s8, s8, 0xffff
178; GCN-NEXT:    s_waitcnt vmcnt(0)
179; GCN-NEXT:    v_lshlrev_b32_e32 v0, s8, v0
180; GCN-NEXT:    buffer_store_short v0, off, s[0:3], 0
181; GCN-NEXT:    s_endpgm
182;
183; EG-LABEL: shl_i16_v_s:
184; EG:       ; %bb.0:
185; EG-NEXT:    ALU 1, @10, KC0[CB0:0-32], KC1[]
186; EG-NEXT:    TEX 1 @6
187; EG-NEXT:    ALU 12, @12, KC0[CB0:0-32], KC1[]
188; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
189; EG-NEXT:    CF_END
190; EG-NEXT:    PAD
191; EG-NEXT:    Fetch clause starting at 6:
192; EG-NEXT:     VTX_READ_16 T1.X, T1.X, 0, #1
193; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 44, #3
194; EG-NEXT:    ALU clause starting at 10:
195; EG-NEXT:     MOV T0.X, 0.0,
196; EG-NEXT:     MOV * T1.X, KC0[2].Z,
197; EG-NEXT:    ALU clause starting at 12:
198; EG-NEXT:     AND_INT T0.W, KC0[2].Y, literal.x,
199; EG-NEXT:     LSHL * T1.W, T1.X, T0.X,
200; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
201; EG-NEXT:     AND_INT T1.W, PS, literal.x,
202; EG-NEXT:     LSHL * T0.W, PV.W, literal.y,
203; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
204; EG-NEXT:     LSHL T0.X, PV.W, PS,
205; EG-NEXT:     LSHL * T0.W, literal.x, PS,
206; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
207; EG-NEXT:     MOV T0.Y, 0.0,
208; EG-NEXT:     MOV * T0.Z, 0.0,
209; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
210; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
211  %a = load i16, i16 addrspace(1)* %in
212  %result = shl i16 %a, %b
213  store i16 %result, i16 addrspace(1)* %out
214  ret void
215}
216
217define amdgpu_kernel void @shl_i16_v_compute_s(i16 addrspace(1)* %out, i16 addrspace(1)* %in, i16 %b) {
218; GCN-LABEL: shl_i16_v_compute_s:
219; GCN:       ; %bb.0:
220; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
221; GCN-NEXT:    s_load_dword s8, s[0:1], 0xd
222; GCN-NEXT:    s_mov_b32 s3, 0xf000
223; GCN-NEXT:    s_mov_b32 s2, -1
224; GCN-NEXT:    s_waitcnt lgkmcnt(0)
225; GCN-NEXT:    s_mov_b32 s0, s4
226; GCN-NEXT:    s_mov_b32 s1, s5
227; GCN-NEXT:    s_mov_b32 s4, s6
228; GCN-NEXT:    s_mov_b32 s5, s7
229; GCN-NEXT:    s_mov_b32 s6, s2
230; GCN-NEXT:    s_mov_b32 s7, s3
231; GCN-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
232; GCN-NEXT:    s_add_i32 s8, s8, 3
233; GCN-NEXT:    s_and_b32 s4, s8, 0xffff
234; GCN-NEXT:    s_waitcnt vmcnt(0)
235; GCN-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
236; GCN-NEXT:    buffer_store_short v0, off, s[0:3], 0
237; GCN-NEXT:    s_endpgm
238;
239; EG-LABEL: shl_i16_v_compute_s:
240; EG:       ; %bb.0:
241; EG-NEXT:    ALU 0, @12, KC0[], KC1[]
242; EG-NEXT:    TEX 0 @8
243; EG-NEXT:    ALU 0, @13, KC0[CB0:0-32], KC1[]
244; EG-NEXT:    TEX 0 @10
245; EG-NEXT:    ALU 15, @14, KC0[CB0:0-32], KC1[]
246; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
247; EG-NEXT:    CF_END
248; EG-NEXT:    PAD
249; EG-NEXT:    Fetch clause starting at 8:
250; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 44, #3
251; EG-NEXT:    Fetch clause starting at 10:
252; EG-NEXT:     VTX_READ_16 T1.X, T1.X, 0, #1
253; EG-NEXT:    ALU clause starting at 12:
254; EG-NEXT:     MOV * T0.X, 0.0,
255; EG-NEXT:    ALU clause starting at 13:
256; EG-NEXT:     MOV * T1.X, KC0[2].Z,
257; EG-NEXT:    ALU clause starting at 14:
258; EG-NEXT:     ADD_INT * T0.W, T0.X, literal.x,
259; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
260; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
261; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.y,
262; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
263; EG-NEXT:     LSHL * T0.W, T1.X, PV.W,
264; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
265; EG-NEXT:     LSHL * T1.W, T1.W, literal.y,
266; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
267; EG-NEXT:     LSHL T0.X, PV.W, PS,
268; EG-NEXT:     LSHL * T0.W, literal.x, PS,
269; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
270; EG-NEXT:     MOV T0.Y, 0.0,
271; EG-NEXT:     MOV * T0.Z, 0.0,
272; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
273; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
274  %a = load i16, i16 addrspace(1)* %in
275  %b.add = add i16 %b, 3
276  %result = shl i16 %a, %b.add
277  store i16 %result, i16 addrspace(1)* %out
278  ret void
279}
280
281define amdgpu_kernel void @shl_i16_computed_amount(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
282; GCN-LABEL: shl_i16_computed_amount:
283; GCN:       ; %bb.0:
284; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
285; GCN-NEXT:    s_mov_b32 s3, 0xf000
286; GCN-NEXT:    s_mov_b32 s2, -1
287; GCN-NEXT:    s_mov_b32 s10, s2
288; GCN-NEXT:    s_mov_b32 s11, s3
289; GCN-NEXT:    s_waitcnt lgkmcnt(0)
290; GCN-NEXT:    s_mov_b32 s8, s6
291; GCN-NEXT:    s_mov_b32 s9, s7
292; GCN-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
293; GCN-NEXT:    v_mov_b32_e32 v1, 0
294; GCN-NEXT:    s_mov_b32 s14, 0
295; GCN-NEXT:    s_mov_b32 s15, s3
296; GCN-NEXT:    s_mov_b64 s[12:13], s[6:7]
297; GCN-NEXT:    buffer_load_ushort v2, off, s[8:11], 0
298; GCN-NEXT:    buffer_load_ushort v0, v[0:1], s[12:15], 0 addr64 offset:2
299; GCN-NEXT:    s_mov_b32 s0, s4
300; GCN-NEXT:    s_mov_b32 s1, s5
301; GCN-NEXT:    s_waitcnt vmcnt(0)
302; GCN-NEXT:    v_add_i32_e32 v0, vcc, 3, v0
303; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
304; GCN-NEXT:    v_lshl_b32_e32 v0, v2, v0
305; GCN-NEXT:    buffer_store_short v0, off, s[0:3], 0
306; GCN-NEXT:    s_endpgm
307;
308; EG-LABEL: shl_i16_computed_amount:
309; EG:       ; %bb.0:
310; EG-NEXT:    ALU 0, @12, KC0[CB0:0-32], KC1[]
311; EG-NEXT:    TEX 0 @8
312; EG-NEXT:    ALU 1, @13, KC0[CB0:0-32], KC1[]
313; EG-NEXT:    TEX 0 @10
314; EG-NEXT:    ALU 15, @15, KC0[CB0:0-32], KC1[]
315; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
316; EG-NEXT:    CF_END
317; EG-NEXT:    PAD
318; EG-NEXT:    Fetch clause starting at 8:
319; EG-NEXT:     VTX_READ_16 T1.X, T1.X, 0, #1
320; EG-NEXT:    Fetch clause starting at 10:
321; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 2, #1
322; EG-NEXT:    ALU clause starting at 12:
323; EG-NEXT:     MOV * T1.X, KC0[2].Z,
324; EG-NEXT:    ALU clause starting at 13:
325; EG-NEXT:     LSHL * T0.W, T0.X, 1,
326; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
327; EG-NEXT:    ALU clause starting at 15:
328; EG-NEXT:     ADD_INT * T0.W, T0.X, literal.x,
329; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
330; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
331; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.y,
332; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
333; EG-NEXT:     LSHL * T0.W, T1.X, PV.W,
334; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
335; EG-NEXT:     LSHL * T1.W, T1.W, literal.y,
336; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
337; EG-NEXT:     LSHL T0.X, PV.W, PS,
338; EG-NEXT:     LSHL * T0.W, literal.x, PS,
339; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
340; EG-NEXT:     MOV T0.Y, 0.0,
341; EG-NEXT:     MOV * T0.Z, 0.0,
342; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
343; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
344  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
345  %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i32 %tid
346  %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
347  %b_ptr = getelementptr i16, i16 addrspace(1)* %gep, i16 1
348  %a = load volatile i16, i16 addrspace(1)* %in
349  %b = load volatile i16, i16 addrspace(1)* %b_ptr
350  %b.add = add i16 %b, 3
351  %result = shl i16 %a, %b.add
352  store i16 %result, i16 addrspace(1)* %out
353  ret void
354}
355
356define amdgpu_kernel void @shl_i16_i_s(i16 addrspace(1)* %out, i16 zeroext %a) {
357; GCN-LABEL: shl_i16_i_s:
358; GCN:       ; %bb.0:
359; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
360; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
361; GCN-NEXT:    s_mov_b32 s7, 0xf000
362; GCN-NEXT:    s_mov_b32 s6, -1
363; GCN-NEXT:    s_waitcnt lgkmcnt(0)
364; GCN-NEXT:    s_lshl_b32 s0, s0, 12
365; GCN-NEXT:    v_mov_b32_e32 v0, s0
366; GCN-NEXT:    buffer_store_short v0, off, s[4:7], 0
367; GCN-NEXT:    s_endpgm
368;
369; EG-LABEL: shl_i16_i_s:
370; EG:       ; %bb.0:
371; EG-NEXT:    ALU 0, @8, KC0[], KC1[]
372; EG-NEXT:    TEX 0 @6
373; EG-NEXT:    ALU 14, @9, KC0[CB0:0-32], KC1[]
374; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
375; EG-NEXT:    CF_END
376; EG-NEXT:    PAD
377; EG-NEXT:    Fetch clause starting at 6:
378; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 40, #3
379; EG-NEXT:    ALU clause starting at 8:
380; EG-NEXT:     MOV * T0.X, 0.0,
381; EG-NEXT:    ALU clause starting at 9:
382; EG-NEXT:     BFE_INT T0.W, T0.X, 0.0, literal.x,
383; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.y,
384; EG-NEXT:    16(2.242078e-44), 3(4.203895e-45)
385; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
386; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
387; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
388; EG-NEXT:     LSHL * T1.W, T1.W, literal.y,
389; EG-NEXT:    61440(8.609578e-41), 3(4.203895e-45)
390; EG-NEXT:     LSHL T0.X, PV.W, PS,
391; EG-NEXT:     LSHL * T0.W, literal.x, PS,
392; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
393; EG-NEXT:     MOV T0.Y, 0.0,
394; EG-NEXT:     MOV * T0.Z, 0.0,
395; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
396; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
397  %result = shl i16 %a, 12
398  store i16 %result, i16 addrspace(1)* %out
399  ret void
400}
401
402define amdgpu_kernel void @shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
403; GCN-LABEL: shl_v2i16:
404; GCN:       ; %bb.0:
405; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
406; GCN-NEXT:    s_mov_b32 s3, 0xf000
407; GCN-NEXT:    s_mov_b32 s2, -1
408; GCN-NEXT:    s_mov_b32 s10, s2
409; GCN-NEXT:    s_mov_b32 s11, s3
410; GCN-NEXT:    s_waitcnt lgkmcnt(0)
411; GCN-NEXT:    s_mov_b32 s8, s6
412; GCN-NEXT:    s_mov_b32 s9, s7
413; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
414; GCN-NEXT:    s_mov_b64 s[12:13], s[6:7]
415; GCN-NEXT:    v_mov_b32_e32 v1, 0
416; GCN-NEXT:    s_mov_b32 s14, 0
417; GCN-NEXT:    s_mov_b32 s15, s3
418; GCN-NEXT:    buffer_load_dword v2, off, s[8:11], 0
419; GCN-NEXT:    buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 offset:4
420; GCN-NEXT:    s_mov_b32 s6, 0xffff
421; GCN-NEXT:    s_mov_b32 s0, s4
422; GCN-NEXT:    s_mov_b32 s1, s5
423; GCN-NEXT:    s_waitcnt vmcnt(1)
424; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
425; GCN-NEXT:    s_waitcnt vmcnt(0)
426; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
427; GCN-NEXT:    v_and_b32_e32 v0, s6, v0
428; GCN-NEXT:    v_lshl_b32_e32 v0, v2, v0
429; GCN-NEXT:    v_lshl_b32_e32 v1, v1, v3
430; GCN-NEXT:    v_and_b32_e32 v0, s6, v0
431; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
432; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
433; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
434; GCN-NEXT:    s_endpgm
435;
436; EG-LABEL: shl_v2i16:
437; EG:       ; %bb.0:
438; EG-NEXT:    ALU 2, @12, KC0[CB0:0-32], KC1[]
439; EG-NEXT:    TEX 0 @8
440; EG-NEXT:    ALU 0, @15, KC0[CB0:0-32], KC1[]
441; EG-NEXT:    TEX 0 @10
442; EG-NEXT:    ALU 12, @16, KC0[CB0:0-32], KC1[]
443; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T7.X, 1
444; EG-NEXT:    CF_END
445; EG-NEXT:    PAD
446; EG-NEXT:    Fetch clause starting at 8:
447; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 4, #1
448; EG-NEXT:    Fetch clause starting at 10:
449; EG-NEXT:     VTX_READ_32 T7.X, T7.X, 0, #1
450; EG-NEXT:    ALU clause starting at 12:
451; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
452; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
453; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
454; EG-NEXT:    ALU clause starting at 15:
455; EG-NEXT:     MOV * T7.X, KC0[2].Z,
456; EG-NEXT:    ALU clause starting at 16:
457; EG-NEXT:     AND_INT T0.Y, T0.X, literal.x,
458; EG-NEXT:     AND_INT T0.Z, T7.X, literal.x, BS:VEC_120/SCL_212
459; EG-NEXT:     LSHR T0.W, T0.X, literal.y,
460; EG-NEXT:     LSHR * T1.W, T7.X, literal.y,
461; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
462; EG-NEXT:     LSHL T0.W, PS, PV.W,
463; EG-NEXT:     LSHL * T1.W, PV.Z, PV.Y,
464; EG-NEXT:     AND_INT T1.W, PS, literal.x,
465; EG-NEXT:     LSHL * T0.W, PV.W, literal.y,
466; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
467; EG-NEXT:     OR_INT T0.X, PV.W, PS,
468; EG-NEXT:     LSHR * T7.X, KC0[2].Y, literal.x,
469; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
470  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
471  %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i32 %tid
472  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
473  %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %gep, i16 1
474  %a = load <2 x i16>, <2 x i16> addrspace(1)* %in
475  %b = load <2 x i16>, <2 x i16> addrspace(1)* %b_ptr
476  %result = shl <2 x i16> %a, %b
477  store <2 x i16> %result, <2 x i16> addrspace(1)* %out
478  ret void
479}
480
481define amdgpu_kernel void @shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
482; GCN-LABEL: shl_v4i16:
483; GCN:       ; %bb.0:
484; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
485; GCN-NEXT:    s_mov_b32 s3, 0xf000
486; GCN-NEXT:    s_mov_b32 s2, 0
487; GCN-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
488; GCN-NEXT:    v_mov_b32_e32 v1, 0
489; GCN-NEXT:    s_waitcnt lgkmcnt(0)
490; GCN-NEXT:    s_mov_b64 s[0:1], s[6:7]
491; GCN-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
492; GCN-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8
493; GCN-NEXT:    s_mov_b32 s0, 0xffff
494; GCN-NEXT:    s_mov_b64 s[6:7], s[2:3]
495; GCN-NEXT:    s_waitcnt vmcnt(1)
496; GCN-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
497; GCN-NEXT:    s_waitcnt vmcnt(0)
498; GCN-NEXT:    v_and_b32_e32 v8, s0, v4
499; GCN-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
500; GCN-NEXT:    v_and_b32_e32 v9, s0, v5
501; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
502; GCN-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
503; GCN-NEXT:    v_lshl_b32_e32 v5, v7, v5
504; GCN-NEXT:    v_lshl_b32_e32 v3, v3, v9
505; GCN-NEXT:    v_lshl_b32_e32 v4, v6, v4
506; GCN-NEXT:    v_lshl_b32_e32 v2, v2, v8
507; GCN-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
508; GCN-NEXT:    v_and_b32_e32 v3, s0, v3
509; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
510; GCN-NEXT:    v_and_b32_e32 v2, s0, v2
511; GCN-NEXT:    v_or_b32_e32 v3, v3, v5
512; GCN-NEXT:    v_or_b32_e32 v2, v2, v4
513; GCN-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
514; GCN-NEXT:    s_endpgm
515;
516; EG-LABEL: shl_v4i16:
517; EG:       ; %bb.0:
518; EG-NEXT:    ALU 2, @12, KC0[CB0:0-32], KC1[]
519; EG-NEXT:    TEX 0 @8
520; EG-NEXT:    ALU 3, @15, KC0[], KC1[]
521; EG-NEXT:    TEX 0 @10
522; EG-NEXT:    ALU 49, @19, KC0[CB0:0-32], KC1[]
523; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T10.XY, T0.X, 1
524; EG-NEXT:    CF_END
525; EG-NEXT:    PAD
526; EG-NEXT:    Fetch clause starting at 8:
527; EG-NEXT:     VTX_READ_64 T10.XY, T0.X, 0, #1
528; EG-NEXT:    Fetch clause starting at 10:
529; EG-NEXT:     VTX_READ_64 T10.XY, T0.X, 8, #1
530; EG-NEXT:    ALU clause starting at 12:
531; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
532; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
533; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
534; EG-NEXT:    ALU clause starting at 15:
535; EG-NEXT:     MOV T4.X, T10.X,
536; EG-NEXT:     MOV * T5.X, T10.Y,
537; EG-NEXT:     MOV T0.Y, PV.X,
538; EG-NEXT:     MOV * T0.Z, PS,
539; EG-NEXT:    ALU clause starting at 19:
540; EG-NEXT:     MOV T2.X, T10.X,
541; EG-NEXT:     MOV * T3.X, T10.Y,
542; EG-NEXT:     MOV T0.X, T6.X,
543; EG-NEXT:     MOV * T1.Y, PV.X,
544; EG-NEXT:     AND_INT T1.W, PV.Y, literal.x,
545; EG-NEXT:     AND_INT * T2.W, T0.Y, literal.x,
546; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
547; EG-NEXT:     LSHL * T1.W, PS, PV.W,
548; EG-NEXT:     AND_INT T1.W, PV.W, literal.x,
549; EG-NEXT:     AND_INT * T2.W, T0.X, literal.y,
550; EG-NEXT:    65535(9.183409e-41), -65536(nan)
551; EG-NEXT:     OR_INT * T1.W, PS, PV.W,
552; EG-NEXT:     MOV T0.X, T3.X,
553; EG-NEXT:     MOV * T6.X, PV.W,
554; EG-NEXT:     MOV T1.Z, PS,
555; EG-NEXT:     LSHR T1.W, T1.Y, literal.x,
556; EG-NEXT:     LSHR * T2.W, T0.Y, literal.x,
557; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
558; EG-NEXT:     LSHL T1.W, PS, PV.W,
559; EG-NEXT:     AND_INT * T2.W, PV.Z, literal.x,
560; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
561; EG-NEXT:     LSHL * T1.W, PV.W, literal.x,
562; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
563; EG-NEXT:     OR_INT * T1.W, T2.W, PV.W,
564; EG-NEXT:     MOV T6.X, PV.W,
565; EG-NEXT:     MOV T0.Y, T7.X,
566; EG-NEXT:     AND_INT T1.W, T0.X, literal.x, BS:VEC_120/SCL_212
567; EG-NEXT:     AND_INT * T2.W, T0.Z, literal.x,
568; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
569; EG-NEXT:     LSHL T1.W, PS, PV.W,
570; EG-NEXT:     AND_INT * T2.W, PV.Y, literal.x,
571; EG-NEXT:    -65536(nan), 0(0.000000e+00)
572; EG-NEXT:     AND_INT * T1.W, PV.W, literal.x,
573; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
574; EG-NEXT:     OR_INT * T1.W, T2.W, PV.W,
575; EG-NEXT:     MOV * T7.X, PV.W,
576; EG-NEXT:     MOV T0.Y, PV.X,
577; EG-NEXT:     LSHR T1.W, T0.X, literal.x,
578; EG-NEXT:     LSHR * T2.W, T0.Z, literal.x,
579; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
580; EG-NEXT:     LSHL * T1.W, PS, PV.W,
581; EG-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
582; EG-NEXT:     LSHL T1.W, PV.W, literal.y,
583; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, T0.W,
584; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
585; EG-NEXT:     LSHR T0.X, PS, literal.x,
586; EG-NEXT:     OR_INT * T10.Y, PV.Z, PV.W,
587; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
588; EG-NEXT:     MOV T7.X, PV.Y,
589; EG-NEXT:     MOV * T10.X, T6.X,
590  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
591  %gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i32 %tid
592  %gep.out = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i32 %tid
593  %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %gep, i16 1
594  %a = load <4 x i16>, <4 x i16> addrspace(1)* %gep
595  %b = load <4 x i16>, <4 x i16> addrspace(1)* %b_ptr
596  %result = shl <4 x i16> %a, %b
597  store <4 x i16> %result, <4 x i16> addrspace(1)* %gep.out
598  ret void
599}
600
601define amdgpu_kernel void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
602; GCN-LABEL: shl_i64:
603; GCN:       ; %bb.0:
604; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
605; GCN-NEXT:    s_mov_b32 s3, 0xf000
606; GCN-NEXT:    s_mov_b32 s2, -1
607; GCN-NEXT:    s_mov_b32 s10, s2
608; GCN-NEXT:    s_mov_b32 s11, s3
609; GCN-NEXT:    s_waitcnt lgkmcnt(0)
610; GCN-NEXT:    s_mov_b32 s8, s6
611; GCN-NEXT:    s_mov_b32 s9, s7
612; GCN-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
613; GCN-NEXT:    s_mov_b32 s0, s4
614; GCN-NEXT:    s_mov_b32 s1, s5
615; GCN-NEXT:    s_waitcnt vmcnt(0)
616; GCN-NEXT:    v_lshl_b64 v[0:1], v[0:1], v2
617; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
618; GCN-NEXT:    s_endpgm
619;
620; EG-LABEL: shl_i64:
621; EG:       ; %bb.0:
622; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
623; EG-NEXT:    TEX 0 @6
624; EG-NEXT:    ALU 15, @9, KC0[CB0:0-32], KC1[]
625; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
626; EG-NEXT:    CF_END
627; EG-NEXT:    PAD
628; EG-NEXT:    Fetch clause starting at 6:
629; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
630; EG-NEXT:    ALU clause starting at 8:
631; EG-NEXT:     MOV * T0.X, KC0[2].Z,
632; EG-NEXT:    ALU clause starting at 9:
633; EG-NEXT:     SUB_INT * T0.W, literal.x, T0.Z,
634; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
635; EG-NEXT:     LSHR * T0.W, T0.X, PV.W,
636; EG-NEXT:     ADD_INT T1.Z, T0.Z, literal.x,
637; EG-NEXT:     LSHR T0.W, PV.W, 1,
638; EG-NEXT:     LSHL * T1.W, T0.Y, T0.Z,
639; EG-NEXT:    -32(nan), 0(0.000000e+00)
640; EG-NEXT:     OR_INT T2.Z, PS, PV.W,
641; EG-NEXT:     LSHL T0.W, T0.X, PV.Z,
642; EG-NEXT:     SETGT_UINT * T1.W, T0.Z, literal.x,
643; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
644; EG-NEXT:     CNDE_INT T0.Y, PS, PV.Z, PV.W,
645; EG-NEXT:     LSHL * T0.W, T0.X, T0.Z,
646; EG-NEXT:     CNDE_INT T0.X, T1.W, PV.W, 0.0,
647; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
648; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
649  %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1
650  %a = load i64, i64 addrspace(1)* %in
651  %b = load i64, i64 addrspace(1)* %b_ptr
652  %result = shl i64 %a, %b
653  store i64 %result, i64 addrspace(1)* %out
654  ret void
655}
656
657define amdgpu_kernel void @shl_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
658; GCN-LABEL: shl_v2i64:
659; GCN:       ; %bb.0:
660; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
661; GCN-NEXT:    s_mov_b32 s3, 0xf000
662; GCN-NEXT:    s_mov_b32 s2, -1
663; GCN-NEXT:    s_mov_b32 s10, s2
664; GCN-NEXT:    s_mov_b32 s11, s3
665; GCN-NEXT:    s_waitcnt lgkmcnt(0)
666; GCN-NEXT:    s_mov_b32 s8, s6
667; GCN-NEXT:    s_mov_b32 s9, s7
668; GCN-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
669; GCN-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
670; GCN-NEXT:    s_mov_b32 s0, s4
671; GCN-NEXT:    s_mov_b32 s1, s5
672; GCN-NEXT:    s_waitcnt vmcnt(0)
673; GCN-NEXT:    v_lshl_b64 v[2:3], v[2:3], v6
674; GCN-NEXT:    v_lshl_b64 v[0:1], v[0:1], v4
675; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
676; GCN-NEXT:    s_endpgm
677;
678; EG-LABEL: shl_v2i64:
679; EG:       ; %bb.0:
680; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
681; EG-NEXT:    TEX 1 @6
682; EG-NEXT:    ALU 28, @11, KC0[CB0:0-32], KC1[]
683; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T0.X, 1
684; EG-NEXT:    CF_END
685; EG-NEXT:    PAD
686; EG-NEXT:    Fetch clause starting at 6:
687; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 16, #1
688; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
689; EG-NEXT:    ALU clause starting at 10:
690; EG-NEXT:     MOV * T0.X, KC0[2].Z,
691; EG-NEXT:    ALU clause starting at 11:
692; EG-NEXT:     SUB_INT * T1.W, literal.x, T1.Z,
693; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
694; EG-NEXT:     LSHR * T1.W, T0.Z, PV.W,
695; EG-NEXT:     SUB_INT T2.Z, literal.x, T1.X,
696; EG-NEXT:     LSHR T1.W, PV.W, 1,
697; EG-NEXT:     LSHL * T0.W, T0.W, T1.Z,
698; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
699; EG-NEXT:     OR_INT T3.Z, PS, PV.W,
700; EG-NEXT:     LSHR T0.W, T0.X, PV.Z,
701; EG-NEXT:     ADD_INT * T1.W, T1.Z, literal.x,
702; EG-NEXT:    -32(nan), 0(0.000000e+00)
703; EG-NEXT:     LSHL T2.X, T0.Z, PS,
704; EG-NEXT:     SETGT_UINT T1.Y, T1.Z, literal.x, BS:VEC_120/SCL_212
705; EG-NEXT:     ADD_INT T2.Z, T1.X, literal.y,
706; EG-NEXT:     LSHR T0.W, PV.W, 1,
707; EG-NEXT:     LSHL * T1.W, T0.Y, T1.X,
708; EG-NEXT:    31(4.344025e-44), -32(nan)
709; EG-NEXT:     OR_INT T0.Y, PS, PV.W,
710; EG-NEXT:     LSHL T2.Z, T0.X, PV.Z,
711; EG-NEXT:     SETGT_UINT T0.W, T1.X, literal.x, BS:VEC_120/SCL_212
712; EG-NEXT:     CNDE_INT * T2.W, PV.Y, T3.Z, PV.X,
713; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
714; EG-NEXT:     CNDE_INT T2.Y, PV.W, PV.Y, PV.Z,
715; EG-NEXT:     LSHL * T1.W, T0.Z, T1.Z,
716; EG-NEXT:     CNDE_INT T2.Z, T1.Y, PV.W, 0.0,
717; EG-NEXT:     LSHL * T1.W, T0.X, T1.X,
718; EG-NEXT:     CNDE_INT T2.X, T0.W, PV.W, 0.0,
719; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
720; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
721  %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1
722  %a = load <2 x i64>, <2 x i64> addrspace(1)* %in
723  %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr
724  %result = shl <2 x i64> %a, %b
725  store <2 x i64> %result, <2 x i64> addrspace(1)* %out
726  ret void
727}
728
729define amdgpu_kernel void @shl_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
730; GCN-LABEL: shl_v4i64:
731; GCN:       ; %bb.0:
732; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
733; GCN-NEXT:    s_mov_b32 s3, 0xf000
734; GCN-NEXT:    s_mov_b32 s2, -1
735; GCN-NEXT:    s_mov_b32 s10, s2
736; GCN-NEXT:    s_mov_b32 s11, s3
737; GCN-NEXT:    s_waitcnt lgkmcnt(0)
738; GCN-NEXT:    s_mov_b32 s8, s6
739; GCN-NEXT:    s_mov_b32 s9, s7
740; GCN-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
741; GCN-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
742; GCN-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32
743; GCN-NEXT:    buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:48
744; GCN-NEXT:    s_mov_b32 s0, s4
745; GCN-NEXT:    s_mov_b32 s1, s5
746; GCN-NEXT:    s_waitcnt vmcnt(1)
747; GCN-NEXT:    v_lshl_b64 v[2:3], v[2:3], v10
748; GCN-NEXT:    s_waitcnt vmcnt(0)
749; GCN-NEXT:    v_lshl_b64 v[6:7], v[6:7], v13
750; GCN-NEXT:    v_lshl_b64 v[4:5], v[4:5], v11
751; GCN-NEXT:    v_lshl_b64 v[0:1], v[0:1], v8
752; GCN-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
753; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
754; GCN-NEXT:    s_endpgm
755;
756; EG-LABEL: shl_v4i64:
757; EG:       ; %bb.0:
758; EG-NEXT:    ALU 0, @14, KC0[CB0:0-32], KC1[]
759; EG-NEXT:    TEX 3 @6
760; EG-NEXT:    ALU 58, @15, KC0[CB0:0-32], KC1[]
761; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T0.X, 0
762; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T4.X, 1
763; EG-NEXT:    CF_END
764; EG-NEXT:    Fetch clause starting at 6:
765; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 48, #1
766; EG-NEXT:     VTX_READ_128 T2.XYZW, T0.X, 0, #1
767; EG-NEXT:     VTX_READ_128 T3.XYZW, T0.X, 16, #1
768; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 32, #1
769; EG-NEXT:    ALU clause starting at 14:
770; EG-NEXT:     MOV * T0.X, KC0[2].Z,
771; EG-NEXT:    ALU clause starting at 15:
772; EG-NEXT:     SUB_INT * T0.W, literal.x, T1.Z,
773; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
774; EG-NEXT:     SUB_INT T4.Z, literal.x, T0.Z,
775; EG-NEXT:     SUB_INT T1.W, literal.x, T0.X,
776; EG-NEXT:     LSHR * T0.W, T3.Z, PV.W,
777; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
778; EG-NEXT:     SUB_INT T0.Y, literal.x, T1.X,
779; EG-NEXT:     LSHR T5.Z, PS, 1,
780; EG-NEXT:     LSHR T0.W, T2.X, PV.W,
781; EG-NEXT:     LSHR * T1.W, T2.Z, PV.Z,
782; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
783; EG-NEXT:     LSHL T4.X, T3.W, T1.Z,
784; EG-NEXT:     LSHR T1.Y, PS, 1,
785; EG-NEXT:     LSHL T4.Z, T2.W, T0.Z, BS:VEC_120/SCL_212
786; EG-NEXT:     LSHR T0.W, PV.W, 1,
787; EG-NEXT:     LSHL * T1.W, T2.Y, T0.X,
788; EG-NEXT:     OR_INT T5.X, PS, PV.W,
789; EG-NEXT:     OR_INT T1.Y, PV.Z, PV.Y,
790; EG-NEXT:     OR_INT T4.Z, PV.X, T5.Z,
791; EG-NEXT:     LSHR T0.W, T3.X, T0.Y,
792; EG-NEXT:     ADD_INT * T1.W, T1.Z, literal.x,
793; EG-NEXT:    -32(nan), 0(0.000000e+00)
794; EG-NEXT:     LSHL T4.X, T3.Z, PS,
795; EG-NEXT:     SETGT_UINT T0.Y, T1.Z, literal.x, BS:VEC_120/SCL_212
796; EG-NEXT:     ADD_INT T5.Z, T1.X, literal.y,
797; EG-NEXT:     LSHR T0.W, PV.W, 1,
798; EG-NEXT:     LSHL * T1.W, T3.Y, T1.X,
799; EG-NEXT:    31(4.344025e-44), -32(nan)
800; EG-NEXT:     OR_INT T6.X, PS, PV.W,
801; EG-NEXT:     LSHL T2.Y, T3.X, PV.Z,
802; EG-NEXT:     SETGT_UINT T5.Z, T1.X, literal.x, BS:VEC_120/SCL_212
803; EG-NEXT:     ADD_INT T0.W, T0.Z, literal.y,
804; EG-NEXT:     CNDE_INT * T3.W, PV.Y, T4.Z, PV.X,
805; EG-NEXT:    31(4.344025e-44), -32(nan)
806; EG-NEXT:     LSHL T4.X, T2.Z, PV.W,
807; EG-NEXT:     CNDE_INT T3.Y, PV.Z, PV.X, PV.Y,
808; EG-NEXT:     SETGT_UINT * T4.Z, T0.Z, literal.x, BS:VEC_120/SCL_212
809; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
810; EG-NEXT:     LSHL T0.W, T3.Z, T1.Z,
811; EG-NEXT:     ADD_INT * T1.W, T0.X, literal.x,
812; EG-NEXT:    -32(nan), 0(0.000000e+00)
813; EG-NEXT:     LSHL T6.X, T2.X, PS,
814; EG-NEXT:     SETGT_UINT T2.Y, T0.X, literal.x, BS:VEC_120/SCL_212
815; EG-NEXT:     CNDE_INT * T3.Z, T0.Y, PV.W, 0.0,
816; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
817; EG-NEXT:     LSHL T0.W, T3.X, T1.X, BS:VEC_120/SCL_212
818; EG-NEXT:     CNDE_INT * T1.W, T4.Z, T1.Y, T4.X,
819; EG-NEXT:     CNDE_INT T3.X, T5.Z, PV.W, 0.0,
820; EG-NEXT:     CNDE_INT T1.Y, T2.Y, T5.X, T6.X,
821; EG-NEXT:     LSHL T0.W, T2.Z, T0.Z, BS:VEC_120/SCL_212
822; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
823; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
824; EG-NEXT:     LSHR T4.X, PS, literal.x,
825; EG-NEXT:     CNDE_INT T1.Z, T4.Z, PV.W, 0.0,
826; EG-NEXT:     LSHL * T0.W, T2.X, T0.X,
827; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
828; EG-NEXT:     CNDE_INT T1.X, T2.Y, PV.W, 0.0,
829; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
830; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
831  %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1
832  %a = load <4 x i64>, <4 x i64> addrspace(1)* %in
833  %b = load <4 x i64>, <4 x i64> addrspace(1)* %b_ptr
834  %result = shl <4 x i64> %a, %b
835  store <4 x i64> %result, <4 x i64> addrspace(1)* %out
836  ret void
837}
838
839; Make sure load width gets reduced to i32 load.
840define amdgpu_kernel void @s_shl_32_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a) {
841; GCN-LABEL: s_shl_32_i64:
842; GCN:       ; %bb.0:
843; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
844; GCN-NEXT:    s_load_dword s0, s[0:1], 0x13
845; GCN-NEXT:    s_mov_b32 s7, 0xf000
846; GCN-NEXT:    s_mov_b32 s6, -1
847; GCN-NEXT:    v_mov_b32_e32 v0, 0
848; GCN-NEXT:    s_waitcnt lgkmcnt(0)
849; GCN-NEXT:    v_mov_b32_e32 v1, s0
850; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
851; GCN-NEXT:    s_endpgm
852;
853; EG-LABEL: s_shl_32_i64:
854; EG:       ; %bb.0:
855; EG-NEXT:    ALU 3, @4, KC0[CB0:0-32], KC1[]
856; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
857; EG-NEXT:    CF_END
858; EG-NEXT:    PAD
859; EG-NEXT:    ALU clause starting at 4:
860; EG-NEXT:     MOV * T0.Y, KC0[4].W,
861; EG-NEXT:     MOV T0.X, 0.0,
862; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
863; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
864  %result = shl i64 %a, 32
865  store i64 %result, i64 addrspace(1)* %out
866  ret void
867}
868
869define amdgpu_kernel void @v_shl_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
870; GCN-LABEL: v_shl_32_i64:
871; GCN:       ; %bb.0:
872; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
873; GCN-NEXT:    s_ashr_i32 s3, s2, 31
874; GCN-NEXT:    s_lshl_b64 s[0:1], s[2:3], 3
875; GCN-NEXT:    v_mov_b32_e32 v0, s0
876; GCN-NEXT:    s_mov_b32 s7, 0xf000
877; GCN-NEXT:    s_mov_b32 s6, 0
878; GCN-NEXT:    s_waitcnt lgkmcnt(0)
879; GCN-NEXT:    s_mov_b64 s[4:5], s[10:11]
880; GCN-NEXT:    v_mov_b32_e32 v1, s1
881; GCN-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
882; GCN-NEXT:    s_mov_b64 s[10:11], s[6:7]
883; GCN-NEXT:    v_mov_b32_e32 v2, 0
884; GCN-NEXT:    s_waitcnt vmcnt(0)
885; GCN-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64
886; GCN-NEXT:    s_endpgm
887;
888; EG-LABEL: v_shl_32_i64:
889; EG:       ; %bb.0:
890; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
891; EG-NEXT:    TEX 0 @6
892; EG-NEXT:    ALU 4, @11, KC0[CB0:0-32], KC1[]
893; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XY, T2.X, 1
894; EG-NEXT:    CF_END
895; EG-NEXT:    PAD
896; EG-NEXT:    Fetch clause starting at 6:
897; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
898; EG-NEXT:    ALU clause starting at 8:
899; EG-NEXT:     LSHL * T0.W, T1.X, literal.x,
900; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
901; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
902; EG-NEXT:    ALU clause starting at 11:
903; EG-NEXT:     MOV T1.X, 0.0,
904; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, T0.W,
905; EG-NEXT:     LSHR T2.X, PV.W, literal.x,
906; EG-NEXT:     MOV * T1.Y, T0.X,
907; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
908  %tid = call i32 @llvm.amdgcn.workgroup.id.x() #0
909  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
910  %gep.out = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
911  %a = load i64, i64 addrspace(1)* %gep.in
912  %result = shl i64 %a, 32
913  store i64 %result, i64 addrspace(1)* %gep.out
914  ret void
915}
916
917define amdgpu_kernel void @s_shl_constant_i64(i64 addrspace(1)* %out, i64 %a) {
918; GCN-LABEL: s_shl_constant_i64:
919; GCN:       ; %bb.0:
920; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
921; GCN-NEXT:    s_mov_b32 s2, -1
922; GCN-NEXT:    s_mov_b32 s9, 0xffff
923; GCN-NEXT:    s_mov_b32 s8, s2
924; GCN-NEXT:    s_mov_b32 s3, 0xf000
925; GCN-NEXT:    s_waitcnt lgkmcnt(0)
926; GCN-NEXT:    s_mov_b32 s0, s4
927; GCN-NEXT:    s_mov_b32 s1, s5
928; GCN-NEXT:    s_lshl_b64 s[4:5], s[8:9], s6
929; GCN-NEXT:    v_mov_b32_e32 v0, s4
930; GCN-NEXT:    v_mov_b32_e32 v1, s5
931; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
932; GCN-NEXT:    s_endpgm
933;
934; EG-LABEL: s_shl_constant_i64:
935; EG:       ; %bb.0:
936; EG-NEXT:    ALU 17, @4, KC0[CB0:0-32], KC1[]
937; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
938; EG-NEXT:    CF_END
939; EG-NEXT:    PAD
940; EG-NEXT:    ALU clause starting at 4:
941; EG-NEXT:     SUB_INT * T0.W, literal.x, KC0[2].W,
942; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
943; EG-NEXT:     LSHR * T0.W, literal.x, PV.W,
944; EG-NEXT:    -1(nan), 0(0.000000e+00)
945; EG-NEXT:     ADD_INT T0.Z, KC0[2].W, literal.x,
946; EG-NEXT:     LSHR T0.W, PV.W, 1,
947; EG-NEXT:     LSHL * T1.W, literal.y, KC0[2].W,
948; EG-NEXT:    -32(nan), 65535(9.183409e-41)
949; EG-NEXT:     OR_INT T1.Z, PS, PV.W,
950; EG-NEXT:     LSHL T0.W, literal.x, PV.Z,
951; EG-NEXT:     SETGT_UINT * T1.W, KC0[2].W, literal.y,
952; EG-NEXT:    -1(nan), 31(4.344025e-44)
953; EG-NEXT:     CNDE_INT T0.Y, PS, PV.Z, PV.W,
954; EG-NEXT:     LSHL * T0.W, literal.x, KC0[2].W,
955; EG-NEXT:    -1(nan), 0(0.000000e+00)
956; EG-NEXT:     CNDE_INT T0.X, T1.W, PV.W, 0.0,
957; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
958; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
959  %shl = shl i64 281474976710655, %a
960  store i64 %shl, i64 addrspace(1)* %out, align 8
961  ret void
962}
963
964define amdgpu_kernel void @v_shl_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
965; GCN-LABEL: v_shl_constant_i64:
966; GCN:       ; %bb.0:
967; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
968; GCN-NEXT:    s_mov_b32 s3, 0xf000
969; GCN-NEXT:    s_mov_b32 s2, -1
970; GCN-NEXT:    s_mov_b32 s10, s2
971; GCN-NEXT:    s_mov_b32 s11, s3
972; GCN-NEXT:    s_waitcnt lgkmcnt(0)
973; GCN-NEXT:    s_mov_b32 s8, s6
974; GCN-NEXT:    s_mov_b32 s9, s7
975; GCN-NEXT:    buffer_load_dword v0, off, s[8:11], 0
976; GCN-NEXT:    s_movk_i32 s7, 0x11e
977; GCN-NEXT:    s_mov_b32 s6, 0xab19b207
978; GCN-NEXT:    s_mov_b32 s0, s4
979; GCN-NEXT:    s_mov_b32 s1, s5
980; GCN-NEXT:    s_waitcnt vmcnt(0)
981; GCN-NEXT:    v_lshl_b64 v[0:1], s[6:7], v0
982; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
983; GCN-NEXT:    s_endpgm
984;
985; EG-LABEL: v_shl_constant_i64:
986; EG:       ; %bb.0:
987; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
988; EG-NEXT:    TEX 0 @6
989; EG-NEXT:    ALU 17, @9, KC0[CB0:0-32], KC1[]
990; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
991; EG-NEXT:    CF_END
992; EG-NEXT:    PAD
993; EG-NEXT:    Fetch clause starting at 6:
994; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
995; EG-NEXT:    ALU clause starting at 8:
996; EG-NEXT:     MOV * T0.X, KC0[2].Z,
997; EG-NEXT:    ALU clause starting at 9:
998; EG-NEXT:     SUB_INT * T0.W, literal.x, T0.X,
999; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
1000; EG-NEXT:     LSHR * T0.W, literal.x, PV.W,
1001; EG-NEXT:    -1424379385(-5.460358e-13), 0(0.000000e+00)
1002; EG-NEXT:     ADD_INT T0.Z, T0.X, literal.x,
1003; EG-NEXT:     LSHR T0.W, PV.W, 1,
1004; EG-NEXT:     LSHL * T1.W, literal.y, T0.X,
1005; EG-NEXT:    -32(nan), 286(4.007714e-43)
1006; EG-NEXT:     OR_INT T1.Z, PS, PV.W,
1007; EG-NEXT:     SETGT_UINT T0.W, T0.X, literal.x,
1008; EG-NEXT:     LSHL * T1.W, literal.y, PV.Z,
1009; EG-NEXT:    31(4.344025e-44), -1424379385(-5.460358e-13)
1010; EG-NEXT:     CNDE_INT T0.Y, PV.W, PV.Z, PS,
1011; EG-NEXT:     LSHL * T1.W, literal.x, T0.X,
1012; EG-NEXT:    -1424379385(-5.460358e-13), 0(0.000000e+00)
1013; EG-NEXT:     CNDE_INT T0.X, T0.W, PV.W, 0.0,
1014; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1015; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1016  %a = load i64, i64 addrspace(1)* %aptr, align 8
1017  %shl = shl i64 1231231234567, %a
1018  store i64 %shl, i64 addrspace(1)* %out, align 8
1019  ret void
1020}
1021
1022define amdgpu_kernel void @v_shl_i64_32_bit_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
1023; GCN-LABEL: v_shl_i64_32_bit_constant:
1024; GCN:       ; %bb.0:
1025; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1026; GCN-NEXT:    s_mov_b32 s3, 0xf000
1027; GCN-NEXT:    s_mov_b32 s2, -1
1028; GCN-NEXT:    s_mov_b32 s10, s2
1029; GCN-NEXT:    s_mov_b32 s11, s3
1030; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1031; GCN-NEXT:    s_mov_b32 s8, s6
1032; GCN-NEXT:    s_mov_b32 s9, s7
1033; GCN-NEXT:    buffer_load_dword v0, off, s[8:11], 0
1034; GCN-NEXT:    s_mov_b32 s7, 0
1035; GCN-NEXT:    s_mov_b32 s6, 0x12d687
1036; GCN-NEXT:    s_mov_b32 s0, s4
1037; GCN-NEXT:    s_mov_b32 s1, s5
1038; GCN-NEXT:    s_waitcnt vmcnt(0)
1039; GCN-NEXT:    v_lshl_b64 v[0:1], s[6:7], v0
1040; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1041; GCN-NEXT:    s_endpgm
1042;
1043; EG-LABEL: v_shl_i64_32_bit_constant:
1044; EG:       ; %bb.0:
1045; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1046; EG-NEXT:    TEX 0 @6
1047; EG-NEXT:    ALU 14, @9, KC0[CB0:0-32], KC1[]
1048; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1049; EG-NEXT:    CF_END
1050; EG-NEXT:    PAD
1051; EG-NEXT:    Fetch clause starting at 6:
1052; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1053; EG-NEXT:    ALU clause starting at 8:
1054; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1055; EG-NEXT:    ALU clause starting at 9:
1056; EG-NEXT:     SUB_INT T0.W, literal.x, T0.X,
1057; EG-NEXT:     ADD_INT * T1.W, T0.X, literal.y,
1058; EG-NEXT:    31(4.344025e-44), -32(nan)
1059; EG-NEXT:     LSHR * T0.W, literal.x, PV.W,
1060; EG-NEXT:    1234567(1.729997e-39), 0(0.000000e+00)
1061; EG-NEXT:     LSHR T0.Z, PV.W, 1,
1062; EG-NEXT:     LSHL T0.W, literal.x, T1.W,
1063; EG-NEXT:     SETGT_UINT * T1.W, T0.X, literal.y,
1064; EG-NEXT:    1234567(1.729997e-39), 31(4.344025e-44)
1065; EG-NEXT:     CNDE_INT T0.Y, PS, PV.Z, PV.W,
1066; EG-NEXT:     LSHL * T0.W, literal.x, T0.X,
1067; EG-NEXT:    1234567(1.729997e-39), 0(0.000000e+00)
1068; EG-NEXT:     CNDE_INT T0.X, T1.W, PV.W, 0.0,
1069; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1070; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1071  %a = load i64, i64 addrspace(1)* %aptr, align 8
1072  %shl = shl i64 1234567, %a
1073  store i64 %shl, i64 addrspace(1)* %out, align 8
1074  ret void
1075}
1076
1077define amdgpu_kernel void @v_shl_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
1078; GCN-LABEL: v_shl_inline_imm_64_i64:
1079; GCN:       ; %bb.0:
1080; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1081; GCN-NEXT:    s_mov_b32 s3, 0xf000
1082; GCN-NEXT:    s_mov_b32 s2, -1
1083; GCN-NEXT:    s_mov_b32 s10, s2
1084; GCN-NEXT:    s_mov_b32 s11, s3
1085; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1086; GCN-NEXT:    s_mov_b32 s8, s6
1087; GCN-NEXT:    s_mov_b32 s9, s7
1088; GCN-NEXT:    buffer_load_dword v0, off, s[8:11], 0
1089; GCN-NEXT:    s_mov_b32 s0, s4
1090; GCN-NEXT:    s_mov_b32 s1, s5
1091; GCN-NEXT:    s_waitcnt vmcnt(0)
1092; GCN-NEXT:    v_lshl_b64 v[0:1], 64, v0
1093; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1094; GCN-NEXT:    s_endpgm
1095;
1096; EG-LABEL: v_shl_inline_imm_64_i64:
1097; EG:       ; %bb.0:
1098; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1099; EG-NEXT:    TEX 0 @6
1100; EG-NEXT:    ALU 14, @9, KC0[CB0:0-32], KC1[]
1101; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1102; EG-NEXT:    CF_END
1103; EG-NEXT:    PAD
1104; EG-NEXT:    Fetch clause starting at 6:
1105; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1106; EG-NEXT:    ALU clause starting at 8:
1107; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1108; EG-NEXT:    ALU clause starting at 9:
1109; EG-NEXT:     SUB_INT T0.W, literal.x, T0.X,
1110; EG-NEXT:     ADD_INT * T1.W, T0.X, literal.y,
1111; EG-NEXT:    31(4.344025e-44), -32(nan)
1112; EG-NEXT:     LSHR * T0.W, literal.x, PV.W,
1113; EG-NEXT:    64(8.968310e-44), 0(0.000000e+00)
1114; EG-NEXT:     LSHR T0.Z, PV.W, 1,
1115; EG-NEXT:     LSHL T0.W, literal.x, T1.W,
1116; EG-NEXT:     SETGT_UINT * T1.W, T0.X, literal.y,
1117; EG-NEXT:    64(8.968310e-44), 31(4.344025e-44)
1118; EG-NEXT:     CNDE_INT T0.Y, PS, PV.Z, PV.W,
1119; EG-NEXT:     LSHL * T0.W, literal.x, T0.X,
1120; EG-NEXT:    64(8.968310e-44), 0(0.000000e+00)
1121; EG-NEXT:     CNDE_INT T0.X, T1.W, PV.W, 0.0,
1122; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1123; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1124  %a = load i64, i64 addrspace(1)* %aptr, align 8
1125  %shl = shl i64 64, %a
1126  store i64 %shl, i64 addrspace(1)* %out, align 8
1127  ret void
1128}
1129
1130define amdgpu_kernel void @s_shl_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
1131; GCN-LABEL: s_shl_inline_imm_64_i64:
1132; GCN:       ; %bb.0:
1133; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1134; GCN-NEXT:    s_load_dword s0, s[0:1], 0xd
1135; GCN-NEXT:    s_mov_b32 s7, 0xf000
1136; GCN-NEXT:    s_mov_b32 s6, -1
1137; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1138; GCN-NEXT:    s_lshl_b64 s[0:1], 64, s0
1139; GCN-NEXT:    v_mov_b32_e32 v0, s0
1140; GCN-NEXT:    v_mov_b32_e32 v1, s1
1141; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1142; GCN-NEXT:    s_endpgm
1143;
1144; EG-LABEL: s_shl_inline_imm_64_i64:
1145; EG:       ; %bb.0:
1146; EG-NEXT:    ALU 14, @4, KC0[CB0:0-32], KC1[]
1147; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1148; EG-NEXT:    CF_END
1149; EG-NEXT:    PAD
1150; EG-NEXT:    ALU clause starting at 4:
1151; EG-NEXT:     SUB_INT * T0.W, literal.x, KC0[2].W,
1152; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
1153; EG-NEXT:     LSHR T0.W, literal.x, PV.W,
1154; EG-NEXT:     ADD_INT * T1.W, KC0[2].W, literal.y,
1155; EG-NEXT:    64(8.968310e-44), -32(nan)
1156; EG-NEXT:     LSHL T0.Z, literal.x, PS,
1157; EG-NEXT:     LSHR T0.W, PV.W, 1,
1158; EG-NEXT:     SETGT_UINT * T1.W, KC0[2].W, literal.y,
1159; EG-NEXT:    64(8.968310e-44), 31(4.344025e-44)
1160; EG-NEXT:     CNDE_INT T0.Y, PS, PV.W, PV.Z,
1161; EG-NEXT:     LSHL * T0.W, literal.x, KC0[2].W,
1162; EG-NEXT:    64(8.968310e-44), 0(0.000000e+00)
1163; EG-NEXT:     CNDE_INT T0.X, T1.W, PV.W, 0.0,
1164; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1165; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1166  %shl = shl i64 64, %a
1167  store i64 %shl, i64 addrspace(1)* %out, align 8
1168  ret void
1169}
1170
1171define amdgpu_kernel void @s_shl_inline_imm_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
1172; GCN-LABEL: s_shl_inline_imm_1_i64:
1173; GCN:       ; %bb.0:
1174; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1175; GCN-NEXT:    s_load_dword s0, s[0:1], 0xd
1176; GCN-NEXT:    s_mov_b32 s7, 0xf000
1177; GCN-NEXT:    s_mov_b32 s6, -1
1178; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1179; GCN-NEXT:    s_lshl_b64 s[0:1], 1, s0
1180; GCN-NEXT:    v_mov_b32_e32 v0, s0
1181; GCN-NEXT:    v_mov_b32_e32 v1, s1
1182; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1183; GCN-NEXT:    s_endpgm
1184;
1185; EG-LABEL: s_shl_inline_imm_1_i64:
1186; EG:       ; %bb.0:
1187; EG-NEXT:    ALU 8, @4, KC0[CB0:0-32], KC1[]
1188; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1189; EG-NEXT:    CF_END
1190; EG-NEXT:    PAD
1191; EG-NEXT:    ALU clause starting at 4:
1192; EG-NEXT:     ADD_INT T0.Z, KC0[2].W, literal.x,
1193; EG-NEXT:     SETGT_UINT T0.W, KC0[2].W, literal.y,
1194; EG-NEXT:     LSHL * T1.W, 1, KC0[2].W,
1195; EG-NEXT:    -32(nan), 31(4.344025e-44)
1196; EG-NEXT:     CNDE_INT T0.X, PV.W, PS, 0.0,
1197; EG-NEXT:     LSHL T1.W, 1, PV.Z,
1198; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1199; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1200; EG-NEXT:     CNDE_INT * T0.Y, T0.W, 0.0, PV.W,
1201  %shl = shl i64 1, %a
1202  store i64 %shl, i64 addrspace(1)* %out, align 8
1203  ret void
1204}
1205
1206define amdgpu_kernel void @s_shl_inline_imm_1_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
1207; GCN-LABEL: s_shl_inline_imm_1_0_i64:
1208; GCN:       ; %bb.0:
1209; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1210; GCN-NEXT:    s_load_dword s0, s[0:1], 0xd
1211; GCN-NEXT:    s_mov_b32 s7, 0xf000
1212; GCN-NEXT:    s_mov_b32 s6, -1
1213; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1214; GCN-NEXT:    s_lshl_b64 s[0:1], 1.0, s0
1215; GCN-NEXT:    v_mov_b32_e32 v0, s0
1216; GCN-NEXT:    v_mov_b32_e32 v1, s1
1217; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1218; GCN-NEXT:    s_endpgm
1219;
1220; EG-LABEL: s_shl_inline_imm_1_0_i64:
1221; EG:       ; %bb.0:
1222; EG-NEXT:    ALU 6, @4, KC0[CB0:0-32], KC1[]
1223; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1224; EG-NEXT:    CF_END
1225; EG-NEXT:    PAD
1226; EG-NEXT:    ALU clause starting at 4:
1227; EG-NEXT:     SETGT_UINT T0.W, KC0[2].W, literal.x,
1228; EG-NEXT:     LSHL * T1.W, literal.y, KC0[2].W,
1229; EG-NEXT:    31(4.344025e-44), 1072693248(1.875000e+00)
1230; EG-NEXT:     CNDE_INT * T0.Y, PV.W, PS, 0.0,
1231; EG-NEXT:     MOV T0.X, 0.0,
1232; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1233; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1234  %shl = shl i64 4607182418800017408, %a
1235  store i64 %shl, i64 addrspace(1)* %out, align 8
1236  ret void
1237}
1238
1239define amdgpu_kernel void @s_shl_inline_imm_neg_1_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
1240; GCN-LABEL: s_shl_inline_imm_neg_1_0_i64:
1241; GCN:       ; %bb.0:
1242; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1243; GCN-NEXT:    s_load_dword s0, s[0:1], 0xd
1244; GCN-NEXT:    s_mov_b32 s7, 0xf000
1245; GCN-NEXT:    s_mov_b32 s6, -1
1246; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1247; GCN-NEXT:    s_lshl_b64 s[0:1], -1.0, s0
1248; GCN-NEXT:    v_mov_b32_e32 v0, s0
1249; GCN-NEXT:    v_mov_b32_e32 v1, s1
1250; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1251; GCN-NEXT:    s_endpgm
1252;
1253; EG-LABEL: s_shl_inline_imm_neg_1_0_i64:
1254; EG:       ; %bb.0:
1255; EG-NEXT:    ALU 6, @4, KC0[CB0:0-32], KC1[]
1256; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1257; EG-NEXT:    CF_END
1258; EG-NEXT:    PAD
1259; EG-NEXT:    ALU clause starting at 4:
1260; EG-NEXT:     SETGT_UINT T0.W, KC0[2].W, literal.x,
1261; EG-NEXT:     LSHL * T1.W, literal.y, KC0[2].W,
1262; EG-NEXT:    31(4.344025e-44), -1074790400(-1.875000e+00)
1263; EG-NEXT:     CNDE_INT * T0.Y, PV.W, PS, 0.0,
1264; EG-NEXT:     MOV T0.X, 0.0,
1265; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1266; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1267  %shl = shl i64 13830554455654793216, %a
1268  store i64 %shl, i64 addrspace(1)* %out, align 8
1269  ret void
1270}
1271
1272define amdgpu_kernel void @s_shl_inline_imm_0_5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
1273; GCN-LABEL: s_shl_inline_imm_0_5_i64:
1274; GCN:       ; %bb.0:
1275; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1276; GCN-NEXT:    s_load_dword s0, s[0:1], 0xd
1277; GCN-NEXT:    s_mov_b32 s7, 0xf000
1278; GCN-NEXT:    s_mov_b32 s6, -1
1279; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1280; GCN-NEXT:    s_lshl_b64 s[0:1], 0.5, s0
1281; GCN-NEXT:    v_mov_b32_e32 v0, s0
1282; GCN-NEXT:    v_mov_b32_e32 v1, s1
1283; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1284; GCN-NEXT:    s_endpgm
1285;
1286; EG-LABEL: s_shl_inline_imm_0_5_i64:
1287; EG:       ; %bb.0:
1288; EG-NEXT:    ALU 6, @4, KC0[CB0:0-32], KC1[]
1289; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1290; EG-NEXT:    CF_END
1291; EG-NEXT:    PAD
1292; EG-NEXT:    ALU clause starting at 4:
1293; EG-NEXT:     SETGT_UINT T0.W, KC0[2].W, literal.x,
1294; EG-NEXT:     LSHL * T1.W, literal.y, KC0[2].W,
1295; EG-NEXT:    31(4.344025e-44), 1071644672(1.750000e+00)
1296; EG-NEXT:     CNDE_INT * T0.Y, PV.W, PS, 0.0,
1297; EG-NEXT:     MOV T0.X, 0.0,
1298; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1299; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1300  %shl = shl i64 4602678819172646912, %a
1301  store i64 %shl, i64 addrspace(1)* %out, align 8
1302  ret void
1303}
1304
1305define amdgpu_kernel void @s_shl_inline_imm_neg_0_5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
1306; GCN-LABEL: s_shl_inline_imm_neg_0_5_i64:
1307; GCN:       ; %bb.0:
1308; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1309; GCN-NEXT:    s_load_dword s0, s[0:1], 0xd
1310; GCN-NEXT:    s_mov_b32 s7, 0xf000
1311; GCN-NEXT:    s_mov_b32 s6, -1
1312; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1313; GCN-NEXT:    s_lshl_b64 s[0:1], -0.5, s0
1314; GCN-NEXT:    v_mov_b32_e32 v0, s0
1315; GCN-NEXT:    v_mov_b32_e32 v1, s1
1316; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1317; GCN-NEXT:    s_endpgm
1318;
1319; EG-LABEL: s_shl_inline_imm_neg_0_5_i64:
1320; EG:       ; %bb.0:
1321; EG-NEXT:    ALU 6, @4, KC0[CB0:0-32], KC1[]
1322; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1323; EG-NEXT:    CF_END
1324; EG-NEXT:    PAD
1325; EG-NEXT:    ALU clause starting at 4:
1326; EG-NEXT:     SETGT_UINT T0.W, KC0[2].W, literal.x,
1327; EG-NEXT:     LSHL * T1.W, literal.y, KC0[2].W,
1328; EG-NEXT:    31(4.344025e-44), -1075838976(-1.750000e+00)
1329; EG-NEXT:     CNDE_INT * T0.Y, PV.W, PS, 0.0,
1330; EG-NEXT:     MOV T0.X, 0.0,
1331; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1332; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1333  %shl = shl i64 13826050856027422720, %a
1334  store i64 %shl, i64 addrspace(1)* %out, align 8
1335  ret void
1336}
1337
1338define amdgpu_kernel void @s_shl_inline_imm_2_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
1339; GCN-LABEL: s_shl_inline_imm_2_0_i64:
1340; GCN:       ; %bb.0:
1341; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1342; GCN-NEXT:    s_load_dword s0, s[0:1], 0xd
1343; GCN-NEXT:    s_mov_b32 s7, 0xf000
1344; GCN-NEXT:    s_mov_b32 s6, -1
1345; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1346; GCN-NEXT:    s_lshl_b64 s[0:1], 2.0, s0
1347; GCN-NEXT:    v_mov_b32_e32 v0, s0
1348; GCN-NEXT:    v_mov_b32_e32 v1, s1
1349; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1350; GCN-NEXT:    s_endpgm
1351;
1352; EG-LABEL: s_shl_inline_imm_2_0_i64:
1353; EG:       ; %bb.0:
1354; EG-NEXT:    ALU 6, @4, KC0[CB0:0-32], KC1[]
1355; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1356; EG-NEXT:    CF_END
1357; EG-NEXT:    PAD
1358; EG-NEXT:    ALU clause starting at 4:
1359; EG-NEXT:     SETGT_UINT T0.W, KC0[2].W, literal.x,
1360; EG-NEXT:     LSHL * T1.W, literal.y, KC0[2].W,
1361; EG-NEXT:    31(4.344025e-44), 1073741824(2.000000e+00)
1362; EG-NEXT:     CNDE_INT * T0.Y, PV.W, PS, 0.0,
1363; EG-NEXT:     MOV T0.X, 0.0,
1364; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1365; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1366  %shl = shl i64 4611686018427387904, %a
1367  store i64 %shl, i64 addrspace(1)* %out, align 8
1368  ret void
1369}
1370
1371define amdgpu_kernel void @s_shl_inline_imm_neg_2_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
1372; GCN-LABEL: s_shl_inline_imm_neg_2_0_i64:
1373; GCN:       ; %bb.0:
1374; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1375; GCN-NEXT:    s_load_dword s0, s[0:1], 0xd
1376; GCN-NEXT:    s_mov_b32 s7, 0xf000
1377; GCN-NEXT:    s_mov_b32 s6, -1
1378; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1379; GCN-NEXT:    s_lshl_b64 s[0:1], -2.0, s0
1380; GCN-NEXT:    v_mov_b32_e32 v0, s0
1381; GCN-NEXT:    v_mov_b32_e32 v1, s1
1382; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1383; GCN-NEXT:    s_endpgm
1384;
1385; EG-LABEL: s_shl_inline_imm_neg_2_0_i64:
1386; EG:       ; %bb.0:
1387; EG-NEXT:    ALU 6, @4, KC0[CB0:0-32], KC1[]
1388; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1389; EG-NEXT:    CF_END
1390; EG-NEXT:    PAD
1391; EG-NEXT:    ALU clause starting at 4:
1392; EG-NEXT:     SETGT_UINT T0.W, KC0[2].W, literal.x,
1393; EG-NEXT:     LSHL * T1.W, literal.y, KC0[2].W,
1394; EG-NEXT:    31(4.344025e-44), -1073741824(-2.000000e+00)
1395; EG-NEXT:     CNDE_INT * T0.Y, PV.W, PS, 0.0,
1396; EG-NEXT:     MOV T0.X, 0.0,
1397; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1398; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1399  %shl = shl i64 13835058055282163712, %a
1400  store i64 %shl, i64 addrspace(1)* %out, align 8
1401  ret void
1402}
1403
1404define amdgpu_kernel void @s_shl_inline_imm_4_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
1405; GCN-LABEL: s_shl_inline_imm_4_0_i64:
1406; GCN:       ; %bb.0:
1407; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1408; GCN-NEXT:    s_load_dword s0, s[0:1], 0xd
1409; GCN-NEXT:    s_mov_b32 s7, 0xf000
1410; GCN-NEXT:    s_mov_b32 s6, -1
1411; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1412; GCN-NEXT:    s_lshl_b64 s[0:1], 4.0, s0
1413; GCN-NEXT:    v_mov_b32_e32 v0, s0
1414; GCN-NEXT:    v_mov_b32_e32 v1, s1
1415; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1416; GCN-NEXT:    s_endpgm
1417;
1418; EG-LABEL: s_shl_inline_imm_4_0_i64:
1419; EG:       ; %bb.0:
1420; EG-NEXT:    ALU 6, @4, KC0[CB0:0-32], KC1[]
1421; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1422; EG-NEXT:    CF_END
1423; EG-NEXT:    PAD
1424; EG-NEXT:    ALU clause starting at 4:
1425; EG-NEXT:     SETGT_UINT T0.W, KC0[2].W, literal.x,
1426; EG-NEXT:     LSHL * T1.W, literal.y, KC0[2].W,
1427; EG-NEXT:    31(4.344025e-44), 1074790400(2.250000e+00)
1428; EG-NEXT:     CNDE_INT * T0.Y, PV.W, PS, 0.0,
1429; EG-NEXT:     MOV T0.X, 0.0,
1430; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1431; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1432  %shl = shl i64 4616189618054758400, %a
1433  store i64 %shl, i64 addrspace(1)* %out, align 8
1434  ret void
1435}
1436
1437define amdgpu_kernel void @s_shl_inline_imm_neg_4_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
1438; GCN-LABEL: s_shl_inline_imm_neg_4_0_i64:
1439; GCN:       ; %bb.0:
1440; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1441; GCN-NEXT:    s_load_dword s0, s[0:1], 0xd
1442; GCN-NEXT:    s_mov_b32 s7, 0xf000
1443; GCN-NEXT:    s_mov_b32 s6, -1
1444; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1445; GCN-NEXT:    s_lshl_b64 s[0:1], -4.0, s0
1446; GCN-NEXT:    v_mov_b32_e32 v0, s0
1447; GCN-NEXT:    v_mov_b32_e32 v1, s1
1448; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1449; GCN-NEXT:    s_endpgm
1450;
1451; EG-LABEL: s_shl_inline_imm_neg_4_0_i64:
1452; EG:       ; %bb.0:
1453; EG-NEXT:    ALU 6, @4, KC0[CB0:0-32], KC1[]
1454; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1455; EG-NEXT:    CF_END
1456; EG-NEXT:    PAD
1457; EG-NEXT:    ALU clause starting at 4:
1458; EG-NEXT:     SETGT_UINT T0.W, KC0[2].W, literal.x,
1459; EG-NEXT:     LSHL * T1.W, literal.y, KC0[2].W,
1460; EG-NEXT:    31(4.344025e-44), -1072693248(-2.250000e+00)
1461; EG-NEXT:     CNDE_INT * T0.Y, PV.W, PS, 0.0,
1462; EG-NEXT:     MOV T0.X, 0.0,
1463; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1464; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1465  %shl = shl i64 13839561654909534208, %a
1466  store i64 %shl, i64 addrspace(1)* %out, align 8
1467  ret void
1468}
1469
1470
1471; Test with the 64-bit integer bitpattern for a 32-bit float in the
1472; low 32-bits, which is not a valid 64-bit inline immmediate.
1473define amdgpu_kernel void @s_shl_inline_imm_f32_4_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
1474; GCN-LABEL: s_shl_inline_imm_f32_4_0_i64:
1475; GCN:       ; %bb.0:
1476; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1477; GCN-NEXT:    s_load_dword s2, s[0:1], 0xd
1478; GCN-NEXT:    s_mov_b32 s1, 0
1479; GCN-NEXT:    s_mov_b32 s0, 4.0
1480; GCN-NEXT:    s_mov_b32 s7, 0xf000
1481; GCN-NEXT:    s_mov_b32 s6, -1
1482; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1483; GCN-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
1484; GCN-NEXT:    v_mov_b32_e32 v0, s0
1485; GCN-NEXT:    v_mov_b32_e32 v1, s1
1486; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1487; GCN-NEXT:    s_endpgm
1488;
1489; EG-LABEL: s_shl_inline_imm_f32_4_0_i64:
1490; EG:       ; %bb.0:
1491; EG-NEXT:    ALU 14, @4, KC0[CB0:0-32], KC1[]
1492; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1493; EG-NEXT:    CF_END
1494; EG-NEXT:    PAD
1495; EG-NEXT:    ALU clause starting at 4:
1496; EG-NEXT:     SUB_INT * T0.W, literal.x, KC0[2].W,
1497; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
1498; EG-NEXT:     LSHR T0.W, literal.x, PV.W,
1499; EG-NEXT:     ADD_INT * T1.W, KC0[2].W, literal.y,
1500; EG-NEXT:    1082130432(4.000000e+00), -32(nan)
1501; EG-NEXT:     LSHL T0.Z, literal.x, PS,
1502; EG-NEXT:     LSHR T0.W, PV.W, 1,
1503; EG-NEXT:     SETGT_UINT * T1.W, KC0[2].W, literal.y,
1504; EG-NEXT:    1082130432(4.000000e+00), 31(4.344025e-44)
1505; EG-NEXT:     CNDE_INT T0.Y, PS, PV.W, PV.Z,
1506; EG-NEXT:     LSHL * T0.W, literal.x, KC0[2].W,
1507; EG-NEXT:    1082130432(4.000000e+00), 0(0.000000e+00)
1508; EG-NEXT:     CNDE_INT T0.X, T1.W, PV.W, 0.0,
1509; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1510; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1511  %shl = shl i64 1082130432, %a
1512  store i64 %shl, i64 addrspace(1)* %out, align 8
1513  ret void
1514}
1515
1516; FIXME: Copy of -1 register
1517define amdgpu_kernel void @s_shl_inline_imm_f32_neg_4_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
1518; GCN-LABEL: s_shl_inline_imm_f32_neg_4_0_i64:
1519; GCN:       ; %bb.0:
1520; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1521; GCN-NEXT:    s_load_dword s2, s[0:1], 0xd
1522; GCN-NEXT:    s_mov_b32 s6, -1
1523; GCN-NEXT:    s_mov_b32 s0, -4.0
1524; GCN-NEXT:    s_mov_b32 s1, s6
1525; GCN-NEXT:    s_mov_b32 s7, 0xf000
1526; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1527; GCN-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
1528; GCN-NEXT:    v_mov_b32_e32 v0, s0
1529; GCN-NEXT:    v_mov_b32_e32 v1, s1
1530; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1531; GCN-NEXT:    s_endpgm
1532;
1533; EG-LABEL: s_shl_inline_imm_f32_neg_4_0_i64:
1534; EG:       ; %bb.0:
1535; EG-NEXT:    ALU 17, @4, KC0[CB0:0-32], KC1[]
1536; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1537; EG-NEXT:    CF_END
1538; EG-NEXT:    PAD
1539; EG-NEXT:    ALU clause starting at 4:
1540; EG-NEXT:     SUB_INT * T0.W, literal.x, KC0[2].W,
1541; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
1542; EG-NEXT:     LSHR * T0.W, literal.x, PV.W,
1543; EG-NEXT:    -1065353216(-4.000000e+00), 0(0.000000e+00)
1544; EG-NEXT:     ADD_INT T0.Z, KC0[2].W, literal.x,
1545; EG-NEXT:     LSHR T0.W, PV.W, 1,
1546; EG-NEXT:     LSHL * T1.W, literal.y, KC0[2].W,
1547; EG-NEXT:    -32(nan), -1(nan)
1548; EG-NEXT:     OR_INT T1.Z, PS, PV.W,
1549; EG-NEXT:     LSHL T0.W, literal.x, PV.Z,
1550; EG-NEXT:     SETGT_UINT * T1.W, KC0[2].W, literal.y,
1551; EG-NEXT:    -1065353216(-4.000000e+00), 31(4.344025e-44)
1552; EG-NEXT:     CNDE_INT T0.Y, PS, PV.Z, PV.W,
1553; EG-NEXT:     LSHL * T0.W, literal.x, KC0[2].W,
1554; EG-NEXT:    -1065353216(-4.000000e+00), 0(0.000000e+00)
1555; EG-NEXT:     CNDE_INT T0.X, T1.W, PV.W, 0.0,
1556; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1557; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1558  %shl = shl i64 -1065353216, %a
1559  store i64 %shl, i64 addrspace(1)* %out, align 8
1560  ret void
1561}
1562
1563define amdgpu_kernel void @s_shl_inline_high_imm_f32_4_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
1564; GCN-LABEL: s_shl_inline_high_imm_f32_4_0_i64:
1565; GCN:       ; %bb.0:
1566; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1567; GCN-NEXT:    s_load_dword s2, s[0:1], 0xd
1568; GCN-NEXT:    s_mov_b32 s1, 4.0
1569; GCN-NEXT:    s_mov_b32 s0, 0
1570; GCN-NEXT:    s_mov_b32 s7, 0xf000
1571; GCN-NEXT:    s_mov_b32 s6, -1
1572; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1573; GCN-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
1574; GCN-NEXT:    v_mov_b32_e32 v0, s0
1575; GCN-NEXT:    v_mov_b32_e32 v1, s1
1576; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1577; GCN-NEXT:    s_endpgm
1578;
1579; EG-LABEL: s_shl_inline_high_imm_f32_4_0_i64:
1580; EG:       ; %bb.0:
1581; EG-NEXT:    ALU 6, @4, KC0[CB0:0-32], KC1[]
1582; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1583; EG-NEXT:    CF_END
1584; EG-NEXT:    PAD
1585; EG-NEXT:    ALU clause starting at 4:
1586; EG-NEXT:     SETGT_UINT T0.W, KC0[2].W, literal.x,
1587; EG-NEXT:     LSHL * T1.W, literal.y, KC0[2].W,
1588; EG-NEXT:    31(4.344025e-44), 1082130432(4.000000e+00)
1589; EG-NEXT:     CNDE_INT * T0.Y, PV.W, PS, 0.0,
1590; EG-NEXT:     MOV T0.X, 0.0,
1591; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1592; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1593  %shl = shl i64 4647714815446351872, %a
1594  store i64 %shl, i64 addrspace(1)* %out, align 8
1595  ret void
1596}
1597
1598define amdgpu_kernel void @s_shl_inline_high_imm_f32_neg_4_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
1599; GCN-LABEL: s_shl_inline_high_imm_f32_neg_4_0_i64:
1600; GCN:       ; %bb.0:
1601; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1602; GCN-NEXT:    s_load_dword s2, s[0:1], 0xd
1603; GCN-NEXT:    s_mov_b32 s1, -4.0
1604; GCN-NEXT:    s_mov_b32 s0, 0
1605; GCN-NEXT:    s_mov_b32 s7, 0xf000
1606; GCN-NEXT:    s_mov_b32 s6, -1
1607; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1608; GCN-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
1609; GCN-NEXT:    v_mov_b32_e32 v0, s0
1610; GCN-NEXT:    v_mov_b32_e32 v1, s1
1611; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1612; GCN-NEXT:    s_endpgm
1613;
1614; EG-LABEL: s_shl_inline_high_imm_f32_neg_4_0_i64:
1615; EG:       ; %bb.0:
1616; EG-NEXT:    ALU 6, @4, KC0[CB0:0-32], KC1[]
1617; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1618; EG-NEXT:    CF_END
1619; EG-NEXT:    PAD
1620; EG-NEXT:    ALU clause starting at 4:
1621; EG-NEXT:     SETGT_UINT T0.W, KC0[2].W, literal.x,
1622; EG-NEXT:     LSHL * T1.W, literal.y, KC0[2].W,
1623; EG-NEXT:    31(4.344025e-44), -1065353216(-4.000000e+00)
1624; EG-NEXT:     CNDE_INT * T0.Y, PV.W, PS, 0.0,
1625; EG-NEXT:     MOV T0.X, 0.0,
1626; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1627; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1628  %shl = shl i64 13871086852301127680, %a
1629  store i64 %shl, i64 addrspace(1)* %out, align 8
1630  ret void
1631}
1632
1633define amdgpu_kernel void @test_mul2(i32 %p) {
1634; GCN-LABEL: test_mul2:
1635; GCN:       ; %bb.0:
1636; GCN-NEXT:    s_load_dword s0, s[0:1], 0x9
1637; GCN-NEXT:    s_mov_b32 s3, 0xf000
1638; GCN-NEXT:    s_mov_b32 s2, -1
1639; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1640; GCN-NEXT:    s_lshl_b32 s0, s0, 1
1641; GCN-NEXT:    v_mov_b32_e32 v0, s0
1642; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1643; GCN-NEXT:    s_endpgm
1644;
1645; EG-LABEL: test_mul2:
1646; EG:       ; %bb.0:
1647; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
1648; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
1649; EG-NEXT:    CF_END
1650; EG-NEXT:    PAD
1651; EG-NEXT:    ALU clause starting at 4:
1652; EG-NEXT:     MOV T0.X, literal.x,
1653; EG-NEXT:     LSHL * T1.X, KC0[2].Y, 1,
1654; EG-NEXT:    0(0.000000e+00), 0(0.000000e+00)
1655   %i = mul i32 %p, 2
1656   store volatile i32 %i, i32 addrspace(1)* undef
1657   ret void
1658}
1659
1660define void @shl_or_k(i32 addrspace(1)* %out, i32 %in) {
1661; GCN-LABEL: shl_or_k:
1662; GCN:       ; %bb.0:
1663; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1664; GCN-NEXT:    s_mov_b32 s6, 0
1665; GCN-NEXT:    v_lshlrev_b32_e32 v2, 2, v2
1666; GCN-NEXT:    s_mov_b32 s7, 0xf000
1667; GCN-NEXT:    s_mov_b32 s4, s6
1668; GCN-NEXT:    s_mov_b32 s5, s6
1669; GCN-NEXT:    v_or_b32_e32 v2, 4, v2
1670; GCN-NEXT:    buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
1671; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
1672; GCN-NEXT:    s_setpc_b64 s[30:31]
1673;
1674; EG-LABEL: shl_or_k:
1675; EG:       ; %bb.0:
1676; EG-NEXT:    ALU 4, @4, KC0[CB0:0-32], KC1[]
1677; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1678; EG-NEXT:    CF_END
1679; EG-NEXT:    PAD
1680; EG-NEXT:    ALU clause starting at 4:
1681; EG-NEXT:     LSHL * T0.W, KC0[2].Z, literal.x,
1682; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1683; EG-NEXT:     OR_INT T0.X, PV.W, literal.x,
1684; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1685; EG-NEXT:    4(5.605194e-45), 2(2.802597e-45)
1686  %tmp0 = or i32 %in, 1
1687  %tmp2 = shl i32 %tmp0, 2
1688  store i32 %tmp2, i32 addrspace(1)* %out
1689  ret void
1690}
1691
1692define void @shl_or_k_two_uses(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %in) {
1693; GCN-LABEL: shl_or_k_two_uses:
1694; GCN:       ; %bb.0:
1695; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1696; GCN-NEXT:    s_mov_b32 s6, 0
1697; GCN-NEXT:    v_or_b32_e32 v4, 1, v4
1698; GCN-NEXT:    s_mov_b32 s7, 0xf000
1699; GCN-NEXT:    s_mov_b32 s4, s6
1700; GCN-NEXT:    s_mov_b32 s5, s6
1701; GCN-NEXT:    v_lshlrev_b32_e32 v5, 2, v4
1702; GCN-NEXT:    buffer_store_dword v5, v[0:1], s[4:7], 0 addr64
1703; GCN-NEXT:    buffer_store_dword v4, v[2:3], s[4:7], 0 addr64
1704; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
1705; GCN-NEXT:    s_setpc_b64 s[30:31]
1706;
1707; EG-LABEL: shl_or_k_two_uses:
1708; EG:       ; %bb.0:
1709; EG-NEXT:    ALU 5, @4, KC0[CB0:0-32], KC1[]
1710; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.X, T3.X, 0
1711; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
1712; EG-NEXT:    CF_END
1713; EG-NEXT:    ALU clause starting at 4:
1714; EG-NEXT:     LSHR T0.X, KC0[2].Z, literal.x,
1715; EG-NEXT:     OR_INT * T1.X, KC0[2].W, 1,
1716; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1717; EG-NEXT:     LSHL T2.X, PS, literal.x,
1718; EG-NEXT:     LSHR * T3.X, KC0[2].Y, literal.x,
1719; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1720  %tmp0 = or i32 %in, 1
1721  %tmp2 = shl i32 %tmp0, 2
1722  store i32 %tmp2, i32 addrspace(1)* %out0
1723  store i32 %tmp0, i32 addrspace(1)* %out1
1724  ret void
1725}
1726
1727attributes #0 = { nounwind readnone }
1728