1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s
3; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s
4; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
5
6; Test optimization to reduce shifts to narrower sizes.
7
8define amdgpu_ps i64 @s_shl_i64_zext_i32(i32 inreg %x) {
9; GCN-LABEL: s_shl_i64_zext_i32:
10; GCN:       ; %bb.0:
11; GCN-NEXT:    s_andn2_b32 s0, s0, -2.0
12; GCN-NEXT:    s_lshl_b32 s0, s0, 2
13; GCN-NEXT:    s_mov_b32 s1, 0
14; GCN-NEXT:    ; return to shader part epilog
15  %and = and i32 %x, 1073741823
16  %ext = zext i32 %and to i64
17  %shl = shl i64 %ext, 2
18  ret i64 %shl
19}
20
21define i64 @v_shl_i64_zext_i32(i32 %x) {
22; GCN-LABEL: v_shl_i64_zext_i32:
23; GCN:       ; %bb.0:
24; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25; GCN-NEXT:    v_and_b32_e32 v0, 0x3fffffff, v0
26; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
27; GCN-NEXT:    v_mov_b32_e32 v1, 0
28; GCN-NEXT:    s_setpc_b64 s[30:31]
29  %and = and i32 %x, 1073741823
30  %ext = zext i32 %and to i64
31  %shl = shl i64 %ext, 2
32  ret i64 %shl
33}
34
35define amdgpu_ps i64 @s_shl_i64_sext_i32(i32 inreg %x) {
36; GCN-LABEL: s_shl_i64_sext_i32:
37; GCN:       ; %bb.0:
38; GCN-NEXT:    s_and_b32 s0, s0, 0x1fffffff
39; GCN-NEXT:    s_lshl_b32 s0, s0, 2
40; GCN-NEXT:    s_mov_b32 s1, 0
41; GCN-NEXT:    ; return to shader part epilog
42  %and = and i32 %x, 536870911
43  %ext = sext i32 %and to i64
44  %shl = shl i64 %ext, 2
45  ret i64 %shl
46}
47
48define i64 @v_shl_i64_sext_i32(i32 %x) {
49; GCN-LABEL: v_shl_i64_sext_i32:
50; GCN:       ; %bb.0:
51; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
52; GCN-NEXT:    v_and_b32_e32 v0, 0x1fffffff, v0
53; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
54; GCN-NEXT:    v_mov_b32_e32 v1, 0
55; GCN-NEXT:    s_setpc_b64 s[30:31]
56  %and = and i32 %x, 536870911
57  %ext = sext i32 %and to i64
58  %shl = shl i64 %ext, 2
59  ret i64 %shl
60}
61
62define amdgpu_ps i64 @s_shl_i64_zext_i32_overflow(i32 inreg %x) {
63; GCN-LABEL: s_shl_i64_zext_i32_overflow:
64; GCN:       ; %bb.0:
65; GCN-NEXT:    s_bitset0_b32 s0, 31
66; GCN-NEXT:    s_bfe_u64 s[0:1], s[0:1], 0x200000
67; GCN-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
68; GCN-NEXT:    ; return to shader part epilog
69  %and = and i32 %x, 2147483647
70  %ext = zext i32 %and to i64
71  %shl = shl i64 %ext, 2
72  ret i64 %shl
73}
74
75define i64 @v_shl_i64_zext_i32_overflow(i32 %x) {
76; GFX7-LABEL: v_shl_i64_zext_i32_overflow:
77; GFX7:       ; %bb.0:
78; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
79; GFX7-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
80; GFX7-NEXT:    v_mov_b32_e32 v1, 0
81; GFX7-NEXT:    v_lshl_b64 v[0:1], v[0:1], 2
82; GFX7-NEXT:    s_setpc_b64 s[30:31]
83;
84; GFX8-LABEL: v_shl_i64_zext_i32_overflow:
85; GFX8:       ; %bb.0:
86; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
87; GFX8-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
88; GFX8-NEXT:    v_mov_b32_e32 v1, 0
89; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
90; GFX8-NEXT:    s_setpc_b64 s[30:31]
91;
92; GFX9-LABEL: v_shl_i64_zext_i32_overflow:
93; GFX9:       ; %bb.0:
94; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
95; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
96; GFX9-NEXT:    v_mov_b32_e32 v1, 0
97; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
98; GFX9-NEXT:    s_setpc_b64 s[30:31]
99  %and = and i32 %x, 2147483647
100  %ext = zext i32 %and to i64
101  %shl = shl i64 %ext, 2
102  ret i64 %shl
103}
104
105define amdgpu_ps i64 @s_shl_i64_sext_i32_overflow(i32 inreg %x) {
106; GCN-LABEL: s_shl_i64_sext_i32_overflow:
107; GCN:       ; %bb.0:
108; GCN-NEXT:    s_bitset0_b32 s0, 31
109; GCN-NEXT:    s_bfe_i64 s[0:1], s[0:1], 0x200000
110; GCN-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
111; GCN-NEXT:    ; return to shader part epilog
112  %and = and i32 %x, 2147483647
113  %ext = sext i32 %and to i64
114  %shl = shl i64 %ext, 2
115  ret i64 %shl
116}
117
118define i64 @v_shl_i64_sext_i32_overflow(i32 %x) {
119; GFX7-LABEL: v_shl_i64_sext_i32_overflow:
120; GFX7:       ; %bb.0:
121; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
122; GFX7-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
123; GFX7-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
124; GFX7-NEXT:    v_lshl_b64 v[0:1], v[0:1], 2
125; GFX7-NEXT:    s_setpc_b64 s[30:31]
126;
127; GFX8-LABEL: v_shl_i64_sext_i32_overflow:
128; GFX8:       ; %bb.0:
129; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
130; GFX8-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
131; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
132; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
133; GFX8-NEXT:    s_setpc_b64 s[30:31]
134;
135; GFX9-LABEL: v_shl_i64_sext_i32_overflow:
136; GFX9:       ; %bb.0:
137; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
138; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
139; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
140; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
141; GFX9-NEXT:    s_setpc_b64 s[30:31]
142  %and = and i32 %x, 2147483647
143  %ext = sext i32 %and to i64
144  %shl = shl i64 %ext, 2
145  ret i64 %shl
146}
147
148define amdgpu_kernel void @mulu24_shl64(i32 addrspace(1)* nocapture %arg) {
149; GFX7-LABEL: mulu24_shl64:
150; GFX7:       ; %bb.0: ; %bb
151; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
152; GFX7-NEXT:    v_and_b32_e32 v0, 6, v0
153; GFX7-NEXT:    v_mul_u32_u24_e32 v0, 7, v0
154; GFX7-NEXT:    v_mov_b32_e32 v1, 0
155; GFX7-NEXT:    v_lshl_b64 v[2:3], v[0:1], 2
156; GFX7-NEXT:    s_mov_b32 s2, 0
157; GFX7-NEXT:    s_mov_b32 s3, 0xf000
158; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
159; GFX7-NEXT:    buffer_store_dword v1, v[2:3], s[0:3], 0 addr64
160; GFX7-NEXT:    s_endpgm
161;
162; GFX8-LABEL: mulu24_shl64:
163; GFX8:       ; %bb.0: ; %bb
164; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
165; GFX8-NEXT:    v_and_b32_e32 v0, 6, v0
166; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 7, v0
167; GFX8-NEXT:    v_mov_b32_e32 v1, 0
168; GFX8-NEXT:    v_lshlrev_b64 v[2:3], 2, v[0:1]
169; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
170; GFX8-NEXT:    v_mov_b32_e32 v5, s1
171; GFX8-NEXT:    v_mov_b32_e32 v4, s0
172; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v4, v2
173; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
174; GFX8-NEXT:    flat_store_dword v[2:3], v1
175; GFX8-NEXT:    s_endpgm
176;
177; GFX9-LABEL: mulu24_shl64:
178; GFX9:       ; %bb.0: ; %bb
179; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
180; GFX9-NEXT:    v_and_b32_e32 v0, 6, v0
181; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 7, v0
182; GFX9-NEXT:    v_mov_b32_e32 v1, 0
183; GFX9-NEXT:    v_lshlrev_b64 v[2:3], 2, v[0:1]
184; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
185; GFX9-NEXT:    v_mov_b32_e32 v5, s1
186; GFX9-NEXT:    v_mov_b32_e32 v4, s0
187; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v4, v2
188; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v3, vcc
189; GFX9-NEXT:    global_store_dword v[2:3], v1, off
190; GFX9-NEXT:    s_endpgm
191bb:
192  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
193  %tmp1 = and i32 %tmp, 6
194  %mulconv = mul nuw nsw i32 %tmp1, 7
195  %tmp2 = zext i32 %mulconv to i64
196  %tmp3 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp2
197  store i32 0, i32 addrspace(1)* %tmp3, align 4
198  ret void
199}
200
201define amdgpu_kernel void @muli24_shl64(i64 addrspace(1)* nocapture %arg, i32 addrspace(1)* nocapture readonly %arg1) {
202; GFX7-LABEL: muli24_shl64:
203; GFX7:       ; %bb.0: ; %bb
204; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
205; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
206; GFX7-NEXT:    v_mov_b32_e32 v2, 0
207; GFX7-NEXT:    s_mov_b32 s6, 0
208; GFX7-NEXT:    s_mov_b32 s7, 0xf000
209; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
210; GFX7-NEXT:    s_mov_b64 s[4:5], s[2:3]
211; GFX7-NEXT:    buffer_load_dword v1, v[1:2], s[4:7], 0 addr64
212; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 3, v0
213; GFX7-NEXT:    v_mov_b32_e32 v4, s1
214; GFX7-NEXT:    v_mov_b32_e32 v3, s0
215; GFX7-NEXT:    s_waitcnt vmcnt(0)
216; GFX7-NEXT:    v_or_b32_e32 v0, 0xff800000, v1
217; GFX7-NEXT:    v_mul_i32_i24_e32 v1, -7, v0
218; GFX7-NEXT:    v_lshl_b64 v[0:1], v[1:2], 3
219; GFX7-NEXT:    v_add_i32_e32 v2, vcc, v3, v5
220; GFX7-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
221; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
222; GFX7-NEXT:    s_endpgm
223;
224; GFX8-LABEL: muli24_shl64:
225; GFX8:       ; %bb.0: ; %bb
226; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
227; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 2, v0
228; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 3, v0
229; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
230; GFX8-NEXT:    v_mov_b32_e32 v1, s2
231; GFX8-NEXT:    v_mov_b32_e32 v2, s3
232; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
233; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
234; GFX8-NEXT:    flat_load_dword v4, v[1:2]
235; GFX8-NEXT:    v_mov_b32_e32 v3, s1
236; GFX8-NEXT:    v_mov_b32_e32 v2, s0
237; GFX8-NEXT:    v_mov_b32_e32 v1, 0
238; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
239; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
240; GFX8-NEXT:    s_waitcnt vmcnt(0)
241; GFX8-NEXT:    v_or_b32_e32 v0, 0xff800000, v4
242; GFX8-NEXT:    v_mul_i32_i24_e32 v0, -7, v0
243; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 3, v[0:1]
244; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
245; GFX8-NEXT:    s_endpgm
246;
247; GFX9-LABEL: muli24_shl64:
248; GFX9:       ; %bb.0: ; %bb
249; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
250; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
251; GFX9-NEXT:    v_mov_b32_e32 v2, 0
252; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
253; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
254; GFX9-NEXT:    global_load_dword v1, v1, s[2:3]
255; GFX9-NEXT:    s_waitcnt vmcnt(0)
256; GFX9-NEXT:    v_or_b32_e32 v1, 0xff800000, v1
257; GFX9-NEXT:    v_mul_i32_i24_e32 v1, -7, v1
258; GFX9-NEXT:    v_lshlrev_b64 v[1:2], 3, v[1:2]
259; GFX9-NEXT:    global_store_dwordx2 v0, v[1:2], s[0:1]
260; GFX9-NEXT:    s_endpgm
261bb:
262  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
263  %tmp2 = sext i32 %tmp to i64
264  %tmp3 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp2
265  %tmp4 = load i32, i32 addrspace(1)* %tmp3, align 4
266  %tmp5 = or i32 %tmp4, -8388608
267  %tmp6 = mul nsw i32 %tmp5, -7
268  %tmp7 = zext i32 %tmp6 to i64
269  %tmp8 = shl nuw nsw i64 %tmp7, 3
270  %tmp9 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i64 %tmp2
271  store i64 %tmp8, i64 addrspace(1)* %tmp9, align 8
272  ret void
273}
274
275define amdgpu_ps <2 x i64> @s_shl_v2i64_zext_v2i32(<2 x i32> inreg %x) {
276; GCN-LABEL: s_shl_v2i64_zext_v2i32:
277; GCN:       ; %bb.0:
278; GCN-NEXT:    s_brev_b32 s2, -4
279; GCN-NEXT:    s_mov_b32 s3, s2
280; GCN-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
281; GCN-NEXT:    s_bfe_u64 s[2:3], s[0:1], 0x200000
282; GCN-NEXT:    s_mov_b32 s0, s1
283; GCN-NEXT:    s_bfe_u64 s[4:5], s[0:1], 0x200000
284; GCN-NEXT:    s_lshl_b64 s[0:1], s[2:3], 2
285; GCN-NEXT:    s_lshl_b64 s[2:3], s[4:5], 2
286; GCN-NEXT:    ; return to shader part epilog
287  %and = and <2 x i32> %x, <i32 1073741823, i32 1073741823>
288  %ext = zext <2 x i32> %and to <2 x i64>
289  %shl = shl <2 x i64> %ext, <i64 2, i64 2>
290  ret <2 x i64> %shl
291}
292
293define <2 x i64> @v_shl_v2i64_zext_v2i32(<2 x i32> %x) {
294; GFX7-LABEL: v_shl_v2i64_zext_v2i32:
295; GFX7:       ; %bb.0:
296; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
297; GFX7-NEXT:    s_brev_b32 s4, -4
298; GFX7-NEXT:    v_and_b32_e32 v2, s4, v1
299; GFX7-NEXT:    v_mov_b32_e32 v1, 0
300; GFX7-NEXT:    v_mov_b32_e32 v3, v1
301; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
302; GFX7-NEXT:    v_lshl_b64 v[0:1], v[0:1], 2
303; GFX7-NEXT:    v_lshl_b64 v[2:3], v[2:3], 2
304; GFX7-NEXT:    s_setpc_b64 s[30:31]
305;
306; GFX8-LABEL: v_shl_v2i64_zext_v2i32:
307; GFX8:       ; %bb.0:
308; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
309; GFX8-NEXT:    s_brev_b32 s4, -4
310; GFX8-NEXT:    v_and_b32_e32 v2, s4, v1
311; GFX8-NEXT:    v_mov_b32_e32 v1, 0
312; GFX8-NEXT:    v_mov_b32_e32 v3, v1
313; GFX8-NEXT:    v_and_b32_e32 v0, s4, v0
314; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
315; GFX8-NEXT:    v_lshlrev_b64 v[2:3], 2, v[2:3]
316; GFX8-NEXT:    s_setpc_b64 s[30:31]
317;
318; GFX9-LABEL: v_shl_v2i64_zext_v2i32:
319; GFX9:       ; %bb.0:
320; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
321; GFX9-NEXT:    s_brev_b32 s4, -4
322; GFX9-NEXT:    v_and_b32_e32 v2, s4, v1
323; GFX9-NEXT:    v_mov_b32_e32 v1, 0
324; GFX9-NEXT:    v_mov_b32_e32 v3, v1
325; GFX9-NEXT:    v_and_b32_e32 v0, s4, v0
326; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
327; GFX9-NEXT:    v_lshlrev_b64 v[2:3], 2, v[2:3]
328; GFX9-NEXT:    s_setpc_b64 s[30:31]
329  %and = and <2 x i32> %x, <i32 1073741823, i32 1073741823>
330  %ext = zext <2 x i32> %and to <2 x i64>
331  %shl = shl <2 x i64> %ext, <i64 2, i64 2>
332  ret <2 x i64> %shl
333}
334
335define amdgpu_ps <2 x i64> @s_shl_v2i64_sext_v2i32(<2 x i32> inreg %x) {
336; GCN-LABEL: s_shl_v2i64_sext_v2i32:
337; GCN:       ; %bb.0:
338; GCN-NEXT:    s_brev_b32 s2, -8
339; GCN-NEXT:    s_mov_b32 s3, s2
340; GCN-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
341; GCN-NEXT:    s_bfe_i64 s[2:3], s[0:1], 0x200000
342; GCN-NEXT:    s_mov_b32 s0, s1
343; GCN-NEXT:    s_bfe_i64 s[4:5], s[0:1], 0x200000
344; GCN-NEXT:    s_lshl_b64 s[0:1], s[2:3], 2
345; GCN-NEXT:    s_lshl_b64 s[2:3], s[4:5], 2
346; GCN-NEXT:    ; return to shader part epilog
347  %and = and <2 x i32> %x, <i32 536870911, i32 536870911>
348  %ext = sext <2 x i32> %and to <2 x i64>
349  %shl = shl <2 x i64> %ext, <i64 2, i64 2>
350  ret <2 x i64> %shl
351}
352
353define <2 x i64> @v_shl_v2i64_sext_v2i32(<2 x i32> %x) {
354; GFX7-LABEL: v_shl_v2i64_sext_v2i32:
355; GFX7:       ; %bb.0:
356; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
357; GFX7-NEXT:    s_brev_b32 s4, -8
358; GFX7-NEXT:    v_and_b32_e32 v2, s4, v1
359; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
360; GFX7-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
361; GFX7-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
362; GFX7-NEXT:    v_lshl_b64 v[0:1], v[0:1], 2
363; GFX7-NEXT:    v_lshl_b64 v[2:3], v[2:3], 2
364; GFX7-NEXT:    s_setpc_b64 s[30:31]
365;
366; GFX8-LABEL: v_shl_v2i64_sext_v2i32:
367; GFX8:       ; %bb.0:
368; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
369; GFX8-NEXT:    s_brev_b32 s4, -8
370; GFX8-NEXT:    v_and_b32_e32 v2, s4, v1
371; GFX8-NEXT:    v_and_b32_e32 v0, s4, v0
372; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
373; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
374; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
375; GFX8-NEXT:    v_lshlrev_b64 v[2:3], 2, v[2:3]
376; GFX8-NEXT:    s_setpc_b64 s[30:31]
377;
378; GFX9-LABEL: v_shl_v2i64_sext_v2i32:
379; GFX9:       ; %bb.0:
380; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
381; GFX9-NEXT:    s_brev_b32 s4, -8
382; GFX9-NEXT:    v_and_b32_e32 v2, s4, v1
383; GFX9-NEXT:    v_and_b32_e32 v0, s4, v0
384; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
385; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
386; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
387; GFX9-NEXT:    v_lshlrev_b64 v[2:3], 2, v[2:3]
388; GFX9-NEXT:    s_setpc_b64 s[30:31]
389  %and = and <2 x i32> %x, <i32 536870911, i32 536870911>
390  %ext = sext <2 x i32> %and to <2 x i64>
391  %shl = shl <2 x i64> %ext, <i64 2, i64 2>
392  ret <2 x i64> %shl
393}
394
395define amdgpu_ps i32 @s_shl_i32_zext_i16(i16 inreg %x) {
396; GFX7-LABEL: s_shl_i32_zext_i16:
397; GFX7:       ; %bb.0:
398; GFX7-NEXT:    s_and_b32 s0, s0, 0x3fff
399; GFX7-NEXT:    s_lshl_b32 s0, s0, 2
400; GFX7-NEXT:    s_and_b32 s0, s0, 0xffff
401; GFX7-NEXT:    ; return to shader part epilog
402;
403; GFX8-LABEL: s_shl_i32_zext_i16:
404; GFX8:       ; %bb.0:
405; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
406; GFX8-NEXT:    s_and_b32 s0, s0, 0x3fff
407; GFX8-NEXT:    s_bfe_u32 s1, 2, 0x100000
408; GFX8-NEXT:    s_lshl_b32 s0, s0, s1
409; GFX8-NEXT:    s_bfe_u32 s0, s0, 0x100000
410; GFX8-NEXT:    ; return to shader part epilog
411;
412; GFX9-LABEL: s_shl_i32_zext_i16:
413; GFX9:       ; %bb.0:
414; GFX9-NEXT:    s_and_b32 s0, s0, 0xffff
415; GFX9-NEXT:    s_and_b32 s0, s0, 0x3fff
416; GFX9-NEXT:    s_bfe_u32 s1, 2, 0x100000
417; GFX9-NEXT:    s_lshl_b32 s0, s0, s1
418; GFX9-NEXT:    s_bfe_u32 s0, s0, 0x100000
419; GFX9-NEXT:    ; return to shader part epilog
420  %and = and i16 %x, 16383
421  %ext = zext i16 %and to i32
422  %shl = shl i32 %ext, 2
423  ret i32 %shl
424}
425
426define i32 @v_shl_i32_zext_i16(i16 %x) {
427; GFX7-LABEL: v_shl_i32_zext_i16:
428; GFX7:       ; %bb.0:
429; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
430; GFX7-NEXT:    v_and_b32_e32 v0, 0x3fff, v0
431; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
432; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
433; GFX7-NEXT:    s_setpc_b64 s[30:31]
434;
435; GFX8-LABEL: v_shl_i32_zext_i16:
436; GFX8:       ; %bb.0:
437; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
438; GFX8-NEXT:    v_and_b32_e32 v0, 0x3fff, v0
439; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 2, v0
440; GFX8-NEXT:    s_setpc_b64 s[30:31]
441;
442; GFX9-LABEL: v_shl_i32_zext_i16:
443; GFX9:       ; %bb.0:
444; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
445; GFX9-NEXT:    v_and_b32_e32 v0, 0x3fff, v0
446; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 2, v0
447; GFX9-NEXT:    s_setpc_b64 s[30:31]
448  %and = and i16 %x, 16383
449  %ext = zext i16 %and to i32
450  %shl = shl i32 %ext, 2
451  ret i32 %shl
452}
453
454define amdgpu_ps <2 x i32> @s_shl_v2i32_zext_v2i16(<2 x i16> inreg %x) {
455; GFX7-LABEL: s_shl_v2i32_zext_v2i16:
456; GFX7:       ; %bb.0:
457; GFX7-NEXT:    s_mov_b32 s2, 0xffff
458; GFX7-NEXT:    s_lshl_b32 s1, s1, 16
459; GFX7-NEXT:    s_and_b32 s0, s0, s2
460; GFX7-NEXT:    s_or_b32 s0, s1, s0
461; GFX7-NEXT:    s_and_b32 s0, s0, 0x3fff3fff
462; GFX7-NEXT:    s_lshr_b32 s1, s0, 16
463; GFX7-NEXT:    s_and_b32 s0, s0, s2
464; GFX7-NEXT:    s_lshl_b32 s0, s0, 2
465; GFX7-NEXT:    s_lshl_b32 s1, s1, 2
466; GFX7-NEXT:    ; return to shader part epilog
467;
468; GFX8-LABEL: s_shl_v2i32_zext_v2i16:
469; GFX8:       ; %bb.0:
470; GFX8-NEXT:    s_movk_i32 s2, 0x3fff
471; GFX8-NEXT:    s_mov_b32 s4, 0xffff
472; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
473; GFX8-NEXT:    s_mov_b32 s3, s2
474; GFX8-NEXT:    s_and_b32 s0, s0, s4
475; GFX8-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
476; GFX8-NEXT:    s_mov_b32 s5, s4
477; GFX8-NEXT:    s_and_b64 s[0:1], s[0:1], s[4:5]
478; GFX8-NEXT:    s_lshl_b32 s0, s0, 2
479; GFX8-NEXT:    s_lshl_b32 s1, s1, 2
480; GFX8-NEXT:    ; return to shader part epilog
481;
482; GFX9-LABEL: s_shl_v2i32_zext_v2i16:
483; GFX9:       ; %bb.0:
484; GFX9-NEXT:    s_and_b32 s0, s0, 0x3fff3fff
485; GFX9-NEXT:    s_lshr_b32 s1, s0, 16
486; GFX9-NEXT:    s_and_b32 s0, s0, 0xffff
487; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
488; GFX9-NEXT:    s_lshl_b32 s1, s1, 2
489; GFX9-NEXT:    ; return to shader part epilog
490  %and = and <2 x i16> %x, <i16 16383, i16 16383>
491  %ext = zext <2 x i16> %and to <2 x i32>
492  %shl = shl <2 x i32> %ext, <i32 2, i32 2>
493  ret <2 x i32> %shl
494}
495
496; FIXME: This doesn't do what we want. The pre-legalizer combiner
497; fails to handle the vector splat. The post-legalizer sees the zext
498; legalized into the and. This is probably not that important, since
499; we really do this combine in the machine level for lowered
500; getelementptrs.
501define <2 x i32> @v_shl_v2i32_zext_v2i16(<2 x i16> %x) {
502; GFX7-LABEL: v_shl_v2i32_zext_v2i16:
503; GFX7:       ; %bb.0:
504; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
505; GFX7-NEXT:    v_mov_b32_e32 v2, 0xffff
506; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
507; GFX7-NEXT:    v_and_b32_e32 v0, v0, v2
508; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
509; GFX7-NEXT:    v_and_b32_e32 v0, 0x3fff3fff, v0
510; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
511; GFX7-NEXT:    v_and_b32_e32 v0, v0, v2
512; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
513; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
514; GFX7-NEXT:    s_setpc_b64 s[30:31]
515;
516; GFX8-LABEL: v_shl_v2i32_zext_v2i16:
517; GFX8:       ; %bb.0:
518; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
519; GFX8-NEXT:    v_and_b32_e32 v1, 0x3fff3fff, v0
520; GFX8-NEXT:    v_mov_b32_e32 v2, 2
521; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
522; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
523; GFX8-NEXT:    s_setpc_b64 s[30:31]
524;
525; GFX9-LABEL: v_shl_v2i32_zext_v2i16:
526; GFX9:       ; %bb.0:
527; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
528; GFX9-NEXT:    s_mov_b32 s4, 2
529; GFX9-NEXT:    v_and_b32_e32 v1, 0x3fff3fff, v0
530; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
531; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
532; GFX9-NEXT:    s_setpc_b64 s[30:31]
533  %and = and <2 x i16> %x, <i16 16383, i16 16383>
534  %ext = zext <2 x i16> %and to <2 x i32>
535  %shl = shl <2 x i32> %ext, <i32 2, i32 2>
536  ret <2 x i32> %shl
537}
538
539declare i32 @llvm.amdgcn.workitem.id.x() #0
540
541attributes #0 = { nounwind readnone speculatable willreturn }
542