1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s -enable-var-scope -check-prefixes=FUNC,GCN,SI
3; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s -enable-var-scope -check-prefixes=FUNC,GCN,VI
4; RUN: llc < %s -march=r600 -mcpu=cypress -verify-machineinstrs | FileCheck %s -enable-var-scope -check-prefixes=FUNC,EG
5
6declare i7 @llvm.ctlz.i7(i7, i1) nounwind readnone
7declare i8 @llvm.ctlz.i8(i8, i1) nounwind readnone
8declare i16 @llvm.ctlz.i16(i16, i1) nounwind readnone
9
10declare i32 @llvm.ctlz.i32(i32, i1) nounwind readnone
11declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) nounwind readnone
12declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) nounwind readnone
13
14declare i64 @llvm.ctlz.i64(i64, i1) nounwind readnone
15declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>, i1) nounwind readnone
16declare <4 x i64> @llvm.ctlz.v4i64(<4 x i64>, i1) nounwind readnone
17
18declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
19
20define amdgpu_kernel void @s_ctlz_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
21; SI-LABEL: s_ctlz_i32:
22; SI:       ; %bb.0:
23; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
24; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
25; SI-NEXT:    s_mov_b32 s7, 0xf000
26; SI-NEXT:    s_waitcnt lgkmcnt(0)
27; SI-NEXT:    s_flbit_i32_b32 s0, s2
28; SI-NEXT:    s_mov_b32 s6, -1
29; SI-NEXT:    v_mov_b32_e32 v0, s0
30; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s2, 0
31; SI-NEXT:    v_cndmask_b32_e32 v0, 32, v0, vcc
32; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
33; SI-NEXT:    s_endpgm
34;
35; VI-LABEL: s_ctlz_i32:
36; VI:       ; %bb.0:
37; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
38; VI-NEXT:    s_load_dword s0, s[0:1], 0x2c
39; VI-NEXT:    s_mov_b32 s7, 0xf000
40; VI-NEXT:    s_mov_b32 s6, -1
41; VI-NEXT:    s_waitcnt lgkmcnt(0)
42; VI-NEXT:    s_flbit_i32_b32 s1, s0
43; VI-NEXT:    s_cmp_lg_u32 s0, 0
44; VI-NEXT:    s_cselect_b32 s0, s1, 32
45; VI-NEXT:    v_mov_b32_e32 v0, s0
46; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
47; VI-NEXT:    s_endpgm
48;
49; EG-LABEL: s_ctlz_i32:
50; EG:       ; %bb.0:
51; EG-NEXT:    ALU 3, @4, KC0[CB0:0-32], KC1[]
52; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
53; EG-NEXT:    CF_END
54; EG-NEXT:    PAD
55; EG-NEXT:    ALU clause starting at 4:
56; EG-NEXT:     FFBH_UINT * T0.W, KC0[2].Z,
57; EG-NEXT:     CNDE_INT T0.X, KC0[2].Z, literal.x, PV.W,
58; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
59; EG-NEXT:    32(4.484155e-44), 2(2.802597e-45)
60  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
61  store i32 %ctlz, i32 addrspace(1)* %out, align 4
62  ret void
63}
64
65define amdgpu_kernel void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
66; SI-LABEL: v_ctlz_i32:
67; SI:       ; %bb.0:
68; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
69; SI-NEXT:    s_mov_b32 s3, 0xf000
70; SI-NEXT:    s_mov_b32 s6, 0
71; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
72; SI-NEXT:    v_mov_b32_e32 v1, 0
73; SI-NEXT:    s_mov_b32 s7, s3
74; SI-NEXT:    s_waitcnt lgkmcnt(0)
75; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
76; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
77; SI-NEXT:    s_mov_b32 s2, -1
78; SI-NEXT:    s_waitcnt vmcnt(0)
79; SI-NEXT:    v_ffbh_u32_e32 v1, v0
80; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
81; SI-NEXT:    v_cndmask_b32_e32 v0, 32, v1, vcc
82; SI-NEXT:    s_waitcnt lgkmcnt(0)
83; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
84; SI-NEXT:    s_endpgm
85;
86; VI-LABEL: v_ctlz_i32:
87; VI:       ; %bb.0:
88; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
89; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
90; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
91; VI-NEXT:    s_mov_b32 s7, 0xf000
92; VI-NEXT:    s_mov_b32 s6, -1
93; VI-NEXT:    s_waitcnt lgkmcnt(0)
94; VI-NEXT:    v_mov_b32_e32 v1, s1
95; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
96; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
97; VI-NEXT:    flat_load_dword v0, v[0:1]
98; VI-NEXT:    s_waitcnt vmcnt(0)
99; VI-NEXT:    v_ffbh_u32_e32 v1, v0
100; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
101; VI-NEXT:    v_cndmask_b32_e32 v0, 32, v1, vcc
102; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
103; VI-NEXT:    s_endpgm
104;
105; EG-LABEL: v_ctlz_i32:
106; EG:       ; %bb.0:
107; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
108; EG-NEXT:    TEX 0 @6
109; EG-NEXT:    ALU 3, @11, KC0[CB0:0-32], KC1[]
110; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
111; EG-NEXT:    CF_END
112; EG-NEXT:    PAD
113; EG-NEXT:    Fetch clause starting at 6:
114; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
115; EG-NEXT:    ALU clause starting at 8:
116; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
117; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
118; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
119; EG-NEXT:    ALU clause starting at 11:
120; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
121; EG-NEXT:     CNDE_INT T0.X, T0.X, literal.x, PV.W,
122; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
123; EG-NEXT:    32(4.484155e-44), 2(2.802597e-45)
124  %tid = call i32 @llvm.amdgcn.workitem.id.x()
125  %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
126  %val = load i32, i32 addrspace(1)* %in.gep, align 4
127  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
128  store i32 %ctlz, i32 addrspace(1)* %out, align 4
129  ret void
130}
131
132define amdgpu_kernel void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
133; SI-LABEL: v_ctlz_v2i32:
134; SI:       ; %bb.0:
135; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
136; SI-NEXT:    s_mov_b32 s3, 0xf000
137; SI-NEXT:    s_mov_b32 s6, 0
138; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
139; SI-NEXT:    v_mov_b32_e32 v1, 0
140; SI-NEXT:    s_mov_b32 s7, s3
141; SI-NEXT:    s_waitcnt lgkmcnt(0)
142; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
143; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
144; SI-NEXT:    s_mov_b32 s2, -1
145; SI-NEXT:    s_waitcnt vmcnt(0)
146; SI-NEXT:    v_ffbh_u32_e32 v2, v1
147; SI-NEXT:    v_ffbh_u32_e32 v3, v0
148; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
149; SI-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
150; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
151; SI-NEXT:    v_cndmask_b32_e32 v0, 32, v3, vcc
152; SI-NEXT:    s_waitcnt lgkmcnt(0)
153; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
154; SI-NEXT:    s_endpgm
155;
156; VI-LABEL: v_ctlz_v2i32:
157; VI:       ; %bb.0:
158; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
159; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
160; VI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
161; VI-NEXT:    s_mov_b32 s7, 0xf000
162; VI-NEXT:    s_mov_b32 s6, -1
163; VI-NEXT:    s_waitcnt lgkmcnt(0)
164; VI-NEXT:    v_mov_b32_e32 v1, s1
165; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
166; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
167; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
168; VI-NEXT:    s_waitcnt vmcnt(0)
169; VI-NEXT:    v_ffbh_u32_e32 v2, v1
170; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
171; VI-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
172; VI-NEXT:    v_ffbh_u32_e32 v3, v0
173; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
174; VI-NEXT:    v_cndmask_b32_e32 v0, 32, v3, vcc
175; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
176; VI-NEXT:    s_endpgm
177;
178; EG-LABEL: v_ctlz_v2i32:
179; EG:       ; %bb.0:
180; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
181; EG-NEXT:    TEX 0 @6
182; EG-NEXT:    ALU 6, @11, KC0[CB0:0-32], KC1[]
183; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
184; EG-NEXT:    CF_END
185; EG-NEXT:    PAD
186; EG-NEXT:    Fetch clause starting at 6:
187; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
188; EG-NEXT:    ALU clause starting at 8:
189; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
190; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
191; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
192; EG-NEXT:    ALU clause starting at 11:
193; EG-NEXT:     FFBH_UINT * T0.W, T0.Y,
194; EG-NEXT:     CNDE_INT T0.Y, T0.Y, literal.x, PV.W,
195; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
196; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
197; EG-NEXT:     CNDE_INT T0.X, T0.X, literal.x, PV.W,
198; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
199; EG-NEXT:    32(4.484155e-44), 2(2.802597e-45)
200  %tid = call i32 @llvm.amdgcn.workitem.id.x()
201  %in.gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %valptr, i32 %tid
202  %val = load <2 x i32>, <2 x i32> addrspace(1)* %in.gep, align 8
203  %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %val, i1 false) nounwind readnone
204  store <2 x i32> %ctlz, <2 x i32> addrspace(1)* %out, align 8
205  ret void
206}
207
208define amdgpu_kernel void @v_ctlz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
209; SI-LABEL: v_ctlz_v4i32:
210; SI:       ; %bb.0:
211; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
212; SI-NEXT:    s_mov_b32 s3, 0xf000
213; SI-NEXT:    s_mov_b32 s6, 0
214; SI-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
215; SI-NEXT:    v_mov_b32_e32 v1, 0
216; SI-NEXT:    s_mov_b32 s7, s3
217; SI-NEXT:    s_waitcnt lgkmcnt(0)
218; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
219; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
220; SI-NEXT:    s_mov_b32 s2, -1
221; SI-NEXT:    s_waitcnt vmcnt(0)
222; SI-NEXT:    v_ffbh_u32_e32 v4, v3
223; SI-NEXT:    v_ffbh_u32_e32 v5, v2
224; SI-NEXT:    v_ffbh_u32_e32 v6, v1
225; SI-NEXT:    v_ffbh_u32_e32 v7, v0
226; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
227; SI-NEXT:    v_cndmask_b32_e32 v3, 32, v4, vcc
228; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
229; SI-NEXT:    v_cndmask_b32_e32 v2, 32, v5, vcc
230; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
231; SI-NEXT:    v_cndmask_b32_e32 v1, 32, v6, vcc
232; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
233; SI-NEXT:    v_cndmask_b32_e32 v0, 32, v7, vcc
234; SI-NEXT:    s_waitcnt lgkmcnt(0)
235; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
236; SI-NEXT:    s_endpgm
237;
238; VI-LABEL: v_ctlz_v4i32:
239; VI:       ; %bb.0:
240; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
241; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
242; VI-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
243; VI-NEXT:    s_mov_b32 s7, 0xf000
244; VI-NEXT:    s_mov_b32 s6, -1
245; VI-NEXT:    s_waitcnt lgkmcnt(0)
246; VI-NEXT:    v_mov_b32_e32 v1, s1
247; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
248; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
249; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
250; VI-NEXT:    s_waitcnt vmcnt(0)
251; VI-NEXT:    v_ffbh_u32_e32 v4, v3
252; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
253; VI-NEXT:    v_cndmask_b32_e32 v3, 32, v4, vcc
254; VI-NEXT:    v_ffbh_u32_e32 v5, v2
255; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
256; VI-NEXT:    v_cndmask_b32_e32 v2, 32, v5, vcc
257; VI-NEXT:    v_ffbh_u32_e32 v6, v1
258; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
259; VI-NEXT:    v_cndmask_b32_e32 v1, 32, v6, vcc
260; VI-NEXT:    v_ffbh_u32_e32 v7, v0
261; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
262; VI-NEXT:    v_cndmask_b32_e32 v0, 32, v7, vcc
263; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
264; VI-NEXT:    s_endpgm
265;
266; EG-LABEL: v_ctlz_v4i32:
267; EG:       ; %bb.0:
268; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
269; EG-NEXT:    TEX 0 @6
270; EG-NEXT:    ALU 12, @11, KC0[CB0:0-32], KC1[]
271; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
272; EG-NEXT:    CF_END
273; EG-NEXT:    PAD
274; EG-NEXT:    Fetch clause starting at 6:
275; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
276; EG-NEXT:    ALU clause starting at 8:
277; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
278; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
279; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
280; EG-NEXT:    ALU clause starting at 11:
281; EG-NEXT:     FFBH_UINT * T1.W, T0.W,
282; EG-NEXT:     FFBH_UINT T2.W, T0.Z,
283; EG-NEXT:     CNDE_INT * T0.W, T0.W, literal.x, PV.W, BS:VEC_021/SCL_122
284; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
285; EG-NEXT:     CNDE_INT T0.Z, T0.Z, literal.x, PV.W,
286; EG-NEXT:     FFBH_UINT * T1.W, T0.Y,
287; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
288; EG-NEXT:     CNDE_INT T0.Y, T0.Y, literal.x, PV.W,
289; EG-NEXT:     FFBH_UINT * T1.W, T0.X,
290; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
291; EG-NEXT:     CNDE_INT T0.X, T0.X, literal.x, PV.W,
292; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
293; EG-NEXT:    32(4.484155e-44), 2(2.802597e-45)
294  %tid = call i32 @llvm.amdgcn.workitem.id.x()
295  %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %valptr, i32 %tid
296  %val = load <4 x i32>, <4 x i32> addrspace(1)* %in.gep, align 16
297  %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %val, i1 false) nounwind readnone
298  store <4 x i32> %ctlz, <4 x i32> addrspace(1)* %out, align 16
299  ret void
300}
301
302define amdgpu_kernel void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
303; SI-LABEL: v_ctlz_i8:
304; SI:       ; %bb.0:
305; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
306; SI-NEXT:    s_mov_b32 s3, 0xf000
307; SI-NEXT:    s_mov_b32 s2, -1
308; SI-NEXT:    s_mov_b32 s6, s2
309; SI-NEXT:    s_mov_b32 s7, s3
310; SI-NEXT:    s_waitcnt lgkmcnt(0)
311; SI-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
312; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
313; SI-NEXT:    s_waitcnt vmcnt(0)
314; SI-NEXT:    v_ffbh_u32_e32 v1, v0
315; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
316; SI-NEXT:    v_cndmask_b32_e32 v0, 32, v1, vcc
317; SI-NEXT:    v_subrev_i32_e32 v0, vcc, 24, v0
318; SI-NEXT:    s_waitcnt lgkmcnt(0)
319; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
320; SI-NEXT:    s_endpgm
321;
322; VI-LABEL: v_ctlz_i8:
323; VI:       ; %bb.0:
324; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
325; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
326; VI-NEXT:    s_mov_b32 s7, 0xf000
327; VI-NEXT:    s_mov_b32 s6, -1
328; VI-NEXT:    s_mov_b32 s2, s6
329; VI-NEXT:    s_mov_b32 s3, s7
330; VI-NEXT:    s_waitcnt lgkmcnt(0)
331; VI-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
332; VI-NEXT:    s_waitcnt vmcnt(0)
333; VI-NEXT:    v_ffbh_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
334; VI-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
335; VI-NEXT:    v_cndmask_b32_e32 v0, 32, v1, vcc
336; VI-NEXT:    v_add_u32_e32 v0, vcc, -16, v0
337; VI-NEXT:    v_add_u16_e32 v0, -8, v0
338; VI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
339; VI-NEXT:    s_endpgm
340;
341; EG-LABEL: v_ctlz_i8:
342; EG:       ; %bb.0:
343; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
344; EG-NEXT:    TEX 0 @6
345; EG-NEXT:    ALU 15, @9, KC0[CB0:0-32], KC1[]
346; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
347; EG-NEXT:    CF_END
348; EG-NEXT:    PAD
349; EG-NEXT:    Fetch clause starting at 6:
350; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 0, #1
351; EG-NEXT:    ALU clause starting at 8:
352; EG-NEXT:     MOV * T0.X, KC0[2].Z,
353; EG-NEXT:    ALU clause starting at 9:
354; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
355; EG-NEXT:     CNDE_INT T0.W, T0.X, literal.x, PV.W,
356; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.y,
357; EG-NEXT:    32(4.484155e-44), 3(4.203895e-45)
358; EG-NEXT:     ADD_INT * T0.W, PV.W, literal.x,
359; EG-NEXT:    -24(nan), 0(0.000000e+00)
360; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
361; EG-NEXT:     LSHL * T1.W, T1.W, literal.y,
362; EG-NEXT:    255(3.573311e-43), 3(4.203895e-45)
363; EG-NEXT:     LSHL T0.X, PV.W, PS,
364; EG-NEXT:     LSHL * T0.W, literal.x, PS,
365; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
366; EG-NEXT:     MOV T0.Y, 0.0,
367; EG-NEXT:     MOV * T0.Z, 0.0,
368; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
369; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
370  %val = load i8, i8 addrspace(1)* %valptr
371  %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 false) nounwind readnone
372  store i8 %ctlz, i8 addrspace(1)* %out
373  ret void
374}
375
376define amdgpu_kernel void @s_ctlz_i64(i64 addrspace(1)* noalias %out, [8 x i32], i64 %val) nounwind {
377; SI-LABEL: s_ctlz_i64:
378; SI:       ; %bb.0:
379; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x13
380; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
381; SI-NEXT:    s_mov_b32 s7, 0xf000
382; SI-NEXT:    s_mov_b32 s6, -1
383; SI-NEXT:    s_waitcnt lgkmcnt(0)
384; SI-NEXT:    s_flbit_i32_b32 s0, s2
385; SI-NEXT:    s_flbit_i32_b32 s1, s3
386; SI-NEXT:    s_add_i32 s0, s0, 32
387; SI-NEXT:    s_or_b32 s2, s2, s3
388; SI-NEXT:    v_mov_b32_e32 v0, s1
389; SI-NEXT:    v_mov_b32_e32 v1, s0
390; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s3, 0
391; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
392; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s2, 0
393; SI-NEXT:    v_cndmask_b32_e32 v0, 64, v0, vcc
394; SI-NEXT:    v_mov_b32_e32 v1, 0
395; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
396; SI-NEXT:    s_endpgm
397;
398; VI-LABEL: s_ctlz_i64:
399; VI:       ; %bb.0:
400; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
401; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x4c
402; VI-NEXT:    s_mov_b32 s7, 0xf000
403; VI-NEXT:    s_mov_b32 s6, -1
404; VI-NEXT:    v_mov_b32_e32 v1, 0
405; VI-NEXT:    s_waitcnt lgkmcnt(0)
406; VI-NEXT:    s_flbit_i32_b32 s2, s0
407; VI-NEXT:    s_add_i32 s2, s2, 32
408; VI-NEXT:    s_flbit_i32_b32 s3, s1
409; VI-NEXT:    s_cmp_eq_u32 s1, 0
410; VI-NEXT:    s_cselect_b32 s2, s2, s3
411; VI-NEXT:    s_or_b32 s0, s0, s1
412; VI-NEXT:    s_cmp_lg_u32 s0, 0
413; VI-NEXT:    s_cselect_b32 s0, s2, 64
414; VI-NEXT:    v_mov_b32_e32 v0, s0
415; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
416; VI-NEXT:    s_endpgm
417;
418; EG-LABEL: s_ctlz_i64:
419; EG:       ; %bb.0:
420; EG-NEXT:    ALU 9, @4, KC0[CB0:0-32], KC1[]
421; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
422; EG-NEXT:    CF_END
423; EG-NEXT:    PAD
424; EG-NEXT:    ALU clause starting at 4:
425; EG-NEXT:     FFBH_UINT * T0.W, KC0[4].W,
426; EG-NEXT:     CNDE_INT * T0.W, KC0[4].W, literal.x, PV.W,
427; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
428; EG-NEXT:     FFBH_UINT T1.W, KC0[5].X,
429; EG-NEXT:     ADD_INT * T0.W, PV.W, literal.x,
430; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
431; EG-NEXT:     CNDE_INT T0.X, KC0[5].X, PS, PV.W,
432; EG-NEXT:     MOV T0.Y, 0.0,
433; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
434; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
435  %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false)
436  store i64 %ctlz, i64 addrspace(1)* %out
437  ret void
438}
439
440define amdgpu_kernel void @s_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
441; SI-LABEL: s_ctlz_i64_trunc:
442; SI:       ; %bb.0:
443; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
444; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
445; SI-NEXT:    s_mov_b32 s7, 0xf000
446; SI-NEXT:    s_mov_b32 s6, -1
447; SI-NEXT:    s_waitcnt lgkmcnt(0)
448; SI-NEXT:    s_flbit_i32_b32 s0, s2
449; SI-NEXT:    s_flbit_i32_b32 s1, s3
450; SI-NEXT:    s_add_i32 s0, s0, 32
451; SI-NEXT:    s_or_b32 s2, s2, s3
452; SI-NEXT:    v_mov_b32_e32 v0, s1
453; SI-NEXT:    v_mov_b32_e32 v1, s0
454; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s3, 0
455; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
456; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s2, 0
457; SI-NEXT:    v_cndmask_b32_e32 v0, 64, v0, vcc
458; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
459; SI-NEXT:    s_endpgm
460;
461; VI-LABEL: s_ctlz_i64_trunc:
462; VI:       ; %bb.0:
463; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
464; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
465; VI-NEXT:    s_mov_b32 s7, 0xf000
466; VI-NEXT:    s_mov_b32 s6, -1
467; VI-NEXT:    s_waitcnt lgkmcnt(0)
468; VI-NEXT:    s_flbit_i32_b32 s2, s0
469; VI-NEXT:    s_add_i32 s2, s2, 32
470; VI-NEXT:    s_flbit_i32_b32 s3, s1
471; VI-NEXT:    s_cmp_eq_u32 s1, 0
472; VI-NEXT:    s_cselect_b32 s2, s2, s3
473; VI-NEXT:    s_or_b32 s0, s0, s1
474; VI-NEXT:    s_cmp_lg_u32 s0, 0
475; VI-NEXT:    s_cselect_b32 s0, s2, 64
476; VI-NEXT:    v_mov_b32_e32 v0, s0
477; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
478; VI-NEXT:    s_endpgm
479;
480; EG-LABEL: s_ctlz_i64_trunc:
481; EG:       ; %bb.0:
482; EG-NEXT:    ALU 8, @4, KC0[CB0:0-32], KC1[]
483; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
484; EG-NEXT:    CF_END
485; EG-NEXT:    PAD
486; EG-NEXT:    ALU clause starting at 4:
487; EG-NEXT:     FFBH_UINT * T0.W, KC0[2].W,
488; EG-NEXT:     CNDE_INT * T0.W, KC0[2].W, literal.x, PV.W,
489; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
490; EG-NEXT:     FFBH_UINT T1.W, KC0[3].X,
491; EG-NEXT:     ADD_INT * T0.W, PV.W, literal.x,
492; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
493; EG-NEXT:     CNDE_INT T0.X, KC0[3].X, PS, PV.W,
494; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
495; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
496  %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false)
497  %trunc = trunc i64 %ctlz to i32
498  store i32 %trunc, i32 addrspace(1)* %out
499  ret void
500}
501
502define amdgpu_kernel void @v_ctlz_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
503; SI-LABEL: v_ctlz_i64:
504; SI:       ; %bb.0:
505; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
506; SI-NEXT:    s_mov_b32 s7, 0xf000
507; SI-NEXT:    s_mov_b32 s6, 0
508; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
509; SI-NEXT:    v_mov_b32_e32 v1, 0
510; SI-NEXT:    s_waitcnt lgkmcnt(0)
511; SI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
512; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
513; SI-NEXT:    s_waitcnt vmcnt(0)
514; SI-NEXT:    v_ffbh_u32_e32 v4, v2
515; SI-NEXT:    v_ffbh_u32_e32 v5, v3
516; SI-NEXT:    v_or_b32_e32 v2, v2, v3
517; SI-NEXT:    v_add_i32_e32 v4, vcc, 32, v4
518; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
519; SI-NEXT:    v_cndmask_b32_e32 v3, v5, v4, vcc
520; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
521; SI-NEXT:    v_cndmask_b32_e32 v2, 64, v3, vcc
522; SI-NEXT:    v_mov_b32_e32 v3, v1
523; SI-NEXT:    s_waitcnt lgkmcnt(0)
524; SI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
525; SI-NEXT:    s_endpgm
526;
527; VI-LABEL: v_ctlz_i64:
528; VI:       ; %bb.0:
529; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
530; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
531; VI-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
532; VI-NEXT:    v_mov_b32_e32 v4, 0
533; VI-NEXT:    v_mov_b32_e32 v2, 0
534; VI-NEXT:    s_waitcnt lgkmcnt(0)
535; VI-NEXT:    v_mov_b32_e32 v5, s3
536; VI-NEXT:    v_mov_b32_e32 v1, s1
537; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v3
538; VI-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
539; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
540; VI-NEXT:    v_add_u32_e32 v3, vcc, s2, v3
541; VI-NEXT:    v_addc_u32_e32 v4, vcc, v5, v4, vcc
542; VI-NEXT:    s_waitcnt vmcnt(0)
543; VI-NEXT:    v_ffbh_u32_e32 v5, v0
544; VI-NEXT:    v_add_u32_e32 v5, vcc, 32, v5
545; VI-NEXT:    v_ffbh_u32_e32 v6, v1
546; VI-NEXT:    v_or_b32_e32 v0, v0, v1
547; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
548; VI-NEXT:    v_cndmask_b32_e32 v1, v6, v5, vcc
549; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
550; VI-NEXT:    v_cndmask_b32_e32 v1, 64, v1, vcc
551; VI-NEXT:    flat_store_dwordx2 v[3:4], v[1:2]
552; VI-NEXT:    s_endpgm
553;
554; EG-LABEL: v_ctlz_i64:
555; EG:       ; %bb.0:
556; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
557; EG-NEXT:    TEX 0 @6
558; EG-NEXT:    ALU 10, @11, KC0[CB0:0-32], KC1[]
559; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
560; EG-NEXT:    CF_END
561; EG-NEXT:    PAD
562; EG-NEXT:    Fetch clause starting at 6:
563; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
564; EG-NEXT:    ALU clause starting at 8:
565; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
566; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
567; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
568; EG-NEXT:    ALU clause starting at 11:
569; EG-NEXT:     FFBH_UINT * T1.W, T0.X,
570; EG-NEXT:     CNDE_INT * T1.W, T0.X, literal.x, PV.W,
571; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
572; EG-NEXT:     FFBH_UINT T2.W, T0.Y,
573; EG-NEXT:     ADD_INT * T1.W, PV.W, literal.x,
574; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
575; EG-NEXT:     CNDE_INT T0.X, T0.Y, PS, PV.W,
576; EG-NEXT:     MOV T0.Y, 0.0,
577; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, T0.W,
578; EG-NEXT:     LSHR * T1.X, PV.W, literal.x,
579; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
580  %tid = call i32 @llvm.amdgcn.workitem.id.x()
581  %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
582  %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
583  %val = load i64, i64 addrspace(1)* %in.gep
584  %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false)
585  store i64 %ctlz, i64 addrspace(1)* %out.gep
586  ret void
587}
588
589define amdgpu_kernel void @v_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
590; SI-LABEL: v_ctlz_i64_trunc:
591; SI:       ; %bb.0:
592; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
593; SI-NEXT:    s_mov_b32 s7, 0xf000
594; SI-NEXT:    s_mov_b32 s6, 0
595; SI-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
596; SI-NEXT:    v_mov_b32_e32 v2, 0
597; SI-NEXT:    s_waitcnt lgkmcnt(0)
598; SI-NEXT:    buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
599; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
600; SI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
601; SI-NEXT:    s_waitcnt vmcnt(0)
602; SI-NEXT:    v_ffbh_u32_e32 v0, v3
603; SI-NEXT:    v_ffbh_u32_e32 v5, v4
604; SI-NEXT:    v_or_b32_e32 v3, v3, v4
605; SI-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
606; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
607; SI-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
608; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
609; SI-NEXT:    v_cndmask_b32_e32 v0, 64, v0, vcc
610; SI-NEXT:    s_waitcnt lgkmcnt(0)
611; SI-NEXT:    buffer_store_dword v0, v[1:2], s[4:7], 0 addr64
612; SI-NEXT:    s_endpgm
613;
614; VI-LABEL: v_ctlz_i64_trunc:
615; VI:       ; %bb.0:
616; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
617; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
618; VI-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
619; VI-NEXT:    v_mov_b32_e32 v4, 0
620; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
621; VI-NEXT:    s_waitcnt lgkmcnt(0)
622; VI-NEXT:    v_mov_b32_e32 v5, s3
623; VI-NEXT:    v_mov_b32_e32 v2, s1
624; VI-NEXT:    v_add_u32_e32 v1, vcc, s0, v1
625; VI-NEXT:    v_addc_u32_e32 v2, vcc, v2, v4, vcc
626; VI-NEXT:    flat_load_dwordx2 v[1:2], v[1:2]
627; VI-NEXT:    v_add_u32_e32 v3, vcc, s2, v0
628; VI-NEXT:    v_addc_u32_e32 v4, vcc, v5, v4, vcc
629; VI-NEXT:    s_waitcnt vmcnt(0)
630; VI-NEXT:    v_ffbh_u32_e32 v0, v1
631; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
632; VI-NEXT:    v_ffbh_u32_e32 v5, v2
633; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
634; VI-NEXT:    v_or_b32_e32 v1, v1, v2
635; VI-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
636; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
637; VI-NEXT:    v_cndmask_b32_e32 v0, 64, v0, vcc
638; VI-NEXT:    flat_store_dword v[3:4], v0
639; VI-NEXT:    s_endpgm
640;
641; EG-LABEL: v_ctlz_i64_trunc:
642; EG:       ; %bb.0:
643; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
644; EG-NEXT:    TEX 0 @6
645; EG-NEXT:    ALU 10, @11, KC0[CB0:0-32], KC1[]
646; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
647; EG-NEXT:    CF_END
648; EG-NEXT:    PAD
649; EG-NEXT:    Fetch clause starting at 6:
650; EG-NEXT:     VTX_READ_64 T1.XY, T1.X, 0, #1
651; EG-NEXT:    ALU clause starting at 8:
652; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
653; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
654; EG-NEXT:     ADD_INT * T1.X, KC0[2].Z, PV.W,
655; EG-NEXT:    ALU clause starting at 11:
656; EG-NEXT:     FFBH_UINT * T0.W, T1.X,
657; EG-NEXT:     CNDE_INT * T0.W, T1.X, literal.x, PV.W,
658; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
659; EG-NEXT:     LSHL T0.Z, T0.X, literal.x,
660; EG-NEXT:     FFBH_UINT T1.W, T1.Y,
661; EG-NEXT:     ADD_INT * T0.W, PV.W, literal.y,
662; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
663; EG-NEXT:     CNDE_INT T0.X, T1.Y, PS, PV.W,
664; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, PV.Z,
665; EG-NEXT:     LSHR * T1.X, PV.W, literal.x,
666; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
667  %tid = call i32 @llvm.amdgcn.workitem.id.x()
668  %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
669  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
670  %val = load i64, i64 addrspace(1)* %in.gep
671  %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false)
672  %trunc = trunc i64 %ctlz to i32
673  store i32 %trunc, i32 addrspace(1)* %out.gep
674  ret void
675}
676
677define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
678; SI-LABEL: v_ctlz_i32_sel_eq_neg1:
679; SI:       ; %bb.0:
680; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
681; SI-NEXT:    s_mov_b32 s3, 0xf000
682; SI-NEXT:    s_mov_b32 s6, 0
683; SI-NEXT:    s_mov_b32 s7, s3
684; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
685; SI-NEXT:    v_mov_b32_e32 v1, 0
686; SI-NEXT:    s_waitcnt lgkmcnt(0)
687; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
688; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
689; SI-NEXT:    s_mov_b32 s2, -1
690; SI-NEXT:    s_waitcnt vmcnt(0)
691; SI-NEXT:    v_ffbh_u32_e32 v0, v0
692; SI-NEXT:    s_waitcnt lgkmcnt(0)
693; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
694; SI-NEXT:    s_endpgm
695;
696; VI-LABEL: v_ctlz_i32_sel_eq_neg1:
697; VI:       ; %bb.0:
698; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
699; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
700; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
701; VI-NEXT:    s_mov_b32 s7, 0xf000
702; VI-NEXT:    s_mov_b32 s6, -1
703; VI-NEXT:    s_waitcnt lgkmcnt(0)
704; VI-NEXT:    v_mov_b32_e32 v1, s1
705; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
706; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
707; VI-NEXT:    flat_load_dword v0, v[0:1]
708; VI-NEXT:    s_waitcnt vmcnt(0)
709; VI-NEXT:    v_ffbh_u32_e32 v0, v0
710; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
711; VI-NEXT:    s_endpgm
712;
713; EG-LABEL: v_ctlz_i32_sel_eq_neg1:
714; EG:       ; %bb.0:
715; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
716; EG-NEXT:    TEX 0 @6
717; EG-NEXT:    ALU 5, @11, KC0[CB0:0-32], KC1[]
718; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
719; EG-NEXT:    CF_END
720; EG-NEXT:    PAD
721; EG-NEXT:    Fetch clause starting at 6:
722; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
723; EG-NEXT:    ALU clause starting at 8:
724; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
725; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
726; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
727; EG-NEXT:    ALU clause starting at 11:
728; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
729; EG-NEXT:     CNDE_INT * T0.W, T0.X, literal.x, PV.W,
730; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
731; EG-NEXT:     CNDE_INT T0.X, T0.X, literal.x, PV.W,
732; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
733; EG-NEXT:    -1(nan), 2(2.802597e-45)
734  %tid = call i32 @llvm.amdgcn.workitem.id.x()
735  %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
736  %val = load i32, i32 addrspace(1)* %in.gep
737  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
738  %cmp = icmp eq i32 %val, 0
739  %sel = select i1 %cmp, i32 -1, i32 %ctlz
740  store i32 %sel, i32 addrspace(1)* %out
741  ret void
742}
743
744define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
745; SI-LABEL: v_ctlz_i32_sel_ne_neg1:
746; SI:       ; %bb.0:
747; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
748; SI-NEXT:    s_mov_b32 s3, 0xf000
749; SI-NEXT:    s_mov_b32 s6, 0
750; SI-NEXT:    s_mov_b32 s7, s3
751; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
752; SI-NEXT:    v_mov_b32_e32 v1, 0
753; SI-NEXT:    s_waitcnt lgkmcnt(0)
754; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
755; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
756; SI-NEXT:    s_mov_b32 s2, -1
757; SI-NEXT:    s_waitcnt vmcnt(0)
758; SI-NEXT:    v_ffbh_u32_e32 v0, v0
759; SI-NEXT:    s_waitcnt lgkmcnt(0)
760; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
761; SI-NEXT:    s_endpgm
762;
763; VI-LABEL: v_ctlz_i32_sel_ne_neg1:
764; VI:       ; %bb.0:
765; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
766; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
767; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
768; VI-NEXT:    s_mov_b32 s7, 0xf000
769; VI-NEXT:    s_mov_b32 s6, -1
770; VI-NEXT:    s_waitcnt lgkmcnt(0)
771; VI-NEXT:    v_mov_b32_e32 v1, s1
772; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
773; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
774; VI-NEXT:    flat_load_dword v0, v[0:1]
775; VI-NEXT:    s_waitcnt vmcnt(0)
776; VI-NEXT:    v_ffbh_u32_e32 v0, v0
777; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
778; VI-NEXT:    s_endpgm
779;
780; EG-LABEL: v_ctlz_i32_sel_ne_neg1:
781; EG:       ; %bb.0:
782; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
783; EG-NEXT:    TEX 0 @6
784; EG-NEXT:    ALU 5, @11, KC0[CB0:0-32], KC1[]
785; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
786; EG-NEXT:    CF_END
787; EG-NEXT:    PAD
788; EG-NEXT:    Fetch clause starting at 6:
789; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
790; EG-NEXT:    ALU clause starting at 8:
791; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
792; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
793; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
794; EG-NEXT:    ALU clause starting at 11:
795; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
796; EG-NEXT:     CNDE_INT * T0.W, T0.X, literal.x, PV.W,
797; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
798; EG-NEXT:     CNDE_INT T0.X, T0.X, literal.x, PV.W,
799; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
800; EG-NEXT:    -1(nan), 2(2.802597e-45)
801  %tid = call i32 @llvm.amdgcn.workitem.id.x()
802  %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
803  %val = load i32, i32 addrspace(1)* %in.gep
804  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
805  %cmp = icmp ne i32 %val, 0
806  %sel = select i1 %cmp, i32 %ctlz, i32 -1
807  store i32 %sel, i32 addrspace(1)* %out
808  ret void
809}
810
811; TODO: Should be able to eliminate select here as well.
812define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
813; SI-LABEL: v_ctlz_i32_sel_eq_bitwidth:
814; SI:       ; %bb.0:
815; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
816; SI-NEXT:    s_mov_b32 s3, 0xf000
817; SI-NEXT:    s_mov_b32 s6, 0
818; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
819; SI-NEXT:    v_mov_b32_e32 v1, 0
820; SI-NEXT:    s_mov_b32 s7, s3
821; SI-NEXT:    s_waitcnt lgkmcnt(0)
822; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
823; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
824; SI-NEXT:    s_mov_b32 s2, -1
825; SI-NEXT:    s_waitcnt vmcnt(0)
826; SI-NEXT:    v_ffbh_u32_e32 v1, v0
827; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
828; SI-NEXT:    v_cndmask_b32_e32 v0, 32, v1, vcc
829; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v0
830; SI-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
831; SI-NEXT:    s_waitcnt lgkmcnt(0)
832; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
833; SI-NEXT:    s_endpgm
834;
835; VI-LABEL: v_ctlz_i32_sel_eq_bitwidth:
836; VI:       ; %bb.0:
837; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
838; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
839; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
840; VI-NEXT:    s_mov_b32 s7, 0xf000
841; VI-NEXT:    s_mov_b32 s6, -1
842; VI-NEXT:    s_waitcnt lgkmcnt(0)
843; VI-NEXT:    v_mov_b32_e32 v1, s1
844; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
845; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
846; VI-NEXT:    flat_load_dword v0, v[0:1]
847; VI-NEXT:    s_waitcnt vmcnt(0)
848; VI-NEXT:    v_ffbh_u32_e32 v1, v0
849; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
850; VI-NEXT:    v_cndmask_b32_e32 v0, 32, v1, vcc
851; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v0
852; VI-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
853; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
854; VI-NEXT:    s_endpgm
855;
856; EG-LABEL: v_ctlz_i32_sel_eq_bitwidth:
857; EG:       ; %bb.0:
858; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
859; EG-NEXT:    TEX 0 @6
860; EG-NEXT:    ALU 7, @11, KC0[CB0:0-32], KC1[]
861; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
862; EG-NEXT:    CF_END
863; EG-NEXT:    PAD
864; EG-NEXT:    Fetch clause starting at 6:
865; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
866; EG-NEXT:    ALU clause starting at 8:
867; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
868; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
869; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
870; EG-NEXT:    ALU clause starting at 11:
871; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
872; EG-NEXT:     CNDE_INT * T0.W, T0.X, literal.x, PV.W,
873; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
874; EG-NEXT:     SETE_INT * T1.W, PV.W, literal.x,
875; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
876; EG-NEXT:     CNDE_INT T0.X, PV.W, T0.W, literal.x,
877; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
878; EG-NEXT:    -1(nan), 2(2.802597e-45)
879  %tid = call i32 @llvm.amdgcn.workitem.id.x()
880  %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
881  %val = load i32, i32 addrspace(1)* %in.gep
882  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
883  %cmp = icmp eq i32 %ctlz, 32
884  %sel = select i1 %cmp, i32 -1, i32 %ctlz
885  store i32 %sel, i32 addrspace(1)* %out
886  ret void
887}
888
889define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
890; SI-LABEL: v_ctlz_i32_sel_ne_bitwidth:
891; SI:       ; %bb.0:
892; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
893; SI-NEXT:    s_mov_b32 s3, 0xf000
894; SI-NEXT:    s_mov_b32 s6, 0
895; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
896; SI-NEXT:    v_mov_b32_e32 v1, 0
897; SI-NEXT:    s_mov_b32 s7, s3
898; SI-NEXT:    s_waitcnt lgkmcnt(0)
899; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
900; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
901; SI-NEXT:    s_mov_b32 s2, -1
902; SI-NEXT:    s_waitcnt vmcnt(0)
903; SI-NEXT:    v_ffbh_u32_e32 v1, v0
904; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
905; SI-NEXT:    v_cndmask_b32_e32 v0, 32, v1, vcc
906; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v0
907; SI-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
908; SI-NEXT:    s_waitcnt lgkmcnt(0)
909; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
910; SI-NEXT:    s_endpgm
911;
912; VI-LABEL: v_ctlz_i32_sel_ne_bitwidth:
913; VI:       ; %bb.0:
914; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
915; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
916; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
917; VI-NEXT:    s_mov_b32 s7, 0xf000
918; VI-NEXT:    s_mov_b32 s6, -1
919; VI-NEXT:    s_waitcnt lgkmcnt(0)
920; VI-NEXT:    v_mov_b32_e32 v1, s1
921; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
922; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
923; VI-NEXT:    flat_load_dword v0, v[0:1]
924; VI-NEXT:    s_waitcnt vmcnt(0)
925; VI-NEXT:    v_ffbh_u32_e32 v1, v0
926; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
927; VI-NEXT:    v_cndmask_b32_e32 v0, 32, v1, vcc
928; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v0
929; VI-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
930; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
931; VI-NEXT:    s_endpgm
932;
933; EG-LABEL: v_ctlz_i32_sel_ne_bitwidth:
934; EG:       ; %bb.0:
935; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
936; EG-NEXT:    TEX 0 @6
937; EG-NEXT:    ALU 7, @11, KC0[CB0:0-32], KC1[]
938; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
939; EG-NEXT:    CF_END
940; EG-NEXT:    PAD
941; EG-NEXT:    Fetch clause starting at 6:
942; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
943; EG-NEXT:    ALU clause starting at 8:
944; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
945; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
946; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
947; EG-NEXT:    ALU clause starting at 11:
948; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
949; EG-NEXT:     CNDE_INT * T0.W, T0.X, literal.x, PV.W,
950; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
951; EG-NEXT:     SETNE_INT * T1.W, PV.W, literal.x,
952; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
953; EG-NEXT:     CNDE_INT T0.X, PV.W, literal.x, T0.W,
954; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
955; EG-NEXT:    -1(nan), 2(2.802597e-45)
956  %tid = call i32 @llvm.amdgcn.workitem.id.x()
957  %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
958  %val = load i32, i32 addrspace(1)* %in.gep
959  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
960  %cmp = icmp ne i32 %ctlz, 32
961  %sel = select i1 %cmp, i32 %ctlz, i32 -1
962  store i32 %sel, i32 addrspace(1)* %out
963  ret void
964}
965
966 define amdgpu_kernel void @v_ctlz_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
967; SI-LABEL: v_ctlz_i8_sel_eq_neg1:
968; SI:       ; %bb.0:
969; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
970; SI-NEXT:    s_mov_b32 s3, 0xf000
971; SI-NEXT:    v_mov_b32_e32 v1, 0
972; SI-NEXT:    s_mov_b32 s6, 0
973; SI-NEXT:    s_mov_b32 s7, s3
974; SI-NEXT:    s_waitcnt lgkmcnt(0)
975; SI-NEXT:    buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
976; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
977; SI-NEXT:    s_mov_b32 s2, -1
978; SI-NEXT:    s_waitcnt vmcnt(0)
979; SI-NEXT:    v_ffbh_u32_e32 v0, v0
980; SI-NEXT:    s_waitcnt lgkmcnt(0)
981; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
982; SI-NEXT:    s_endpgm
983;
984; VI-LABEL: v_ctlz_i8_sel_eq_neg1:
985; VI:       ; %bb.0:
986; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
987; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
988; VI-NEXT:    s_mov_b32 s7, 0xf000
989; VI-NEXT:    s_mov_b32 s6, -1
990; VI-NEXT:    s_waitcnt lgkmcnt(0)
991; VI-NEXT:    v_mov_b32_e32 v1, s1
992; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
993; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
994; VI-NEXT:    flat_load_ubyte v0, v[0:1]
995; VI-NEXT:    s_waitcnt vmcnt(0)
996; VI-NEXT:    v_ffbh_u32_e32 v0, v0
997; VI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
998; VI-NEXT:    s_endpgm
999;
1000; EG-LABEL: v_ctlz_i8_sel_eq_neg1:
1001; EG:       ; %bb.0:
1002; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1003; EG-NEXT:    TEX 0 @6
1004; EG-NEXT:    ALU 12, @9, KC0[CB0:0-32], KC1[]
1005; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
1006; EG-NEXT:    CF_END
1007; EG-NEXT:    PAD
1008; EG-NEXT:    Fetch clause starting at 6:
1009; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 0, #1
1010; EG-NEXT:    ALU clause starting at 8:
1011; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, T0.X,
1012; EG-NEXT:    ALU clause starting at 9:
1013; EG-NEXT:     FFBH_UINT T0.W, T0.X,
1014; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
1015; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1016; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
1017; EG-NEXT:     LSHL * T1.W, PS, literal.y,
1018; EG-NEXT:    255(3.573311e-43), 3(4.203895e-45)
1019; EG-NEXT:     LSHL T0.X, PV.W, PS,
1020; EG-NEXT:     LSHL * T0.W, literal.x, PS,
1021; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
1022; EG-NEXT:     MOV T0.Y, 0.0,
1023; EG-NEXT:     MOV * T0.Z, 0.0,
1024; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1025; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1026  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1027  %valptr.gep = getelementptr i8, i8 addrspace(1)* %valptr, i32 %tid
1028  %val = load i8, i8 addrspace(1)* %valptr.gep
1029  %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 false) nounwind readnone
1030  %cmp = icmp eq i8 %val, 0
1031  %sel = select i1 %cmp, i8 -1, i8 %ctlz
1032  store i8 %sel, i8 addrspace(1)* %out
1033  ret void
1034}
1035
1036 define amdgpu_kernel void @v_ctlz_i16_sel_eq_neg1(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) nounwind {
1037; SI-LABEL: v_ctlz_i16_sel_eq_neg1:
1038; SI:       ; %bb.0:
1039; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
1040; SI-NEXT:    s_mov_b32 s3, 0xf000
1041; SI-NEXT:    s_mov_b32 s2, -1
1042; SI-NEXT:    s_mov_b32 s6, s2
1043; SI-NEXT:    s_mov_b32 s7, s3
1044; SI-NEXT:    s_waitcnt lgkmcnt(0)
1045; SI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
1046; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1047; SI-NEXT:    s_waitcnt vmcnt(0)
1048; SI-NEXT:    v_ffbh_u32_e32 v0, v0
1049; SI-NEXT:    s_waitcnt lgkmcnt(0)
1050; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1051; SI-NEXT:    s_endpgm
1052;
1053; VI-LABEL: v_ctlz_i16_sel_eq_neg1:
1054; VI:       ; %bb.0:
1055; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1056; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
1057; VI-NEXT:    s_mov_b32 s7, 0xf000
1058; VI-NEXT:    s_mov_b32 s6, -1
1059; VI-NEXT:    s_mov_b32 s2, s6
1060; VI-NEXT:    s_mov_b32 s3, s7
1061; VI-NEXT:    s_waitcnt lgkmcnt(0)
1062; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
1063; VI-NEXT:    s_waitcnt vmcnt(0)
1064; VI-NEXT:    v_ffbh_u32_e32 v1, v0
1065; VI-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v0
1066; VI-NEXT:    v_cndmask_b32_e64 v0, 32, v1, s[0:1]
1067; VI-NEXT:    v_add_u32_e32 v0, vcc, -16, v0
1068; VI-NEXT:    v_mov_b32_e32 v1, 0xffff
1069; VI-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[0:1]
1070; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
1071; VI-NEXT:    s_endpgm
1072;
1073; EG-LABEL: v_ctlz_i16_sel_eq_neg1:
1074; EG:       ; %bb.0:
1075; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1076; EG-NEXT:    TEX 0 @6
1077; EG-NEXT:    ALU 12, @9, KC0[CB0:0-32], KC1[]
1078; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
1079; EG-NEXT:    CF_END
1080; EG-NEXT:    PAD
1081; EG-NEXT:    Fetch clause starting at 6:
1082; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
1083; EG-NEXT:    ALU clause starting at 8:
1084; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1085; EG-NEXT:    ALU clause starting at 9:
1086; EG-NEXT:     FFBH_UINT T0.W, T0.X,
1087; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
1088; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1089; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
1090; EG-NEXT:     LSHL * T1.W, PS, literal.y,
1091; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
1092; EG-NEXT:     LSHL T0.X, PV.W, PS,
1093; EG-NEXT:     LSHL * T0.W, literal.x, PS,
1094; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1095; EG-NEXT:     MOV T0.Y, 0.0,
1096; EG-NEXT:     MOV * T0.Z, 0.0,
1097; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1098; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1099  %val = load i16, i16 addrspace(1)* %valptr
1100  %ctlz = call i16 @llvm.ctlz.i16(i16 %val, i1 false) nounwind readnone
1101  %cmp = icmp eq i16 %val, 0
1102  %sel = select i1 %cmp, i16 -1, i16 %ctlz
1103  store i16 %sel, i16 addrspace(1)* %out
1104  ret void
1105}
1106
1107; FIXME: Need to handle non-uniform case for function below (load without gep).
1108define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(i7 addrspace(1)* noalias %out, i7 addrspace(1)* noalias %valptr) nounwind {
1109; SI-LABEL: v_ctlz_i7_sel_eq_neg1:
1110; SI:       ; %bb.0:
1111; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
1112; SI-NEXT:    s_mov_b32 s3, 0xf000
1113; SI-NEXT:    v_mov_b32_e32 v1, 0
1114; SI-NEXT:    s_mov_b32 s6, 0
1115; SI-NEXT:    s_mov_b32 s7, s3
1116; SI-NEXT:    s_waitcnt lgkmcnt(0)
1117; SI-NEXT:    buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
1118; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1119; SI-NEXT:    s_mov_b32 s2, -1
1120; SI-NEXT:    s_waitcnt vmcnt(0)
1121; SI-NEXT:    v_ffbh_u32_e32 v0, v0
1122; SI-NEXT:    v_and_b32_e32 v0, 0x7f, v0
1123; SI-NEXT:    s_waitcnt lgkmcnt(0)
1124; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
1125; SI-NEXT:    s_endpgm
1126;
1127; VI-LABEL: v_ctlz_i7_sel_eq_neg1:
1128; VI:       ; %bb.0:
1129; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1130; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
1131; VI-NEXT:    s_mov_b32 s7, 0xf000
1132; VI-NEXT:    s_mov_b32 s6, -1
1133; VI-NEXT:    s_waitcnt lgkmcnt(0)
1134; VI-NEXT:    v_mov_b32_e32 v1, s1
1135; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
1136; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1137; VI-NEXT:    flat_load_ubyte v0, v[0:1]
1138; VI-NEXT:    s_waitcnt vmcnt(0)
1139; VI-NEXT:    v_ffbh_u32_e32 v0, v0
1140; VI-NEXT:    v_and_b32_e32 v0, 0x7f, v0
1141; VI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
1142; VI-NEXT:    s_endpgm
1143;
1144; EG-LABEL: v_ctlz_i7_sel_eq_neg1:
1145; EG:       ; %bb.0:
1146; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1147; EG-NEXT:    TEX 0 @6
1148; EG-NEXT:    ALU 12, @9, KC0[CB0:0-32], KC1[]
1149; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
1150; EG-NEXT:    CF_END
1151; EG-NEXT:    PAD
1152; EG-NEXT:    Fetch clause starting at 6:
1153; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 0, #1
1154; EG-NEXT:    ALU clause starting at 8:
1155; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, T0.X,
1156; EG-NEXT:    ALU clause starting at 9:
1157; EG-NEXT:     FFBH_UINT T0.W, T0.X,
1158; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
1159; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1160; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
1161; EG-NEXT:     LSHL * T1.W, PS, literal.y,
1162; EG-NEXT:    127(1.779649e-43), 3(4.203895e-45)
1163; EG-NEXT:     LSHL T0.X, PV.W, PS,
1164; EG-NEXT:     LSHL * T0.W, literal.x, PS,
1165; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
1166; EG-NEXT:     MOV T0.Y, 0.0,
1167; EG-NEXT:     MOV * T0.Z, 0.0,
1168; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1169; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1170  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1171  %valptr.gep = getelementptr i7, i7 addrspace(1)* %valptr, i32 %tid
1172  %val = load i7, i7 addrspace(1)* %valptr.gep
1173  %ctlz = call i7 @llvm.ctlz.i7(i7 %val, i1 false) nounwind readnone
1174  %cmp = icmp eq i7 %val, 0
1175  %sel = select i1 %cmp, i7 -1, i7 %ctlz
1176  store i7 %sel, i7 addrspace(1)* %out
1177  ret void
1178}
1179