1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s -enable-var-scope -check-prefixes=FUNC,GCN,SI 3; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s -enable-var-scope -check-prefixes=FUNC,GCN,VI 4; RUN: llc < %s -march=r600 -mcpu=cypress -verify-machineinstrs | FileCheck %s -enable-var-scope -check-prefixes=FUNC,EG 5 6declare i7 @llvm.ctlz.i7(i7, i1) nounwind readnone 7declare i8 @llvm.ctlz.i8(i8, i1) nounwind readnone 8declare i16 @llvm.ctlz.i16(i16, i1) nounwind readnone 9 10declare i32 @llvm.ctlz.i32(i32, i1) nounwind readnone 11declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) nounwind readnone 12declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) nounwind readnone 13 14declare i64 @llvm.ctlz.i64(i64, i1) nounwind readnone 15declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>, i1) nounwind readnone 16declare <4 x i64> @llvm.ctlz.v4i64(<4 x i64>, i1) nounwind readnone 17 18declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 19 20define amdgpu_kernel void @s_ctlz_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind { 21; SI-LABEL: s_ctlz_i32: 22; SI: ; %bb.0: 23; SI-NEXT: s_load_dword s2, s[0:1], 0xb 24; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 25; SI-NEXT: s_mov_b32 s7, 0xf000 26; SI-NEXT: s_waitcnt lgkmcnt(0) 27; SI-NEXT: s_flbit_i32_b32 s0, s2 28; SI-NEXT: s_mov_b32 s6, -1 29; SI-NEXT: v_mov_b32_e32 v0, s0 30; SI-NEXT: v_cmp_ne_u32_e64 vcc, s2, 0 31; SI-NEXT: v_cndmask_b32_e32 v0, 32, v0, vcc 32; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 33; SI-NEXT: s_endpgm 34; 35; VI-LABEL: s_ctlz_i32: 36; VI: ; %bb.0: 37; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 38; VI-NEXT: s_load_dword s0, s[0:1], 0x2c 39; VI-NEXT: s_mov_b32 s7, 0xf000 40; VI-NEXT: s_mov_b32 s6, -1 41; VI-NEXT: s_waitcnt lgkmcnt(0) 42; VI-NEXT: s_flbit_i32_b32 s1, s0 43; VI-NEXT: s_cmp_lg_u32 s0, 0 44; VI-NEXT: s_cselect_b32 s0, s1, 32 45; VI-NEXT: v_mov_b32_e32 v0, s0 46; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 47; VI-NEXT: s_endpgm 48; 49; EG-LABEL: s_ctlz_i32: 50; EG: ; %bb.0: 51; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] 52; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 53; EG-NEXT: CF_END 54; EG-NEXT: PAD 55; EG-NEXT: ALU clause starting at 4: 56; EG-NEXT: FFBH_UINT * T0.W, KC0[2].Z, 57; EG-NEXT: CNDE_INT T0.X, KC0[2].Z, literal.x, PV.W, 58; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 59; EG-NEXT: 32(4.484155e-44), 2(2.802597e-45) 60 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone 61 store i32 %ctlz, i32 addrspace(1)* %out, align 4 62 ret void 63} 64 65define amdgpu_kernel void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { 66; SI-LABEL: v_ctlz_i32: 67; SI: ; %bb.0: 68; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 69; SI-NEXT: s_mov_b32 s3, 0xf000 70; SI-NEXT: s_mov_b32 s6, 0 71; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 72; SI-NEXT: v_mov_b32_e32 v1, 0 73; SI-NEXT: s_mov_b32 s7, s3 74; SI-NEXT: s_waitcnt lgkmcnt(0) 75; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 76; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 77; SI-NEXT: s_mov_b32 s2, -1 78; SI-NEXT: s_waitcnt vmcnt(0) 79; SI-NEXT: v_ffbh_u32_e32 v1, v0 80; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 81; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc 82; SI-NEXT: s_waitcnt lgkmcnt(0) 83; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 84; SI-NEXT: s_endpgm 85; 86; VI-LABEL: v_ctlz_i32: 87; VI: ; %bb.0: 88; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 89; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 90; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 91; VI-NEXT: s_mov_b32 s7, 0xf000 92; VI-NEXT: s_mov_b32 s6, -1 93; VI-NEXT: s_waitcnt lgkmcnt(0) 94; VI-NEXT: v_mov_b32_e32 v1, s1 95; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 96; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 97; VI-NEXT: flat_load_dword v0, v[0:1] 98; VI-NEXT: s_waitcnt vmcnt(0) 99; VI-NEXT: v_ffbh_u32_e32 v1, v0 100; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 101; VI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc 102; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 103; VI-NEXT: s_endpgm 104; 105; EG-LABEL: v_ctlz_i32: 106; EG: ; %bb.0: 107; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 108; EG-NEXT: TEX 0 @6 109; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[] 110; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 111; EG-NEXT: CF_END 112; EG-NEXT: PAD 113; EG-NEXT: Fetch clause starting at 6: 114; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 115; EG-NEXT: ALU clause starting at 8: 116; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 117; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 118; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 119; EG-NEXT: ALU clause starting at 11: 120; EG-NEXT: FFBH_UINT * T0.W, T0.X, 121; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W, 122; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 123; EG-NEXT: 32(4.484155e-44), 2(2.802597e-45) 124 %tid = call i32 @llvm.amdgcn.workitem.id.x() 125 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid 126 %val = load i32, i32 addrspace(1)* %in.gep, align 4 127 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone 128 store i32 %ctlz, i32 addrspace(1)* %out, align 4 129 ret void 130} 131 132define amdgpu_kernel void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind { 133; SI-LABEL: v_ctlz_v2i32: 134; SI: ; %bb.0: 135; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 136; SI-NEXT: s_mov_b32 s3, 0xf000 137; SI-NEXT: s_mov_b32 s6, 0 138; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 139; SI-NEXT: v_mov_b32_e32 v1, 0 140; SI-NEXT: s_mov_b32 s7, s3 141; SI-NEXT: s_waitcnt lgkmcnt(0) 142; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 143; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 144; SI-NEXT: s_mov_b32 s2, -1 145; SI-NEXT: s_waitcnt vmcnt(0) 146; SI-NEXT: v_ffbh_u32_e32 v2, v1 147; SI-NEXT: v_ffbh_u32_e32 v3, v0 148; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 149; SI-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc 150; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 151; SI-NEXT: v_cndmask_b32_e32 v0, 32, v3, vcc 152; SI-NEXT: s_waitcnt lgkmcnt(0) 153; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 154; SI-NEXT: s_endpgm 155; 156; VI-LABEL: v_ctlz_v2i32: 157; VI: ; %bb.0: 158; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 159; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 160; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 161; VI-NEXT: s_mov_b32 s7, 0xf000 162; VI-NEXT: s_mov_b32 s6, -1 163; VI-NEXT: s_waitcnt lgkmcnt(0) 164; VI-NEXT: v_mov_b32_e32 v1, s1 165; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 166; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 167; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 168; VI-NEXT: s_waitcnt vmcnt(0) 169; VI-NEXT: v_ffbh_u32_e32 v2, v1 170; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 171; VI-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc 172; VI-NEXT: v_ffbh_u32_e32 v3, v0 173; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 174; VI-NEXT: v_cndmask_b32_e32 v0, 32, v3, vcc 175; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 176; VI-NEXT: s_endpgm 177; 178; EG-LABEL: v_ctlz_v2i32: 179; EG: ; %bb.0: 180; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 181; EG-NEXT: TEX 0 @6 182; EG-NEXT: ALU 6, @11, KC0[CB0:0-32], KC1[] 183; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 184; EG-NEXT: CF_END 185; EG-NEXT: PAD 186; EG-NEXT: Fetch clause starting at 6: 187; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 188; EG-NEXT: ALU clause starting at 8: 189; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 190; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 191; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 192; EG-NEXT: ALU clause starting at 11: 193; EG-NEXT: FFBH_UINT * T0.W, T0.Y, 194; EG-NEXT: CNDE_INT T0.Y, T0.Y, literal.x, PV.W, 195; EG-NEXT: FFBH_UINT * T0.W, T0.X, 196; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 197; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W, 198; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 199; EG-NEXT: 32(4.484155e-44), 2(2.802597e-45) 200 %tid = call i32 @llvm.amdgcn.workitem.id.x() 201 %in.gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %valptr, i32 %tid 202 %val = load <2 x i32>, <2 x i32> addrspace(1)* %in.gep, align 8 203 %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %val, i1 false) nounwind readnone 204 store <2 x i32> %ctlz, <2 x i32> addrspace(1)* %out, align 8 205 ret void 206} 207 208define amdgpu_kernel void @v_ctlz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind { 209; SI-LABEL: v_ctlz_v4i32: 210; SI: ; %bb.0: 211; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 212; SI-NEXT: s_mov_b32 s3, 0xf000 213; SI-NEXT: s_mov_b32 s6, 0 214; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 215; SI-NEXT: v_mov_b32_e32 v1, 0 216; SI-NEXT: s_mov_b32 s7, s3 217; SI-NEXT: s_waitcnt lgkmcnt(0) 218; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 219; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 220; SI-NEXT: s_mov_b32 s2, -1 221; SI-NEXT: s_waitcnt vmcnt(0) 222; SI-NEXT: v_ffbh_u32_e32 v4, v3 223; SI-NEXT: v_ffbh_u32_e32 v5, v2 224; SI-NEXT: v_ffbh_u32_e32 v6, v1 225; SI-NEXT: v_ffbh_u32_e32 v7, v0 226; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 227; SI-NEXT: v_cndmask_b32_e32 v3, 32, v4, vcc 228; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 229; SI-NEXT: v_cndmask_b32_e32 v2, 32, v5, vcc 230; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 231; SI-NEXT: v_cndmask_b32_e32 v1, 32, v6, vcc 232; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 233; SI-NEXT: v_cndmask_b32_e32 v0, 32, v7, vcc 234; SI-NEXT: s_waitcnt lgkmcnt(0) 235; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 236; SI-NEXT: s_endpgm 237; 238; VI-LABEL: v_ctlz_v4i32: 239; VI: ; %bb.0: 240; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 241; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 242; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 243; VI-NEXT: s_mov_b32 s7, 0xf000 244; VI-NEXT: s_mov_b32 s6, -1 245; VI-NEXT: s_waitcnt lgkmcnt(0) 246; VI-NEXT: v_mov_b32_e32 v1, s1 247; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 248; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 249; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 250; VI-NEXT: s_waitcnt vmcnt(0) 251; VI-NEXT: v_ffbh_u32_e32 v4, v3 252; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 253; VI-NEXT: v_cndmask_b32_e32 v3, 32, v4, vcc 254; VI-NEXT: v_ffbh_u32_e32 v5, v2 255; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 256; VI-NEXT: v_cndmask_b32_e32 v2, 32, v5, vcc 257; VI-NEXT: v_ffbh_u32_e32 v6, v1 258; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 259; VI-NEXT: v_cndmask_b32_e32 v1, 32, v6, vcc 260; VI-NEXT: v_ffbh_u32_e32 v7, v0 261; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 262; VI-NEXT: v_cndmask_b32_e32 v0, 32, v7, vcc 263; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 264; VI-NEXT: s_endpgm 265; 266; EG-LABEL: v_ctlz_v4i32: 267; EG: ; %bb.0: 268; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 269; EG-NEXT: TEX 0 @6 270; EG-NEXT: ALU 12, @11, KC0[CB0:0-32], KC1[] 271; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 272; EG-NEXT: CF_END 273; EG-NEXT: PAD 274; EG-NEXT: Fetch clause starting at 6: 275; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 276; EG-NEXT: ALU clause starting at 8: 277; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 278; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) 279; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 280; EG-NEXT: ALU clause starting at 11: 281; EG-NEXT: FFBH_UINT * T1.W, T0.W, 282; EG-NEXT: FFBH_UINT T2.W, T0.Z, 283; EG-NEXT: CNDE_INT * T0.W, T0.W, literal.x, PV.W, BS:VEC_021/SCL_122 284; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 285; EG-NEXT: CNDE_INT T0.Z, T0.Z, literal.x, PV.W, 286; EG-NEXT: FFBH_UINT * T1.W, T0.Y, 287; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 288; EG-NEXT: CNDE_INT T0.Y, T0.Y, literal.x, PV.W, 289; EG-NEXT: FFBH_UINT * T1.W, T0.X, 290; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 291; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W, 292; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 293; EG-NEXT: 32(4.484155e-44), 2(2.802597e-45) 294 %tid = call i32 @llvm.amdgcn.workitem.id.x() 295 %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %valptr, i32 %tid 296 %val = load <4 x i32>, <4 x i32> addrspace(1)* %in.gep, align 16 297 %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %val, i1 false) nounwind readnone 298 store <4 x i32> %ctlz, <4 x i32> addrspace(1)* %out, align 16 299 ret void 300} 301 302define amdgpu_kernel void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind { 303; SI-LABEL: v_ctlz_i8: 304; SI: ; %bb.0: 305; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 306; SI-NEXT: s_mov_b32 s3, 0xf000 307; SI-NEXT: s_mov_b32 s2, -1 308; SI-NEXT: s_mov_b32 s6, s2 309; SI-NEXT: s_mov_b32 s7, s3 310; SI-NEXT: s_waitcnt lgkmcnt(0) 311; SI-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 312; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 313; SI-NEXT: s_waitcnt vmcnt(0) 314; SI-NEXT: v_ffbh_u32_e32 v1, v0 315; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 316; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc 317; SI-NEXT: v_subrev_i32_e32 v0, vcc, 24, v0 318; SI-NEXT: s_waitcnt lgkmcnt(0) 319; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 320; SI-NEXT: s_endpgm 321; 322; VI-LABEL: v_ctlz_i8: 323; VI: ; %bb.0: 324; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 325; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 326; VI-NEXT: s_mov_b32 s7, 0xf000 327; VI-NEXT: s_mov_b32 s6, -1 328; VI-NEXT: s_mov_b32 s2, s6 329; VI-NEXT: s_mov_b32 s3, s7 330; VI-NEXT: s_waitcnt lgkmcnt(0) 331; VI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 332; VI-NEXT: s_waitcnt vmcnt(0) 333; VI-NEXT: v_ffbh_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 334; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0 335; VI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc 336; VI-NEXT: v_add_u32_e32 v0, vcc, -16, v0 337; VI-NEXT: v_add_u16_e32 v0, -8, v0 338; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 339; VI-NEXT: s_endpgm 340; 341; EG-LABEL: v_ctlz_i8: 342; EG: ; %bb.0: 343; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 344; EG-NEXT: TEX 0 @6 345; EG-NEXT: ALU 15, @9, KC0[CB0:0-32], KC1[] 346; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 347; EG-NEXT: CF_END 348; EG-NEXT: PAD 349; EG-NEXT: Fetch clause starting at 6: 350; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1 351; EG-NEXT: ALU clause starting at 8: 352; EG-NEXT: MOV * T0.X, KC0[2].Z, 353; EG-NEXT: ALU clause starting at 9: 354; EG-NEXT: FFBH_UINT * T0.W, T0.X, 355; EG-NEXT: CNDE_INT T0.W, T0.X, literal.x, PV.W, 356; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.y, 357; EG-NEXT: 32(4.484155e-44), 3(4.203895e-45) 358; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x, 359; EG-NEXT: -24(nan), 0(0.000000e+00) 360; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 361; EG-NEXT: LSHL * T1.W, T1.W, literal.y, 362; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45) 363; EG-NEXT: LSHL T0.X, PV.W, PS, 364; EG-NEXT: LSHL * T0.W, literal.x, PS, 365; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 366; EG-NEXT: MOV T0.Y, 0.0, 367; EG-NEXT: MOV * T0.Z, 0.0, 368; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 369; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 370 %val = load i8, i8 addrspace(1)* %valptr 371 %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 false) nounwind readnone 372 store i8 %ctlz, i8 addrspace(1)* %out 373 ret void 374} 375 376define amdgpu_kernel void @s_ctlz_i64(i64 addrspace(1)* noalias %out, [8 x i32], i64 %val) nounwind { 377; SI-LABEL: s_ctlz_i64: 378; SI: ; %bb.0: 379; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x13 380; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 381; SI-NEXT: s_mov_b32 s7, 0xf000 382; SI-NEXT: s_mov_b32 s6, -1 383; SI-NEXT: s_waitcnt lgkmcnt(0) 384; SI-NEXT: s_flbit_i32_b32 s0, s2 385; SI-NEXT: s_flbit_i32_b32 s1, s3 386; SI-NEXT: s_add_i32 s0, s0, 32 387; SI-NEXT: s_or_b32 s2, s2, s3 388; SI-NEXT: v_mov_b32_e32 v0, s1 389; SI-NEXT: v_mov_b32_e32 v1, s0 390; SI-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0 391; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 392; SI-NEXT: v_cmp_ne_u32_e64 vcc, s2, 0 393; SI-NEXT: v_cndmask_b32_e32 v0, 64, v0, vcc 394; SI-NEXT: v_mov_b32_e32 v1, 0 395; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 396; SI-NEXT: s_endpgm 397; 398; VI-LABEL: s_ctlz_i64: 399; VI: ; %bb.0: 400; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 401; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x4c 402; VI-NEXT: s_mov_b32 s7, 0xf000 403; VI-NEXT: s_mov_b32 s6, -1 404; VI-NEXT: v_mov_b32_e32 v1, 0 405; VI-NEXT: s_waitcnt lgkmcnt(0) 406; VI-NEXT: s_flbit_i32_b32 s2, s0 407; VI-NEXT: s_add_i32 s2, s2, 32 408; VI-NEXT: s_flbit_i32_b32 s3, s1 409; VI-NEXT: s_cmp_eq_u32 s1, 0 410; VI-NEXT: s_cselect_b32 s2, s2, s3 411; VI-NEXT: s_or_b32 s0, s0, s1 412; VI-NEXT: s_cmp_lg_u32 s0, 0 413; VI-NEXT: s_cselect_b32 s0, s2, 64 414; VI-NEXT: v_mov_b32_e32 v0, s0 415; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 416; VI-NEXT: s_endpgm 417; 418; EG-LABEL: s_ctlz_i64: 419; EG: ; %bb.0: 420; EG-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[] 421; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 422; EG-NEXT: CF_END 423; EG-NEXT: PAD 424; EG-NEXT: ALU clause starting at 4: 425; EG-NEXT: FFBH_UINT * T0.W, KC0[4].W, 426; EG-NEXT: CNDE_INT * T0.W, KC0[4].W, literal.x, PV.W, 427; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 428; EG-NEXT: FFBH_UINT T1.W, KC0[5].X, 429; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x, 430; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 431; EG-NEXT: CNDE_INT T0.X, KC0[5].X, PS, PV.W, 432; EG-NEXT: MOV T0.Y, 0.0, 433; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 434; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 435 %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false) 436 store i64 %ctlz, i64 addrspace(1)* %out 437 ret void 438} 439 440define amdgpu_kernel void @s_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind { 441; SI-LABEL: s_ctlz_i64_trunc: 442; SI: ; %bb.0: 443; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 444; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 445; SI-NEXT: s_mov_b32 s7, 0xf000 446; SI-NEXT: s_mov_b32 s6, -1 447; SI-NEXT: s_waitcnt lgkmcnt(0) 448; SI-NEXT: s_flbit_i32_b32 s0, s2 449; SI-NEXT: s_flbit_i32_b32 s1, s3 450; SI-NEXT: s_add_i32 s0, s0, 32 451; SI-NEXT: s_or_b32 s2, s2, s3 452; SI-NEXT: v_mov_b32_e32 v0, s1 453; SI-NEXT: v_mov_b32_e32 v1, s0 454; SI-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0 455; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 456; SI-NEXT: v_cmp_ne_u32_e64 vcc, s2, 0 457; SI-NEXT: v_cndmask_b32_e32 v0, 64, v0, vcc 458; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 459; SI-NEXT: s_endpgm 460; 461; VI-LABEL: s_ctlz_i64_trunc: 462; VI: ; %bb.0: 463; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 464; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 465; VI-NEXT: s_mov_b32 s7, 0xf000 466; VI-NEXT: s_mov_b32 s6, -1 467; VI-NEXT: s_waitcnt lgkmcnt(0) 468; VI-NEXT: s_flbit_i32_b32 s2, s0 469; VI-NEXT: s_add_i32 s2, s2, 32 470; VI-NEXT: s_flbit_i32_b32 s3, s1 471; VI-NEXT: s_cmp_eq_u32 s1, 0 472; VI-NEXT: s_cselect_b32 s2, s2, s3 473; VI-NEXT: s_or_b32 s0, s0, s1 474; VI-NEXT: s_cmp_lg_u32 s0, 0 475; VI-NEXT: s_cselect_b32 s0, s2, 64 476; VI-NEXT: v_mov_b32_e32 v0, s0 477; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 478; VI-NEXT: s_endpgm 479; 480; EG-LABEL: s_ctlz_i64_trunc: 481; EG: ; %bb.0: 482; EG-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[] 483; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 484; EG-NEXT: CF_END 485; EG-NEXT: PAD 486; EG-NEXT: ALU clause starting at 4: 487; EG-NEXT: FFBH_UINT * T0.W, KC0[2].W, 488; EG-NEXT: CNDE_INT * T0.W, KC0[2].W, literal.x, PV.W, 489; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 490; EG-NEXT: FFBH_UINT T1.W, KC0[3].X, 491; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x, 492; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 493; EG-NEXT: CNDE_INT T0.X, KC0[3].X, PS, PV.W, 494; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 495; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 496 %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false) 497 %trunc = trunc i64 %ctlz to i32 498 store i32 %trunc, i32 addrspace(1)* %out 499 ret void 500} 501 502define amdgpu_kernel void @v_ctlz_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { 503; SI-LABEL: v_ctlz_i64: 504; SI: ; %bb.0: 505; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 506; SI-NEXT: s_mov_b32 s7, 0xf000 507; SI-NEXT: s_mov_b32 s6, 0 508; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 509; SI-NEXT: v_mov_b32_e32 v1, 0 510; SI-NEXT: s_waitcnt lgkmcnt(0) 511; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 512; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 513; SI-NEXT: s_waitcnt vmcnt(0) 514; SI-NEXT: v_ffbh_u32_e32 v4, v2 515; SI-NEXT: v_ffbh_u32_e32 v5, v3 516; SI-NEXT: v_or_b32_e32 v2, v2, v3 517; SI-NEXT: v_add_i32_e32 v4, vcc, 32, v4 518; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 519; SI-NEXT: v_cndmask_b32_e32 v3, v5, v4, vcc 520; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 521; SI-NEXT: v_cndmask_b32_e32 v2, 64, v3, vcc 522; SI-NEXT: v_mov_b32_e32 v3, v1 523; SI-NEXT: s_waitcnt lgkmcnt(0) 524; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 525; SI-NEXT: s_endpgm 526; 527; VI-LABEL: v_ctlz_i64: 528; VI: ; %bb.0: 529; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 530; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 531; VI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 532; VI-NEXT: v_mov_b32_e32 v4, 0 533; VI-NEXT: v_mov_b32_e32 v2, 0 534; VI-NEXT: s_waitcnt lgkmcnt(0) 535; VI-NEXT: v_mov_b32_e32 v5, s3 536; VI-NEXT: v_mov_b32_e32 v1, s1 537; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v3 538; VI-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc 539; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 540; VI-NEXT: v_add_u32_e32 v3, vcc, s2, v3 541; VI-NEXT: v_addc_u32_e32 v4, vcc, v5, v4, vcc 542; VI-NEXT: s_waitcnt vmcnt(0) 543; VI-NEXT: v_ffbh_u32_e32 v5, v0 544; VI-NEXT: v_add_u32_e32 v5, vcc, 32, v5 545; VI-NEXT: v_ffbh_u32_e32 v6, v1 546; VI-NEXT: v_or_b32_e32 v0, v0, v1 547; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 548; VI-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc 549; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 550; VI-NEXT: v_cndmask_b32_e32 v1, 64, v1, vcc 551; VI-NEXT: flat_store_dwordx2 v[3:4], v[1:2] 552; VI-NEXT: s_endpgm 553; 554; EG-LABEL: v_ctlz_i64: 555; EG: ; %bb.0: 556; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 557; EG-NEXT: TEX 0 @6 558; EG-NEXT: ALU 10, @11, KC0[CB0:0-32], KC1[] 559; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 560; EG-NEXT: CF_END 561; EG-NEXT: PAD 562; EG-NEXT: Fetch clause starting at 6: 563; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 564; EG-NEXT: ALU clause starting at 8: 565; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 566; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 567; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 568; EG-NEXT: ALU clause starting at 11: 569; EG-NEXT: FFBH_UINT * T1.W, T0.X, 570; EG-NEXT: CNDE_INT * T1.W, T0.X, literal.x, PV.W, 571; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 572; EG-NEXT: FFBH_UINT T2.W, T0.Y, 573; EG-NEXT: ADD_INT * T1.W, PV.W, literal.x, 574; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 575; EG-NEXT: CNDE_INT T0.X, T0.Y, PS, PV.W, 576; EG-NEXT: MOV T0.Y, 0.0, 577; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W, 578; EG-NEXT: LSHR * T1.X, PV.W, literal.x, 579; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 580 %tid = call i32 @llvm.amdgcn.workitem.id.x() 581 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid 582 %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid 583 %val = load i64, i64 addrspace(1)* %in.gep 584 %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false) 585 store i64 %ctlz, i64 addrspace(1)* %out.gep 586 ret void 587} 588 589define amdgpu_kernel void @v_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { 590; SI-LABEL: v_ctlz_i64_trunc: 591; SI: ; %bb.0: 592; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 593; SI-NEXT: s_mov_b32 s7, 0xf000 594; SI-NEXT: s_mov_b32 s6, 0 595; SI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 596; SI-NEXT: v_mov_b32_e32 v2, 0 597; SI-NEXT: s_waitcnt lgkmcnt(0) 598; SI-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 599; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 600; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 601; SI-NEXT: s_waitcnt vmcnt(0) 602; SI-NEXT: v_ffbh_u32_e32 v0, v3 603; SI-NEXT: v_ffbh_u32_e32 v5, v4 604; SI-NEXT: v_or_b32_e32 v3, v3, v4 605; SI-NEXT: v_add_i32_e32 v0, vcc, 32, v0 606; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 607; SI-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc 608; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 609; SI-NEXT: v_cndmask_b32_e32 v0, 64, v0, vcc 610; SI-NEXT: s_waitcnt lgkmcnt(0) 611; SI-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 612; SI-NEXT: s_endpgm 613; 614; VI-LABEL: v_ctlz_i64_trunc: 615; VI: ; %bb.0: 616; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 617; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 618; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 619; VI-NEXT: v_mov_b32_e32 v4, 0 620; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 621; VI-NEXT: s_waitcnt lgkmcnt(0) 622; VI-NEXT: v_mov_b32_e32 v5, s3 623; VI-NEXT: v_mov_b32_e32 v2, s1 624; VI-NEXT: v_add_u32_e32 v1, vcc, s0, v1 625; VI-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc 626; VI-NEXT: flat_load_dwordx2 v[1:2], v[1:2] 627; VI-NEXT: v_add_u32_e32 v3, vcc, s2, v0 628; VI-NEXT: v_addc_u32_e32 v4, vcc, v5, v4, vcc 629; VI-NEXT: s_waitcnt vmcnt(0) 630; VI-NEXT: v_ffbh_u32_e32 v0, v1 631; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 632; VI-NEXT: v_ffbh_u32_e32 v5, v2 633; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 634; VI-NEXT: v_or_b32_e32 v1, v1, v2 635; VI-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc 636; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 637; VI-NEXT: v_cndmask_b32_e32 v0, 64, v0, vcc 638; VI-NEXT: flat_store_dword v[3:4], v0 639; VI-NEXT: s_endpgm 640; 641; EG-LABEL: v_ctlz_i64_trunc: 642; EG: ; %bb.0: 643; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 644; EG-NEXT: TEX 0 @6 645; EG-NEXT: ALU 10, @11, KC0[CB0:0-32], KC1[] 646; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 647; EG-NEXT: CF_END 648; EG-NEXT: PAD 649; EG-NEXT: Fetch clause starting at 6: 650; EG-NEXT: VTX_READ_64 T1.XY, T1.X, 0, #1 651; EG-NEXT: ALU clause starting at 8: 652; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 653; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 654; EG-NEXT: ADD_INT * T1.X, KC0[2].Z, PV.W, 655; EG-NEXT: ALU clause starting at 11: 656; EG-NEXT: FFBH_UINT * T0.W, T1.X, 657; EG-NEXT: CNDE_INT * T0.W, T1.X, literal.x, PV.W, 658; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 659; EG-NEXT: LSHL T0.Z, T0.X, literal.x, 660; EG-NEXT: FFBH_UINT T1.W, T1.Y, 661; EG-NEXT: ADD_INT * T0.W, PV.W, literal.y, 662; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44) 663; EG-NEXT: CNDE_INT T0.X, T1.Y, PS, PV.W, 664; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, PV.Z, 665; EG-NEXT: LSHR * T1.X, PV.W, literal.x, 666; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 667 %tid = call i32 @llvm.amdgcn.workitem.id.x() 668 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid 669 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid 670 %val = load i64, i64 addrspace(1)* %in.gep 671 %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false) 672 %trunc = trunc i64 %ctlz to i32 673 store i32 %trunc, i32 addrspace(1)* %out.gep 674 ret void 675} 676 677define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { 678; SI-LABEL: v_ctlz_i32_sel_eq_neg1: 679; SI: ; %bb.0: 680; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 681; SI-NEXT: s_mov_b32 s3, 0xf000 682; SI-NEXT: s_mov_b32 s6, 0 683; SI-NEXT: s_mov_b32 s7, s3 684; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 685; SI-NEXT: v_mov_b32_e32 v1, 0 686; SI-NEXT: s_waitcnt lgkmcnt(0) 687; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 688; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 689; SI-NEXT: s_mov_b32 s2, -1 690; SI-NEXT: s_waitcnt vmcnt(0) 691; SI-NEXT: v_ffbh_u32_e32 v0, v0 692; SI-NEXT: s_waitcnt lgkmcnt(0) 693; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 694; SI-NEXT: s_endpgm 695; 696; VI-LABEL: v_ctlz_i32_sel_eq_neg1: 697; VI: ; %bb.0: 698; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 699; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 700; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 701; VI-NEXT: s_mov_b32 s7, 0xf000 702; VI-NEXT: s_mov_b32 s6, -1 703; VI-NEXT: s_waitcnt lgkmcnt(0) 704; VI-NEXT: v_mov_b32_e32 v1, s1 705; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 706; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 707; VI-NEXT: flat_load_dword v0, v[0:1] 708; VI-NEXT: s_waitcnt vmcnt(0) 709; VI-NEXT: v_ffbh_u32_e32 v0, v0 710; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 711; VI-NEXT: s_endpgm 712; 713; EG-LABEL: v_ctlz_i32_sel_eq_neg1: 714; EG: ; %bb.0: 715; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 716; EG-NEXT: TEX 0 @6 717; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[] 718; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 719; EG-NEXT: CF_END 720; EG-NEXT: PAD 721; EG-NEXT: Fetch clause starting at 6: 722; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 723; EG-NEXT: ALU clause starting at 8: 724; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 725; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 726; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 727; EG-NEXT: ALU clause starting at 11: 728; EG-NEXT: FFBH_UINT * T0.W, T0.X, 729; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W, 730; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 731; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W, 732; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 733; EG-NEXT: -1(nan), 2(2.802597e-45) 734 %tid = call i32 @llvm.amdgcn.workitem.id.x() 735 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid 736 %val = load i32, i32 addrspace(1)* %in.gep 737 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone 738 %cmp = icmp eq i32 %val, 0 739 %sel = select i1 %cmp, i32 -1, i32 %ctlz 740 store i32 %sel, i32 addrspace(1)* %out 741 ret void 742} 743 744define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { 745; SI-LABEL: v_ctlz_i32_sel_ne_neg1: 746; SI: ; %bb.0: 747; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 748; SI-NEXT: s_mov_b32 s3, 0xf000 749; SI-NEXT: s_mov_b32 s6, 0 750; SI-NEXT: s_mov_b32 s7, s3 751; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 752; SI-NEXT: v_mov_b32_e32 v1, 0 753; SI-NEXT: s_waitcnt lgkmcnt(0) 754; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 755; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 756; SI-NEXT: s_mov_b32 s2, -1 757; SI-NEXT: s_waitcnt vmcnt(0) 758; SI-NEXT: v_ffbh_u32_e32 v0, v0 759; SI-NEXT: s_waitcnt lgkmcnt(0) 760; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 761; SI-NEXT: s_endpgm 762; 763; VI-LABEL: v_ctlz_i32_sel_ne_neg1: 764; VI: ; %bb.0: 765; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 766; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 767; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 768; VI-NEXT: s_mov_b32 s7, 0xf000 769; VI-NEXT: s_mov_b32 s6, -1 770; VI-NEXT: s_waitcnt lgkmcnt(0) 771; VI-NEXT: v_mov_b32_e32 v1, s1 772; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 773; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 774; VI-NEXT: flat_load_dword v0, v[0:1] 775; VI-NEXT: s_waitcnt vmcnt(0) 776; VI-NEXT: v_ffbh_u32_e32 v0, v0 777; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 778; VI-NEXT: s_endpgm 779; 780; EG-LABEL: v_ctlz_i32_sel_ne_neg1: 781; EG: ; %bb.0: 782; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 783; EG-NEXT: TEX 0 @6 784; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[] 785; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 786; EG-NEXT: CF_END 787; EG-NEXT: PAD 788; EG-NEXT: Fetch clause starting at 6: 789; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 790; EG-NEXT: ALU clause starting at 8: 791; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 792; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 793; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 794; EG-NEXT: ALU clause starting at 11: 795; EG-NEXT: FFBH_UINT * T0.W, T0.X, 796; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W, 797; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 798; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W, 799; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 800; EG-NEXT: -1(nan), 2(2.802597e-45) 801 %tid = call i32 @llvm.amdgcn.workitem.id.x() 802 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid 803 %val = load i32, i32 addrspace(1)* %in.gep 804 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone 805 %cmp = icmp ne i32 %val, 0 806 %sel = select i1 %cmp, i32 %ctlz, i32 -1 807 store i32 %sel, i32 addrspace(1)* %out 808 ret void 809} 810 811; TODO: Should be able to eliminate select here as well. 812define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { 813; SI-LABEL: v_ctlz_i32_sel_eq_bitwidth: 814; SI: ; %bb.0: 815; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 816; SI-NEXT: s_mov_b32 s3, 0xf000 817; SI-NEXT: s_mov_b32 s6, 0 818; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 819; SI-NEXT: v_mov_b32_e32 v1, 0 820; SI-NEXT: s_mov_b32 s7, s3 821; SI-NEXT: s_waitcnt lgkmcnt(0) 822; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 823; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 824; SI-NEXT: s_mov_b32 s2, -1 825; SI-NEXT: s_waitcnt vmcnt(0) 826; SI-NEXT: v_ffbh_u32_e32 v1, v0 827; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 828; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc 829; SI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0 830; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 831; SI-NEXT: s_waitcnt lgkmcnt(0) 832; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 833; SI-NEXT: s_endpgm 834; 835; VI-LABEL: v_ctlz_i32_sel_eq_bitwidth: 836; VI: ; %bb.0: 837; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 838; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 839; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 840; VI-NEXT: s_mov_b32 s7, 0xf000 841; VI-NEXT: s_mov_b32 s6, -1 842; VI-NEXT: s_waitcnt lgkmcnt(0) 843; VI-NEXT: v_mov_b32_e32 v1, s1 844; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 845; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 846; VI-NEXT: flat_load_dword v0, v[0:1] 847; VI-NEXT: s_waitcnt vmcnt(0) 848; VI-NEXT: v_ffbh_u32_e32 v1, v0 849; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 850; VI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc 851; VI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0 852; VI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 853; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 854; VI-NEXT: s_endpgm 855; 856; EG-LABEL: v_ctlz_i32_sel_eq_bitwidth: 857; EG: ; %bb.0: 858; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 859; EG-NEXT: TEX 0 @6 860; EG-NEXT: ALU 7, @11, KC0[CB0:0-32], KC1[] 861; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 862; EG-NEXT: CF_END 863; EG-NEXT: PAD 864; EG-NEXT: Fetch clause starting at 6: 865; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 866; EG-NEXT: ALU clause starting at 8: 867; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 868; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 869; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 870; EG-NEXT: ALU clause starting at 11: 871; EG-NEXT: FFBH_UINT * T0.W, T0.X, 872; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W, 873; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 874; EG-NEXT: SETE_INT * T1.W, PV.W, literal.x, 875; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 876; EG-NEXT: CNDE_INT T0.X, PV.W, T0.W, literal.x, 877; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 878; EG-NEXT: -1(nan), 2(2.802597e-45) 879 %tid = call i32 @llvm.amdgcn.workitem.id.x() 880 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid 881 %val = load i32, i32 addrspace(1)* %in.gep 882 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone 883 %cmp = icmp eq i32 %ctlz, 32 884 %sel = select i1 %cmp, i32 -1, i32 %ctlz 885 store i32 %sel, i32 addrspace(1)* %out 886 ret void 887} 888 889define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { 890; SI-LABEL: v_ctlz_i32_sel_ne_bitwidth: 891; SI: ; %bb.0: 892; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 893; SI-NEXT: s_mov_b32 s3, 0xf000 894; SI-NEXT: s_mov_b32 s6, 0 895; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 896; SI-NEXT: v_mov_b32_e32 v1, 0 897; SI-NEXT: s_mov_b32 s7, s3 898; SI-NEXT: s_waitcnt lgkmcnt(0) 899; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 900; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 901; SI-NEXT: s_mov_b32 s2, -1 902; SI-NEXT: s_waitcnt vmcnt(0) 903; SI-NEXT: v_ffbh_u32_e32 v1, v0 904; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 905; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc 906; SI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0 907; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 908; SI-NEXT: s_waitcnt lgkmcnt(0) 909; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 910; SI-NEXT: s_endpgm 911; 912; VI-LABEL: v_ctlz_i32_sel_ne_bitwidth: 913; VI: ; %bb.0: 914; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 915; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 916; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 917; VI-NEXT: s_mov_b32 s7, 0xf000 918; VI-NEXT: s_mov_b32 s6, -1 919; VI-NEXT: s_waitcnt lgkmcnt(0) 920; VI-NEXT: v_mov_b32_e32 v1, s1 921; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 922; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 923; VI-NEXT: flat_load_dword v0, v[0:1] 924; VI-NEXT: s_waitcnt vmcnt(0) 925; VI-NEXT: v_ffbh_u32_e32 v1, v0 926; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 927; VI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc 928; VI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0 929; VI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 930; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 931; VI-NEXT: s_endpgm 932; 933; EG-LABEL: v_ctlz_i32_sel_ne_bitwidth: 934; EG: ; %bb.0: 935; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 936; EG-NEXT: TEX 0 @6 937; EG-NEXT: ALU 7, @11, KC0[CB0:0-32], KC1[] 938; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 939; EG-NEXT: CF_END 940; EG-NEXT: PAD 941; EG-NEXT: Fetch clause starting at 6: 942; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 943; EG-NEXT: ALU clause starting at 8: 944; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 945; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 946; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 947; EG-NEXT: ALU clause starting at 11: 948; EG-NEXT: FFBH_UINT * T0.W, T0.X, 949; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W, 950; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 951; EG-NEXT: SETNE_INT * T1.W, PV.W, literal.x, 952; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 953; EG-NEXT: CNDE_INT T0.X, PV.W, literal.x, T0.W, 954; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 955; EG-NEXT: -1(nan), 2(2.802597e-45) 956 %tid = call i32 @llvm.amdgcn.workitem.id.x() 957 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid 958 %val = load i32, i32 addrspace(1)* %in.gep 959 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone 960 %cmp = icmp ne i32 %ctlz, 32 961 %sel = select i1 %cmp, i32 %ctlz, i32 -1 962 store i32 %sel, i32 addrspace(1)* %out 963 ret void 964} 965 966 define amdgpu_kernel void @v_ctlz_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind { 967; SI-LABEL: v_ctlz_i8_sel_eq_neg1: 968; SI: ; %bb.0: 969; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 970; SI-NEXT: s_mov_b32 s3, 0xf000 971; SI-NEXT: v_mov_b32_e32 v1, 0 972; SI-NEXT: s_mov_b32 s6, 0 973; SI-NEXT: s_mov_b32 s7, s3 974; SI-NEXT: s_waitcnt lgkmcnt(0) 975; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 976; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 977; SI-NEXT: s_mov_b32 s2, -1 978; SI-NEXT: s_waitcnt vmcnt(0) 979; SI-NEXT: v_ffbh_u32_e32 v0, v0 980; SI-NEXT: s_waitcnt lgkmcnt(0) 981; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 982; SI-NEXT: s_endpgm 983; 984; VI-LABEL: v_ctlz_i8_sel_eq_neg1: 985; VI: ; %bb.0: 986; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 987; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 988; VI-NEXT: s_mov_b32 s7, 0xf000 989; VI-NEXT: s_mov_b32 s6, -1 990; VI-NEXT: s_waitcnt lgkmcnt(0) 991; VI-NEXT: v_mov_b32_e32 v1, s1 992; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 993; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 994; VI-NEXT: flat_load_ubyte v0, v[0:1] 995; VI-NEXT: s_waitcnt vmcnt(0) 996; VI-NEXT: v_ffbh_u32_e32 v0, v0 997; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 998; VI-NEXT: s_endpgm 999; 1000; EG-LABEL: v_ctlz_i8_sel_eq_neg1: 1001; EG: ; %bb.0: 1002; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 1003; EG-NEXT: TEX 0 @6 1004; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[] 1005; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 1006; EG-NEXT: CF_END 1007; EG-NEXT: PAD 1008; EG-NEXT: Fetch clause starting at 6: 1009; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1 1010; EG-NEXT: ALU clause starting at 8: 1011; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, T0.X, 1012; EG-NEXT: ALU clause starting at 9: 1013; EG-NEXT: FFBH_UINT T0.W, T0.X, 1014; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 1015; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1016; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 1017; EG-NEXT: LSHL * T1.W, PS, literal.y, 1018; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45) 1019; EG-NEXT: LSHL T0.X, PV.W, PS, 1020; EG-NEXT: LSHL * T0.W, literal.x, PS, 1021; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 1022; EG-NEXT: MOV T0.Y, 0.0, 1023; EG-NEXT: MOV * T0.Z, 0.0, 1024; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1025; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1026 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1027 %valptr.gep = getelementptr i8, i8 addrspace(1)* %valptr, i32 %tid 1028 %val = load i8, i8 addrspace(1)* %valptr.gep 1029 %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 false) nounwind readnone 1030 %cmp = icmp eq i8 %val, 0 1031 %sel = select i1 %cmp, i8 -1, i8 %ctlz 1032 store i8 %sel, i8 addrspace(1)* %out 1033 ret void 1034} 1035 1036 define amdgpu_kernel void @v_ctlz_i16_sel_eq_neg1(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) nounwind { 1037; SI-LABEL: v_ctlz_i16_sel_eq_neg1: 1038; SI: ; %bb.0: 1039; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 1040; SI-NEXT: s_mov_b32 s3, 0xf000 1041; SI-NEXT: s_mov_b32 s2, -1 1042; SI-NEXT: s_mov_b32 s6, s2 1043; SI-NEXT: s_mov_b32 s7, s3 1044; SI-NEXT: s_waitcnt lgkmcnt(0) 1045; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 1046; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1047; SI-NEXT: s_waitcnt vmcnt(0) 1048; SI-NEXT: v_ffbh_u32_e32 v0, v0 1049; SI-NEXT: s_waitcnt lgkmcnt(0) 1050; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 1051; SI-NEXT: s_endpgm 1052; 1053; VI-LABEL: v_ctlz_i16_sel_eq_neg1: 1054; VI: ; %bb.0: 1055; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1056; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 1057; VI-NEXT: s_mov_b32 s7, 0xf000 1058; VI-NEXT: s_mov_b32 s6, -1 1059; VI-NEXT: s_mov_b32 s2, s6 1060; VI-NEXT: s_mov_b32 s3, s7 1061; VI-NEXT: s_waitcnt lgkmcnt(0) 1062; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 1063; VI-NEXT: s_waitcnt vmcnt(0) 1064; VI-NEXT: v_ffbh_u32_e32 v1, v0 1065; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v0 1066; VI-NEXT: v_cndmask_b32_e64 v0, 32, v1, s[0:1] 1067; VI-NEXT: v_add_u32_e32 v0, vcc, -16, v0 1068; VI-NEXT: v_mov_b32_e32 v1, 0xffff 1069; VI-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[0:1] 1070; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 1071; VI-NEXT: s_endpgm 1072; 1073; EG-LABEL: v_ctlz_i16_sel_eq_neg1: 1074; EG: ; %bb.0: 1075; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 1076; EG-NEXT: TEX 0 @6 1077; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[] 1078; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 1079; EG-NEXT: CF_END 1080; EG-NEXT: PAD 1081; EG-NEXT: Fetch clause starting at 6: 1082; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1 1083; EG-NEXT: ALU clause starting at 8: 1084; EG-NEXT: MOV * T0.X, KC0[2].Z, 1085; EG-NEXT: ALU clause starting at 9: 1086; EG-NEXT: FFBH_UINT T0.W, T0.X, 1087; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 1088; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1089; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 1090; EG-NEXT: LSHL * T1.W, PS, literal.y, 1091; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 1092; EG-NEXT: LSHL T0.X, PV.W, PS, 1093; EG-NEXT: LSHL * T0.W, literal.x, PS, 1094; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1095; EG-NEXT: MOV T0.Y, 0.0, 1096; EG-NEXT: MOV * T0.Z, 0.0, 1097; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1098; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1099 %val = load i16, i16 addrspace(1)* %valptr 1100 %ctlz = call i16 @llvm.ctlz.i16(i16 %val, i1 false) nounwind readnone 1101 %cmp = icmp eq i16 %val, 0 1102 %sel = select i1 %cmp, i16 -1, i16 %ctlz 1103 store i16 %sel, i16 addrspace(1)* %out 1104 ret void 1105} 1106 1107; FIXME: Need to handle non-uniform case for function below (load without gep). 1108define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(i7 addrspace(1)* noalias %out, i7 addrspace(1)* noalias %valptr) nounwind { 1109; SI-LABEL: v_ctlz_i7_sel_eq_neg1: 1110; SI: ; %bb.0: 1111; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 1112; SI-NEXT: s_mov_b32 s3, 0xf000 1113; SI-NEXT: v_mov_b32_e32 v1, 0 1114; SI-NEXT: s_mov_b32 s6, 0 1115; SI-NEXT: s_mov_b32 s7, s3 1116; SI-NEXT: s_waitcnt lgkmcnt(0) 1117; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 1118; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1119; SI-NEXT: s_mov_b32 s2, -1 1120; SI-NEXT: s_waitcnt vmcnt(0) 1121; SI-NEXT: v_ffbh_u32_e32 v0, v0 1122; SI-NEXT: v_and_b32_e32 v0, 0x7f, v0 1123; SI-NEXT: s_waitcnt lgkmcnt(0) 1124; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 1125; SI-NEXT: s_endpgm 1126; 1127; VI-LABEL: v_ctlz_i7_sel_eq_neg1: 1128; VI: ; %bb.0: 1129; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1130; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 1131; VI-NEXT: s_mov_b32 s7, 0xf000 1132; VI-NEXT: s_mov_b32 s6, -1 1133; VI-NEXT: s_waitcnt lgkmcnt(0) 1134; VI-NEXT: v_mov_b32_e32 v1, s1 1135; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 1136; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1137; VI-NEXT: flat_load_ubyte v0, v[0:1] 1138; VI-NEXT: s_waitcnt vmcnt(0) 1139; VI-NEXT: v_ffbh_u32_e32 v0, v0 1140; VI-NEXT: v_and_b32_e32 v0, 0x7f, v0 1141; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 1142; VI-NEXT: s_endpgm 1143; 1144; EG-LABEL: v_ctlz_i7_sel_eq_neg1: 1145; EG: ; %bb.0: 1146; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 1147; EG-NEXT: TEX 0 @6 1148; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[] 1149; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 1150; EG-NEXT: CF_END 1151; EG-NEXT: PAD 1152; EG-NEXT: Fetch clause starting at 6: 1153; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1 1154; EG-NEXT: ALU clause starting at 8: 1155; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, T0.X, 1156; EG-NEXT: ALU clause starting at 9: 1157; EG-NEXT: FFBH_UINT T0.W, T0.X, 1158; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 1159; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1160; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 1161; EG-NEXT: LSHL * T1.W, PS, literal.y, 1162; EG-NEXT: 127(1.779649e-43), 3(4.203895e-45) 1163; EG-NEXT: LSHL T0.X, PV.W, PS, 1164; EG-NEXT: LSHL * T0.W, literal.x, PS, 1165; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 1166; EG-NEXT: MOV T0.Y, 0.0, 1167; EG-NEXT: MOV * T0.Z, 0.0, 1168; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1169; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1170 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1171 %valptr.gep = getelementptr i7, i7 addrspace(1)* %valptr, i32 %tid 1172 %val = load i7, i7 addrspace(1)* %valptr.gep 1173 %ctlz = call i7 @llvm.ctlz.i7(i7 %val, i1 false) nounwind readnone 1174 %cmp = icmp eq i7 %val, 0 1175 %sel = select i1 %cmp, i7 -1, i7 %ctlz 1176 store i7 %sel, i7 addrspace(1)* %out 1177 ret void 1178} 1179