1; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-NOSDWA -check-prefix=FUNC %s 2; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-SDWA -check-prefix=FUNC %s 3; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=EG -check-prefix=FUNC %s 4 5declare i7 @llvm.cttz.i7(i7, i1) nounwind readnone 6declare i8 @llvm.cttz.i8(i8, i1) nounwind readnone 7declare i16 @llvm.cttz.i16(i16, i1) nounwind readnone 8declare i32 @llvm.cttz.i32(i32, i1) nounwind readnone 9declare i64 @llvm.cttz.i64(i64, i1) nounwind readnone 10declare <2 x i32> @llvm.cttz.v2i32(<2 x i32>, i1) nounwind readnone 11declare <4 x i32> @llvm.cttz.v4i32(<4 x i32>, i1) nounwind readnone 12declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 13 14; FUNC-LABEL: {{^}}s_cttz_zero_undef_i32: 15; SI: s_load_dword [[VAL:s[0-9]+]], 16; SI: s_ff1_i32_b32 [[SRESULT:s[0-9]+]], [[VAL]] 17; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]] 18; SI: buffer_store_dword [[VRESULT]], 19; SI: s_endpgm 20; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]] 21; EG: FFBL_INT {{\*? *}}[[RESULT]] 22define amdgpu_kernel void @s_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind { 23 %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone 24 store i32 %cttz, i32 addrspace(1)* %out, align 4 25 ret void 26} 27 28; FUNC-LABEL: {{^}}v_cttz_zero_undef_i32: 29; SI: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]], 30; SI: v_ffbl_b32_e32 [[RESULT:v[0-9]+]], [[VAL]] 31; SI: buffer_store_dword [[RESULT]], 32; SI: s_endpgm 33; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]] 34; EG: FFBL_INT {{\*? *}}[[RESULT]] 35define amdgpu_kernel void @v_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { 36 %tid = call i32 @llvm.amdgcn.workitem.id.x() 37 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid 38 %val = load i32, i32 addrspace(1)* %in.gep, align 4 39 %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone 40 store i32 %cttz, i32 addrspace(1)* %out, align 4 41 ret void 42} 43 44; FUNC-LABEL: {{^}}v_cttz_zero_undef_v2i32: 45; SI: {{buffer|flat}}_load_dwordx2 46; SI: v_ffbl_b32_e32 47; SI: v_ffbl_b32_e32 48; SI: buffer_store_dwordx2 49; SI: s_endpgm 50; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}} 51; EG: FFBL_INT {{\*? *}}[[RESULT]] 52; EG: FFBL_INT {{\*? *}}[[RESULT]] 53define amdgpu_kernel void @v_cttz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind { 54 %tid = call i32 @llvm.amdgcn.workitem.id.x() 55 %in.gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %valptr, i32 %tid 56 %val = load <2 x i32>, <2 x i32> addrspace(1)* %in.gep, align 8 57 %cttz = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %val, i1 true) nounwind readnone 58 store <2 x i32> %cttz, <2 x i32> addrspace(1)* %out, align 8 59 ret void 60} 61 62; FUNC-LABEL: {{^}}v_cttz_zero_undef_v4i32: 63; SI: {{buffer|flat}}_load_dwordx4 64; SI: v_ffbl_b32_e32 65; SI: v_ffbl_b32_e32 66; SI: v_ffbl_b32_e32 67; SI: v_ffbl_b32_e32 68; SI: buffer_store_dwordx4 69; SI: s_endpgm 70; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}} 71; EG: FFBL_INT {{\*? *}}[[RESULT]] 72; EG: FFBL_INT {{\*? *}}[[RESULT]] 73; EG: FFBL_INT {{\*? *}}[[RESULT]] 74; EG: FFBL_INT {{\*? *}}[[RESULT]] 75define amdgpu_kernel void @v_cttz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind { 76 %tid = call i32 @llvm.amdgcn.workitem.id.x() 77 %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %valptr, i32 %tid 78 %val = load <4 x i32>, <4 x i32> addrspace(1)* %in.gep, align 16 79 %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %val, i1 true) nounwind readnone 80 store <4 x i32> %cttz, <4 x i32> addrspace(1)* %out, align 16 81 ret void 82} 83 84; FUNC-LABEL: {{^}}s_cttz_zero_undef_i8_with_select: 85; SI: s_ff1_i32_b32 s{{[0-9]+}}, s{{[0-9]+}} 86; EG: MEM_RAT MSKOR 87; EG: FFBL_INT 88define amdgpu_kernel void @s_cttz_zero_undef_i8_with_select(i8 addrspace(1)* noalias %out, i8 %val) nounwind { 89 %cttz = tail call i8 @llvm.cttz.i8(i8 %val, i1 true) nounwind readnone 90 %cttz_ret = icmp ne i8 %val, 0 91 %ret = select i1 %cttz_ret, i8 %cttz, i8 32 92 store i8 %cttz, i8 addrspace(1)* %out, align 4 93 ret void 94} 95 96; FUNC-LABEL: {{^}}s_cttz_zero_undef_i16_with_select: 97; SI: s_ff1_i32_b32 s{{[0-9]+}}, s{{[0-9]+}} 98; EG: MEM_RAT MSKOR 99; EG: FFBL_INT 100define amdgpu_kernel void @s_cttz_zero_undef_i16_with_select(i16 addrspace(1)* noalias %out, i16 %val) nounwind { 101 %cttz = tail call i16 @llvm.cttz.i16(i16 %val, i1 true) nounwind readnone 102 %cttz_ret = icmp ne i16 %val, 0 103 %ret = select i1 %cttz_ret, i16 %cttz, i16 32 104 store i16 %cttz, i16 addrspace(1)* %out, align 4 105 ret void 106} 107 108; FUNC-LABEL: {{^}}s_cttz_zero_undef_i32_with_select: 109; SI: s_ff1_i32_b32 110; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]] 111; EG: FFBL_INT {{\*? *}}[[RESULT]] 112define amdgpu_kernel void @s_cttz_zero_undef_i32_with_select(i32 addrspace(1)* noalias %out, i32 %val) nounwind { 113 %cttz = tail call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone 114 %cttz_ret = icmp ne i32 %val, 0 115 %ret = select i1 %cttz_ret, i32 %cttz, i32 32 116 store i32 %cttz, i32 addrspace(1)* %out, align 4 117 ret void 118} 119 120; FUNC-LABEL: {{^}}s_cttz_zero_undef_i64_with_select: 121; SI: s_ff1_i32_b32 s{{[0-9]+}}, s{{[0-9]+}} 122; SI: s_ff1_i32_b32 s{{[0-9]+}}, s{{[0-9]+}} 123; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]] 124define amdgpu_kernel void @s_cttz_zero_undef_i64_with_select(i64 addrspace(1)* noalias %out, i64 %val) nounwind { 125 %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 true) nounwind readnone 126 %cttz_ret = icmp ne i64 %val, 0 127 %ret = select i1 %cttz_ret, i64 %cttz, i64 32 128 store i64 %cttz, i64 addrspace(1)* %out, align 4 129 ret void 130} 131 132; FUNC-LABEL: {{^}}v_cttz_zero_undef_i8_with_select: 133; SI-NOSDWA: v_ffbl_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} 134; SI-SDWA: v_ffbl_b32_e32 135; EG: MEM_RAT MSKOR 136define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(i8 addrspace(1)* noalias %out, i8 addrspace(1)* nocapture readonly %arrayidx) nounwind { 137 %val = load i8, i8 addrspace(1)* %arrayidx, align 1 138 %cttz = tail call i8 @llvm.cttz.i8(i8 %val, i1 true) nounwind readnone 139 %cttz_ret = icmp ne i8 %val, 0 140 %ret = select i1 %cttz_ret, i8 %cttz, i8 32 141 store i8 %ret, i8 addrspace(1)* %out, align 4 142 ret void 143} 144 145; FUNC-LABEL: {{^}}v_cttz_zero_undef_i16_with_select: 146; SI-NOSDWA: v_ffbl_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} 147; SI-SDWA: v_ffbl_b32_e32 148; EG: MEM_RAT MSKOR 149define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(i16 addrspace(1)* noalias %out, i16 addrspace(1)* nocapture readonly %arrayidx) nounwind { 150 %val = load i16, i16 addrspace(1)* %arrayidx, align 1 151 %cttz = tail call i16 @llvm.cttz.i16(i16 %val, i1 true) nounwind readnone 152 %cttz_ret = icmp ne i16 %val, 0 153 %ret = select i1 %cttz_ret, i16 %cttz, i16 32 154 store i16 %ret, i16 addrspace(1)* %out, align 4 155 ret void 156} 157 158; FUNC-LABEL: {{^}}v_cttz_zero_undef_i32_with_select: 159; SI-DAG: v_ffbl_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} 160; SI-DAG: v_cmp_ne_u32_e32 vcc, 0 161; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]] 162define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(i32 addrspace(1)* noalias %out, i32 addrspace(1)* nocapture readonly %arrayidx) nounwind { 163 %val = load i32, i32 addrspace(1)* %arrayidx, align 1 164 %cttz = tail call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone 165 %cttz_ret = icmp ne i32 %val, 0 166 %ret = select i1 %cttz_ret, i32 %cttz, i32 32 167 store i32 %ret, i32 addrspace(1)* %out, align 4 168 ret void 169} 170 171; FUNC-LABEL: {{^}}v_cttz_zero_undef_i64_with_select: 172; SI-NOSDWA: v_or_b32_e32 173; SI-NOSDWA: v_or_b32_e32 174; SI-NOSDWA: v_or_b32_e32 175; SI-NOSDWA: v_or_b32_e32 176; SI-NOSDWA: v_or_b32_e32 [[VAL1:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} 177; SI-NOSDWA: v_or_b32_e32 [[VAL2:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} 178; SI-NOSDWA: v_ffbl_b32_e32 v{{[0-9]+}}, [[VAL1]] 179; SI-NOSDWA: v_ffbl_b32_e32 v{{[0-9]+}}, [[VAL2]] 180; SI-SDWA: v_or_b32_e32 181; SI-SDWA: v_or_b32_sdwa 182; SI-SDWA: v_or_b32_e32 183; SI-SDWA: v_or_b32_sdwa 184; SI-SDWA: v_or_b32_e32 [[VAL1:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} 185; SI-SDWA: v_or_b32_e32 [[VAL2:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} 186; SI-SDWA: v_ffbl_b32_e32 v{{[0-9]+}}, [[VAL1]] 187; SI-SDWA: v_ffbl_b32_e32 v{{[0-9]+}}, [[VAL2]] 188; SI: v_cmp_eq_u32_e32 vcc, 0 189; SI: v_cmp_ne_u64_e32 vcc, 0 190; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]] 191define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(i64 addrspace(1)* noalias %out, i64 addrspace(1)* nocapture readonly %arrayidx) nounwind { 192 %val = load i64, i64 addrspace(1)* %arrayidx, align 1 193 %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 true) nounwind readnone 194 %cttz_ret = icmp ne i64 %val, 0 195 %ret = select i1 %cttz_ret, i64 %cttz, i64 32 196 store i64 %ret, i64 addrspace(1)* %out, align 4 197 ret void 198} 199 200; FUNC-LABEL: {{^}}v_cttz_i32_sel_eq_neg1: 201; SI: v_ffbl_b32_e32 [[VAL:v[0-9]+]], v{{[0-9]+}} 202; SI: buffer_store_dword [[VAL]], 203; SI: s_endpgm 204; EG: MEM_RAT_CACHELESS STORE_RAW 205; EG: FFBL_INT 206define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* nocapture readonly %arrayidx) nounwind { 207 %val = load i32, i32 addrspace(1)* %arrayidx, align 1 208 %ctlz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone 209 %cmp = icmp eq i32 %val, 0 210 %sel = select i1 %cmp, i32 -1, i32 %ctlz 211 store i32 %sel, i32 addrspace(1)* %out 212 ret void 213} 214 215; FUNC-LABEL: {{^}}v_cttz_i32_sel_ne_neg1: 216; SI: v_ffbl_b32_e32 [[VAL:v[0-9]+]], v{{[0-9]+}} 217; SI: buffer_store_dword [[VAL]], 218; SI: s_endpgm 219; EG: MEM_RAT_CACHELESS STORE_RAW 220; EG: FFBL_INT 221define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* nocapture readonly %arrayidx) nounwind { 222 %val = load i32, i32 addrspace(1)* %arrayidx, align 1 223 %ctlz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone 224 %cmp = icmp ne i32 %val, 0 225 %sel = select i1 %cmp, i32 %ctlz, i32 -1 226 store i32 %sel, i32 addrspace(1)* %out 227 ret void 228} 229 230; FUNC-LABEL: {{^}}v_cttz_i32_sel_ne_bitwidth: 231; SI: v_ffbl_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} 232; SI: v_cmp 233; SI: v_cndmask 234; SI: s_endpgm 235; EG: MEM_RAT_CACHELESS STORE_RAW 236; EG: FFBL_INT 237define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* nocapture readonly %arrayidx) nounwind { 238 %val = load i32, i32 addrspace(1)* %arrayidx, align 1 239 %ctlz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone 240 %cmp = icmp ne i32 %ctlz, 32 241 %sel = select i1 %cmp, i32 %ctlz, i32 -1 242 store i32 %sel, i32 addrspace(1)* %out 243 ret void 244} 245 246; FUNC-LABEL: {{^}}v_cttz_i8_sel_eq_neg1: 247; SI: {{buffer|flat}}_load_ubyte 248; SI-NOSDWA: v_ffbl_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} 249; SI-SDWA: v_ffbl_b32_e32 250; EG: MEM_RAT MSKOR 251; EG: FFBL_INT 252 define amdgpu_kernel void @v_cttz_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* nocapture readonly %arrayidx) nounwind { 253 %val = load i8, i8 addrspace(1)* %arrayidx, align 1 254 %ctlz = call i8 @llvm.cttz.i8(i8 %val, i1 false) nounwind readnone 255 %cmp = icmp eq i8 %val, 0 256 %sel = select i1 %cmp, i8 -1, i8 %ctlz 257 store i8 %sel, i8 addrspace(1)* %out 258 ret void 259} 260 261; FUNC-LABEL: {{^}}v_cttz_i16_sel_eq_neg1: 262; SI: {{buffer|flat}}_load_ubyte 263; SI: v_ffbl_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} 264; SI: buffer_store_short 265; EG: MEM_RAT MSKOR 266; EG: FFBL_INT 267 define amdgpu_kernel void @v_cttz_i16_sel_eq_neg1(i16 addrspace(1)* noalias %out, i16 addrspace(1)* nocapture readonly %arrayidx) nounwind { 268 %val = load i16, i16 addrspace(1)* %arrayidx, align 1 269 %ctlz = call i16 @llvm.cttz.i16(i16 %val, i1 false) nounwind readnone 270 %cmp = icmp eq i16 %val, 0 271 %sel = select i1 %cmp, i16 -1, i16 %ctlz 272 store i16 %sel, i16 addrspace(1)* %out 273 ret void 274} 275 276 277