1; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SIVI %s 2; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,SIVI %s 3; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s 4 5declare i32 @llvm.amdgcn.workitem.id.x() #1 6declare half @llvm.fabs.f16(half) 7declare float @llvm.fabs.f32(float) 8declare double @llvm.fabs.f64(double) 9 10; GCN-LABEL: {{^}}v_cnd_nan_nosgpr: 11; GCN: v_cmp_eq_u32_e64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 0 12; GCN: v_cndmask_b32_e{{32|64}} v{{[0-9]}}, -1, v{{[0-9]+}}, [[COND]] 13; GCN-DAG: v{{[0-9]}} 14; All nan values are converted to 0xffffffff 15; GCN: s_endpgm 16define amdgpu_kernel void @v_cnd_nan_nosgpr(float addrspace(1)* %out, i32 %c, float addrspace(1)* %fptr) #0 { 17 %idx = call i32 @llvm.amdgcn.workitem.id.x() #1 18 %f.gep = getelementptr float, float addrspace(1)* %fptr, i32 %idx 19 %f = load float, float addrspace(1)* %f.gep 20 %setcc = icmp ne i32 %c, 0 21 %select = select i1 %setcc, float 0xFFFFFFFFE0000000, float %f 22 store float %select, float addrspace(1)* %out 23 ret void 24} 25 26 27; This requires slightly trickier SGPR operand legalization since the 28; single constant bus SGPR usage is the last operand, and it should 29; never be moved. 30; However on GFX10 constant bus is limited to 2 scalar operands, not one. 31 32; GCN-LABEL: {{^}}v_cnd_nan: 33; SIVI: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0 34; SIVI: v_cndmask_b32_e32 v{{[0-9]+}}, -1, v{{[0-9]+}}, vcc 35; GFX10: v_cmp_eq_u32_e64 [[CC:s\[[0-9:]+\]]], s{{[0-9]+}}, 0 36; GFX10: v_cndmask_b32_e64 v{{[0-9]+}}, -1, s{{[0-9]+}}, [[CC]] 37; GCN-DAG: v{{[0-9]}} 38; All nan values are converted to 0xffffffff 39; GCN: s_endpgm 40define amdgpu_kernel void @v_cnd_nan(float addrspace(1)* %out, i32 %c, float %f) #0 { 41 %setcc = icmp ne i32 %c, 0 42 %select = select i1 %setcc, float 0xFFFFFFFFE0000000, float %f 43 store float %select, float addrspace(1)* %out 44 ret void 45} 46 47; Test different compare and select operand types for optimal code 48; shrinking. 49; (select (cmp (sgprX, constant)), constant, sgprZ) 50 51; GCN-LABEL: {{^}}fcmp_sgprX_k0_select_k1_sgprZ_f32: 52; GCN: s_load_dwordx2 s{{\[}}[[X:[0-9]+]]:[[Z:[0-9]+]]{{\]}}, s[0:1], {{0x4c|0x13}} 53 54; SIVI-DAG: v_cmp_nlg_f32_e64 [[CC:vcc]], s[[X]], 0 55; GFX10-DAG: v_cmp_nlg_f32_e64 [[CC:s\[[0-9:]+\]]], s[[X]], 0 56; SIVI-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], s[[Z]] 57; SIVI: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[VZ]], [[CC]] 58; GFX10: v_cndmask_b32_e64 v{{[0-9]+}}, 1.0, s[[Z]], [[CC]] 59define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprZ_f32(float addrspace(1)* %out, [8 x i32], float %x, float %z) #0 { 60 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 61 %tid.ext = sext i32 %tid to i64 62 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 63 %setcc = fcmp one float %x, 0.0 64 %select = select i1 %setcc, float 1.0, float %z 65 store float %select, float addrspace(1)* %out.gep 66 ret void 67} 68 69; GCN-LABEL: {{^}}fcmp_sgprX_k0_select_k1_sgprX_f32: 70; GCN: s_load_dword [[X:s[0-9]+]] 71; SIVI-DAG: v_cmp_nlg_f32_e64 [[CC:vcc]], [[X]], 0 72; GFX10-DAG: v_cmp_nlg_f32_e64 [[CC:s\[[0-9:]+\]]], [[X]], 0 73; SIVI-DAG: v_mov_b32_e32 [[VX:v[0-9]+]], [[X]] 74; SIVI: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[VX]], [[CC]] 75; GFX10: v_cndmask_b32_e64 v{{[0-9]+}}, 1.0, [[X]], [[CC]] 76define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprX_f32(float addrspace(1)* %out, float %x) #0 { 77 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 78 %tid.ext = sext i32 %tid to i64 79 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 80 %setcc = fcmp one float %x, 0.0 81 %select = select i1 %setcc, float 1.0, float %x 82 store float %select, float addrspace(1)* %out.gep 83 ret void 84} 85 86; GCN-LABEL: {{^}}fcmp_sgprX_k0_select_k0_sgprZ_f32: 87; GCN-DAG: s_load_dwordx2 s{{\[}}[[X:[0-9]+]]:[[Z:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}} 88; SIVI-DAG: v_cmp_nlg_f32_e64 [[CC:vcc]], s[[X]], 0 89; GFX10-DAG: v_cmp_nlg_f32_e64 [[CC:s\[[0-9:]+\]]], s[[X]], 0 90; SIVI-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], s[[Z]] 91; SIVI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, [[VZ]], [[CC]] 92; GFX10: v_cndmask_b32_e64 v{{[0-9]+}}, 0, s[[Z]], [[CC]] 93define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprZ_f32(float addrspace(1)* %out, [8 x i32], float %x, float %z) #0 { 94 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 95 %tid.ext = sext i32 %tid to i64 96 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 97 %setcc = fcmp one float %x, 0.0 98 %select = select i1 %setcc, float 0.0, float %z 99 store float %select, float addrspace(1)* %out.gep 100 ret void 101} 102 103; GCN-LABEL: {{^}}fcmp_sgprX_k0_select_k0_sgprX_f32: 104; GCN: s_load_dword [[X:s[0-9]+]] 105; SIVI-DAG: v_cmp_nlg_f32_e64 [[CC:vcc]], [[X]], 0 106; GFX10-DAG: v_cmp_nlg_f32_e64 [[CC:s\[[0-9:]+\]]], [[X]], 0 107; SIVI-DAG: v_mov_b32_e32 [[VX:v[0-9]+]], [[X]] 108; SIVI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, [[VX]], [[CC]] 109; GFX10: v_cndmask_b32_e64 v{{[0-9]+}}, 0, [[X]], [[CC]] 110define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprX_f32(float addrspace(1)* %out, float %x) #0 { 111 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 112 %tid.ext = sext i32 %tid to i64 113 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 114 %setcc = fcmp one float %x, 0.0 115 %select = select i1 %setcc, float 0.0, float %x 116 store float %select, float addrspace(1)* %out.gep 117 ret void 118} 119 120; GCN-LABEL: {{^}}fcmp_sgprX_k0_select_k0_vgprZ_f32: 121; GCN-DAG: s_load_dword [[X:s[0-9]+]] 122; GCN-DAG: {{buffer|flat|global}}_load_dword [[Z:v[0-9]+]] 123; GCN-DAG: v_cmp_nlg_f32_e64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], [[X]], 0 124; GCN: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, 0, [[Z]], [[COND]] 125define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_vgprZ_f32(float addrspace(1)* %out, float %x, float addrspace(1)* %z.ptr) #0 { 126 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 127 %tid.ext = sext i32 %tid to i64 128 %z.gep = getelementptr inbounds float, float addrspace(1)* %z.ptr, i64 %tid.ext 129 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 130 %z = load float, float addrspace(1)* %z.gep 131 %setcc = fcmp one float %x, 0.0 132 %select = select i1 %setcc, float 0.0, float %z 133 store float %select, float addrspace(1)* %out.gep 134 ret void 135} 136 137; GCN-LABEL: {{^}}fcmp_sgprX_k0_select_k1_vgprZ_f32: 138; GCN-DAG: {{buffer|flat|global}}_load_dword [[Z:v[0-9]+]] 139; GCN-DAG: s_load_dword [[X:s[0-9]+]] 140; GCN-DAG: v_cmp_nlg_f32_e64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], [[X]], 0 141; GCN: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, 1.0, [[Z]], [[COND]] 142define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_vgprZ_f32(float addrspace(1)* %out, float %x, float addrspace(1)* %z.ptr) #0 { 143 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 144 %tid.ext = sext i32 %tid to i64 145 %z.gep = getelementptr inbounds float, float addrspace(1)* %z.ptr, i64 %tid.ext 146 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 147 %z = load float, float addrspace(1)* %z.gep 148 %setcc = fcmp one float %x, 0.0 149 %select = select i1 %setcc, float 1.0, float %z 150 store float %select, float addrspace(1)* %out.gep 151 ret void 152} 153 154; GCN-LABEL: {{^}}fcmp_vgprX_k0_select_k1_sgprZ_f32: 155; GCN-DAG: {{buffer|flat|global}}_load_dword [[X:v[0-9]+]] 156; GCN-DAG: s_load_dword [[Z:s[0-9]+]] 157; GCN-DAG: v_cmp_ngt_f32_e32 vcc, 0, [[X]] 158; SIVI-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]] 159; SIVI: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[VZ]], vcc 160; GFX10: v_cndmask_b32_e64 v{{[0-9]+}}, 1.0, [[Z]], vcc 161define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_sgprZ_f32(float addrspace(1)* %out, float addrspace(1)* %x.ptr, float %z) #0 { 162 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 163 %tid.ext = sext i32 %tid to i64 164 %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext 165 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 166 %x = load float, float addrspace(1)* %x.gep 167 %setcc = fcmp olt float %x, 0.0 168 %select = select i1 %setcc, float 1.0, float %z 169 store float %select, float addrspace(1)* %out.gep 170 ret void 171} 172 173; GCN-LABEL: {{^}}fcmp_vgprX_k0_select_k1_vgprZ_f32: 174; GCN: {{buffer|flat|global}}_load_dword [[X:v[0-9]+]] 175; GCN: {{buffer|flat|global}}_load_dword [[Z:v[0-9]+]] 176; GCN: v_cmp_le_f32_e32 vcc, 0, [[X]] 177; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[Z]], vcc 178define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_f32(float addrspace(1)* %out, float addrspace(1)* %x.ptr, float addrspace(1)* %z.ptr) #0 { 179 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 180 %tid.ext = sext i32 %tid to i64 181 %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext 182 %z.gep = getelementptr inbounds float, float addrspace(1)* %z.ptr, i64 %tid.ext 183 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 184 %x = load volatile float, float addrspace(1)* %x.gep 185 %z = load volatile float, float addrspace(1)* %z.gep 186 %setcc = fcmp ult float %x, 0.0 187 %select = select i1 %setcc, float 1.0, float %z 188 store float %select, float addrspace(1)* %out.gep 189 ret void 190} 191 192; GCN-LABEL: {{^}}icmp_vgprX_k0_select_k1_vgprZ_i32: 193; GCN: {{buffer|flat|global}}_load_dword [[X:v[0-9]+]] 194; GCN: {{buffer|flat|global}}_load_dword [[Z:v[0-9]+]] 195; GCN: v_cmp_lt_i32_e32 vcc, -1, [[X]] 196; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 2, [[Z]], vcc 197define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %x.ptr, i32 addrspace(1)* %z.ptr) #0 { 198 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 199 %tid.ext = sext i32 %tid to i64 200 %x.gep = getelementptr inbounds i32, i32 addrspace(1)* %x.ptr, i64 %tid.ext 201 %z.gep = getelementptr inbounds i32, i32 addrspace(1)* %z.ptr, i64 %tid.ext 202 %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext 203 %x = load volatile i32, i32 addrspace(1)* %x.gep 204 %z = load volatile i32, i32 addrspace(1)* %z.gep 205 %setcc = icmp slt i32 %x, 0 206 %select = select i1 %setcc, i32 2, i32 %z 207 store i32 %select, i32 addrspace(1)* %out.gep 208 ret void 209} 210 211; FIXME: Why does VI make the wrong regalloc choice? 212; GCN-LABEL: {{^}}icmp_vgprX_k0_select_k1_vgprZ_i64: 213; GCN: {{buffer|flat|global}}_load_dwordx2 v{{\[}}[[X_LO:[0-9]+]]:[[X_HI:[0-9]+]]{{\]}} 214; GCN-DAG: {{buffer|flat|global}}_load_dwordx2 v{{\[}}[[Z_LO:[0-9]+]]:[[Z_HI:[0-9]+]]{{\]}} 215; SI-DAG: v_cmp_lt_i64_e32 vcc, -1, v{{\[}}[[X_LO]]:[[X_HI]]{{\]}} 216; SI-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v[[Z_HI]], vcc 217; SI-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2, v[[Z_LO]], vcc 218 219; VI-DAG: v_cmp_lt_i64_e32 vcc, -1, v{{\[}}[[X_LO]]:[[X_HI]]{{\]}} 220; VI-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v[[Z_HI]], vcc 221; VI-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2, v[[Z_LO]], vcc 222define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %x.ptr, i64 addrspace(1)* %z.ptr) #0 { 223 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 224 %tid.ext = sext i32 %tid to i64 225 %x.gep = getelementptr inbounds i64, i64 addrspace(1)* %x.ptr, i64 %tid.ext 226 %z.gep = getelementptr inbounds i64, i64 addrspace(1)* %z.ptr, i64 %tid.ext 227 %out.gep = getelementptr inbounds i64, i64 addrspace(1)* %out, i64 %tid.ext 228 %x = load volatile i64, i64 addrspace(1)* %x.gep 229 %z = load volatile i64, i64 addrspace(1)* %z.gep 230 %setcc = icmp slt i64 %x, 0 231 %select = select i1 %setcc, i64 2, i64 %z 232 store i64 %select, i64 addrspace(1)* %out.gep 233 ret void 234} 235 236; GCN-LABEL: {{^}}fcmp_vgprX_k0_select_vgprZ_k1_v4f32: 237; GCN: {{buffer|flat|global}}_load_dword [[X:v[0-9]+]] 238; GCN: {{buffer|flat|global}}_load_dwordx4 239 240; GCN: v_cmp_nge_f32_e32 vcc, 4.0, [[X]] 241; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, vcc 242; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}, vcc 243; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, -0.5, v{{[0-9]+}}, vcc 244; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}, vcc 245define amdgpu_kernel void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(<4 x float> addrspace(1)* %out, float addrspace(1)* %x.ptr, <4 x float> addrspace(1)* %z.ptr) #0 { 246 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 247 %tid.ext = sext i32 %tid to i64 248 %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext 249 %z.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %z.ptr, i64 %tid.ext 250 %out.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %tid.ext 251 %x = load volatile float, float addrspace(1)* %x.gep 252 %z = load volatile <4 x float>, <4 x float> addrspace(1)* %z.gep 253 %setcc = fcmp ugt float %x, 4.0 254 %select = select i1 %setcc, <4 x float> %z, <4 x float> <float 1.0, float 2.0, float -0.5, float 4.0> 255 store <4 x float> %select, <4 x float> addrspace(1)* %out.gep 256 ret void 257} 258 259; GCN-LABEL: {{^}}fcmp_vgprX_k0_select_k1_vgprZ_v4f32: 260; GCN: {{buffer|flat|global}}_load_dword [[X:v[0-9]+]] 261; GCN: {{buffer|flat|global}}_load_dwordx4 262 263; GCN: v_cmp_ge_f32_e32 vcc, 4.0, [[X]] 264; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, vcc 265; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}, vcc 266; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, -0.5, v{{[0-9]+}}, vcc 267; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}, vcc 268define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(<4 x float> addrspace(1)* %out, float addrspace(1)* %x.ptr, <4 x float> addrspace(1)* %z.ptr) #0 { 269 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 270 %tid.ext = sext i32 %tid to i64 271 %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext 272 %z.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %z.ptr, i64 %tid.ext 273 %out.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %tid.ext 274 %x = load volatile float, float addrspace(1)* %x.gep 275 %z = load volatile <4 x float>, <4 x float> addrspace(1)* %z.gep 276 %setcc = fcmp ugt float %x, 4.0 277 %select = select i1 %setcc, <4 x float> <float 1.0, float 2.0, float -0.5, float 4.0>, <4 x float> %z 278 store <4 x float> %select, <4 x float> addrspace(1)* %out.gep 279 ret void 280} 281 282; This must be swapped as a vector type before the condition has 283; multiple uses. 284 285; GCN-LABEL: {{^}}fcmp_k0_vgprX_select_k1_vgprZ_v4f32: 286; GCN: {{buffer|flat|global}}_load_dword [[X:v[0-9]+]] 287; GCN: {{buffer|flat|global}}_load_dwordx4 288 289; GCN: v_cmp_le_f32_e32 vcc, 4.0, [[X]] 290; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, vcc 291; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}, vcc 292; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, -0.5, v{{[0-9]+}}, vcc 293; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}, vcc 294define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(<4 x float> addrspace(1)* %out, float addrspace(1)* %x.ptr, <4 x float> addrspace(1)* %z.ptr) #0 { 295 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 296 %tid.ext = sext i32 %tid to i64 297 %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext 298 %z.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %z.ptr, i64 %tid.ext 299 %out.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %tid.ext 300 %x = load volatile float, float addrspace(1)* %x.gep 301 %z = load volatile <4 x float>, <4 x float> addrspace(1)* %z.gep 302 %setcc = fcmp ugt float 4.0, %x 303 %select = select i1 %setcc, <4 x float> <float 1.0, float 2.0, float -0.5, float 4.0>, <4 x float> %z 304 store <4 x float> %select, <4 x float> addrspace(1)* %out.gep 305 ret void 306} 307 308; GCN-LABEL: {{^}}icmp_vgprX_k0_select_k1_vgprZ_i1: 309; GCN: load_dword 310; GCN: load_ubyte 311; GCN-DAG: v_cmp_gt_i32_e32 vcc, 0, v 312; GCN-DAG: v_and_b32_e32 v{{[0-9]+}}, 1, 313; GCN-DAG: v_cmp_eq_u32_e64 s{{\[[0-9]+:[0-9]+\]}}, 1, v 314; GCN-DAG: s_or_b64 s{{\[[0-9]+:[0-9]+\]}}, vcc, s{{\[[0-9]+:[0-9]+\]}} 315; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, s 316; GCN: store_byte 317define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(i1 addrspace(1)* %out, i32 addrspace(1)* %x.ptr, i1 addrspace(1)* %z.ptr) #0 { 318 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 319 %tid.ext = sext i32 %tid to i64 320 %x.gep = getelementptr inbounds i32, i32 addrspace(1)* %x.ptr, i64 %tid.ext 321 %z.gep = getelementptr inbounds i1, i1 addrspace(1)* %z.ptr, i64 %tid.ext 322 %out.gep = getelementptr inbounds i1, i1 addrspace(1)* %out, i64 %tid.ext 323 %x = load volatile i32, i32 addrspace(1)* %x.gep 324 %z = load volatile i1, i1 addrspace(1)* %z.gep 325 %setcc = icmp slt i32 %x, 0 326 %select = select i1 %setcc, i1 true, i1 %z 327 store i1 %select, i1 addrspace(1)* %out.gep 328 ret void 329} 330 331; Different types compared vs. selected 332; GCN-LABEL: {{^}}fcmp_vgprX_k0_selectf64_k1_vgprZ_f32: 333; SIVI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3ff00000 334; GCN-DAG: {{buffer|flat|global}}_load_dword [[X:v[0-9]+]] 335; GCN-DAG: {{buffer|flat|global}}_load_dwordx2 336 337; GCN: v_cmp_le_f32_e32 vcc, 0, [[X]] 338; SIVI-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[K]], v{{[0-9]+}}, vcc 339; GFX10-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 0x3ff00000, v{{[0-9]+}}, vcc 340; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc 341define amdgpu_kernel void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(double addrspace(1)* %out, float addrspace(1)* %x.ptr, double addrspace(1)* %z.ptr) #0 { 342 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 343 %tid.ext = sext i32 %tid to i64 344 %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext 345 %z.gep = getelementptr inbounds double, double addrspace(1)* %z.ptr, i64 %tid.ext 346 %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext 347 %x = load volatile float, float addrspace(1)* %x.gep 348 %z = load volatile double, double addrspace(1)* %z.gep 349 %setcc = fcmp ult float %x, 0.0 350 %select = select i1 %setcc, double 1.0, double %z 351 store double %select, double addrspace(1)* %out.gep 352 ret void 353} 354 355; Different types compared vs. selected 356; GCN-LABEL: {{^}}fcmp_vgprX_k0_selecti64_k1_vgprZ_f32: 357; GCN: {{buffer|flat|global}}_load_dword [[X:v[0-9]+]] 358; GCN: {{buffer|flat|global}}_load_dwordx2 359 360; GCN: v_cmp_nlg_f32_e32 vcc, 0, [[X]] 361; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 3, v{{[0-9]+}}, vcc 362; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc 363define amdgpu_kernel void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(i64 addrspace(1)* %out, float addrspace(1)* %x.ptr, i64 addrspace(1)* %z.ptr) #0 { 364 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 365 %tid.ext = sext i32 %tid to i64 366 %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext 367 %z.gep = getelementptr inbounds i64, i64 addrspace(1)* %z.ptr, i64 %tid.ext 368 %out.gep = getelementptr inbounds i64, i64 addrspace(1)* %out, i64 %tid.ext 369 %x = load volatile float, float addrspace(1)* %x.gep 370 %z = load volatile i64, i64 addrspace(1)* %z.gep 371 %setcc = fcmp one float %x, 0.0 372 %select = select i1 %setcc, i64 3, i64 %z 373 store i64 %select, i64 addrspace(1)* %out.gep 374 ret void 375} 376 377; Different types compared vs. selected 378; GCN-LABEL: {{^}}icmp_vgprX_k0_selectf32_k1_vgprZ_i32: 379; GCN: {{buffer|flat|global}}_load_dword [[X:v[0-9]+]] 380; GCN: {{buffer|flat|global}}_load_dword [[Z:v[0-9]+]] 381 382; GCN: v_cmp_gt_u32_e32 vcc, 2, [[X]] 383; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 4.0, [[Z]], vcc 384define amdgpu_kernel void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(float addrspace(1)* %out, i32 addrspace(1)* %x.ptr, float addrspace(1)* %z.ptr) #0 { 385 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 386 %tid.ext = sext i32 %tid to i64 387 %x.gep = getelementptr inbounds i32, i32 addrspace(1)* %x.ptr, i64 %tid.ext 388 %z.gep = getelementptr inbounds float, float addrspace(1)* %z.ptr, i64 %tid.ext 389 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 390 %x = load volatile i32, i32 addrspace(1)* %x.gep 391 %z = load volatile float, float addrspace(1)* %z.gep 392 %setcc = icmp ugt i32 %x, 1 393 %select = select i1 %setcc, float 4.0, float %z 394 store float %select, float addrspace(1)* %out.gep 395 ret void 396} 397 398; FIXME: Should be able to handle multiple uses 399 400; GCN-LABEL: {{^}}fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2: 401; GCN: {{buffer|flat|global}}_load_dword [[X:v[0-9]+]] 402 403; GCN: v_cmp_nle_f32_e32 vcc, 4.0, [[X]] 404; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, -1.0, vcc 405; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, -2.0, vcc 406define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(float addrspace(1)* %out, float addrspace(1)* %x.ptr, float addrspace(1)* %z.ptr) #0 { 407 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 408 %tid.ext = sext i32 %tid to i64 409 %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext 410 %z.gep = getelementptr inbounds float, float addrspace(1)* %z.ptr, i64 %tid.ext 411 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 412 %x = load volatile float, float addrspace(1)* %x.gep 413 %z = load volatile float, float addrspace(1)* %z.gep 414 %setcc = fcmp ugt float 4.0, %x 415 %select0 = select i1 %setcc, float -1.0, float %z 416 %select1 = select i1 %setcc, float -2.0, float %z 417 store volatile float %select0, float addrspace(1)* %out.gep 418 store volatile float %select1, float addrspace(1)* %out.gep 419 ret void 420} 421 422; Source modifiers abs/neg only work for f32 423 424; GCN-LABEL: {{^}}v_cndmask_abs_neg_f16: 425; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 426define amdgpu_kernel void @v_cndmask_abs_neg_f16(half addrspace(1)* %out, i32 %c, half addrspace(1)* %fptr) #0 { 427 %idx = call i32 @llvm.amdgcn.workitem.id.x() #1 428 %f.gep = getelementptr half, half addrspace(1)* %fptr, i32 %idx 429 %f = load half, half addrspace(1)* %f.gep 430 %f.abs = call half @llvm.fabs.f16(half %f) 431 %f.neg = fneg half %f 432 %setcc = icmp ne i32 %c, 0 433 %select = select i1 %setcc, half %f.abs, half %f.neg 434 store half %select, half addrspace(1)* %out 435 ret void 436} 437 438; GCN-LABEL: {{^}}v_cndmask_abs_neg_f32: 439; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, -v{{[0-9]+}}, |v{{[0-9]+}}|, 440define amdgpu_kernel void @v_cndmask_abs_neg_f32(float addrspace(1)* %out, i32 %c, float addrspace(1)* %fptr) #0 { 441 %idx = call i32 @llvm.amdgcn.workitem.id.x() #1 442 %f.gep = getelementptr float, float addrspace(1)* %fptr, i32 %idx 443 %f = load float, float addrspace(1)* %f.gep 444 %f.abs = call float @llvm.fabs.f32(float %f) 445 %f.neg = fneg float %f 446 %setcc = icmp ne i32 %c, 0 447 %select = select i1 %setcc, float %f.abs, float %f.neg 448 store float %select, float addrspace(1)* %out 449 ret void 450} 451 452; GCN-LABEL: {{^}}v_cndmask_abs_neg_f64: 453; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 454; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 455define amdgpu_kernel void @v_cndmask_abs_neg_f64(double addrspace(1)* %out, i32 %c, double addrspace(1)* %fptr) #0 { 456 %idx = call i32 @llvm.amdgcn.workitem.id.x() #1 457 %f.gep = getelementptr double, double addrspace(1)* %fptr, i32 %idx 458 %f = load double, double addrspace(1)* %f.gep 459 %f.abs = call double @llvm.fabs.f64(double %f) 460 %f.neg = fneg double %f 461 %setcc = icmp ne i32 %c, 0 462 %select = select i1 %setcc, double %f.abs, double %f.neg 463 store double %select, double addrspace(1)* %out 464 ret void 465} 466 467attributes #0 = { nounwind } 468attributes #1 = { nounwind readnone } 469