1; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -amdgpu-early-ifcvt=1 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s 2; XUN: llc -march=amdgcn -mcpu=tonga -amdgpu-early-ifcvt=1 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s 3 4; FIXME: This leaves behind a now unnecessary and with exec 5 6; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle: 7; GCN: buffer_load_dword [[VAL:v[0-9]+]] 8; GCN: v_cmp_neq_f32_e32 vcc, 1.0, [[VAL]] 9; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], [[VAL]], [[VAL]] 10; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], [[ADD]], [[VAL]], vcc 11; GCN: buffer_store_dword [[RESULT]] 12define amdgpu_kernel void @test_vccnz_ifcvt_triangle(float addrspace(1)* %out, float addrspace(1)* %in) #0 { 13entry: 14 %v = load float, float addrspace(1)* %in 15 %cc = fcmp oeq float %v, 1.000000e+00 16 br i1 %cc, label %if, label %endif 17 18if: 19 %u = fadd float %v, %v 20 br label %endif 21 22endif: 23 %r = phi float [ %v, %entry ], [ %u, %if ] 24 store float %r, float addrspace(1)* %out 25 ret void 26} 27 28; GCN-LABEL: {{^}}test_vccnz_ifcvt_diamond: 29; GCN: buffer_load_dword [[VAL:v[0-9]+]] 30; GCN: v_cmp_neq_f32_e32 vcc, 1.0, [[VAL]] 31; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], [[VAL]], [[VAL]] 32; GCN-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[VAL]], [[VAL]] 33; GCN: buffer_store_dword [[RESULT]] 34define amdgpu_kernel void @test_vccnz_ifcvt_diamond(float addrspace(1)* %out, float addrspace(1)* %in) #0 { 35entry: 36 %v = load float, float addrspace(1)* %in 37 %cc = fcmp oeq float %v, 1.000000e+00 38 br i1 %cc, label %if, label %else 39 40if: 41 %u0 = fadd float %v, %v 42 br label %endif 43 44else: 45 %u1 = fmul float %v, %v 46 br label %endif 47 48endif: 49 %r = phi float [ %u0, %if ], [ %u1, %else ] 50 store float %r, float addrspace(1)* %out 51 ret void 52} 53 54; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_vcc_clobber: 55; GCN: ; clobber vcc 56; GCN: v_cmp_neq_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 1.0 57; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc 58; GCN: s_mov_b64 vcc, [[CMP]] 59; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc 60define amdgpu_kernel void @test_vccnz_ifcvt_triangle_vcc_clobber(i32 addrspace(1)* %out, i32 addrspace(1)* %in, float %k) #0 { 61entry: 62 %v = load i32, i32 addrspace(1)* %in 63 %cc = fcmp oeq float %k, 1.000000e+00 64 br i1 %cc, label %if, label %endif 65 66if: 67 call void asm "; clobber $0", "~{vcc}"() #0 68 %u = add i32 %v, %v 69 br label %endif 70 71endif: 72 %r = phi i32 [ %v, %entry ], [ %u, %if ] 73 store i32 %r, i32 addrspace(1)* %out 74 ret void 75} 76 77; Longest chain of cheap instructions to convert 78; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_max_cheap: 79; GCN: v_mul_f32 80; GCN: v_mul_f32 81; GCN: v_mul_f32 82; GCN: v_mul_f32 83; GCN: v_mul_f32 84; GCN: v_mul_f32 85; GCN: v_mul_f32 86; GCN: v_mul_f32 87; GCN: v_mul_f32 88; GCN: v_cndmask_b32_e32 89define amdgpu_kernel void @test_vccnz_ifcvt_triangle_max_cheap(float addrspace(1)* %out, float addrspace(1)* %in) #0 { 90entry: 91 %v = load float, float addrspace(1)* %in 92 %cc = fcmp oeq float %v, 1.000000e+00 93 br i1 %cc, label %if, label %endif 94 95if: 96 %u.0 = fmul float %v, %v 97 %u.1 = fmul float %v, %u.0 98 %u.2 = fmul float %v, %u.1 99 %u.3 = fmul float %v, %u.2 100 %u.4 = fmul float %v, %u.3 101 %u.5 = fmul float %v, %u.4 102 %u.6 = fmul float %v, %u.5 103 %u.7 = fmul float %v, %u.6 104 %u.8 = fmul float %v, %u.7 105 br label %endif 106 107endif: 108 %r = phi float [ %v, %entry ], [ %u.8, %if ] 109 store float %r, float addrspace(1)* %out 110 ret void 111} 112 113; Short chain of cheap instructions to not convert 114; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_min_expensive: 115; GCN: s_cbranch_vccnz [[ENDIF:BB[0-9]+_[0-9]+]] 116 117; GCN: v_mul_f32 118; GCN: v_mul_f32 119; GCN: v_mul_f32 120; GCN: v_mul_f32 121; GCN: v_mul_f32 122; GCN: v_mul_f32 123; GCN: v_mul_f32 124; GCN: v_mul_f32 125; GCN: v_mul_f32 126; GCN: v_mul_f32 127 128; GCN: [[ENDIF]]: 129; GCN: buffer_store_dword 130define amdgpu_kernel void @test_vccnz_ifcvt_triangle_min_expensive(float addrspace(1)* %out, float addrspace(1)* %in) #0 { 131entry: 132 %v = load float, float addrspace(1)* %in 133 %cc = fcmp oeq float %v, 1.000000e+00 134 br i1 %cc, label %if, label %endif 135 136if: 137 %u.0 = fmul float %v, %v 138 %u.1 = fmul float %v, %u.0 139 %u.2 = fmul float %v, %u.1 140 %u.3 = fmul float %v, %u.2 141 %u.4 = fmul float %v, %u.3 142 %u.5 = fmul float %v, %u.4 143 %u.6 = fmul float %v, %u.5 144 %u.7 = fmul float %v, %u.6 145 %u.8 = fmul float %v, %u.7 146 %u.9 = fmul float %v, %u.8 147 br label %endif 148 149endif: 150 %r = phi float [ %v, %entry ], [ %u.9, %if ] 151 store float %r, float addrspace(1)* %out 152 ret void 153} 154 155; Should still branch over fdiv expansion 156; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_expensive: 157; GCN: v_cmp_neq_f32_e32 158; GCN: s_cbranch_vccnz [[ENDIF:BB[0-9]+_[0-9]+]] 159 160; GCN: v_div_scale_f32 161 162; GCN: [[ENDIF]]: 163; GCN: buffer_store_dword 164define amdgpu_kernel void @test_vccnz_ifcvt_triangle_expensive(float addrspace(1)* %out, float addrspace(1)* %in) #0 { 165entry: 166 %v = load float, float addrspace(1)* %in 167 %cc = fcmp oeq float %v, 1.000000e+00 168 br i1 %cc, label %if, label %endif 169 170if: 171 %u = fdiv float %v, %v 172 br label %endif 173 174endif: 175 %r = phi float [ %v, %entry ], [ %u, %if ] 176 store float %r, float addrspace(1)* %out 177 ret void 178} 179 180; vcc branch with SGPR inputs 181; GCN-LABEL: {{^}}test_vccnz_sgpr_ifcvt_triangle: 182; GCN: v_cmp_neq_f32_e64 183; GCN: s_cbranch_vccnz [[ENDIF:BB[0-9]+_[0-9]+]] 184 185; GCN: s_add_i32 186 187; GCN: [[ENDIF]]: 188; GCN: buffer_store_dword 189define amdgpu_kernel void @test_vccnz_sgpr_ifcvt_triangle(i32 addrspace(1)* %out, i32 addrspace(4)* %in, float %cnd) #0 { 190entry: 191 %v = load i32, i32 addrspace(4)* %in 192 %cc = fcmp oeq float %cnd, 1.000000e+00 193 br i1 %cc, label %if, label %endif 194 195if: 196 %u = add i32 %v, %v 197 br label %endif 198 199endif: 200 %r = phi i32 [ %v, %entry ], [ %u, %if ] 201 store i32 %r, i32 addrspace(1)* %out 202 ret void 203 204} 205 206; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_constant_load: 207; GCN: v_cndmask_b32 208define amdgpu_kernel void @test_vccnz_ifcvt_triangle_constant_load(float addrspace(1)* %out, float addrspace(4)* %in) #0 { 209entry: 210 %v = load float, float addrspace(4)* %in 211 %cc = fcmp oeq float %v, 1.000000e+00 212 br i1 %cc, label %if, label %endif 213 214if: 215 %u = fadd float %v, %v 216 br label %endif 217 218endif: 219 %r = phi float [ %v, %entry ], [ %u, %if ] 220 store float %r, float addrspace(1)* %out 221 ret void 222} 223 224; Due to broken cost heuristic, this is not if converted like 225; test_vccnz_ifcvt_triangle_constant_load even though it should be. 226 227; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_argload: 228; GCN: v_cndmask_b32 229define amdgpu_kernel void @test_vccnz_ifcvt_triangle_argload(float addrspace(1)* %out, float %v) #0 { 230entry: 231 %cc = fcmp oeq float %v, 1.000000e+00 232 br i1 %cc, label %if, label %endif 233 234if: 235 %u = fadd float %v, %v 236 br label %endif 237 238endif: 239 %r = phi float [ %v, %entry ], [ %u, %if ] 240 store float %r, float addrspace(1)* %out 241 ret void 242} 243 244; Scalar branch and scalar inputs 245; GCN-LABEL: {{^}}test_scc1_sgpr_ifcvt_triangle: 246; GCN: s_load_dword [[VAL:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0 247; GCN: s_add_i32 [[ADD:s[0-9]+]], [[VAL]], [[VAL]] 248; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1 249; GCN-NEXT: s_cselect_b32 [[SELECT:s[0-9]+]], [[VAL]], [[ADD]] 250define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle(i32 addrspace(4)* %in, i32 %cond) #0 { 251entry: 252 %v = load i32, i32 addrspace(4)* %in 253 %cc = icmp eq i32 %cond, 1 254 br i1 %cc, label %if, label %endif 255 256if: 257 %u = add i32 %v, %v 258 br label %endif 259 260endif: 261 %r = phi i32 [ %v, %entry ], [ %u, %if ] 262 call void asm sideeffect "; reg use $0", "s"(i32 %r) #0 263 ret void 264} 265 266; FIXME: Should be able to use VALU compare and select 267; Scalar branch but VGPR select operands 268; GCN-LABEL: {{^}}test_scc1_vgpr_ifcvt_triangle: 269; GCN: s_cmp_lg_u32 270; GCN: s_cbranch_scc1 [[ENDIF:BB[0-9]+_[0-9]+]] 271 272; GCN: v_add_f32_e32 273 274; GCN: [[ENDIF]]: 275; GCN: buffer_store_dword 276define amdgpu_kernel void @test_scc1_vgpr_ifcvt_triangle(float addrspace(1)* %out, float addrspace(1)* %in, i32 %cond) #0 { 277entry: 278 %v = load float, float addrspace(1)* %in 279 %cc = icmp eq i32 %cond, 1 280 br i1 %cc, label %if, label %endif 281 282if: 283 %u = fadd float %v, %v 284 br label %endif 285 286endif: 287 %r = phi float [ %v, %entry ], [ %u, %if ] 288 store float %r, float addrspace(1)* %out 289 ret void 290} 291 292; GCN-LABEL: {{^}}test_scc1_sgpr_ifcvt_triangle64: 293; GCN: s_add_u32 294; GCN: s_addc_u32 295; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1 296; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} 297define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle64(i64 addrspace(4)* %in, i32 %cond) #0 { 298entry: 299 %v = load i64, i64 addrspace(4)* %in 300 %cc = icmp eq i32 %cond, 1 301 br i1 %cc, label %if, label %endif 302 303if: 304 %u = add i64 %v, %v 305 br label %endif 306 307endif: 308 %r = phi i64 [ %v, %entry ], [ %u, %if ] 309 call void asm sideeffect "; reg use $0", "s"(i64 %r) #0 310 ret void 311} 312 313; TODO: Can do s_cselect_b64; s_cselect_b32 314; GCN-LABEL: {{^}}test_scc1_sgpr_ifcvt_triangle96: 315; GCN: s_add_i32 316; GCN: s_add_i32 317; GCN: s_add_i32 318; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1 319; GCN-NEXT: s_cselect_b32 s 320; GCN-NEXT: s_cselect_b32 s 321; GCN-NEXT: s_cselect_b32 s 322define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle96(<3 x i32> addrspace(4)* %in, i32 %cond) #0 { 323entry: 324 %v = load <3 x i32>, <3 x i32> addrspace(4)* %in 325 %cc = icmp eq i32 %cond, 1 326 br i1 %cc, label %if, label %endif 327 328if: 329 %u = add <3 x i32> %v, %v 330 br label %endif 331 332endif: 333 %r = phi <3 x i32> [ %v, %entry ], [ %u, %if ] 334 %r.ext = shufflevector <3 x i32> %r, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 335 call void asm sideeffect "; reg use $0", "s"(<4 x i32> %r.ext) #0 336 ret void 337} 338 339; GCN-LABEL: {{^}}test_scc1_sgpr_ifcvt_triangle128: 340; GCN: s_add_i32 341; GCN: s_add_i32 342; GCN: s_add_i32 343; GCN: s_add_i32 344; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1 345; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} 346; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} 347define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle128(<4 x i32> addrspace(4)* %in, i32 %cond) #0 { 348entry: 349 %v = load <4 x i32>, <4 x i32> addrspace(4)* %in 350 %cc = icmp eq i32 %cond, 1 351 br i1 %cc, label %if, label %endif 352 353if: 354 %u = add <4 x i32> %v, %v 355 br label %endif 356 357endif: 358 %r = phi <4 x i32> [ %v, %entry ], [ %u, %if ] 359 call void asm sideeffect "; reg use $0", "s"(<4 x i32> %r) #0 360 ret void 361} 362 363; GCN-LABEL: {{^}}uniform_if_swap_br_targets_scc_constant_select: 364; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 0 365; GCN: s_cselect_b32 s{{[0-9]+}}, 0, 1{{$}} 366define amdgpu_kernel void @uniform_if_swap_br_targets_scc_constant_select(i32 %cond, i32 addrspace(1)* %out) { 367entry: 368 %cmp0 = icmp eq i32 %cond, 0 369 br i1 %cmp0, label %else, label %if 370 371if: 372 br label %done 373 374else: 375 br label %done 376 377done: 378 %value = phi i32 [0, %if], [1, %else] 379 store i32 %value, i32 addrspace(1)* %out 380 ret void 381} 382 383; GCN-LABEL: {{^}}ifcvt_undef_scc: 384; GCN: {{^}}; %bb.0: 385; GCN-NEXT: s_load_dwordx2 386; GCN-NEXT: s_cselect_b32 s{{[0-9]+}}, 0, 1{{$}} 387define amdgpu_kernel void @ifcvt_undef_scc(i32 %cond, i32 addrspace(1)* %out) { 388entry: 389 br i1 undef, label %else, label %if 390 391if: 392 br label %done 393 394else: 395 br label %done 396 397done: 398 %value = phi i32 [0, %if], [1, %else] 399 store i32 %value, i32 addrspace(1)* %out 400 ret void 401} 402 403; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle256: 404; GCN: v_cmp_neq_f32 405; GCN: s_cbranch_vccnz [[ENDIF:BB[0-9]+_[0-9]+]] 406 407; GCN: v_add_i32 408; GCN: v_add_i32 409 410; GCN: [[ENDIF]]: 411; GCN: buffer_store_dword 412define amdgpu_kernel void @test_vccnz_ifcvt_triangle256(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in, float %cnd) #0 { 413entry: 414 %v = load <8 x i32>, <8 x i32> addrspace(1)* %in 415 %cc = fcmp oeq float %cnd, 1.000000e+00 416 br i1 %cc, label %if, label %endif 417 418if: 419 %u = add <8 x i32> %v, %v 420 br label %endif 421 422endif: 423 %r = phi <8 x i32> [ %v, %entry ], [ %u, %if ] 424 store <8 x i32> %r, <8 x i32> addrspace(1)* %out 425 ret void 426} 427 428; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle512: 429; GCN: v_cmp_neq_f32 430; GCN: s_cbranch_vccnz [[ENDIF:BB[0-9]+_[0-9]+]] 431 432; GCN: v_add_i32 433; GCN: v_add_i32 434 435; GCN: [[ENDIF]]: 436; GCN: buffer_store_dword 437define amdgpu_kernel void @test_vccnz_ifcvt_triangle512(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(1)* %in, float %cnd) #0 { 438entry: 439 %v = load <16 x i32>, <16 x i32> addrspace(1)* %in 440 %cc = fcmp oeq float %cnd, 1.000000e+00 441 br i1 %cc, label %if, label %endif 442 443if: 444 %u = add <16 x i32> %v, %v 445 br label %endif 446 447endif: 448 %r = phi <16 x i32> [ %v, %entry ], [ %u, %if ] 449 store <16 x i32> %r, <16 x i32> addrspace(1)* %out 450 ret void 451} 452 453attributes #0 = { nounwind } 454