1; RUN: llc -march=amdgcn -verify-machineinstrs -enable-misched -asm-verbose -disable-block-placement < %s | FileCheck -check-prefix=SI %s 2 3declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 4 5; SI-LABEL: {{^}}test_if: 6; Make sure the i1 values created by the cfg structurizer pass are 7; moved using VALU instructions 8 9 10; waitcnt should be inserted after exec modification 11; SI: v_cmp_lt_i32_e32 vcc, 1, 12; SI-NEXT: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, 0 13; SI-NEXT: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, 0 14; SI-NEXT: s_and_saveexec_b64 [[SAVE1:s\[[0-9]+:[0-9]+\]]], vcc 15; SI-NEXT: s_xor_b64 [[SAVE2:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE1]] 16; SI-NEXT: s_cbranch_execz [[FLOW_BB:BB[0-9]+_[0-9]+]] 17 18; SI-NEXT: ; %bb.{{[0-9]+}}: ; %LeafBlock3 19; SI: s_mov_b64 s[{{[0-9]:[0-9]}}], -1 20; SI: s_and_saveexec_b64 21; SI-NEXT: s_cbranch_execnz 22 23; v_mov should be after exec modification 24; SI: [[FLOW_BB]]: 25; SI-NEXT: s_or_saveexec_b64 [[SAVE3:s\[[0-9]+:[0-9]+\]]], [[SAVE2]] 26; SI-NEXT: s_xor_b64 exec, exec, [[SAVE3]] 27; 28define amdgpu_kernel void @test_if(i32 %b, i32 addrspace(1)* %src, i32 addrspace(1)* %dst) #1 { 29entry: 30 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 31 switch i32 %tid, label %default [ 32 i32 1, label %case1 33 i32 2, label %case2 34 ] 35 36case1: 37 %arrayidx1 = getelementptr i32, i32 addrspace(1)* %dst, i32 %b 38 store i32 13, i32 addrspace(1)* %arrayidx1, align 4 39 br label %end 40 41case2: 42 %arrayidx5 = getelementptr i32, i32 addrspace(1)* %dst, i32 %b 43 store i32 17, i32 addrspace(1)* %arrayidx5, align 4 44 br label %end 45 46default: 47 %cmp8 = icmp eq i32 %tid, 2 48 %arrayidx10 = getelementptr i32, i32 addrspace(1)* %dst, i32 %b 49 br i1 %cmp8, label %if, label %else 50 51if: 52 store i32 19, i32 addrspace(1)* %arrayidx10, align 4 53 br label %end 54 55else: 56 store i32 21, i32 addrspace(1)* %arrayidx10, align 4 57 br label %end 58 59end: 60 ret void 61} 62 63; SI-LABEL: {{^}}simple_test_v_if: 64; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}} 65; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc 66; SI-NEXT: s_cbranch_execz [[EXIT:BB[0-9]+_[0-9]+]] 67 68; SI-NEXT: ; %bb.{{[0-9]+}}: 69; SI: buffer_store_dword 70 71; SI-NEXT: {{^}}[[EXIT]]: 72; SI: s_endpgm 73define amdgpu_kernel void @simple_test_v_if(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 { 74 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 75 %is.0 = icmp ne i32 %tid, 0 76 br i1 %is.0, label %then, label %exit 77 78then: 79 %gep = getelementptr i32, i32 addrspace(1)* %dst, i32 %tid 80 store i32 999, i32 addrspace(1)* %gep 81 br label %exit 82 83exit: 84 ret void 85} 86 87; FIXME: It would be better to endpgm in the then block. 88 89; SI-LABEL: {{^}}simple_test_v_if_ret_else_ret: 90; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}} 91; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc 92; SI-NEXT: s_cbranch_execz [[EXIT:BB[0-9]+_[0-9]+]] 93 94; SI-NEXT: ; %bb.{{[0-9]+}}: 95; SI: buffer_store_dword 96 97; SI-NEXT: {{^}}[[EXIT]]: 98; SI: s_endpgm 99define amdgpu_kernel void @simple_test_v_if_ret_else_ret(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 { 100 %tid = call i32 @llvm.amdgcn.workitem.id.x() 101 %is.0 = icmp ne i32 %tid, 0 102 br i1 %is.0, label %then, label %exit 103 104then: 105 %gep = getelementptr i32, i32 addrspace(1)* %dst, i32 %tid 106 store i32 999, i32 addrspace(1)* %gep 107 ret void 108 109exit: 110 ret void 111} 112 113; Final block has more than a ret to execute. This was miscompiled 114; before function exit blocks were unified since the endpgm would 115; terminate the then wavefront before reaching the store. 116 117; SI-LABEL: {{^}}simple_test_v_if_ret_else_code_ret: 118; SI: v_cmp_eq_u32_e32 vcc, 0, v{{[0-9]+}} 119; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc 120; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]] 121; SI: s_cbranch_execnz [[EXIT:BB[0-9]+_[0-9]+]] 122 123; SI-NEXT: {{^BB[0-9]+_[0-9]+}}: ; %Flow 124; SI-NEXT: s_or_saveexec_b64 125; SI-NEXT: s_xor_b64 exec, exec 126; SI-NEXT: s_cbranch_execz [[UNIFIED_RETURN:BB[0-9]+_[0-9]+]] 127 128; SI-NEXT: ; %bb.{{[0-9]+}}: ; %then 129; SI: s_waitcnt 130; SI-NEXT: buffer_store_dword 131 132; SI-NEXT: {{^}}[[UNIFIED_RETURN]]: ; %UnifiedReturnBlock 133; SI: s_endpgm 134 135; SI-NEXT: {{^}}[[EXIT]]: 136; SI: ds_write_b32 137define amdgpu_kernel void @simple_test_v_if_ret_else_code_ret(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 { 138 %tid = call i32 @llvm.amdgcn.workitem.id.x() 139 %is.0 = icmp ne i32 %tid, 0 140 br i1 %is.0, label %then, label %exit 141 142then: 143 %gep = getelementptr i32, i32 addrspace(1)* %dst, i32 %tid 144 store i32 999, i32 addrspace(1)* %gep 145 ret void 146 147exit: 148 store volatile i32 7, i32 addrspace(3)* undef 149 ret void 150} 151 152; SI-LABEL: {{^}}simple_test_v_loop: 153; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}} 154; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc 155; SI-NEXT: s_cbranch_execz [[LABEL_EXIT:BB[0-9]+_[0-9]+]] 156 157; SI: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, 0{{$}} 158 159; SI: [[LABEL_LOOP:BB[0-9]+_[0-9]+]]: 160; SI: buffer_load_dword 161; SI-DAG: buffer_store_dword 162; SI-DAG: s_cmpk_lg_i32 s{{[0-9+]}}, 0x100 163; SI: s_cbranch_scc1 [[LABEL_LOOP]] 164; SI: [[LABEL_EXIT]]: 165; SI: s_endpgm 166 167define amdgpu_kernel void @simple_test_v_loop(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 { 168entry: 169 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 170 %is.0 = icmp ne i32 %tid, 0 171 %limit = add i32 %tid, 64 172 br i1 %is.0, label %loop, label %exit 173 174loop: 175 %i = phi i32 [%tid, %entry], [%i.inc, %loop] 176 %gep.src = getelementptr i32, i32 addrspace(1)* %src, i32 %i 177 %gep.dst = getelementptr i32, i32 addrspace(1)* %dst, i32 %i 178 %load = load i32, i32 addrspace(1)* %src 179 store i32 %load, i32 addrspace(1)* %gep.dst 180 %i.inc = add nsw i32 %i, 1 181 %cmp = icmp eq i32 %limit, %i.inc 182 br i1 %cmp, label %exit, label %loop 183 184exit: 185 ret void 186} 187 188; SI-LABEL: {{^}}multi_vcond_loop: 189 190; Load loop limit from buffer 191; Branch to exit if uniformly not taken 192; SI: ; %bb.0: 193; SI: buffer_load_dword [[VBOUND:v[0-9]+]] 194; SI: v_cmp_lt_i32_e32 vcc 195; SI: s_and_saveexec_b64 [[OUTER_CMP_SREG:s\[[0-9]+:[0-9]+\]]], vcc 196; SI-NEXT: s_cbranch_execz [[LABEL_EXIT:BB[0-9]+_[0-9]+]] 197 198; Initialize inner condition to false 199; SI: ; %bb.{{[0-9]+}}: ; %bb10.preheader 200; SI: s_mov_b64 [[COND_STATE:s\[[0-9]+:[0-9]+\]]], 0{{$}} 201 202; Clear exec bits for workitems that load -1s 203; SI: [[LABEL_LOOP:BB[0-9]+_[0-9]+]]: 204; SI: buffer_load_dword [[B:v[0-9]+]] 205; SI: buffer_load_dword [[A:v[0-9]+]] 206; SI-DAG: v_cmp_ne_u32_e64 [[NEG1_CHECK_0:s\[[0-9]+:[0-9]+\]]], -1, [[A]] 207; SI-DAG: v_cmp_ne_u32_e32 [[NEG1_CHECK_1:vcc]], -1, [[B]] 208; SI: s_and_b64 [[ORNEG1:s\[[0-9]+:[0-9]+\]]], [[NEG1_CHECK_1]], [[NEG1_CHECK_0]] 209; SI: s_and_saveexec_b64 [[ORNEG2:s\[[0-9]+:[0-9]+\]]], [[ORNEG1]] 210; SI: s_cbranch_execz [[LABEL_FLOW:BB[0-9]+_[0-9]+]] 211 212; SI: ; %bb.{{[0-9]+}}: ; %bb20 213; SI: buffer_store_dword 214 215; SI: [[LABEL_FLOW]]: 216; SI-NEXT: ; in Loop: Header=[[LABEL_LOOP]] 217; SI-NEXT: s_or_b64 exec, exec, [[ORNEG2]] 218; SI-NEXT: s_and_b64 [[TMP1:s\[[0-9]+:[0-9]+\]]], 219; SI-NEXT: s_or_b64 [[COND_STATE]], [[TMP1]], [[COND_STATE]] 220; SI-NEXT: s_andn2_b64 exec, exec, [[COND_STATE]] 221; SI-NEXT: s_cbranch_execnz [[LABEL_LOOP]] 222 223; SI: [[LABEL_EXIT]]: 224; SI-NOT: [[COND_STATE]] 225; SI: s_endpgm 226 227define amdgpu_kernel void @multi_vcond_loop(i32 addrspace(1)* noalias nocapture %arg, i32 addrspace(1)* noalias nocapture readonly %arg1, i32 addrspace(1)* noalias nocapture readonly %arg2, i32 addrspace(1)* noalias nocapture readonly %arg3) #1 { 228bb: 229 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0 230 %tmp4 = sext i32 %tmp to i64 231 %tmp5 = getelementptr inbounds i32, i32 addrspace(1)* %arg3, i64 %tmp4 232 %tmp6 = load i32, i32 addrspace(1)* %tmp5, align 4 233 %tmp7 = icmp sgt i32 %tmp6, 0 234 %tmp8 = sext i32 %tmp6 to i64 235 br i1 %tmp7, label %bb10, label %bb26 236 237bb10: ; preds = %bb, %bb20 238 %tmp11 = phi i64 [ %tmp23, %bb20 ], [ 0, %bb ] 239 %tmp12 = add nsw i64 %tmp11, %tmp4 240 %tmp13 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp12 241 %tmp14 = load i32, i32 addrspace(1)* %tmp13, align 4 242 %tmp15 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp12 243 %tmp16 = load i32, i32 addrspace(1)* %tmp15, align 4 244 %tmp17 = icmp ne i32 %tmp14, -1 245 %tmp18 = icmp ne i32 %tmp16, -1 246 %tmp19 = and i1 %tmp17, %tmp18 247 br i1 %tmp19, label %bb20, label %bb26 248 249bb20: ; preds = %bb10 250 %tmp21 = add nsw i32 %tmp16, %tmp14 251 %tmp22 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp12 252 store i32 %tmp21, i32 addrspace(1)* %tmp22, align 4 253 %tmp23 = add nuw nsw i64 %tmp11, 1 254 %tmp24 = icmp slt i64 %tmp23, %tmp8 255 br i1 %tmp24, label %bb10, label %bb26 256 257bb26: ; preds = %bb10, %bb20, %bb 258 ret void 259} 260 261attributes #0 = { nounwind readnone } 262attributes #1 = { nounwind } 263