1; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,LOOP %s 2; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,LOOP %s 3; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,LOOP %s 4; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,NOLOOP,NOLOOP-SDAG %s 5; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -asm-verbose=0 -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,NOLOOP,NOLOOP-SDAG,GFX10 %s 6 7; Make sure the op is emitted bundled with a waitcnt with and without the retry loop, and the bundle is not removed by ExpandPostRAPseudos. 8; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -stop-after=postrapseudos -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=MIR %s 9; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -stop-after=postrapseudos -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=MIR %s 10 11 12; Minimum offset 13; GCN-LABEL: {{^}}gws_barrier_offset0: 14; NOLOOP-DAG: s_load_dword [[BAR_NUM:s[0-9]+]] 15; NOLOOP-DAG: s_mov_b32 m0, 0{{$}} 16; NOLOOP: v_mov_b32_e32 v0, [[BAR_NUM]] 17; NOLOOP: ds_gws_barrier v0 gds{{$}} 18 19; LOOP: s_mov_b32 m0, 0{{$}} 20; LOOP: [[LOOP:BB[0-9]+_[0-9]+]]: 21; LOOP-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_TRAPSTS, 8, 1), 0 22; LOOP-NEXT: ds_gws_barrier v0 gds 23; LOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 24; LOOP-NEXT: s_getreg_b32 [[GETREG:s[0-9]+]], hwreg(HW_REG_TRAPSTS, 8, 1) 25; LOOP-NEXT: s_cmp_lg_u32 [[GETREG]], 0 26; LOOP-NEXT: s_cbranch_scc1 [[LOOP]] 27 28; MIR-LABEL: name: gws_barrier_offset0{{$}} 29; MIR: BUNDLE implicit{{( killed)?( renamable)?}} $vgpr0, implicit $m0, implicit $exec { 30; MIR-NEXT: DS_GWS_BARRIER renamable $vgpr0, 0, implicit $m0, implicit $exec :: (load 4 from custom "GWSResource") 31; MIR-NEXT: S_WAITCNT 0 32; MIR-NEXT: } 33define amdgpu_kernel void @gws_barrier_offset0(i32 %val) #0 { 34 call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 0) 35 ret void 36} 37 38; MIR-LABEL: name: gws_barrier_offset63{{$}} 39 40; Maximum offset 41; GCN-LABEL: {{^}}gws_barrier_offset63: 42; NOLOOP-DAG: s_load_dword [[BAR_NUM:s[0-9]+]] 43; NOLOOP-DAG: s_mov_b32 m0, 0{{$}} 44; NOLOOP-DAG: v_mov_b32_e32 v0, [[BAR_NUM]] 45; NOLOOP: ds_gws_barrier v0 offset:63 gds{{$}} 46define amdgpu_kernel void @gws_barrier_offset63(i32 %val) #0 { 47 call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 63) 48 ret void 49} 50 51; FIXME: Should be able to shift directly into m0 52; GCN-LABEL: {{^}}gws_barrier_sgpr_offset: 53; NOLOOP-DAG: s_load_dwordx2 s{{\[}}[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]{{\]}} 54 55; NOLOOP-SDAG-DAG: s_lshl_b32 [[SHL:s[0-9]+]], s[[OFFSET]], 16 56; NOLOOP-SDAG-DAG: s_mov_b32 m0, [[SHL]]{{$}} 57 58; NOLOOP-GISEL-DAG: s_lshl_b32 m0, s[[OFFSET]], 16 59 60 61; NOLOOP-DAG: v_mov_b32_e32 [[GWS_VAL:v[0-9]+]], s[[BAR_NUM]] 62; NOLOOP: ds_gws_barrier [[GWS_VAL]] gds{{$}} 63define amdgpu_kernel void @gws_barrier_sgpr_offset(i32 %val, i32 %offset) #0 { 64 call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 %offset) 65 ret void 66} 67 68; Variable offset in SGPR with constant add 69; GCN-LABEL: {{^}}gws_barrier_sgpr_offset_add1: 70; NOLOOP-DAG: s_load_dwordx2 s{{\[}}[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]{{\]}} 71 72; NOLOOP-SDAG-DAG: s_lshl_b32 [[SHL:s[0-9]+]], s[[OFFSET]], 16 73; NOLOOP-SDAG-DAG: s_mov_b32 m0, [[SHL]]{{$}} 74 75; NOLOOP-GISEL-DAG: s_lshl_b32 m0, s[[OFFSET]], 16 76 77; NOLOOP-DAG: v_mov_b32_e32 [[GWS_VAL:v[0-9]+]], s[[BAR_NUM]] 78; NOLOOP: ds_gws_barrier [[GWS_VAL]] offset:1 gds{{$}} 79define amdgpu_kernel void @gws_barrier_sgpr_offset_add1(i32 %val, i32 %offset.base) #0 { 80 %offset = add i32 %offset.base, 1 81 call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 %offset) 82 ret void 83} 84 85; GCN-LABEL: {{^}}gws_barrier_vgpr_offset: 86; NOLOOP-DAG: s_load_dword [[BAR_NUM:s[0-9]+]] 87; NOLOOP-DAG: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0 88 89; NOLOOP-SDAG-DAG: s_lshl_b32 [[SHL:s[0-9]+]], [[READLANE]], 16 90; NOLOOP-SDAG-DAG: s_mov_b32 m0, [[SHL]]{{$}} 91 92; NOLOOP-GISEL-DAG: s_lshl_b32 m0, [[READLANE]], 16 93 94; NOLOOP-DAG: v_mov_b32_e32 [[GWS_VAL:v[0-9]+]], [[BAR_NUM]] 95; NOLOOP: ds_gws_barrier [[GWS_VAL]] gds{{$}} 96define amdgpu_kernel void @gws_barrier_vgpr_offset(i32 %val) #0 { 97 %vgpr.offset = call i32 @llvm.amdgcn.workitem.id.x() 98 call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 %vgpr.offset) 99 ret void 100} 101 102; Variable offset in VGPR with constant add 103; GCN-LABEL: {{^}}gws_barrier_vgpr_offset_add: 104; NOLOOP-DAG: s_load_dword [[BAR_NUM:s[0-9]+]] 105; NOLOOP-DAG: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0 106 107; NOLOOP-SDAG-DAG: s_lshl_b32 [[SHL:s[0-9]+]], [[READLANE]], 16 108; NOLOOP-SDAG-DAG: s_mov_b32 m0, [[SHL]]{{$}} 109 110; NOLOOP-GISEL-DAG: s_lshl_b32 m0, [[READLANE]], 16 111 112; NOLOOP-DAG: v_mov_b32_e32 [[GWS_VAL:v[0-9]+]], [[BAR_NUM]] 113; NOLOOP: ds_gws_barrier [[GWS_VAL]] offset:3 gds{{$}} 114define amdgpu_kernel void @gws_barrier_vgpr_offset_add(i32 %val) #0 { 115 %vgpr.offset.base = call i32 @llvm.amdgcn.workitem.id.x() 116 %vgpr.offset = add i32 %vgpr.offset.base, 3 117 call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 %vgpr.offset) 118 ret void 119} 120 121@lds = internal unnamed_addr addrspace(3) global i32 undef 122 123; Check if m0 initialization is shared 124; GCN-LABEL: {{^}}gws_barrier_save_m0_barrier_constant_offset: 125; NOLOOP: s_mov_b32 m0, 0 126; NOLOOP: ds_gws_barrier v{{[0-9]+}} offset:10 gds 127 128; LOOP: s_mov_b32 m0, -1 129; LOOP: ds_write_b32 130; LOOP: s_mov_b32 m0, 0 131; LOOP: s_setreg_imm32_b32 132; LOOP: ds_gws_barrier v{{[0-9]+}} offset:10 gds 133; LOOP: s_cbranch_scc1 134 135; LOOP: s_mov_b32 m0, -1 136; LOOP: ds_write_b32 137define amdgpu_kernel void @gws_barrier_save_m0_barrier_constant_offset(i32 %val) #0 { 138 store i32 1, i32 addrspace(3)* @lds 139 call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 10) 140 store i32 2, i32 addrspace(3)* @lds 141 ret void 142} 143 144; Make sure this increments lgkmcnt 145; GCN-LABEL: {{^}}gws_barrier_lgkmcnt: 146; NOLOOP: s_mov_b32 m0, 0{{$}} 147; NOLOOP: ds_gws_barrier v0 gds{{$}} 148; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 149; NOLOOP-NEXT: s_setpc_b64 150define void @gws_barrier_lgkmcnt(i32 %val) { 151 call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 0) 152 ret void 153} 154 155; Does not imply memory fence on its own 156; GCN-LABEL: {{^}}gws_barrier_wait_before: 157; NOLOOP: s_waitcnt 158; NOLOOP-NOT: s_waitcnt{{$}} 159define amdgpu_kernel void @gws_barrier_wait_before(i32 %val, i32 addrspace(1)* %ptr) #0 { 160 store i32 0, i32 addrspace(1)* %ptr 161 call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7) 162 ret void 163} 164 165; GCN-LABEL: {{^}}gws_barrier_wait_after: 166; NOLOOP: s_mov_b32 m0, 0{{$}} 167; NOLOOP: ds_gws_barrier v{{[0-9]+}} offset:7 gds 168; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 169; NOLOOP: load_dword 170define amdgpu_kernel void @gws_barrier_wait_after(i32 %val, i32 addrspace(1)* %ptr) #0 { 171 call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7) 172 %load = load volatile i32, i32 addrspace(1)* %ptr 173 ret void 174} 175 176; Does not imply memory fence on its own 177; GCN-LABEL: {{^}}gws_barrier_fence_before: 178; NOLOOP: s_mov_b32 m0, 0{{$}} 179; NOLOOP: store_dword 180; NOLOOP: s_waitcnt vmcnt(0) lgkmcnt(0) 181; NOLOOP: ds_gws_barrier v{{[0-9]+}} offset:7 gds 182; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 183define amdgpu_kernel void @gws_barrier_fence_before(i32 %val, i32 addrspace(1)* %ptr) #0 { 184 store i32 0, i32 addrspace(1)* %ptr 185 fence release 186 call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7) 187 ret void 188} 189 190; FIXME: Extra waitcnt 191; GCN-LABEL: {{^}}gws_barrier_fence_after: 192; NOLOOP: s_mov_b32 m0, 0{{$}} 193; NOLOOP: ds_gws_barrier v{{[0-9]+}} offset:7 gds 194; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 195; NOLOOP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 196; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 197; NOLOOP-NEXT: load_dword 198define amdgpu_kernel void @gws_barrier_fence_after(i32 %val, i32 addrspace(1)* %ptr) #0 { 199 call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7) 200 fence release 201 %load = load volatile i32, i32 addrspace(1)* %ptr 202 ret void 203} 204 205; FIXME: Should a wait be inserted here, or is an explicit fence needed? 206; GCN-LABEL: {{^}}gws_init_barrier: 207; NOLOOP: s_mov_b32 m0, 0 208; NOLOOP: ds_gws_init v0 offset:7 gds 209; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 210; NOLOOP-NEXT: ds_gws_barrier v0 offset:7 gds 211; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 212define amdgpu_kernel void @gws_init_barrier(i32 %val) #0 { 213 call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 7) 214 call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7) 215 ret void 216} 217 218; FIXME: Why vmcnt, not expcnt? 219; GCN-LABEL: {{^}}gws_init_fence_barrier: 220; NOLOOP: s_mov_b32 m0, 0 221; NOLOOP: ds_gws_init v0 offset:7 gds 222; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 223; NOLOOP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 224; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 225; NOLOOP-NEXT: ds_gws_barrier v0 offset:7 gds 226; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 227define amdgpu_kernel void @gws_init_fence_barrier(i32 %val) #0 { 228 call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 7) 229 fence release 230 call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7) 231 ret void 232} 233 234declare void @llvm.amdgcn.ds.gws.barrier(i32, i32) #1 235declare void @llvm.amdgcn.ds.gws.init(i32, i32) #2 236declare i32 @llvm.amdgcn.workitem.id.x() #3 237 238attributes #0 = { nounwind } 239attributes #1 = { convergent inaccessiblememonly nounwind } 240attributes #2 = { convergent inaccessiblememonly nounwind writeonly } 241attributes #3 = { nounwind readnone speculatable } 242