1; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,LOOP %s 2; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,LOOP %s 3; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,LOOP %s 4; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,NOLOOP,NOLOOP-SDAG %s 5; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -asm-verbose=0 -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,NOLOOP,NOLOOP-SDAG %s 6 7; Minimum offset 8; GCN-LABEL: {{^}}gws_init_offset0: 9; GCN-DAG: s_load_dword [[BAR_NUM:s[0-9]+]] 10; GCN-DAG: s_mov_b32 m0, 0{{$}} 11; GCN: v_mov_b32_e32 v0, [[BAR_NUM]] 12; NOLOOP: ds_gws_init v0 gds{{$}} 13 14; LOOP: [[LOOP:BB[0-9]+_[0-9]+]]: 15; LOOP-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_TRAPSTS, 8, 1), 0 16; LOOP-NEXT: ds_gws_init v0 gds 17; LOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 18; LOOP-NEXT: s_getreg_b32 [[GETREG:s[0-9]+]], hwreg(HW_REG_TRAPSTS, 8, 1) 19; LOOP-NEXT: s_cmp_lg_u32 [[GETREG]], 0 20; LOOP-NEXT: s_cbranch_scc1 [[LOOP]] 21define amdgpu_kernel void @gws_init_offset0(i32 %val) #0 { 22 call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 0) 23 ret void 24} 25 26; Maximum offset 27; GCN-LABEL: {{^}}gws_init_offset63: 28; NOLOOP-DAG: s_load_dword [[BAR_NUM:s[0-9]+]] 29; NOLOOP-DAG: s_mov_b32 m0, 0{{$}} 30; NOLOOP-DAG: v_mov_b32_e32 v0, [[BAR_NUM]] 31; NOLOOP: ds_gws_init v0 offset:63 gds{{$}} 32 33 34; LOOP: s_mov_b32 m0, 0{{$}} 35; LOOP: [[LOOP:BB[0-9]+_[0-9]+]]: 36; LOOP-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_TRAPSTS, 8, 1), 0 37; LOOP-NEXT: ds_gws_init v0 offset:63 gds 38; LOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 39; LOOP-NEXT: s_getreg_b32 [[GETREG:s[0-9]+]], hwreg(HW_REG_TRAPSTS, 8, 1) 40; LOOP-NEXT: s_cmp_lg_u32 [[GETREG]], 0 41; LOOP-NEXT: s_cbranch_scc1 [[LOOP]] 42define amdgpu_kernel void @gws_init_offset63(i32 %val) #0 { 43 call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 63) 44 ret void 45} 46 47; FIXME: Should be able to shift directly into m0 48; GCN-LABEL: {{^}}gws_init_sgpr_offset: 49; NOLOOP-DAG: s_load_dwordx2 s{{\[}}[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]{{\]}} 50 51; NOLOOP-SDAG-DAG: s_lshl_b32 [[SHL:s[0-9]+]], s[[OFFSET]], 16 52; NOLOOP-SDAG-DAG: s_mov_b32 m0, [[SHL]]{{$}} 53 54; NOLOOP-GISEL-DAG: s_lshl_b32 m0, s[[OFFSET]], 16 55 56; NOLOOP-DAG: v_mov_b32_e32 [[GWS_VAL:v[0-9]+]], s[[BAR_NUM]] 57; NOLOOP: ds_gws_init [[GWS_VAL]] gds{{$}} 58define amdgpu_kernel void @gws_init_sgpr_offset(i32 %val, i32 %offset) #0 { 59 call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 %offset) 60 ret void 61} 62 63; Variable offset in SGPR with constant add 64; GCN-LABEL: {{^}}gws_init_sgpr_offset_add1: 65; NOLOOP-DAG: s_load_dwordx2 s{{\[}}[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]{{\]}} 66 67; NOLOOP-SDAG-DAG: s_lshl_b32 [[SHL:s[0-9]+]], s[[OFFSET]], 16 68; NOLOOP-SDAG-DAG: s_mov_b32 m0, [[SHL]]{{$}} 69 70; NOLOOP-GISEL-DAG: s_lshl_b32 m0, s[[OFFSET]], 16 71 72; NOLOOP-DAG: v_mov_b32_e32 [[GWS_VAL:v[0-9]+]], s[[BAR_NUM]] 73; NOLOOP: ds_gws_init [[GWS_VAL]] offset:1 gds{{$}} 74define amdgpu_kernel void @gws_init_sgpr_offset_add1(i32 %val, i32 %offset.base) #0 { 75 %offset = add i32 %offset.base, 1 76 call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 %offset) 77 ret void 78} 79 80; GCN-LABEL: {{^}}gws_init_vgpr_offset: 81; NOLOOP-DAG: s_load_dword [[BAR_NUM:s[0-9]+]] 82; NOLOOP-DAG: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0 83 84; NOLOOP-SDAG-DAG: s_lshl_b32 [[SHL:s[0-9]+]], [[READLANE]], 16 85; NOLOOP-SDAG-DAG: s_mov_b32 m0, [[SHL]]{{$}} 86 87; NOLOOP-GISEL-DAG: s_lshl_b32 m0, [[READLANE]], 16 88 89; NOLOOP-DAG: v_mov_b32_e32 v0, [[BAR_NUM]] 90; NOLOOP: ds_gws_init v0 gds{{$}} 91define amdgpu_kernel void @gws_init_vgpr_offset(i32 %val) #0 { 92 %vgpr.offset = call i32 @llvm.amdgcn.workitem.id.x() 93 call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 %vgpr.offset) 94 ret void 95} 96 97; Variable offset in VGPR with constant add 98; GCN-LABEL: {{^}}gws_init_vgpr_offset_add: 99; NOLOOP-DAG: s_load_dword [[BAR_NUM:s[0-9]+]] 100; NOLOOP-DAG: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0 101 102; NOLOOP-SDAG-DAG: s_lshl_b32 [[SHL:s[0-9]+]], [[READLANE]], 16 103; NOLOOP-SDAG-DAG: s_mov_b32 m0, [[SHL]]{{$}} 104 105; NOLOOP-GISEL-DAG: s_lshl_b32 m0, [[READLANE]], 16 106 107; NOLOOP-DAG: v_mov_b32_e32 v0, [[BAR_NUM]] 108; NOLOOP: ds_gws_init v0 offset:3 gds{{$}} 109define amdgpu_kernel void @gws_init_vgpr_offset_add(i32 %val) #0 { 110 %vgpr.offset.base = call i32 @llvm.amdgcn.workitem.id.x() 111 %vgpr.offset = add i32 %vgpr.offset.base, 3 112 call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 %vgpr.offset) 113 ret void 114} 115 116@lds = internal unnamed_addr addrspace(3) global i32 undef 117 118; Check if m0 initialization is shared. 119; GCN-LABEL: {{^}}gws_init_save_m0_init_constant_offset: 120; NOLOOP: s_mov_b32 m0, 0 121; NOLOOP: ds_gws_init v{{[0-9]+}} offset:10 gds 122 123; LOOP: s_mov_b32 m0, -1 124; LOOP: ds_write_b32 125; LOOP: s_mov_b32 m0, 0 126; LOOP: s_setreg_imm32_b32 127; LOOP: ds_gws_init v{{[0-9]+}} offset:10 gds 128; LOOP: s_cbranch_scc1 129 130; LOOP: s_mov_b32 m0, -1 131; LOOP: ds_write_b32 132define amdgpu_kernel void @gws_init_save_m0_init_constant_offset(i32 %val) #0 { 133 store volatile i32 1, i32 addrspace(3)* @lds 134 call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 10) 135 store i32 2, i32 addrspace(3)* @lds 136 ret void 137} 138 139; GCN-LABEL: {{^}}gws_init_lgkmcnt: 140; NOLOOP: s_mov_b32 m0, 0{{$}} 141; NOLOOP: ds_gws_init v0 gds{{$}} 142; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 143; NOLOOP-NEXT: s_setpc_b64 144define void @gws_init_lgkmcnt(i32 %val) { 145 call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 0) 146 ret void 147} 148 149; Does not imply memory fence on its own 150; GCN-LABEL: {{^}}gws_init_wait_before: 151; NOLOOP: s_waitcnt lgkmcnt(0) 152; NOLOOP-NOT: s_waitcnt 153; NOLOOP: ds_gws_init 154; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 155define amdgpu_kernel void @gws_init_wait_before(i32 %val, i32 addrspace(1)* %ptr) #0 { 156 store i32 0, i32 addrspace(1)* %ptr 157 call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 7) 158 ret void 159} 160 161declare void @llvm.amdgcn.ds.gws.init(i32, i32) #1 162declare i32 @llvm.amdgcn.workitem.id.x() #2 163 164attributes #0 = { nounwind } 165attributes #1 = { convergent inaccessiblememonly nounwind writeonly } 166attributes #2 = { nounwind readnone speculatable } 167