1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8,VARIANT0 %s 3; RUN: llc -march=amdgcn -mattr=+auto-waitcnt-before-barrier -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8,VARIANT1 %s 4; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9,VARIANT2 %s 5; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=+auto-waitcnt-before-barrier -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9,VARIANT3 %s 6 7define amdgpu_kernel void @test_barrier(i32 addrspace(1)* %out, i32 %size) #0 { 8; VARIANT0-LABEL: test_barrier: 9; VARIANT0: ; %bb.0: ; %entry 10; VARIANT0-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 11; VARIANT0-NEXT: s_load_dword s2, s[0:1], 0xb 12; VARIANT0-NEXT: s_mov_b32 s7, 0xf000 13; VARIANT0-NEXT: s_mov_b32 s6, 0 14; VARIANT0-NEXT: v_lshlrev_b32_e32 v1, 2, v0 15; VARIANT0-NEXT: v_mov_b32_e32 v2, 0 16; VARIANT0-NEXT: v_not_b32_e32 v3, v0 17; VARIANT0-NEXT: s_waitcnt lgkmcnt(0) 18; VARIANT0-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 19; VARIANT0-NEXT: s_waitcnt vmcnt(0) expcnt(0) 20; VARIANT0-NEXT: s_barrier 21; VARIANT0-NEXT: v_add_i32_e32 v3, vcc, s2, v3 22; VARIANT0-NEXT: v_ashrrev_i32_e32 v4, 31, v3 23; VARIANT0-NEXT: v_lshl_b64 v[3:4], v[3:4], 2 24; VARIANT0-NEXT: buffer_load_dword v0, v[3:4], s[4:7], 0 addr64 25; VARIANT0-NEXT: s_waitcnt vmcnt(0) 26; VARIANT0-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 27; VARIANT0-NEXT: s_endpgm 28; 29; VARIANT1-LABEL: test_barrier: 30; VARIANT1: ; %bb.0: ; %entry 31; VARIANT1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 32; VARIANT1-NEXT: s_load_dword s2, s[0:1], 0xb 33; VARIANT1-NEXT: s_mov_b32 s7, 0xf000 34; VARIANT1-NEXT: s_mov_b32 s6, 0 35; VARIANT1-NEXT: v_lshlrev_b32_e32 v1, 2, v0 36; VARIANT1-NEXT: v_mov_b32_e32 v2, 0 37; VARIANT1-NEXT: v_not_b32_e32 v3, v0 38; VARIANT1-NEXT: s_waitcnt lgkmcnt(0) 39; VARIANT1-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 40; VARIANT1-NEXT: s_barrier 41; VARIANT1-NEXT: v_add_i32_e32 v3, vcc, s2, v3 42; VARIANT1-NEXT: v_ashrrev_i32_e32 v4, 31, v3 43; VARIANT1-NEXT: v_lshl_b64 v[3:4], v[3:4], 2 44; VARIANT1-NEXT: s_waitcnt expcnt(0) 45; VARIANT1-NEXT: buffer_load_dword v0, v[3:4], s[4:7], 0 addr64 46; VARIANT1-NEXT: s_waitcnt vmcnt(0) 47; VARIANT1-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 48; VARIANT1-NEXT: s_endpgm 49; 50; VARIANT2-LABEL: test_barrier: 51; VARIANT2: ; %bb.0: ; %entry 52; VARIANT2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 53; VARIANT2-NEXT: s_load_dword s0, s[0:1], 0x2c 54; VARIANT2-NEXT: v_lshlrev_b32_e32 v2, 2, v0 55; VARIANT2-NEXT: s_waitcnt lgkmcnt(0) 56; VARIANT2-NEXT: global_store_dword v2, v0, s[2:3] 57; VARIANT2-NEXT: v_xad_u32 v0, v0, -1, s0 58; VARIANT2-NEXT: v_ashrrev_i32_e32 v1, 31, v0 59; VARIANT2-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] 60; VARIANT2-NEXT: v_mov_b32_e32 v3, s3 61; VARIANT2-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 62; VARIANT2-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc 63; VARIANT2-NEXT: s_waitcnt vmcnt(0) 64; VARIANT2-NEXT: s_barrier 65; VARIANT2-NEXT: global_load_dword v0, v[0:1], off 66; VARIANT2-NEXT: s_waitcnt vmcnt(0) 67; VARIANT2-NEXT: global_store_dword v2, v0, s[2:3] 68; VARIANT2-NEXT: s_endpgm 69; 70; VARIANT3-LABEL: test_barrier: 71; VARIANT3: ; %bb.0: ; %entry 72; VARIANT3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 73; VARIANT3-NEXT: s_load_dword s0, s[0:1], 0x2c 74; VARIANT3-NEXT: v_lshlrev_b32_e32 v2, 2, v0 75; VARIANT3-NEXT: s_waitcnt lgkmcnt(0) 76; VARIANT3-NEXT: global_store_dword v2, v0, s[2:3] 77; VARIANT3-NEXT: v_xad_u32 v0, v0, -1, s0 78; VARIANT3-NEXT: v_ashrrev_i32_e32 v1, 31, v0 79; VARIANT3-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] 80; VARIANT3-NEXT: v_mov_b32_e32 v3, s3 81; VARIANT3-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 82; VARIANT3-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc 83; VARIANT3-NEXT: s_barrier 84; VARIANT3-NEXT: global_load_dword v0, v[0:1], off 85; VARIANT3-NEXT: s_waitcnt vmcnt(0) 86; VARIANT3-NEXT: global_store_dword v2, v0, s[2:3] 87; VARIANT3-NEXT: s_endpgm 88entry: 89 %tmp = call i32 @llvm.amdgcn.workitem.id.x() 90 %tmp1 = getelementptr i32, i32 addrspace(1)* %out, i32 %tmp 91 store i32 %tmp, i32 addrspace(1)* %tmp1 92 call void @llvm.amdgcn.s.barrier() 93 %tmp3 = sub i32 %size, 1 94 %tmp4 = sub i32 %tmp3, %tmp 95 %tmp5 = getelementptr i32, i32 addrspace(1)* %out, i32 %tmp4 96 %tmp6 = load i32, i32 addrspace(1)* %tmp5 97 store i32 %tmp6, i32 addrspace(1)* %tmp1 98 ret void 99} 100 101declare void @llvm.amdgcn.s.barrier() #1 102declare i32 @llvm.amdgcn.workitem.id.x() #2 103 104attributes #0 = { nounwind } 105attributes #1 = { convergent nounwind } 106attributes #2 = { nounwind readnone } 107