1//===------- target_impl.hip - AMDGCN OpenMP GPU implementation --- HIP -*-===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9// Definitions of target specific functions 10// 11//===----------------------------------------------------------------------===// 12 13#include "target_impl.h" 14 15// Implementations initially derived from hcc 16 17// Initialized with a 64-bit mask with bits set in positions less than the 18// thread's lane number in the warp 19DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt() { 20 uint32_t lane = GetLaneId(); 21 int64_t ballot = __kmpc_impl_activemask(); 22 uint64_t mask = ((uint64_t)1 << lane) - (uint64_t)1; 23 return mask & ballot; 24} 25 26// Initialized with a 64-bit mask with bits set in positions greater than the 27// thread's lane number in the warp 28DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt() { 29 uint32_t lane = GetLaneId(); 30 if (lane == (WARPSIZE - 1)) 31 return 0; 32 uint64_t ballot = __kmpc_impl_activemask(); 33 uint64_t mask = (~((uint64_t)0)) << (lane + 1); 34 return mask & ballot; 35} 36 37DEVICE double __kmpc_impl_get_wtick() { return ((double)1E-9); } 38 39DEVICE double __kmpc_impl_get_wtime() { 40 // The intrinsics for measuring time have undocumented frequency 41 // This will probably need to be found by measurement on a number of 42 // architectures. Until then, return 0, which is very inaccurate as a 43 // timer but resolves the undefined symbol at link time. 44 return 0; 45} 46 47// Warp vote function 48DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask() { 49 return __builtin_amdgcn_read_exec(); 50} 51 52DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t, int32_t var, 53 int32_t srcLane) { 54 int width = WARPSIZE; 55 int self = GetLaneId(); 56 int index = srcLane + (self & ~(width - 1)); 57 return __builtin_amdgcn_ds_bpermute(index << 2, var); 58} 59 60DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t, int32_t var, 61 uint32_t laneDelta, int32_t width) { 62 int self = GetLaneId(); 63 int index = self + laneDelta; 64 index = (int)(laneDelta + (self & (width - 1))) >= width ? self : index; 65 return __builtin_amdgcn_ds_bpermute(index << 2, var); 66} 67 68static DEVICE SHARED uint32_t L1_Barrier; 69 70DEVICE void __kmpc_impl_target_init() { 71 // Don't have global ctors, and shared memory is not zero init 72 __atomic_store_n(&L1_Barrier, 0u, __ATOMIC_RELEASE); 73} 74 75DEVICE void __kmpc_impl_named_sync(uint32_t num_threads) { 76 __atomic_thread_fence(__ATOMIC_ACQUIRE); 77 78 uint32_t num_waves = num_threads / WARPSIZE; 79 80 // Partial barrier implementation for amdgcn. 81 // Uses two 16 bit unsigned counters. One for the number of waves to have 82 // reached the barrier, and one to count how many times the barrier has been 83 // passed. These are packed in a single atomically accessed 32 bit integer. 84 // Low bits for the number of waves, assumed zero before this call. 85 // High bits to count the number of times the barrier has been passed. 86 87 assert(num_waves != 0); 88 assert(num_waves * WARPSIZE == num_threads); 89 assert(num_waves < 0xffffu); 90 91 // Increment the low 16 bits once, using the lowest active thread. 92 uint64_t lowestActiveThread = __kmpc_impl_ffs(__kmpc_impl_activemask()) - 1; 93 bool isLowest = GetLaneId() == lowestActiveThread; 94 95 if (isLowest) { 96 uint32_t load = 97 __atomic_fetch_add(&L1_Barrier, 1, __ATOMIC_RELAXED); // commutative 98 99 // Record the number of times the barrier has been passed 100 uint32_t generation = load & 0xffff0000u; 101 102 if ((load & 0x0000ffffu) == (num_waves - 1)) { 103 // Reached num_waves in low bits so this is the last wave. 104 // Set low bits to zero and increment high bits 105 load += 0x00010000u; // wrap is safe 106 load &= 0xffff0000u; // because bits zeroed second 107 108 // Reset the wave counter and release the waiting waves 109 __atomic_store_n(&L1_Barrier, load, __ATOMIC_RELAXED); 110 } else { 111 // more waves still to go, spin until generation counter changes 112 do { 113 __builtin_amdgcn_s_sleep(0); 114 load = __atomic_load_n(&L1_Barrier, __ATOMIC_RELAXED); 115 } while ((load & 0xffff0000u) == generation); 116 } 117 } 118 __atomic_thread_fence(__ATOMIC_RELEASE); 119} 120 121namespace { 122DEVICE uint32_t get_grid_dim(uint32_t n, uint16_t d) { 123 uint32_t q = n / d; 124 return q + (n > q * d); 125} 126DEVICE uint32_t get_workgroup_dim(uint32_t group_id, uint32_t grid_size, 127 uint16_t group_size) { 128 uint32_t r = grid_size - group_id * group_size; 129 return (r < group_size) ? r : group_size; 130} 131} // namespace 132 133DEVICE int GetNumberOfBlocksInKernel() { 134 return get_grid_dim(__builtin_amdgcn_grid_size_x(), __builtin_amdgcn_workgroup_size_x()); 135} 136 137DEVICE int GetNumberOfThreadsInBlock() { 138 return get_workgroup_dim(__builtin_amdgcn_workgroup_id_x(), __builtin_amdgcn_grid_size_x(), 139 __builtin_amdgcn_workgroup_size_x()); 140} 141 142DEVICE unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; } 143DEVICE unsigned GetLaneId() { 144 return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u)); 145} 146 147// Stub implementations 148DEVICE void *__kmpc_impl_malloc(size_t ) { return nullptr } 149DEVICE void __kmpc_impl_free(void *) {} 150