1//===------- target_impl.hip - AMDGCN OpenMP GPU implementation --- HIP -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// Definitions of target specific functions
10//
11//===----------------------------------------------------------------------===//
12
13#include "target_impl.h"
14
15// Implementations initially derived from hcc
16
17// Initialized with a 64-bit mask with bits set in positions less than the
18// thread's lane number in the warp
19DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt() {
20  uint32_t lane = GetLaneId();
21  int64_t ballot = __kmpc_impl_activemask();
22  uint64_t mask = ((uint64_t)1 << lane) - (uint64_t)1;
23  return mask & ballot;
24}
25
26// Initialized with a 64-bit mask with bits set in positions greater than the
27// thread's lane number in the warp
28DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt() {
29  uint32_t lane = GetLaneId();
30  if (lane == (WARPSIZE - 1))
31    return 0;
32  uint64_t ballot = __kmpc_impl_activemask();
33  uint64_t mask = (~((uint64_t)0)) << (lane + 1);
34  return mask & ballot;
35}
36
37DEVICE double __kmpc_impl_get_wtick() { return ((double)1E-9); }
38
39DEVICE double __kmpc_impl_get_wtime() {
40  // The intrinsics for measuring time have undocumented frequency
41  // This will probably need to be found by measurement on a number of
42  // architectures. Until then, return 0, which is very inaccurate as a
43  // timer but resolves the undefined symbol at link time.
44  return 0;
45}
46
47// Warp vote function
48DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
49  return __builtin_amdgcn_read_exec();
50}
51
52DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t, int32_t var,
53                                     int32_t srcLane) {
54  int width = WARPSIZE;
55  int self = GetLaneId();
56  int index = srcLane + (self & ~(width - 1));
57  return __builtin_amdgcn_ds_bpermute(index << 2, var);
58}
59
60DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t, int32_t var,
61                                          uint32_t laneDelta, int32_t width) {
62  int self = GetLaneId();
63  int index = self + laneDelta;
64  index = (int)(laneDelta + (self & (width - 1))) >= width ? self : index;
65  return __builtin_amdgcn_ds_bpermute(index << 2, var);
66}
67
68static DEVICE SHARED uint32_t L1_Barrier;
69
70DEVICE void __kmpc_impl_target_init() {
71  // Don't have global ctors, and shared memory is not zero init
72  __atomic_store_n(&L1_Barrier, 0u, __ATOMIC_RELEASE);
73}
74
75DEVICE void __kmpc_impl_named_sync(uint32_t num_threads) {
76  __atomic_thread_fence(__ATOMIC_ACQUIRE);
77
78  uint32_t num_waves = num_threads / WARPSIZE;
79
80  // Partial barrier implementation for amdgcn.
81  // Uses two 16 bit unsigned counters. One for the number of waves to have
82  // reached the barrier, and one to count how many times the barrier has been
83  // passed. These are packed in a single atomically accessed 32 bit integer.
84  // Low bits for the number of waves, assumed zero before this call.
85  // High bits to count the number of times the barrier has been passed.
86
87  assert(num_waves != 0);
88  assert(num_waves * WARPSIZE == num_threads);
89  assert(num_waves < 0xffffu);
90
91  // Increment the low 16 bits once, using the lowest active thread.
92  uint64_t lowestActiveThread = __kmpc_impl_ffs(__kmpc_impl_activemask()) - 1;
93  bool isLowest = GetLaneId() == lowestActiveThread;
94
95  if (isLowest) {
96    uint32_t load =
97        __atomic_fetch_add(&L1_Barrier, 1, __ATOMIC_RELAXED); // commutative
98
99    // Record the number of times the barrier has been passed
100    uint32_t generation = load & 0xffff0000u;
101
102    if ((load & 0x0000ffffu) == (num_waves - 1)) {
103      // Reached num_waves in low bits so this is the last wave.
104      // Set low bits to zero and increment high bits
105      load += 0x00010000u; // wrap is safe
106      load &= 0xffff0000u; // because bits zeroed second
107
108      // Reset the wave counter and release the waiting waves
109      __atomic_store_n(&L1_Barrier, load, __ATOMIC_RELAXED);
110    } else {
111      // more waves still to go, spin until generation counter changes
112      do {
113        __builtin_amdgcn_s_sleep(0);
114        load = __atomic_load_n(&L1_Barrier, __ATOMIC_RELAXED);
115      } while ((load & 0xffff0000u) == generation);
116    }
117  }
118  __atomic_thread_fence(__ATOMIC_RELEASE);
119}
120
121namespace {
122DEVICE uint32_t get_grid_dim(uint32_t n, uint16_t d) {
123  uint32_t q = n / d;
124  return q + (n > q * d);
125}
126DEVICE uint32_t get_workgroup_dim(uint32_t group_id, uint32_t grid_size,
127                                  uint16_t group_size) {
128  uint32_t r = grid_size - group_id * group_size;
129  return (r < group_size) ? r : group_size;
130}
131} // namespace
132
133DEVICE int GetNumberOfBlocksInKernel() {
134  return get_grid_dim(__builtin_amdgcn_grid_size_x(), __builtin_amdgcn_workgroup_size_x());
135}
136
137DEVICE int GetNumberOfThreadsInBlock() {
138  return get_workgroup_dim(__builtin_amdgcn_workgroup_id_x(), __builtin_amdgcn_grid_size_x(),
139                           __builtin_amdgcn_workgroup_size_x());
140}
141
142DEVICE unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; }
143DEVICE unsigned GetLaneId() {
144  return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
145}
146
147// Stub implementations
148DEVICE void *__kmpc_impl_malloc(size_t ) { return nullptr }
149DEVICE void __kmpc_impl_free(void *) {}
150