1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 // Suite of types that represent device memory allocations. These are
17 // allocated by the StreamExecutor interface, which produces values appropriate
18 // for the underlying platform (whether it be CUDA or OpenCL).
19 //
20 // The untyped base class (like a device void*) is DeviceMemoryBase, which can
21 // be specialized for a given allocation type (like a device T*) using
22 // DeviceMemory<T>.
23 
24 #ifndef TENSORFLOW_STREAM_EXECUTOR_DEVICE_MEMORY_H_
25 #define TENSORFLOW_STREAM_EXECUTOR_DEVICE_MEMORY_H_
26 
27 #include <stddef.h>
28 
29 #include "tensorflow/stream_executor/platform/port.h"
30 
31 namespace perftools {
32 namespace gputools {
33 
34 // Temporarily pull stream_executor into perftools::gputools while we migrate
35 // code to the new namespace.  TODO(b/77980417): Remove this once we've
36 // completed the migration.
37 using namespace stream_executor;  // NOLINT[build/namespaces]
38 
39 }  // namespace gputools
40 }  // namespace perftools
41 
42 namespace stream_executor {
43 
44 class DeviceMemoryAllocator;
45 class StreamExecutor;
46 
47 // void*-analogous device memory allocation. For the typed variation, see
48 // DeviceMemory<T>.
49 //
50 // This is effectively a two-tuple of a pointer and size; however, note that the
51 // pointer may not be to the virtual address itself -- in OpenCL the pointer is
52 // to a cl_mem handle that describes the device allocation. Therefore,
53 // DeviceMemoryBase::opaque does not necessarily produce a pointer that can be
54 // referenced directly, so use it with caution.
55 //
56 // Thread-compatible.
57 class DeviceMemoryBase {
58  public:
59   // Default constructor instantiates a null-pointed, zero-sized device memory
60   // region. An opaque pointer may be provided -- see header for details on the
61   // opacity of that pointer.
62   explicit DeviceMemoryBase(void *opaque = nullptr, uint64 size = 0)
opaque_(opaque)63       : opaque_(opaque), size_(size) {}
64 
65   // Returns whether the backing memory is the null pointer.
66   // A `== nullptr` convenience method is also provided.
is_null()67   bool is_null() const { return opaque_ == nullptr; }
68   bool operator==(std::nullptr_t other) const { return is_null(); }
69   bool operator!=(std::nullptr_t other) const { return !is_null(); }
70 
71   // Provides a partial order between device memory values.
72   //
73   // This operator is provided so that this object can be used as a key in an
74   // ordered map.
75   bool operator<(const DeviceMemoryBase &other) const {
76     return opaque() < other.opaque();
77   }
78 
79   // Returns the size, in bytes, for the backing memory.
size()80   uint64 size() const { return size_; }
81 
82   // Warning: note that the pointer returned is not necessarily directly to
83   // device virtual address space, but is platform-dependent.
opaque()84   void *opaque() { return opaque_; }
opaque()85   const void *opaque() const { return opaque_; }
86 
87   // Returns the payload of this memory region.
payload()88   uint64 payload() const { return payload_; }
89 
90   // Sets payload to given value.
SetPayload(uint64 payload)91   void SetPayload(uint64 payload) { payload_ = payload; }
92 
93   // Returns whether the two DeviceMemoryBase segments are identical (both in
94   // their opaque pointer and size).
IsSameAs(const DeviceMemoryBase & other)95   bool IsSameAs(const DeviceMemoryBase &other) const {
96     return opaque() == other.opaque() && size() == other.size();
97   }
98 
99  protected:
100   friend class StreamExecutor;
101 
102   // Resets the internal values of the opaque pointer and number of bytes in the
103   // memory region, just as in the constructor.
Reset(void * opaque,uint64 bytes)104   void Reset(void *opaque, uint64 bytes) {
105     opaque_ = opaque;
106     size_ = bytes;
107   }
108 
109  private:
110   void *opaque_;  // Platform-dependent value representing allocated memory.
111   uint64 size_;   // Size in bytes of this allocation.
112   uint64 payload_ = 0;  // Payload data associated with this allocation.
113 };
114 
115 // Typed wrapper around "void *"-like DeviceMemoryBase.
116 //
117 // For example, DeviceMemory<int> is a simple wrapper around DeviceMemoryBase
118 // that represents one or more integers in Device memory.
119 //
120 // Thread-compatible.
121 template <typename ElemT>
122 class DeviceMemory final : public DeviceMemoryBase {
123  public:
124   // Default constructor instantiates a null-pointed, zero-sized memory region.
DeviceMemory()125   DeviceMemory() : DeviceMemoryBase(nullptr, 0) {}
DeviceMemory(std::nullptr_t)126   explicit DeviceMemory(std::nullptr_t) : DeviceMemory() {}
127 
128   // Typed device memory regions may be constructed from untyped device memory
129   // regions, this effectively amounts to a cast from a void*.
DeviceMemory(const DeviceMemoryBase & other)130   explicit DeviceMemory(const DeviceMemoryBase &other)
131       : DeviceMemoryBase(const_cast<DeviceMemoryBase &>(other).opaque(),
132                          other.size()) {
133     SetPayload(other.payload());
134   }
135 
136   // Returns the number of elements of type ElemT that constitute this
137   // allocation.
ElementCount()138   uint64 ElementCount() const { return size() / sizeof(ElemT); }
139 
140   // Returns whether this is a single-element allocation.
IsScalar()141   bool IsScalar() const { return ElementCount() == 1; }
142 
143   // Create a typed area of DeviceMemory with a given opaque pointer and the
144   // quantity of bytes in the allocation. This function is broken out to
145   // distinguish bytes from an element count.
MakeFromByteSize(void * opaque,uint64 bytes)146   static DeviceMemory<ElemT> MakeFromByteSize(void *opaque, uint64 bytes) {
147     return DeviceMemory<ElemT>(opaque, bytes);
148   }
149 
150   // Resets the DeviceMemory data, in MakeFromByteSize fashion.
151   // This simply clobbers the prior values.
ResetFromByteSize(void * opaque,uint64 bytes)152   void ResetFromByteSize(void *opaque, uint64 bytes) {
153     // TODO(leary) when NVCC is eliminated we can add this check (and the
154     // logging include it requires).
155     // CHECK_EQ(0, bytes % sizeof(ElemT));
156     DeviceMemoryBase::Reset(opaque, bytes);
157   }
158 
159   // ------------------------------------------------------------
160 
161  protected:
162   // This constructor is solely used from derived classes; it is made protected
163   // because it accepts a byte-size instead of an element count, which could
164   // potentially be misused given the ElementCount() nature of this interface.
165   //
166   // In order to specify the desire to use byte size instead of element count
167   // explicitly, use MakeFromByteSize.
DeviceMemory(void * opaque,uint64 size)168   DeviceMemory(void *opaque, uint64 size) : DeviceMemoryBase(opaque, size) {}
169 };
170 
171 // A class to encapsulate the type and size of a dynamic shared memory
172 // buffer. Because the buffer exists solely on the device and is not copyable
173 // to the host, memory objects of this type do not maintain buffer pointers
174 // on the host.
175 template <typename ElemT>
176 class SharedDeviceMemory final : public DeviceMemoryBase {
177  public:
SharedDeviceMemory(uint64 elem_count)178   explicit SharedDeviceMemory(uint64 elem_count)
179       : DeviceMemoryBase(nullptr, elem_count * kElemSize) {}
180 
181   static constexpr size_t kElemSize = sizeof(ElemT);
182 
183   // Returns the number of elements of type ElemT that constitute this
184   // allocation.
ElementCount()185   uint64 ElementCount() const { return size() / kElemSize; }
186 
187   // Returns whether this is a single-element allocation.
IsScalar()188   bool IsScalar() const { return ElementCount() == 1; }
189 };
190 
191 // Host-side representation of packed-and-aligned vector datatypes on the device
192 // side. Since these can appear in device kernel signatures, we support
193 // launching them with these datatypes in launch signatures.
194 
195 struct Float2 {
196   float x, y;
197 };
198 
199 struct Float4 {
200   Float2 xz, yw;
201 };
202 
203 struct Double2 {
204   double x, y;
205 };
206 
207 static_assert(sizeof(Float2) == 2 * sizeof(float), "Float2 must be packed");
208 static_assert(sizeof(Float4) == 4 * sizeof(float), "Float4 must be packed");
209 static_assert(sizeof(Double2) == 2 * sizeof(double), "Double2 must be packed");
210 
211 }  // namespace stream_executor
212 
213 #endif  // TENSORFLOW_STREAM_EXECUTOR_DEVICE_MEMORY_H_
214