1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 // Suite of types that represent device memory allocations. These are
17 // allocated by the StreamExecutor interface, which produces values appropriate
18 // for the underlying platform (whether it be CUDA or OpenCL).
19 //
20 // The untyped base class (like a device void*) is DeviceMemoryBase, which can
21 // be specialized for a given allocation type (like a device T*) using
22 // DeviceMemory<T>.
23 
24 #ifndef TENSORFLOW_STREAM_EXECUTOR_DEVICE_MEMORY_H_
25 #define TENSORFLOW_STREAM_EXECUTOR_DEVICE_MEMORY_H_
26 
27 #include <stddef.h>
28 
29 #include "tensorflow/stream_executor/platform/port.h"
30 
31 namespace perftools {
32 namespace gputools {
33 
34 // Temporarily pull stream_executor into perftools::gputools while we migrate
35 // code to the new namespace.  TODO(b/77980417): Remove this once we've
36 // completed the migration.
37 using namespace stream_executor;  // NOLINT[build/namespaces]
38 
39 }  // namespace gputools
40 }  // namespace perftools
41 
42 namespace stream_executor {
43 
44 class StreamExecutor;
45 
46 // void*-analogous device memory allocation. For the typed variation, see
47 // DeviceMemory<T>.
48 //
49 // This is effectively a two-tuple of a pointer and size; however, note that the
50 // pointer may not be to the virtual address itself -- in OpenCL the pointer is
51 // to a cl_mem handle that describes the device allocation. Therefore,
52 // DeviceMemoryBase::opaque does not necessarily produce a pointer that can be
53 // referenced directly, so use it with caution.
54 //
55 // Thread-compatible.
56 class DeviceMemoryBase {
57  public:
58   // Default constructor instantiates a null-pointed, zero-sized device memory
59   // region. An opaque pointer may be provided -- see header for details on the
60   // opacity of that pointer.
61   explicit DeviceMemoryBase(void *opaque = nullptr, uint64 size = 0,
62                             bool is_sub_buffer = false)
opaque_(opaque)63       : opaque_(opaque), size_(size), is_sub_buffer_(is_sub_buffer) {}
64 
65   // Returns whether the backing memory is the null pointer.
66   // A `== nullptr` convenience method is also provided.
is_null()67   bool is_null() const { return opaque_ == nullptr; }
68   bool operator==(std::nullptr_t other) const { return is_null(); }
69   bool operator!=(std::nullptr_t other) const { return !is_null(); }
70 
71   // Provides a partial order between device memory values.
72   //
73   // This operator is provided so that this object can be used as a key in an
74   // ordered map.
75   bool operator<(const DeviceMemoryBase &other) const {
76     return opaque() < other.opaque();
77   }
78 
79   // Returns the size, in bytes, for the backing memory.
size()80   uint64 size() const { return size_; }
81 
82   // Warning: note that the pointer returned is not necessarily directly to
83   // device virtual address space, but is platform-dependent.
opaque()84   void *opaque() { return opaque_; }
opaque()85   const void *opaque() const { return opaque_; }
86 
87   // Returns true if this is an offset into another primary allocation.
is_sub_buffer()88   bool is_sub_buffer() const { return is_sub_buffer_; }
89 
90   // Returns whether the two DeviceMemoryBase segments are identical (both in
91   // their opaque pointer and size).
IsSameAs(const DeviceMemoryBase & other)92   bool IsSameAs(const DeviceMemoryBase &other) const {
93     return opaque() == other.opaque() && size() == other.size();
94   }
95 
96  protected:
97   friend class StreamExecutor;
98 
99   // Resets the internal values of the opaque pointer and number of bytes in the
100   // memory region, just as in the constructor.
Reset(void * opaque,uint64 bytes)101   void Reset(void *opaque, uint64 bytes) {
102     opaque_ = opaque;
103     size_ = bytes;
104   }
105 
106  private:
107   void *opaque_;  // Platform-dependent value representing allocated memory.
108   uint64 size_;   // Size in bytes of this allocation.
109   bool is_sub_buffer_;  // Is this a primary allocation or a sub-buffer?
110 };
111 
112 // Typed wrapper around "void *"-like DeviceMemoryBase.
113 //
114 // For example, DeviceMemory<int> is a simple wrapper around DeviceMemoryBase
115 // that represents one or more integers in Device memory.
116 //
117 // Thread-compatible.
118 template <typename ElemT>
119 class DeviceMemory final : public DeviceMemoryBase {
120  public:
121   // Default constructor instantiates a null-pointed, zero-sized memory region.
DeviceMemory()122   DeviceMemory() : DeviceMemoryBase(nullptr, 0) {}
DeviceMemory(std::nullptr_t)123   DeviceMemory(std::nullptr_t) : DeviceMemory() {}
124 
125   // Typed device memory regions may be constructed from untyped device memory
126   // regions, this effectively amounts to a cast from a void*.
DeviceMemory(const DeviceMemoryBase & other)127   explicit DeviceMemory(const DeviceMemoryBase &other)
128       : DeviceMemoryBase(const_cast<DeviceMemoryBase &>(other).opaque(),
129                          other.size(), other.is_sub_buffer()) {}
130 
131   // Returns the number of elements of type ElemT that constitute this
132   // allocation.
ElementCount()133   uint64 ElementCount() const { return size() / sizeof(ElemT); }
134 
135   // Returns whether this is a single-element allocation.
IsScalar()136   bool IsScalar() const { return ElementCount() == 1; }
137 
138   // Create a typed area of DeviceMemory with a given opaque pointer and the
139   // quantity of bytes in the allocation. This function is broken out to
140   // distinguish bytes from an element count.
MakeFromByteSize(void * opaque,uint64 bytes)141   static DeviceMemory<ElemT> MakeFromByteSize(void *opaque, uint64 bytes) {
142     return DeviceMemory<ElemT>(opaque, bytes);
143   }
144 
145   // Resets the DeviceMemory data, in MakeFromByteSize fashion.
146   // This simply clobbers the prior values.
ResetFromByteSize(void * opaque,uint64 bytes)147   void ResetFromByteSize(void *opaque, uint64 bytes) {
148     // TODO(leary) when NVCC is eliminated we can add this check (and the
149     // logging include it requires).
150     // CHECK_EQ(0, bytes % sizeof(ElemT));
151     DeviceMemoryBase::Reset(opaque, bytes);
152   }
153 
154   // ------------------------------------------------------------
155 
156  protected:
157   // This constructor is solely used from derived classes; it is made protected
158   // because it accepts a byte-size instead of an element count, which could
159   // potentially be misused given the ElementCount() nature of this interface.
160   //
161   // In order to specify the desire to use byte size instead of element count
162   // explicitly, use MakeFromByteSize.
DeviceMemory(void * opaque,uint64 size)163   DeviceMemory(void *opaque, uint64 size) : DeviceMemoryBase(opaque, size) {}
164 };
165 
166 // A class to encapsulate the type and size of a dynamic shared memory
167 // buffer. Because the buffer exists solely on the device and is not copyable
168 // to the host, memory objects of this type do not maintain buffer pointers
169 // on the host.
170 template <typename ElemT>
171 class SharedDeviceMemory final : public DeviceMemoryBase {
172  public:
SharedDeviceMemory(uint64 elem_count)173   explicit SharedDeviceMemory(uint64 elem_count)
174       : DeviceMemoryBase(nullptr, elem_count * kElemSize) {}
175 
176   static constexpr size_t kElemSize = sizeof(ElemT);
177 
178   // Returns the number of elements of type ElemT that constitute this
179   // allocation.
ElementCount()180   uint64 ElementCount() const { return size() / kElemSize; }
181 
182   // Returns whether this is a single-element allocation.
IsScalar()183   bool IsScalar() const { return ElementCount() == 1; }
184 };
185 
186 // Similar to the typed DeviceMemory, but is the unique owner of its
187 // memory, if any. ScopedDeviceMemory is thread-compatible. It is also
188 // movable and uncopyable to represent unique ownership.
189 template <typename ElemT>
190 class ScopedDeviceMemory {
191  public:
192   // Default construction initializes the internal state to nullptr.  This
193   // mirrors the std::unique_ptr<> functionality, where default construction
194   // produces a nullptr unique_ptr, which can be assigned later.
195   ScopedDeviceMemory();
196 
197   // Parameters:
198   //  parent: Executor used to deallocate memory when this instance goes
199   //          out of scope.
200   //  value: Already-allocated device memory value for this scoped mechanism to
201   //         deallocate. This memory must have been allocated by parent.
202   ScopedDeviceMemory(StreamExecutor *parent, DeviceMemoryBase value);
203 
204   // Constructor overload that places a literal array into device memory
205   ScopedDeviceMemory(StreamExecutor *parent,
206                      std::initializer_list<ElemT> values);
207 
208   // Moves ownership of the memory from other to the constructed
209   // object.
210   //
211   // Postcondition: other == nullptr.
ScopedDeviceMemory(ScopedDeviceMemory && other)212   ScopedDeviceMemory(ScopedDeviceMemory &&other) noexcept:
213       ScopedDeviceMemory(other.parent_, other.Release()) {}
214 
215   // Releases the memory that was provided in the constructor, through the
216   // "parent" StreamExecutor.
217   ~ScopedDeviceMemory();
218 
219   // Moves ownership of the memory from other to this object.
220   //
221   // Postcondition: other == nullptr.
222   ScopedDeviceMemory& operator=(ScopedDeviceMemory &&other) {
223     Reset(other.Release());
224     parent_ = other.parent_;
225     return *this;
226   }
227 
228   // Returns the memory that backs this scoped allocation converted to
229   // DeviceMemory<T> apparent type. This is useful for cases where the
230   // DeviceMemory must be passed by const-ref, as the ScopedDeviceMemory doesn't
231   // allow copying, for scoped-object-lifetime reasons.
cref()232   const DeviceMemory<ElemT> &cref() const { return wrapped_; }
233 
234   // Returns a pointer to the DeviceMemory<T> apparent type for use in mutable
235   // operations. The value returned should not be used outside the scope of this
236   // ScopedDeviceMemory object's lifetime.
ptr()237   DeviceMemory<ElemT> *ptr() { return &wrapped_; }
ptr()238   const DeviceMemory<ElemT> *ptr() const { return &wrapped_; }
239 
240   // Smart-pointer-like operators for the wrapped DeviceMemory.
241   // This reference must not be used outside the lifetime of this
242   // ScopedDeviceMemory.
243   const DeviceMemory<ElemT> &operator*() const { return cref(); }
244   DeviceMemory<ElemT> *operator->() { return ptr(); }
245   const DeviceMemory<ElemT> *operator->() const { return ptr(); }
246   bool operator==(std::nullptr_t other) const { return wrapped_.is_null(); }
247   bool operator!=(std::nullptr_t other) const { return !wrapped_.is_null(); }
248 
249   // Analogous to std::unique_ptr::reset, frees the existing memory held in
250   // this scoped memory container and replaces it with updated. Ownership
251   // of updated is transferred to this object.
252   void Reset(DeviceMemory<ElemT> updated);
253   void Reset(std::nullptr_t);
254 
255   // Analogous to std::unique_ptr::release, releases ownership of the held
256   // memory and transfers it to the caller.
257   //
258   // Postcondition: *this == nullptr
Release()259   DeviceMemory<ElemT> Release() {
260     auto tmp = wrapped_;
261     wrapped_.ResetFromByteSize(nullptr, 0);
262     return tmp;
263   }
264 
265  private:
266   DeviceMemory<ElemT> wrapped_;  // Value we wrap with scoped-release.
267   StreamExecutor *parent_;       // See constructor.
268 
269   SE_DISALLOW_COPY_AND_ASSIGN(ScopedDeviceMemory);
270 };
271 
272 // Host-side representation of packed-and-aligned vector datatypes on the device
273 // side. Since these can appear in device kernel signatures, we support
274 // launching them with these datatypes in launch signatures.
275 
276 struct Float2 {
277   float x, y;
278 };
279 
280 struct Float4 {
281   Float2 xz, yw;
282 };
283 
284 struct Double2 {
285   double x, y;
286 };
287 
288 static_assert(sizeof(Float2) == 2 * sizeof(float), "Float2 must be packed");
289 static_assert(sizeof(Float4) == 4 * sizeof(float), "Float4 must be packed");
290 static_assert(sizeof(Double2) == 2 * sizeof(double), "Double2 must be packed");
291 
292 }  // namespace stream_executor
293 
294 #endif  // TENSORFLOW_STREAM_EXECUTOR_DEVICE_MEMORY_H_
295