1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #if !GOOGLE_CUDA && !TENSORFLOW_USE_ROCM
17 #error This file must only be included when building with Cuda or ROCm support
18 #endif
19 
20 #ifndef TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_DEVICE_H_
21 #define TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_DEVICE_H_
22 
23 #include <memory>
24 #include <string>
25 #include <unordered_map>
26 #include <vector>
27 
28 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
29 #include "tensorflow/core/common_runtime/device_factory.h"
30 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
31 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
32 #include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
33 #include "tensorflow/core/common_runtime/gpu/gpu_id_utils.h"
34 #include "tensorflow/core/common_runtime/gpu_device_context.h"
35 #include "tensorflow/core/common_runtime/local_device.h"
36 #include "tensorflow/core/common_runtime/scoped_allocator_mgr.h"
37 #include "tensorflow/core/common_runtime/shared_counter.h"
38 #include "tensorflow/core/framework/allocator.h"
39 #include "tensorflow/core/framework/device_base.h"
40 #include "tensorflow/core/framework/op_kernel.h"
41 #include "tensorflow/core/framework/tensor.h"
42 #include "tensorflow/core/lib/core/status.h"
43 #include "tensorflow/core/lib/gtl/inlined_vector.h"
44 #include "tensorflow/core/platform/mutex.h"
45 #include "tensorflow/core/platform/stream_executor.h"
46 #include "tensorflow/core/platform/types.h"
47 #include "tensorflow/core/public/session_options.h"
48 
49 namespace tensorflow {
50 class GPUKernelTracker;
51 
52 class BaseGPUDevice : public LocalDevice {
53  public:
54   BaseGPUDevice(const SessionOptions& options, const string& name,
55                 Bytes memory_limit, const DeviceLocality& locality,
56                 TfGpuId tf_gpu_id, const string& physical_device_desc,
57                 Allocator* gpu_allocator, Allocator* cpu_allocator,
58                 bool sync_every_op, int32 max_streams);
59 
60   ~BaseGPUDevice() override;
61 
62   // Initialize the device and return the status of initialization.
63   Status Init(const SessionOptions& options);
64 
65   // GPU devices require the Op Compute method to save a reference to
66   // any temporary tensors that are allocated until the Op execution
67   // completes.
68   bool RequiresRecordingAccessedTensors() const override;
69 
70   // GPU kernel execution requires us to use `tracing::ScopedAnnotation()`
71   // rather than `tracing::ScopedActivity()`, in order to relate asynchronously
72   // launched GPU kernels to the OpKernel.
TraceUsingAnnotations()73   bool TraceUsingAnnotations() const { return true; }
74 
75   void ConsumeListOfAccessedTensors(
76       DeviceContext* device_context,
77       const TensorReferenceVector& tensor_refs) override;
78 
79   Status FillContextMap(const Graph* graph,
80                         DeviceContextMap* device_context_map) override;
81 
82   void Compute(OpKernel* op_kernel, OpKernelContext* context) override;
83 
84   Status Sync() override;
85 
86   void ComputeAsync(AsyncOpKernel* op_kernel, OpKernelContext* context,
87                     AsyncOpKernel::DoneCallback done) override;
88 
89   Status MakeTensorFromProto(const TensorProto& tensor_proto,
90                              const AllocatorAttributes alloc_attrs,
91                              Tensor* tensor) override;
92 
93   // The caller owns the returned device.
94   PerOpGpuDevice* MakeGpuDevice() override;
95 
96   Status ReinitializeGpuDevice(OpKernelContext* context, PerOpGpuDevice* device,
97                                DeviceContext* dc,
98                                Allocator* allocator) override;
99 
100   // Returns the platform GPU id of this device within the native driver system;
101   // e.g., for CUDA and ROCm this is the ordinal of the GPU within the system.
gpu_id()102   int gpu_id() const {
103     PlatformGpuId platform_gpu_id;
104     TF_CHECK_OK(GpuIdManager::TfToPlatformGpuId(tf_gpu_id_, &platform_gpu_id));
105     return platform_gpu_id.value();
106   }
107 
108   // The executor that provides control for the device; e.g., for CUDA this
109   // corresponds to the cuda context.
executor()110   se::StreamExecutor* executor() const { return executor_; }
111 
112   Allocator* GetScopedAllocator(AllocatorAttributes attr,
113                                 int64 step_id) override;
114 
GetScopedAllocatorMgr()115   ScopedAllocatorMgr* GetScopedAllocatorMgr() const override {
116     return scoped_allocator_mgr_.get();
117   }
118 
119   // The following two functions always return 0 unless one of the
120   // related experimental config options has been specified.
121 
122   // If returned value is > 0 then GPU Memory chunks freed before this count
123   // are guaranteed not to be in use by any kernel pending on this device.
124   uint64 SafeAllocFrontier() override;
125 
126   // Returns the number of kernels that have been queued for execution on
127   // the compute stream and are not yet known to have completed.
128   int PendingKernels();
129 
130  protected:
131   Allocator* gpu_allocator_;  // not owned
132   Allocator* cpu_allocator_;  // not owned
133 
134   se::StreamExecutor* executor_;  // not owned
135   std::unique_ptr<ScopedAllocatorMgr> scoped_allocator_mgr_;
136 
137  private:
138   struct StreamGroup {
139     se::Stream* compute = nullptr;
140     se::Stream* host_to_device = nullptr;
141     se::Stream* device_to_host = nullptr;
142     gtl::InlinedVector<se::Stream*, 4> device_to_device;
143   };
144   class StreamGroupFactory;
145 
146   gtl::InlinedVector<StreamGroup*, 4> streams_;
147   mutex scratch_init_mutex_;
148   gtl::InlinedVector<char*, 4> scratch_;
149   std::vector<GPUDeviceContext*> device_contexts_;
150   GpuDeviceInfo* gpu_device_info_ = nullptr;
151   mutex trace_mu_;
152   TfGpuId tf_gpu_id_;
153   const bool sync_every_op_ = false;
154   const int32 max_streams_;
155   std::unique_ptr<EventMgr> em_;
156   std::unique_ptr<thread::ThreadPool> thread_pool_;
157   std::unique_ptr<GPUKernelTracker> kernel_tracker_;
158   int pending_cap_ = 0;
159   bool timestamped_allocator_ = false;
160 
161   // Initialize scractch buffers used by Eigen.
162   Status InitScratchBuffers();
163 
164   void ReinitializeDevice(OpKernelContext* context, PerOpGpuDevice* device,
165                           int stream_id, Allocator* allocator);
166 
167   void ComputeHelper(OpKernel* op_kernel, OpKernelContext* context);
168 
169   string ComputeOpKernelDebugString(const OpKernel& op_kernel,
170                                     const int& stream_id);
171 
172   // This method returns an initialization status, in addition to
173   // calling the "done" StatusCallback, if there is a failure to
174   // allocate memory or if the tensor "from" is not DMA-copyable.
175   // If there is no error prior to enqueueing the copy, an OK status
176   // is returned.
177   Status MaybeCopyTensorToGPU(const AllocatorAttributes& alloc_attrs,
178                               const Tensor& from, Tensor* to,
179                               StatusCallback done);
180 };
181 
182 // A per-compute-stream utility that keeps track of kernels that have been
183 // queued for execution but may not yet have terminated, and also the queued
184 // time of the most recently terminated kernel.
185 class GPUKernelTracker {
186  public:
187   // If we're going to share a SharedCounter with an allocator, it's owned
188   // by the allocator because allocators are initialized once per process.
189   // Devices are per-session.
GPUKernelTracker(Env * env,SharedCounter * timing_counter)190   explicit GPUKernelTracker(Env* env, SharedCounter* timing_counter)
191       : env_(env), timing_counter_(timing_counter), pending_kernels_(64) {
192     if (!timing_counter_) {
193       // There's not a preexisting counter owned by GPUProcessState, i.e.
194       // pending_cap > 0 but timestamped_allocator == false.
195       owned_counter_.reset(new SharedCounter);
196       timing_counter_ = owned_counter_.get();
197     }
198   }
199 
200   // Record that a GPU kernel has just been enqueued on the compute stream.
201   // Inserts a new timing counter value in a new PendingKernel record appended
202   // to the end of the ring buffer then returns that same count.
203   uint64 RecordQueued();
204 
205   // Takes a count value returned by RecordQueued and finds the corresponding
206   // PendingKernel record in the ring buffer.  Marks the kernel as completed and
207   // advances the completion frontier accordingly.
208   void RecordTerminated(uint64 at_count);
209 
210   // Returns the largest timing count such that all kernels queued no
211   // later than that count are known to have terminated.
212   uint64 LastTerminatedCount();
213 
214   // Returns the number of kernels enqueued that are not yet known to
215   // have terminated.
NumPending()216   int NumPending() {
217     mutex_lock l(mu_);
218     return num_pending_;
219   }
220 
221   // Yield current thread until number of pending kernels no longer
222   // exceeds the cap.
PauseWhilePendingExceeds(int cap)223   void PauseWhilePendingExceeds(int cap) {
224     mutex_lock l(mu_);
225     while (num_pending_ > cap) {
226       pending_decreased_.wait(l);
227     }
228   }
229 
230  private:
231   Env* env_;
232   SharedCounter* timing_counter_;
233   std::unique_ptr<SharedCounter> owned_counter_;
234 
235   // Records when a kernel was queued for execution.  Kernel launches are
236   // identified by a unique count value from a per-GPU device timing counter.
237   struct PendingKernel {
238     uint64 queued_count;
239     bool terminated;
PendingKernelPendingKernel240     PendingKernel(const PendingKernel& pk)
241         : queued_count(pk.queued_count), terminated(pk.terminated) {}
PendingKernelPendingKernel242     PendingKernel() : queued_count(0), terminated(false) {}
243   };
244   mutex mu_;
245   // Ring buffer of PendingKernel records.
246   std::vector<PendingKernel> pending_kernels_ GUARDED_BY(mu_);
247   // Next unused slot in pending_kernels_.
248   int first_available_ GUARDED_BY(mu_) = 0;
249   // Last completed PendingKernel such that all prior PendingKernels are
250   // also completed.  With out-of-order completion there may be a mixture
251   // of completed and uncompleted entries between last_completed_ and
252   // first_available_, hence num_pending_ is not guaranteed equal to
253   // their differerence.
254   int last_completed_ GUARDED_BY(mu_) = -1;
255   int num_pending_ GUARDED_BY(mu_) = 0;
256   condition_variable pending_decreased_ GUARDED_BY(mu_);
257 };
258 
259 class BaseGPUDeviceFactory : public DeviceFactory {
260  public:
261   Status CreateDevices(const SessionOptions& options, const string& name_prefix,
262                        std::vector<std::unique_ptr<Device>>* devices) override;
263 
264   struct InterconnectMap {
265     // Name of interconnect technology, if known.
266     string name;
267     // If possible, strength should approximate Gb/sec bandwidth rate.
268     // Where architecture-specific subclassing is not done that won't
269     // always be possible.  The minimum expectation is that
270     // faster links should have a higher value than slower links.
271     int32 strength;
272     static const int kSameDeviceStrength;
273     static const int kStreamExecutorStrength;
274     std::set<std::pair<PlatformGpuId, PlatformGpuId>> directed_links;
275   };
276 
277  protected:
278   // Populates *maps with interconnect maps for all local direct access
279   // pathways between GPUs.
280   virtual Status GetInterconnectMaps(
281       const std::vector<PlatformGpuId>& visible_gpu_order,
282       se::Platform* gpu_manager, std::vector<InterconnectMap>* maps);
283 
284   struct TfGpuIdHash {
operatorTfGpuIdHash285     std::size_t operator()(const TfGpuId& id) const noexcept {
286       return std::hash<int>{}(id.value());
287     }
288   };
289   typedef std::unordered_map<TfGpuId, DeviceLocality, TfGpuIdHash> LocalityMap;
290   // Populates *localities with the DeviceLocality descriptor for
291   // every TfGpuId.
292   virtual Status GetDeviceLocalities(
293       int num_tf_gpus, const std::vector<InterconnectMap>& interconnects,
294       LocalityMap* localities);
295 
296  private:
297   // Creates a BaseGPUDevice associated with 'tf_gpu_id', allocates (strictly)
298   // 'memory_limit' bytes of GPU memory to it, and adds it to the 'devices'
299   // vector.
300   Status CreateGPUDevice(const SessionOptions& options,
301                          const string& name_prefix, TfGpuId tf_gpu_id,
302                          int64 memory_limit, const DeviceLocality& dev_locality,
303                          std::vector<std::unique_ptr<Device>>* devices);
304 
305   virtual std::unique_ptr<BaseGPUDevice> CreateGPUDevice(
306       const SessionOptions& options, const string& name, Bytes memory_limit,
307       const DeviceLocality& dev_locality, TfGpuId tf_gpu_id,
308       const string& physical_device_desc, Allocator* gpu_allocator,
309       Allocator* cpu_allocator) = 0;
310 
311   // Returns into 'ids' the list of valid platform GPU ids, in the order that
312   // they should map to TF GPU ids "/device:GPU:0", "/device:GPU:1", etc,
313   // based upon 'visible_gpu_order' which was generated by parsing
314   // GPUOptions::visible_device_list which is a comma-separated list of CUDA or
315   // ROCm GPU ids.
316   Status GetValidDeviceIds(const std::vector<PlatformGpuId>& visible_gpu_order,
317                            std::vector<PlatformGpuId>* ids);
318 
319   // visible_gpu_initialized_[platform_gpu_id] is true if visible GPU
320   // platform_gpu_id has been initialized by the process.
321   std::unordered_map<int, bool> visible_gpu_initialized_;
322 };
323 
324 }  // namespace tensorflow
325 
326 #endif  // TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_DEVICE_H_
327