1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 // TODO(opensource): Use a more generic sounding preprocessor name than
17 // GOOGLE_CUDA
18 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
19 
20 #if TENSORFLOW_USE_ROCM
21 #include "rocm/include/hip/hip_runtime.h"
22 #endif
23 
24 #define EIGEN_USE_GPU
25 
26 #include "tensorflow/core/common_runtime/gpu/gpu_device.h"
27 
28 #include <stdlib.h>
29 #include <string.h>
30 #include <algorithm>
31 #include <list>
32 #include <map>
33 #include <tuple>
34 #include <vector>
35 
36 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
37 #include "tensorflow/core/common_runtime/device_factory.h"
38 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
39 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
40 #include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
41 #include "tensorflow/core/common_runtime/gpu/gpu_id_utils.h"
42 #include "tensorflow/core/common_runtime/gpu/gpu_init.h"
43 #include "tensorflow/core/common_runtime/gpu/gpu_process_state.h"
44 #include "tensorflow/core/common_runtime/gpu/gpu_stream_util.h"
45 #include "tensorflow/core/common_runtime/gpu/gpu_util.h"
46 #include "tensorflow/core/common_runtime/gpu_device_context.h"
47 #include "tensorflow/core/common_runtime/local_device.h"
48 #include "tensorflow/core/framework/allocator.h"
49 #include "tensorflow/core/framework/device_base.h"
50 #include "tensorflow/core/framework/op_kernel.h"
51 #include "tensorflow/core/framework/tensor.h"
52 #include "tensorflow/core/framework/tensor.pb.h"
53 #include "tensorflow/core/framework/types.h"
54 #include "tensorflow/core/framework/variant_op_registry.h"
55 #include "tensorflow/core/graph/types.h"
56 #include "tensorflow/core/lib/core/errors.h"
57 #include "tensorflow/core/lib/core/status.h"
58 #include "tensorflow/core/lib/gtl/stl_util.h"
59 #include "tensorflow/core/lib/strings/numbers.h"
60 #include "tensorflow/core/lib/strings/str_util.h"
61 #include "tensorflow/core/lib/strings/strcat.h"
62 #if GOOGLE_CUDA
63 #include "tensorflow/core/platform/cuda.h"
64 #elif TENSORFLOW_USE_ROCM
65 #include "tensorflow/core/platform/rocm.h"
66 #endif
67 #include "tensorflow/core/platform/logging.h"
68 #include "tensorflow/core/platform/macros.h"
69 #include "tensorflow/core/platform/stream_executor.h"
70 #include "tensorflow/core/platform/tracing.h"
71 #include "tensorflow/core/platform/types.h"
72 #include "tensorflow/core/public/session_options.h"
73 #include "tensorflow/core/util/device_name_utils.h"
74 #include "tensorflow/core/util/env_var.h"
75 #include "tensorflow/core/util/stream_executor_util.h"
76 
77 #if !defined(PLATFORM_GOOGLE)
78 #if GOOGLE_CUDA
79 #include "cuda/cuda_config.h"
80 #endif
81 #endif
82 
83 namespace tensorflow {
84 
85 #if GOOGLE_CUDA
86 
87 typedef cudaStream_t gpuStream_t;
88 typedef cudaDeviceProp gpuDeviceProp_t;
89 #define EIGEN_GPU_SCRATCH_SIZE (Eigen::kGpuScratchSize)
90 using se::cuda::ScopedActivateExecutorContext;
91 
92 #elif TENSORFLOW_USE_ROCM
93 
94 typedef hipStream_t gpuStream_t;
95 typedef hipDeviceProp_t gpuDeviceProp_t;
96 #define EIGEN_GPU_SCRATCH_SIZE (Eigen::kGpuScratchSize)
97 using se::rocm::ScopedActivateExecutorContext;
98 
99 #endif
100 
101 // Eigen Ops directly allocate memory only for temporary buffers used
102 // during OpKernel::Compute().  The recommended way of allocating such
103 // memory is via OpKernelContext::allocate_temp().  However, Eigen Ops
104 // don't have access to OpKernelContext, instead they get access to
105 // memory directly through the device allocator.  As an Open Source
106 // project, Eigen assumes allocator semantics similar to those of the
107 // CUDA or ROCm memory allocator, and may not work correctly due to race
108 // conditions if used with some other allocator.  For safety, we need
109 // to delay deallocation calls out of Eigen until all events on the
110 // corresponding stream have completed.  The following two classes
111 // serve this purpose in two different compilation environments.
112 
113 class EigenGpuStreamDevice : public ::Eigen::StreamInterface {
114  public:
EigenGpuStreamDevice()115   EigenGpuStreamDevice()
116       : scratch_(nullptr), semaphore_(nullptr), context_(nullptr) {
117     Eigen::initializeDeviceProp();
118   }
~EigenGpuStreamDevice()119   ~EigenGpuStreamDevice() override {}
Reinitialize(OpKernelContext * context,const gpuStream_t * gpu_stream,TfGpuId tf_gpu_id,::tensorflow::Allocator * alloc,char * scratch)120   void Reinitialize(OpKernelContext* context, const gpuStream_t* gpu_stream,
121                     TfGpuId tf_gpu_id, ::tensorflow::Allocator* alloc,
122                     char* scratch) {
123     if (LogMemory::IsEnabled()) {
124       operation_ = context->op_kernel().name() + "/EigenAllocator";
125       step_id_ = context->step_id();
126     }
127     context_ = context;
128     scratch_ = scratch;
129     semaphore_ =
130         reinterpret_cast<unsigned int*>(scratch + Eigen::kGpuScratchSize);
131     stream_ = gpu_stream;
132     allocator_ = alloc;
133     PlatformGpuId platform_gpu_id;
134     TF_CHECK_OK(GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id));
135     device_prop_ = &Eigen::m_deviceProperties[platform_gpu_id.value()];
136   }
137 
stream() const138   const gpuStream_t& stream() const override { return *stream_; }
deviceProperties() const139   const gpuDeviceProp_t& deviceProperties() const override {
140     return *device_prop_;
141   }
142 
allocate(size_t num_bytes) const143   void* allocate(size_t num_bytes) const override {
144     void* ret = allocator_->AllocateRaw(32 /* alignment */, num_bytes);
145     if (ret == nullptr) {
146       if (context_) {
147         context_->SetStatus(errors::ResourceExhausted(
148             strings::StrCat("Ran out of GPU memory when allocating ", num_bytes,
149                             " bytes for ", operation_)));
150       } else {
151         LOG(FATAL)
152             << "EigenAllocator for GPU ran out of memory when allocating "
153             << num_bytes << ". See error logs for more detailed info.";
154       }
155     }
156     if (LogMemory::IsEnabled() && ret != nullptr) {
157       LogMemory::RecordRawAllocation(operation_, step_id_, num_bytes, ret,
158                                      allocator_);
159     }
160     return ret;
161   }
deallocate(void * buffer) const162   void deallocate(void* buffer) const override {
163     if (LogMemory::IsEnabled() && buffer != nullptr) {
164       LogMemory::RecordRawDeallocation(operation_, step_id_, buffer, allocator_,
165                                        true);
166     }
167     AsyncFreeData* afData =
168         new AsyncFreeData(allocator_, buffer, operation_, step_id_);
169 #if GOOGLE_CUDA
170     cudaError_t err = cudaStreamAddCallback(*stream_, asyncFree, afData, 0);
171     CHECK_EQ(err, cudaSuccess);
172 #elif TENSORFLOW_USE_ROCM
173     hipError_t err = hipStreamAddCallback(*stream_, asyncFree, afData, 0);
174     CHECK_EQ(err, hipSuccess);
175 #endif
176   }
177 
178   // Return a pointer to a per stream scratchpad of 1024 bytes residing
179   // in global memory.
scratchpad() const180   void* scratchpad() const override { return scratch_; }
181 
182   // Return a semaphore. The semaphore is initially initialized to 0, and
183   // each kernel using it is responsible for resetting to 0 upon completion
184   // to maintain the invariant that the semaphore is always equal to 0 upon
185   // each kernel start.
semaphore() const186   unsigned int* semaphore() const override { return semaphore_; }
187 
188  private:
189   struct AsyncFreeData {
AsyncFreeDatatensorflow::EigenGpuStreamDevice::AsyncFreeData190     AsyncFreeData(::tensorflow::Allocator* a, void* p, const string& o,
191                   const int64 s)
192         : allocator_(a), address_(p), operation_(o), step_id_(s) {}
193     ::tensorflow::Allocator* allocator_;
194     void* address_;
195     const string operation_;
196     const int64 step_id_;
197   };
198 
199 #if GOOGLE_CUDA
asyncFree(gpuStream_t stream,cudaError_t status,void * userData)200   static void CUDART_CB asyncFree(gpuStream_t stream, cudaError_t status,
201                                   void* userData) {
202 #elif TENSORFLOW_USE_ROCM
203   static void asyncFree(gpuStream_t stream, hipError_t status, void* userData) {
204 #endif
205     AsyncFreeData* data = static_cast<AsyncFreeData*>(userData);
206     if (LogMemory::IsEnabled()) {
207       LogMemory::RecordRawDeallocation(data->operation_, data->step_id_,
208                                        data->address_, data->allocator_, false);
209     }
210     data->allocator_->DeallocateRaw(data->address_);
211     delete data;
212   }
213 
214   string operation_;
215   int64 step_id_;
216   const gpuStream_t* stream_;           // Not owned.
217   const gpuDeviceProp_t* device_prop_;  // Not owned.
218   ::tensorflow::Allocator* allocator_;  // Not owned.
219   mutable char* scratch_;
220   mutable unsigned int* semaphore_;
221   OpKernelContext* context_;
222 
223   TF_DISALLOW_COPY_AND_ASSIGN(EigenGpuStreamDevice);
224 };
225 
226 // This factory helps to ensure that different GPU device objects that refer to
227 // the same physical device and stream group id use the same stream group
228 // object (and therefore the same CUDA streams). This is necessary since there
229 // is a single memory allocator per device (see ProcessState::GetGPUAllocator)
230 // and allocators must not be shared across streams.
231 class BaseGPUDevice::StreamGroupFactory {
232  public:
233   // Returns the unique stream group for use with the stream defined by
234   // {tf_gpu_id, stream_group_within_gpu}, creating it if it does not yet
235   // exist.
236   // This function is thread safe.
GetOrCreate(TfGpuId tf_gpu_id,int stream_group_within_gpu,se::StreamExecutor * executor,const GPUOptions & options)237   BaseGPUDevice::StreamGroup* GetOrCreate(TfGpuId tf_gpu_id,
238                                           int stream_group_within_gpu,
239                                           se::StreamExecutor* executor,
240                                           const GPUOptions& options) {
241     mutex_lock guard(lock_);
242     StreamGroup* group =
243         &streams_[key_type(tf_gpu_id.value(), stream_group_within_gpu)];
244     if (!group->compute) {
245       group->compute = new se::Stream(executor);
246       group->compute->Init();
247       VLOG(2) << "Created stream[" << stream_group_within_gpu
248               << "] = " << group->compute;
249 
250       group->host_to_device = new se::Stream(executor);
251       group->host_to_device->Init();
252       VLOG(2) << "Created host_to_device_stream[" << stream_group_within_gpu
253               << "] = " << group->host_to_device;
254 
255       group->device_to_host = new se::Stream(executor);
256       group->device_to_host->Init();
257       VLOG(2) << "Created device_to_host_stream[" << stream_group_within_gpu
258               << "] = " << group->device_to_host;
259 
260       int num_d2d_streams =
261           options.experimental().num_dev_to_dev_copy_streams();
262       if (num_d2d_streams == 0) num_d2d_streams = 1;
263       if (num_d2d_streams < 1 || num_d2d_streams > 4) {
264         LOG(ERROR)
265             << "Illegal GPUOptions.experimental.num_dev_to_dev_copy_streams="
266             << num_d2d_streams << " set to 1 instead.";
267         num_d2d_streams = 1;
268       }
269       for (int i = 0; i < num_d2d_streams; ++i) {
270         se::Stream* stream = new se::Stream(executor);
271         stream->Init();
272         group->device_to_device.push_back(stream);
273         VLOG(2) << "Created device_to_device_stream[" << stream_group_within_gpu
274                 << "] = " << group->device_to_device.back();
275       }
276     }
277     return group;
278   }
279 
280   // Returns a reference to the StreamGroupFactory singleton. Note that this is
281   // never destroyed, so the objects it owns are never deleted.
Global()282   static StreamGroupFactory& Global() {
283     static StreamGroupFactory* instance = new StreamGroupFactory();
284     return *instance;
285   }
286 
287  private:
288   mutex lock_;
289   using key_type = std::tuple<int, int>;
290   std::map<key_type, StreamGroup> streams_;
291 
292   // StreamGroupFactory cannot be created directly; Call
293   // StreamGroupFactory::Global() to get the global instance.
294   StreamGroupFactory() = default;
295   TF_DISALLOW_COPY_AND_ASSIGN(StreamGroupFactory);
296 };
297 
BaseGPUDevice(const SessionOptions & options,const string & name,Bytes memory_limit,const DeviceLocality & locality,TfGpuId tf_gpu_id,const string & physical_device_desc,Allocator * gpu_allocator,Allocator * cpu_allocator,bool sync_every_op,int32 max_streams)298 BaseGPUDevice::BaseGPUDevice(const SessionOptions& options, const string& name,
299                              Bytes memory_limit, const DeviceLocality& locality,
300                              TfGpuId tf_gpu_id,
301                              const string& physical_device_desc,
302                              Allocator* gpu_allocator, Allocator* cpu_allocator,
303                              bool sync_every_op, int32 max_streams)
304     : LocalDevice(options, Device::BuildDeviceAttributes(name, DEVICE_GPU,
305                                                          memory_limit, locality,
306                                                          physical_device_desc)),
307       gpu_allocator_(gpu_allocator),
308       cpu_allocator_(cpu_allocator),
309       scoped_allocator_mgr_(new ScopedAllocatorMgr(name)),
310       tf_gpu_id_(tf_gpu_id),
311       sync_every_op_(sync_every_op),
312       max_streams_(max_streams) {
313   GPUProcessState::singleton()->EnableGPUDevice();
314   pending_cap_ = options.config.gpu_options().experimental().pending_cap();
315   timestamped_allocator_ =
316       options.config.gpu_options().experimental().timestamped_allocator();
317   if (timestamped_allocator_ || pending_cap_ > 0) {
318     SharedCounter* timing_counter = nullptr;
319     if (timestamped_allocator_) {
320       // In this case the SharedCounter was already created and set in the
321       // associated Allocator, with ownership by GPUProcessState.
322       // The GPUKernelTracker will use this SharedCounter, instead of
323       // owning its own.
324       timing_counter =
325           GPUProcessState::singleton()->GPUAllocatorCounter(tf_gpu_id);
326       DCHECK(timing_counter);
327     } else {
328       DCHECK_GT(pending_cap_, 0);
329     }
330     kernel_tracker_.reset(new GPUKernelTracker(Env::Default(), timing_counter));
331   }
332 }
333 
~BaseGPUDevice()334 BaseGPUDevice::~BaseGPUDevice() {
335   delete gpu_device_info_;
336   for (auto sb : scratch_) gpu_allocator_->DeallocateRaw(sb);
337   for (auto ctx : device_contexts_) ctx->Unref();
338 }
339 
340 // This should be idempotent if already initialized.
InitScratchBuffers()341 Status BaseGPUDevice::InitScratchBuffers() {
342   mutex_lock l(scratch_init_mutex_);
343   if (scratch_.size() < max_streams_) {
344     for (int i = 0; i < max_streams_; i++) {
345       DCHECK(streams_[i]);
346       if (scratch_.size() > i && scratch_[i]) continue;
347       size_t scratch_buffer_size =
348           Eigen::kGpuScratchSize + sizeof(unsigned int);
349       void* scratch_buffer = gpu_allocator_->AllocateRaw(
350           Allocator::kAllocatorAlignment, scratch_buffer_size);
351       if (scratch_buffer == nullptr) {
352         return errors::FailedPrecondition(
353             "Failed to allocate scratch buffer for device ",
354             tf_gpu_id_.value());
355       }
356       se::DeviceMemory<char> mem(
357           se::DeviceMemoryBase(scratch_buffer, scratch_buffer_size));
358 
359       bool ok = executor_->SynchronousMemZero(
360           &mem, Eigen::kGpuScratchSize + sizeof(unsigned int));
361       if (!ok) {
362         return errors::FailedPrecondition(
363             "Failed to memcopy into scratch buffer for device ",
364             tf_gpu_id_.value());
365       }
366       scratch_.push_back(static_cast<char*>(scratch_buffer));
367     }
368   }
369   return Status::OK();
370 }
371 
Init(const SessionOptions & options)372 Status BaseGPUDevice::Init(const SessionOptions& options) {
373   auto executor_status = GpuIdUtil::ExecutorForTfGpuId(tf_gpu_id_);
374   if (!executor_status.status().ok()) {
375     return errors::Internal("Failed to get StreamExecutor for device ",
376                             tf_gpu_id_.value());
377   }
378 
379   executor_ = executor_status.ValueOrDie();
380   em_.reset(new EventMgr(executor_, options.config.gpu_options()));
381 
382   if (max_streams_ < 1) {
383     return errors::InvalidArgument("Invalid value for max_streams.");
384   }
385 
386   // Create the specified number of GPU streams
387   for (int i = 0; i < max_streams_; i++) {
388     streams_.push_back(StreamGroupFactory::Global().GetOrCreate(
389         tf_gpu_id_, i, executor_, options.config.gpu_options()));
390     device_contexts_.push_back(new GPUDeviceContext(
391         i, streams_.back()->compute, streams_.back()->host_to_device,
392         streams_.back()->device_to_host, streams_.back()->device_to_device));
393   }
394   gpu_device_info_ = new GpuDeviceInfo;
395   gpu_device_info_->stream = streams_[0]->compute;
396   gpu_device_info_->default_context = device_contexts_[0];
397   gpu_device_info_->event_mgr = em_.get();
398   PlatformGpuId platform_gpu_id;
399   TF_RETURN_IF_ERROR(
400       GpuIdManager::TfToPlatformGpuId(tf_gpu_id_, &platform_gpu_id));
401   gpu_device_info_->gpu_id = platform_gpu_id.value();
402   set_tensorflow_gpu_device_info(gpu_device_info_);
403 
404   // Whether and how the GPU device uses its own threadpool.
405   // This option is experimental. Once we confirm the best setting, we
406   // may change the default behavior and completely remove this flag.
407   // Default values might change in future releases.
408   // Possible values:
409   //   * global: GPU uses threads shared with CPU in the main compute
410   //          thread-pool. This is currently the default.
411   //   * gpu_private: GPU uses threads dedicated to this device.
412   //   * gpu_shared: All GPUs share a dedicated thread pool.
413   string gpu_thread_mode;
414   TF_RETURN_IF_ERROR(
415       ReadStringFromEnvVar("TF_GPU_THREAD_MODE", "global", &gpu_thread_mode));
416   gpu_thread_mode = str_util::Lowercase(gpu_thread_mode);
417   if (gpu_thread_mode != "global") {
418     int64 gpu_thread_count = -1;
419     // Default to two threads. One for device compute and another for memory
420     // copies.
421     TF_RETURN_IF_ERROR(
422         ReadInt64FromEnvVar("TF_GPU_THREAD_COUNT", 2, &gpu_thread_count));
423     if (gpu_thread_mode == "gpu_private") {
424       // TODO(zhengxq): since these threads only serve a single GPU device,
425       //   we should set the device context once for each thread, and avoid
426       //   setting them for each kernel.
427       // TODO(zhengxq): pin the thread to the same socket of the target GPU.
428       thread_pool_.reset(new thread::ThreadPool(
429           options.env, strings::StrCat("gpu_private_", tf_gpu_id_.value()),
430           static_cast<int32>(gpu_thread_count)));
431       set_tensorflow_device_thread_pool(thread_pool_.get());
432     } else if (gpu_thread_mode == "gpu_shared") {
433       static thread::ThreadPool* thread_pool = new thread::ThreadPool(
434           options.env, "gpu_shared", static_cast<int32>(gpu_thread_count));
435       set_tensorflow_device_thread_pool(thread_pool);
436     } else {
437       string error_message =
438           strings::StrCat("Invalid gpu_thread_mode: ", gpu_thread_mode);
439       LOG(WARNING) << error_message;
440       return errors::InvalidArgument(error_message);
441     }
442   }
443 
444   return Status::OK();
445 }
446 
RequiresRecordingAccessedTensors() const447 bool BaseGPUDevice::RequiresRecordingAccessedTensors() const {
448   // When there is no more than one stream, we release the tensor reference
449   // at the end of the kernel launch, instead of at the end of the kernel
450   // execution.
451   return streams_.size() > 1;
452 }
453 
FillContextMap(const Graph * graph,DeviceContextMap * device_context_map)454 Status BaseGPUDevice::FillContextMap(const Graph* graph,
455                                      DeviceContextMap* device_context_map) {
456   VLOG(2) << "FillContextMap";
457 
458   const size_t num_streams = streams_.size();
459   // Special case for single stream.
460   if (num_streams == 1) {
461     return Status::OK();
462   }
463   const int64 before = Env::Default()->NowMicros();
464   gpu_stream_util::AssignStreamsOpts opts;
465   opts.max_streams = static_cast<int32>(num_streams);
466   std::unordered_map<int, int> node_to_stream_id;
467   TF_RETURN_IF_ERROR(
468       gpu_stream_util::AssignStreams(graph, opts, &node_to_stream_id));
469   int64 elapsed = Env::Default()->NowMicros() - before;
470   VLOG(3) << "AssignStreams took " << elapsed << "us";
471 
472   // Fill in the context map.  It is OK for this map to contain
473   // duplicate DeviceContexts so long as we increment the refcount.
474   device_context_map->resize(graph->num_node_ids());
475   for (Node* n : graph->nodes()) {
476     auto mapped_stream = node_to_stream_id[n->id()];
477     CHECK_LE(mapped_stream, num_streams);
478     auto ctx = device_contexts_[mapped_stream];
479     VLOG(3) << "Assigned stream " << node_to_stream_id[n->id()]
480             << " ==> stream[" << ctx->stream_id() << "] for node id " << n->id()
481             << " " << n->type_string() << " " << n->name();
482     ctx->Ref();
483     (*device_context_map)[n->id()] = ctx;
484   }
485 
486   return Status::OK();
487 }
488 
Compute(OpKernel * op_kernel,OpKernelContext * context)489 void BaseGPUDevice::Compute(OpKernel* op_kernel, OpKernelContext* context) {
490   // NOTE(tucker): We need to discriminate between Eigen GPU
491   // operations and all others.  If an operation is Eigen
492   // implemented (or otherwise tries to launch a GPU kernel
493   // directly), we need to establish a stacked-scoped environment
494   // that directs it to execute on the proper device.  Otherwise we
495   // expect the Op to use StreamExecutor directly and correctly.  The
496   // way we make this discrimination is quite hacky: At the moment
497   // the only non-Eigen GPU Op is the recv-op, which is known to be
498   // asynchronous.
499   if (op_kernel->is_internal() && op_kernel->type_string() == "_Recv") {
500     context->SetStatus(errors::Internal(
501         "Invalid synchronous 'Compute' on GPU for '_Recv' op"));
502   } else {
503     ComputeHelper(op_kernel, context);
504   }
505 }
506 
ComputeOpKernelDebugString(const OpKernel & op_kernel,const int & stream_id)507 string BaseGPUDevice::ComputeOpKernelDebugString(const OpKernel& op_kernel,
508                                                  const int& stream_id) {
509   return strings::StrCat(op_kernel.name(), " op ", op_kernel.type_string(),
510                          " on GPU ", tf_gpu_id_.value(), " stream[", stream_id,
511                          "]");
512 }
513 
ComputeHelper(OpKernel * op_kernel,OpKernelContext * context)514 void BaseGPUDevice::ComputeHelper(OpKernel* op_kernel,
515                                   OpKernelContext* context) {
516   GPUDeviceContext* gpu_device_context = device_contexts_[0];
517   if (context->op_device_context() != nullptr) {
518     gpu_device_context =
519         static_cast<GPUDeviceContext*>(context->op_device_context());
520   }
521   se::Stream* stream = gpu_device_context->stream();
522   const auto stream_id = gpu_device_context->stream_id();
523 
524   const bool vlog_1 = VLOG_IS_ON(1);
525   const bool vlog_2 = vlog_1 && VLOG_IS_ON(2);
526 
527   if (vlog_1) {
528     VLOG(1) << "GpuDevice::ComputeHelper "
529             << ComputeOpKernelDebugString(*op_kernel, stream_id);
530   }
531 
532   const auto num_streams = streams_.size();
533   if (num_streams > 1) {
534     // If this op's device context is different from the other contexts,
535     // we must wait on the stream.
536     for (int i = 0; i < context->num_inputs(); ++i) {
537       const GPUDeviceContext* idc =
538           static_cast<GPUDeviceContext*>(context->input_device_context(i));
539       OP_REQUIRES(context, idc != nullptr,
540                   errors::Internal("Input device context ", i,
541                                    " was not set properly."));
542       if (vlog_2) {
543         const void* base;
544         size_t len;
545         if (context->has_input(i)) {
546           if (IsRefType(context->input_dtype(i))) {
547             Tensor tensor = context->mutable_input(i, false);
548             base = DMAHelper::base(&tensor);
549             len = tensor.TotalBytes();
550           } else {
551             const Tensor& tensor = context->input(i);
552             base = DMAHelper::base(&tensor);
553             len = tensor.TotalBytes();
554           }
555           LOG(INFO) << "Input " << i << " " << base << "  " << len;
556           LOG(INFO) << "  stream[" << stream_id << "].ThenWaitFor(stream["
557                     << idc->stream_id() << "])"
558                     << ((idc->stream() == stream) ? " not needed" : "");
559         }
560       }
561       if (idc->stream() != stream) stream->ThenWaitFor(idc->stream());
562     }
563   }
564   if (pending_cap_ > 0) {
565     DCHECK(kernel_tracker_);
566     kernel_tracker_->PauseWhilePendingExceeds(pending_cap_);
567   }
568   ScopedActivateExecutorContext scoped_activation{stream->parent()};
569   op_kernel->Compute(context);
570   if (context->status().ok()) {
571     if (sync_every_op_) {
572       // Note: GPUUtil::Sync() only syncs the default stream.
573       // We need to either sync the stream used by this op, or
574       // all streams.  Given that this flag is typically used for
575       // debugging it makes more sense to sync all GPU activity.
576       context->SetStatus(GPUUtil::SyncAll(this));
577       if (vlog_1) {
578         VLOG(1) << "GpuDevice::ComputeHelper finished "
579                 << ComputeOpKernelDebugString(*op_kernel, stream_id);
580       }
581     } else if (vlog_1) {
582       VLOG(1) << "GpuDevice::ComputeHelper scheduled "
583               << ComputeOpKernelDebugString(*op_kernel, stream_id);
584     }
585     if (kernel_tracker_) {
586       GPUKernelTracker* tracker = kernel_tracker_.get();
587       DCHECK(tracker);
588       uint64 queued_count = tracker->RecordQueued();
589       em_->ThenExecute(stream, [op_kernel, tracker, queued_count]() {
590         tracker->RecordTerminated(queued_count);
591       });
592     }
593   } else {
594     if (vlog_1) {
595       VLOG(1) << "GpuDevice::ComputeHelper failed to schedule "
596               << ComputeOpKernelDebugString(*op_kernel, stream_id);
597     }
598   }
599 }
600 
ConsumeListOfAccessedTensors(DeviceContext * device_context,const TensorReferenceVector & tensor_refs)601 void BaseGPUDevice::ConsumeListOfAccessedTensors(
602     DeviceContext* device_context, const TensorReferenceVector& tensor_refs) {
603   GPUDeviceContext* gpu_device_context = device_contexts_[0];
604   if (device_context != nullptr) {
605     gpu_device_context = static_cast<GPUDeviceContext*>(device_context);
606   }
607   se::Stream* stream = gpu_device_context->stream();
608   em_->ThenDeleteTensors(stream, tensor_refs);
609 }
610 
611 // Based on the semantics of Device::Sync this call should wait for
612 // all streams not just the current one.
Sync()613 Status BaseGPUDevice::Sync() { return GPUUtil::SyncAll(this); }
614 
ComputeAsync(AsyncOpKernel * op_kernel,OpKernelContext * context,AsyncOpKernel::DoneCallback done)615 void BaseGPUDevice::ComputeAsync(AsyncOpKernel* op_kernel,
616                                  OpKernelContext* context,
617                                  AsyncOpKernel::DoneCallback done) {
618   GPUDeviceContext* gpu_device_context = device_contexts_[0];
619   if (context->op_device_context() != nullptr) {
620     gpu_device_context =
621         static_cast<GPUDeviceContext*>(context->op_device_context());
622   }
623   se::Stream* stream = gpu_device_context->stream();
624   const auto stream_id = gpu_device_context->stream_id();
625 
626   VLOG(1) << "GpuDevice::ComputeAsync " << op_kernel->name() << " op "
627           << op_kernel->type_string() << " on GPU" << tf_gpu_id_ << " stream["
628           << stream_id << "]";
629 
630   // When Xprof profiling is off (which is the default), constructing the
631   // activity is simple enough that its overhead is negligible.
632   tracing::ScopedActivity activity(op_kernel->name(), op_kernel->type_string(),
633                                    op_kernel->IsExpensive());
634   ScopedActivateExecutorContext scoped_activation{stream->parent()};
635   op_kernel->ComputeAsync(context, done);
636 }
637 
MaybeCopyTensorToGPU(const AllocatorAttributes & alloc_attrs,const Tensor & from,Tensor * to,StatusCallback done)638 Status BaseGPUDevice::MaybeCopyTensorToGPU(
639     const AllocatorAttributes& alloc_attrs, const Tensor& from, Tensor* to,
640     StatusCallback done) {
641   if (alloc_attrs.on_host()) {
642     *to = from;
643     done(Status::OK());
644     return Status::OK();
645   } else {
646     if (!DMAHelper::CanUseDMA(&from)) {
647       Status err = errors::Internal("GPU copy from non-DMA ",
648                                     DataTypeString(from.dtype()), " tensor");
649       done(err);
650       return err;
651     }
652     auto* copy =
653         new Tensor(GetAllocator(alloc_attrs), from.dtype(), from.shape());
654 
655     // If the tensor is not initialized, we likely ran out of memory.
656     if (!copy->IsInitialized()) {
657       delete copy;
658       Status err = errors::ResourceExhausted(
659           "OOM when allocating tensor of shape ", from.shape().DebugString(),
660           " and type ", DataTypeString(from.dtype()));
661       done(err);
662       return err;
663     }
664 
665     StatusCallback wrapped_done = std::bind(
666         [to, copy](StatusCallback done_,
667                    // Begin unbound arguments.
668                    const Status& s) {
669           if (s.ok()) {
670             *to = std::move(*copy);
671           }
672           delete copy;
673           done_(s);
674         },
675         std::move(done), std::placeholders::_1);
676 
677     tracing::ScopedAnnotation annotation("MakeTensorFromProto");
678     device_contexts_[0]->CopyCPUTensorToDevice(&from, this, copy,
679                                                std::move(wrapped_done));
680     return Status::OK();
681   }
682 }
683 
MakeTensorFromProto(const TensorProto & tensor_proto,const AllocatorAttributes alloc_attrs,Tensor * tensor)684 Status BaseGPUDevice::MakeTensorFromProto(const TensorProto& tensor_proto,
685                                           const AllocatorAttributes alloc_attrs,
686                                           Tensor* tensor) {
687   AllocatorAttributes attr;
688   attr.set_on_host(true);
689   attr.set_gpu_compatible(true);
690   Allocator* host_alloc = GetAllocator(attr);
691   Tensor parsed(tensor_proto.dtype());
692   if (!parsed.FromProto(host_alloc, tensor_proto)) {
693     return errors::InvalidArgument("Cannot parse tensor from proto: ",
694                                    tensor_proto.DebugString());
695   }
696 
697   if (parsed.dtype() == DT_VARIANT) {
698     const Variant* from = parsed.flat<Variant>().data();
699     int numa_node = attributes().locality().numa_node();
700     Tensor copy(cpu_allocator(numa_node), DT_VARIANT, parsed.shape());
701     Variant* copy_variant = copy.flat<Variant>().data();
702 
703     std::list<Notification> notifications;
704     Status copy_status;
705     auto copier = [this, &alloc_attrs, &notifications, &copy_status](
706                       const Tensor& from, Tensor* to) {
707       // Copier isn't run in a multithreaded environment, so we don't
708       // have to worry about the notifications list being modified in parallel.
709       notifications.emplace_back();
710       Notification& n = *notifications.rbegin();
711       return MaybeCopyTensorToGPU(alloc_attrs, from, to,
712                                   [&n, &copy_status](const Status& s) {
713                                     if (copy_status.ok()) {
714                                       copy_status.Update(s);
715                                     }
716                                     n.Notify();
717                                   });
718     };
719     Status s;
720     for (int64 ix = 0; ix < parsed.NumElements(); ++ix) {
721       s = VariantDeviceCopy(VariantDeviceCopyDirection::HOST_TO_DEVICE,
722                             from[ix], &copy_variant[ix], copier);
723       if (!s.ok()) {
724         break;
725       }
726     }
727     for (auto& n : notifications) {
728       n.WaitForNotification();
729     }
730     if (!s.ok()) {
731       return s;
732     }
733     *tensor = std::move(copy);
734     return copy_status;
735   } else {
736     Notification n;
737     Status status;
738     TF_RETURN_IF_ERROR(MaybeCopyTensorToGPU(alloc_attrs, parsed, tensor,
739                                             [&n, &status](const Status& s) {
740                                               status = s;
741                                               n.Notify();
742                                             }));
743     n.WaitForNotification();
744     return status;
745   }
746 }
747 
748 namespace {
749 class ConcretePerOpGpuDevice : public PerOpGpuDevice {
750  public:
ConcretePerOpGpuDevice()751   ConcretePerOpGpuDevice() : device_(&stream_device_) {}
752 
Reinitialize(OpKernelContext * context,const gpuStream_t * gpu_stream,TfGpuId tf_gpu_id,Allocator * base_allocator,char * scratch)753   void Reinitialize(OpKernelContext* context, const gpuStream_t* gpu_stream,
754                     TfGpuId tf_gpu_id, Allocator* base_allocator,
755                     char* scratch) {
756     stream_device_.Reinitialize(context, gpu_stream, tf_gpu_id, base_allocator,
757                                 scratch);
758   }
759 
device() const760   const Eigen::GpuDevice& device() const override { return device_; }
761 
762  private:
763   EigenGpuStreamDevice stream_device_;
764   Eigen::GpuDevice device_;
765 };
766 
767 // Parse 'visible_device_list' into a list of platform GPU ids.
ParseVisibleDeviceList(const string & visible_device_list,std::vector<PlatformGpuId> * visible_gpu_order)768 Status ParseVisibleDeviceList(const string& visible_device_list,
769                               std::vector<PlatformGpuId>* visible_gpu_order) {
770   visible_gpu_order->clear();
771   se::Platform* gpu_manager = GPUMachineManager();
772 
773   // If the user wants to remap the visible to virtual GPU mapping,
774   // check for that here.
775   if (visible_device_list.empty()) {
776     visible_gpu_order->resize(gpu_manager->VisibleDeviceCount());
777     // By default, visible to virtual mapping is unchanged.
778     int deviceNo = 0;
779     std::generate(visible_gpu_order->begin(), visible_gpu_order->end(),
780                   [&deviceNo] { return deviceNo++; });
781   } else {
782     const std::vector<string> order_str =
783         str_util::Split(visible_device_list, ',');
784     for (const string& platform_gpu_id_str : order_str) {
785       int32 platform_gpu_id;
786       if (!strings::safe_strto32(platform_gpu_id_str, &platform_gpu_id)) {
787         return errors::InvalidArgument(
788             "Could not parse entry in 'visible_device_list': '",
789             platform_gpu_id_str,
790             "'. visible_device_list = ", visible_device_list);
791       }
792       if (platform_gpu_id < 0 ||
793           platform_gpu_id >= gpu_manager->VisibleDeviceCount()) {
794         return errors::InvalidArgument(
795             "'visible_device_list' listed an invalid GPU id '", platform_gpu_id,
796             "' but visible device count is ",
797             gpu_manager->VisibleDeviceCount());
798       }
799       visible_gpu_order->push_back(PlatformGpuId(platform_gpu_id));
800     }
801   }
802 
803   // Validate no repeats.
804   std::set<PlatformGpuId> visible_device_set(visible_gpu_order->begin(),
805                                              visible_gpu_order->end());
806   if (visible_device_set.size() != visible_gpu_order->size()) {
807     return errors::InvalidArgument(
808         "visible_device_list contained a duplicate entry: ",
809         visible_device_list);
810   }
811   return Status::OK();
812 }
813 
VerifyVirtualDeviceSettings(const size_t num_gpus_to_use,const GPUOptions & gpu_options,const std::vector<PlatformGpuId> & visible_gpu_order,const std::vector<PlatformGpuId> & valid_platform_gpu_ids)814 Status VerifyVirtualDeviceSettings(
815     const size_t num_gpus_to_use, const GPUOptions& gpu_options,
816     const std::vector<PlatformGpuId>& visible_gpu_order,
817     const std::vector<PlatformGpuId>& valid_platform_gpu_ids) {
818   const auto& virtual_devices = gpu_options.experimental().virtual_devices();
819   CHECK(!virtual_devices.empty());
820   if (gpu_options.per_process_gpu_memory_fraction() > 0) {
821     return errors::InvalidArgument(
822         "It's invalid to set per_process_gpu_memory_fraction when "
823         "virtual_devices is set.");
824   }
825   if (num_gpus_to_use < virtual_devices.size()) {
826     return errors::Unknown(
827         "Not enough GPUs to create virtual devices."
828         " num_gpus_to_use: ",
829         num_gpus_to_use, " #virtual_devices: ", virtual_devices.size());
830   }
831   if (!gpu_options.visible_device_list().empty() &&
832       visible_gpu_order.size() != virtual_devices.size()) {
833     return errors::InvalidArgument(
834         "The number of GPUs in visible_device_list doesn't match the number "
835         "of elements in the virtual_devices list.",
836         " #GPUs in visible_device_list: ", visible_gpu_order.size(),
837         " virtual_devices.size(): ", virtual_devices.size());
838   }
839   if (valid_platform_gpu_ids.size() != virtual_devices.size()) {
840     return errors::Unknown(
841         "The number of valid GPUs doesn't match the number of elements in "
842         "the virtual_devices list.",
843         " #valid GPUs: ", valid_platform_gpu_ids.size(),
844         " virtual_devices.size(): ", virtual_devices.size());
845   }
846   return Status::OK();
847 }
848 
MinSystemMemory(int64 available_memory)849 int64 MinSystemMemory(int64 available_memory) {
850   // We use the following heuristic for now:
851   //
852   // If the available_memory is < 2GiB, we allocate 225MiB to system memory.
853   // Otherwise, allocate max(300MiB, 0.05 * available_memory) to system memory.
854   //
855   // In the future we could be more sophisticated by using a table of devices.
856   int64 min_system_memory;
857   if (available_memory < (1LL << 31)) {
858     // 225MiB
859     min_system_memory = 225 * 1024 * 1024;
860   } else {
861     // max(300 MiB, 0.05 * available_memory)
862     min_system_memory =
863         std::max(int64{314572800}, static_cast<int64>(available_memory * 0.05));
864   }
865 #if defined(__GNUC__) && defined(__OPTIMIZE__)
866 // Do nothing
867 #elif !defined(__GNUC__) && defined(NDEBUG)
868 // Do nothing
869 #else
870   // Double the amount of available GPU memory in non-opt builds (debug
871   // builds in windows); because in non-opt builds more system memory
872   // is necessary.
873   min_system_memory *= 2;
874 #endif
875 
876 #if defined(ANDROID_TEGRA)
877   // 1GB system mem for NVIDIA Tegra devices since they use the same mem for RAM
878   // and Video RAM
879   min_system_memory = 1 << 30;
880 #endif
881   return min_system_memory;
882 }
883 
884 // Get the memory limit for the virtual device being created on GPU with
885 // 'platform_gpu_id', when that virtual device is the only virtual device being
886 // created on that GPU.
SingleVirtualDeviceMemoryLimit(const GPUOptions & gpu_options,PlatformGpuId platform_gpu_id,int64 * memory_limit)887 Status SingleVirtualDeviceMemoryLimit(const GPUOptions& gpu_options,
888                                       PlatformGpuId platform_gpu_id,
889                                       int64* memory_limit) {
890   int64 total_memory = 0;
891   int64 available_memory = 0;
892   se::StreamExecutor* se =
893       GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie();
894   if (!se->DeviceMemoryUsage(&available_memory, &total_memory)) {
895     return errors::Unknown("Failed to query available memory for GPU ",
896                            platform_gpu_id.value());
897   }
898 
899   int64 allocated_memory = 0;
900   const double per_process_gpu_memory_fraction =
901       gpu_options.per_process_gpu_memory_fraction();
902   if (per_process_gpu_memory_fraction > 1.0 ||
903       gpu_options.experimental().use_unified_memory()) {
904     int cc_major = 0, cc_minor = 0;
905     if (!se->GetDeviceDescription().cuda_compute_capability(&cc_major,
906                                                             &cc_minor)) {
907       return errors::Internal("Failed to get compute capability for device.");
908     }
909     if (cc_major < 6) {
910       return errors::Internal(
911           "Unified memory on GPUs with compute capability lower than 6.0 "
912           "(pre-Pascal class GPUs) does not support oversubscription.");
913     }
914   }
915 
916   if (per_process_gpu_memory_fraction == 0) {
917     allocated_memory = available_memory;
918     const int64 min_system_memory = MinSystemMemory(available_memory);
919     if (min_system_memory < allocated_memory) {
920       allocated_memory -= min_system_memory;
921     }
922   } else {
923     allocated_memory = total_memory * per_process_gpu_memory_fraction;
924   }
925   *memory_limit = allocated_memory;
926   return Status::OK();
927 }
928 }  // namespace
929 
ReinitializeDevice(OpKernelContext * context,PerOpGpuDevice * device,int stream_id,Allocator * allocator)930 void BaseGPUDevice::ReinitializeDevice(OpKernelContext* context,
931                                        PerOpGpuDevice* device, int stream_id,
932                                        Allocator* allocator) {
933   ConcretePerOpGpuDevice* concrete_device =
934       static_cast<ConcretePerOpGpuDevice*>(device);
935   DCHECK(concrete_device);
936   const gpuStream_t* gpu_stream = reinterpret_cast<const gpuStream_t*>(
937       streams_[stream_id]->compute->implementation()->GpuStreamMemberHack());
938   concrete_device->Reinitialize(context, gpu_stream, tf_gpu_id_, allocator,
939                                 scratch_[stream_id]);
940 }
941 
MakeGpuDevice()942 PerOpGpuDevice* BaseGPUDevice::MakeGpuDevice() {
943   return new ConcretePerOpGpuDevice();
944 }
945 
ReinitializeGpuDevice(OpKernelContext * context,PerOpGpuDevice * device,DeviceContext * dc,Allocator * allocator)946 Status BaseGPUDevice::ReinitializeGpuDevice(OpKernelContext* context,
947                                             PerOpGpuDevice* device,
948                                             DeviceContext* dc,
949                                             Allocator* allocator) {
950   TF_RETURN_IF_ERROR(InitScratchBuffers());
951   if (dc) {
952     const GPUDeviceContext* gpu_dc = static_cast<GPUDeviceContext*>(dc);
953     const int stream_id = gpu_dc->stream_id();
954     VLOG(1) << "  eigen_gpu_device(" << dc << ") => stream[" << stream_id
955             << "]";
956     CHECK_LT(stream_id, streams_.size());
957     ReinitializeDevice(context, device, stream_id, allocator);
958   } else {
959     ReinitializeDevice(context, device, 0, allocator);
960   }
961   return Status::OK();
962 }
963 
GetScopedAllocator(AllocatorAttributes attr,int64 step_id)964 Allocator* BaseGPUDevice::GetScopedAllocator(AllocatorAttributes attr,
965                                              int64 step_id) {
966   if (attr.scope_id > 0) {
967     return scoped_allocator_mgr_->GetContainer(step_id)->GetInstance(
968         attr.scope_id);
969   }
970   LOG(FATAL) << "Unexpected call to BaseGPUDevice::GetScopedAllocator "
971              << "attr.scope_id = " << attr.scope_id;
972   return gpu_allocator_;
973 }
974 
975 const int BaseGPUDeviceFactory::InterconnectMap::kSameDeviceStrength = 1000;
976 const int BaseGPUDeviceFactory::InterconnectMap::kStreamExecutorStrength = 1;
977 
CreateDevices(const SessionOptions & options,const string & name_prefix,std::vector<std::unique_ptr<Device>> * devices)978 Status BaseGPUDeviceFactory::CreateDevices(
979     const SessionOptions& options, const string& name_prefix,
980     std::vector<std::unique_ptr<Device>>* devices) {
981   TF_RETURN_IF_ERROR(ValidateGPUMachineManager());
982   se::Platform* gpu_manager = GPUMachineManager();
983   if (gpu_manager == nullptr) {
984     return Status::OK();
985   }
986   // If there are no GPUs visible, do nothing.
987   if (gpu_manager->VisibleDeviceCount() <= 0) {
988     return Status::OK();
989   }
990 
991   size_t num_gpus_to_use = INT_MAX;
992   auto iter = options.config.device_count().find("GPU");
993   if (iter != options.config.device_count().end()) {
994     num_gpus_to_use = iter->second;
995   }
996   const auto& gpu_options = options.config.gpu_options();
997   std::vector<PlatformGpuId> visible_gpu_order;
998   std::vector<PlatformGpuId> valid_platform_gpu_ids;
999   // If we aren't going to use any GPUs, don't initialize them.
1000   // We don't want to call ParseVisibleDeviceList if num_gpus_to_use is 0,
1001   // because it treats an empty gpu_options.visible_device_list as 'all GPUs are
1002   // visible'.
1003   if (num_gpus_to_use > 0) {
1004     TF_RETURN_IF_ERROR(ParseVisibleDeviceList(gpu_options.visible_device_list(),
1005                                               &visible_gpu_order));
1006     TF_RETURN_IF_ERROR(
1007         GetValidDeviceIds(visible_gpu_order, &valid_platform_gpu_ids));
1008   }
1009   if (num_gpus_to_use > valid_platform_gpu_ids.size()) {
1010     num_gpus_to_use = valid_platform_gpu_ids.size();
1011   }
1012   if (!valid_platform_gpu_ids.empty()) {
1013     // Save the original device.
1014     int original_device = 0;
1015 #if GOOGLE_CUDA
1016     cudaError_t err = cudaGetDevice(&original_device);
1017     if (err != cudaSuccess) {
1018       return errors::Internal("cudaGetDevice() failed. Status: ",
1019                               cudaGetErrorString(err));
1020     }
1021 #elif TENSORFLOW_USE_ROCM
1022     hipError_t err = hipGetDevice(&original_device);
1023     if (err != hipSuccess) {
1024       return errors::Internal("hipGetDevice() failed. Status: ",
1025                               hipGetErrorString(err));
1026     }
1027 #endif
1028 
1029     // Force to implicitly initialize CUDA runtime on each valid GPU before
1030     // CreateGPUDevice().
1031     for (PlatformGpuId platform_gpu_id : valid_platform_gpu_ids) {
1032 #if GOOGLE_CUDA
1033       err = cudaSetDevice(platform_gpu_id.value());
1034       if (err != cudaSuccess) {
1035         return errors::Internal(
1036             "cudaSetDevice() on GPU:", platform_gpu_id.value(),
1037             " failed. Status: ", cudaGetErrorString(err));
1038       }
1039       err = cudaFree(nullptr);
1040       if (err != cudaSuccess) {
1041         return errors::Internal("CUDA runtime implicit initialization on GPU:",
1042                                 platform_gpu_id.value(),
1043                                 " failed. Status: ", cudaGetErrorString(err));
1044       }
1045 #elif TENSORFLOW_USE_ROCM
1046       err = hipSetDevice(platform_gpu_id.value());
1047       if (err != hipSuccess) {
1048         return errors::Internal(
1049             "hipSetDevice() on GPU:", platform_gpu_id.value(),
1050             " failed. Status: ", hipGetErrorString(err));
1051       }
1052       err = hipFree(nullptr);
1053       if (err != hipSuccess) {
1054         return errors::Internal("ROCm runtime implicit initialization on GPU:",
1055                                 platform_gpu_id.value(),
1056                                 " failed. Status: ", hipGetErrorString(err));
1057       }
1058 #endif
1059     }
1060     // Reset to the original device.
1061 #if GOOGLE_CUDA
1062     err = cudaSetDevice(original_device);
1063     if (err != cudaSuccess) {
1064       return errors::Internal("cudaSetDevice() on GPU:", original_device,
1065                               " failed. Status: ", cudaGetErrorString(err));
1066     }
1067 #elif TENSORFLOW_USE_ROCM
1068     err = hipSetDevice(original_device);
1069     if (err != hipSuccess) {
1070       return errors::Internal("hipSetDevice() on GPU:", original_device,
1071                               " failed. Status: ", hipGetErrorString(err));
1072     }
1073 #endif
1074   }
1075 
1076   std::vector<InterconnectMap> interconnect_maps;
1077   TF_RETURN_IF_ERROR(
1078       GetInterconnectMaps(visible_gpu_order, gpu_manager, &interconnect_maps));
1079 
1080   // Print each interconnect map to the log.
1081   for (const InterconnectMap& im : interconnect_maps) {
1082     LOG(INFO) << "Device interconnect " << im.name << " with strength "
1083               << im.strength << " edge matrix:";
1084     string line_buf = "     ";
1085     for (int i = 0; i < visible_gpu_order.size(); ++i) {
1086       strings::StrAppend(&line_buf, visible_gpu_order[i].value(), " ");
1087     }
1088     LOG(INFO) << line_buf;
1089     for (int i = 0; i < visible_gpu_order.size(); ++i) {
1090       line_buf = strings::StrCat(visible_gpu_order[i].value(), ":   ");
1091       PlatformGpuId gpu_id_i = visible_gpu_order[i];
1092       for (int j = 0; j < visible_gpu_order.size(); ++j) {
1093         PlatformGpuId gpu_id_j = visible_gpu_order[j];
1094         if (im.directed_links.find({gpu_id_i, gpu_id_j}) !=
1095             im.directed_links.end()) {
1096           line_buf.append("Y ");
1097         } else {
1098           line_buf.append("N ");
1099         }
1100       }
1101       LOG(INFO) << line_buf;
1102     }
1103   }
1104 
1105   const auto& virtual_devices = gpu_options.experimental().virtual_devices();
1106   if (!virtual_devices.empty()) {
1107     TF_RETURN_IF_ERROR(VerifyVirtualDeviceSettings(num_gpus_to_use, gpu_options,
1108                                                    visible_gpu_order,
1109                                                    valid_platform_gpu_ids));
1110     // We've verified that num_gpus_to_use >= virtual_devices.size().
1111     num_gpus_to_use = virtual_devices.size();
1112     CHECK(gpu_options.visible_device_list().empty() ||
1113           valid_platform_gpu_ids == visible_gpu_order);
1114   }
1115   int next_tf_gpu_id = 0;
1116   std::vector<int64> memory_limit_bytes;
1117   for (int i = 0; i < num_gpus_to_use; ++i) {
1118     const PlatformGpuId platform_gpu_id = valid_platform_gpu_ids[i];
1119     if (virtual_devices.empty() ||
1120         virtual_devices.Get(i).memory_limit_mb_size() == 0) {
1121       int64 single_virtual_device_memory_limit = 0;
1122       TF_RETURN_IF_ERROR(SingleVirtualDeviceMemoryLimit(
1123           gpu_options, platform_gpu_id, &single_virtual_device_memory_limit));
1124       memory_limit_bytes.push_back(single_virtual_device_memory_limit);
1125     } else {
1126       const auto& memory_limit_mb = virtual_devices.Get(i).memory_limit_mb();
1127       std::transform(memory_limit_mb.begin(), memory_limit_mb.end(),
1128                      std::back_inserter(memory_limit_bytes), [](float mb) {
1129                        return static_cast<int64>(mb) * (1ll << 20);
1130                      });
1131     }
1132     while (next_tf_gpu_id < memory_limit_bytes.size()) {
1133       TfGpuId tf_gpu_id(next_tf_gpu_id);
1134       ++next_tf_gpu_id;
1135       TF_RETURN_IF_ERROR(
1136           GpuIdManager::InsertTfPlatformGpuIdPair(tf_gpu_id, platform_gpu_id));
1137     }
1138   }
1139   const int num_tf_gpus = next_tf_gpu_id;
1140 
1141   LocalityMap device_localities;
1142   TF_RETURN_IF_ERROR(
1143       GetDeviceLocalities(num_tf_gpus, interconnect_maps, &device_localities));
1144 
1145   // Build the GPUDevices
1146   CHECK_EQ(next_tf_gpu_id, memory_limit_bytes.size());
1147   for (int di = 0; di < num_tf_gpus; ++di) {
1148     TfGpuId tf_gpu_id(di);
1149     int64 bytes = memory_limit_bytes[di];
1150     auto it = device_localities.find(tf_gpu_id);
1151     if (it == device_localities.end()) {
1152       return errors::Internal("Failed to find DeviceLocality for GPU device ",
1153                               tf_gpu_id.value());
1154     }
1155     TF_RETURN_IF_ERROR(CreateGPUDevice(options, name_prefix, tf_gpu_id, bytes,
1156                                        it->second, devices));
1157   }
1158   return Status::OK();
1159 }
1160 
GetShortDeviceDescription(PlatformGpuId platform_gpu_id,const se::DeviceDescription & desc)1161 static string GetShortDeviceDescription(PlatformGpuId platform_gpu_id,
1162                                         const se::DeviceDescription& desc) {
1163 #if GOOGLE_CUDA
1164   int cc_major;
1165   int cc_minor;
1166   if (!desc.cuda_compute_capability(&cc_major, &cc_minor)) {
1167     cc_major = 0;
1168     cc_minor = 0;
1169   }
1170   // LINT.IfChange
1171   return strings::StrCat("device: ", platform_gpu_id.value(), ", name: ",
1172                          desc.name(), ", pci bus id: ", desc.pci_bus_id(),
1173                          ", compute capability: ", cc_major, ".", cc_minor);
1174   // LINT.ThenChange(//tensorflow/python/platform/test.py)
1175 #elif TENSORFLOW_USE_ROCM
1176   return strings::StrCat("device: ", platform_gpu_id.value(),
1177                          ", name: ", desc.name(),
1178                          ", pci bus id: ", desc.pci_bus_id());
1179 #endif
1180 }
1181 
CreateGPUDevice(const SessionOptions & options,const string & name_prefix,TfGpuId tf_gpu_id,int64 memory_limit,const DeviceLocality & dev_locality,std::vector<std::unique_ptr<Device>> * devices)1182 Status BaseGPUDeviceFactory::CreateGPUDevice(
1183     const SessionOptions& options, const string& name_prefix, TfGpuId tf_gpu_id,
1184     int64 memory_limit, const DeviceLocality& dev_locality,
1185     std::vector<std::unique_ptr<Device>>* devices) {
1186   CHECK_GE(tf_gpu_id.value(), 0);
1187   const string device_name =
1188       strings::StrCat(name_prefix, "/device:GPU:", tf_gpu_id.value());
1189   GpuIdUtil::CheckValidTfGpuId(tf_gpu_id);
1190   PlatformGpuId platform_gpu_id;
1191   TF_RETURN_IF_ERROR(
1192       GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id));
1193   int numa_node = dev_locality.numa_node();
1194 
1195   se::StreamExecutor* se =
1196       GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie();
1197   const se::DeviceDescription& desc = se->GetDeviceDescription();
1198   GPUProcessState* process_state = GPUProcessState::singleton();
1199   Allocator* gpu_allocator = process_state->GetGPUAllocator(
1200       options.config.gpu_options(), tf_gpu_id, memory_limit);
1201   if (gpu_allocator == nullptr) {
1202     return errors::Internal("Failed to get memory allocator for TF GPU ",
1203                             tf_gpu_id.value(), " with ", memory_limit,
1204                             " bytes of memory.");
1205   }
1206   absl::optional<AllocatorStats> stats = gpu_allocator->GetStats();
1207   if (!stats) {
1208     return errors::Internal("No allocator statistics");
1209   }
1210   // 'memory_limit' is the required memory size, but if the allocator with given
1211   // tf_gpu_id was created before, we'll use it instead of creating a new one
1212   // (as TF gpu device is a shared resource), in which case the actual memory
1213   // limit represented by 'stats.bytes_limit' used by that allocator may be
1214   // different (which should be an error).
1215   //
1216   // TODO(laigd): report error if memory_limit doesn't match stats->bytes_limit.
1217   int64 bytes_limit = stats->bytes_limit ? *stats->bytes_limit : 0;
1218   std::unique_ptr<BaseGPUDevice> gpu_device = CreateGPUDevice(
1219       options, device_name, static_cast<Bytes>(bytes_limit), dev_locality,
1220       tf_gpu_id, GetShortDeviceDescription(platform_gpu_id, desc),
1221       gpu_allocator, ProcessState::singleton()->GetCPUAllocator(numa_node));
1222   LOG(INFO) << "Created TensorFlow device (" << device_name << " with "
1223             << (bytes_limit >> 20) << " MB memory) -> physical GPU ("
1224             << GetShortDeviceDescription(platform_gpu_id, desc) << ")";
1225   TF_RETURN_IF_ERROR(gpu_device->Init(options));
1226   devices->push_back(std::move(gpu_device));
1227 
1228   return Status::OK();
1229 }
1230 
1231 namespace {
1232 std::unique_ptr<std::map<std::pair<PlatformGpuId, PlatformGpuId>, bool>>
GetPeerAccessMap(se::Platform * platform,const std::vector<PlatformGpuId> & visible_gpu_order)1233 GetPeerAccessMap(se::Platform* platform,
1234                  const std::vector<PlatformGpuId>& visible_gpu_order) {
1235   std::unique_ptr<std::map<std::pair<PlatformGpuId, PlatformGpuId>, bool>> map(
1236       new std::map<std::pair<PlatformGpuId, PlatformGpuId>, bool>);
1237   for (PlatformGpuId platform_gpu_i : visible_gpu_order) {
1238     for (PlatformGpuId platform_gpu_j : visible_gpu_order) {
1239       se::StreamExecutor* from =
1240           GpuIdUtil::ExecutorForPlatformGpuId(platform, platform_gpu_i)
1241               .ValueOrDie();
1242       se::StreamExecutor* to =
1243           GpuIdUtil::ExecutorForPlatformGpuId(platform, platform_gpu_j)
1244               .ValueOrDie();
1245       (*map)[{platform_gpu_i, platform_gpu_j}] =
1246           from->CanEnablePeerAccessTo(to);
1247     }
1248   }
1249 
1250   return map;
1251 }
1252 
1253 }  // namespace
1254 
GetInterconnectMaps(const std::vector<PlatformGpuId> & visible_gpu_order,se::Platform * gpu_manager,std::vector<InterconnectMap> * maps)1255 Status BaseGPUDeviceFactory::GetInterconnectMaps(
1256     const std::vector<PlatformGpuId>& visible_gpu_order,
1257     se::Platform* gpu_manager, std::vector<InterconnectMap>* maps) {
1258   // The default interconnect map is obtained from the StreamExecutor.
1259   auto access_map = GetPeerAccessMap(gpu_manager, visible_gpu_order);
1260   maps->resize(1);
1261   InterconnectMap& imap = maps->at(0);
1262   imap.name = "StreamExecutor";
1263   imap.strength = InterconnectMap::kStreamExecutorStrength;
1264   for (PlatformGpuId gpu_id_i : visible_gpu_order) {
1265     for (PlatformGpuId gpu_id_j : visible_gpu_order) {
1266       if (gpu_id_i == gpu_id_j) continue;
1267       if ((*access_map)[{gpu_id_i, gpu_id_j}]) {
1268         imap.directed_links.insert({gpu_id_i, gpu_id_j});
1269       }
1270     }
1271   }
1272   return Status::OK();
1273 }
1274 
GetDeviceLocalities(int num_tf_gpus,const std::vector<InterconnectMap> & interconnects,LocalityMap * localities)1275 Status BaseGPUDeviceFactory::GetDeviceLocalities(
1276     int num_tf_gpus, const std::vector<InterconnectMap>& interconnects,
1277     LocalityMap* localities) {
1278   std::vector<TfGpuId> all_tf_gpu_ids;
1279   all_tf_gpu_ids.reserve(num_tf_gpus);
1280   for (int i = 0; i < num_tf_gpus; ++i) {
1281     all_tf_gpu_ids.push_back(TfGpuId(i));
1282   }
1283   for (TfGpuId tf_gpu_id : all_tf_gpu_ids) {
1284     PlatformGpuId platform_gpu_id;
1285     TF_RETURN_IF_ERROR(
1286         GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id));
1287     // Get GPU bus_id from its reported NUMA affinity.  Because GPUs are
1288     // virtualized in some environments, we can't just use the GPU id.
1289     // NUMA locales are indexed from 0, buses are indexed from 1.
1290     se::StreamExecutor* se =
1291         GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie();
1292     const se::DeviceDescription& desc = se->GetDeviceDescription();
1293     int numa_node = desc.numa_node();
1294     if (numa_node < 0) {
1295       // For some reason the StreamExecutor couldn't get the NUMA
1296       // affinity of the GPU.  If this is not a multi-socket mobo with
1297       // GPUs local to different buses, it doesn't matter.  If it is, we
1298       // may run into trouble later with data transfer operations.  The
1299       // trouble may manifest as slower than expected performance, or
1300       // outright failures.
1301       LOG(INFO) << "Could not identify NUMA node of platform GPU id "
1302                 << platform_gpu_id
1303                 << ", defaulting to 0.  Your kernel may not have been built "
1304                 << "with NUMA support.";
1305       numa_node = 0;
1306     }
1307     DeviceLocality dev_locality;
1308     dev_locality.set_numa_node(numa_node);
1309     dev_locality.set_bus_id(numa_node + 1);
1310 
1311     // Set LocalLinks from InterconnectMaps.
1312     LocalLinks* links = dev_locality.mutable_links();
1313     for (const InterconnectMap& imap : interconnects) {
1314       for (TfGpuId tf_gpu_dst : all_tf_gpu_ids) {
1315         PlatformGpuId platform_gpu_dst;
1316         TF_RETURN_IF_ERROR(
1317             GpuIdManager::TfToPlatformGpuId(tf_gpu_dst, &platform_gpu_dst));
1318         if (imap.directed_links.find({platform_gpu_id, platform_gpu_dst}) !=
1319             imap.directed_links.end()) {
1320           InterconnectLink* ilink = links->add_link();
1321           ilink->set_device_id(tf_gpu_dst.value());
1322           ilink->set_type(imap.name);
1323           ilink->set_strength(imap.strength);
1324         }
1325       }
1326     }
1327 
1328     // If this is one of multiple virtual GPUs on the same physical GPU
1329     // add high strength links to the others.
1330     for (TfGpuId tf_gpu_dst : all_tf_gpu_ids) {
1331       if (tf_gpu_id == tf_gpu_dst) continue;
1332       PlatformGpuId platform_gpu_dst;
1333       TF_RETURN_IF_ERROR(
1334           GpuIdManager::TfToPlatformGpuId(tf_gpu_dst, &platform_gpu_dst));
1335       if (platform_gpu_id == platform_gpu_dst) {
1336         InterconnectLink* ilink = links->add_link();
1337         ilink->set_device_id(tf_gpu_dst.value());
1338         ilink->set_type("SAME_DEVICE");
1339         ilink->set_strength(InterconnectMap::kSameDeviceStrength);
1340       }
1341     }
1342 
1343     (*localities)[tf_gpu_id] = dev_locality;
1344     VLOG(1) << "GPUDevice PlatformGpuId " << platform_gpu_id << " TfGpuId "
1345             << tf_gpu_id << " on bus " << dev_locality.bus_id()
1346             << " numa: " << numa_node << " pci: " << desc.pci_bus_id()
1347             << " DeviceLocality: " << dev_locality.DebugString();
1348   }
1349   return Status::OK();
1350 }
1351 
GetDefaultMinGPUMultiprocessorCount(se::Platform * gpu_manager,const std::vector<PlatformGpuId> & visible_gpu_order)1352 static int GetDefaultMinGPUMultiprocessorCount(
1353     se::Platform* gpu_manager,
1354     const std::vector<PlatformGpuId>& visible_gpu_order) {
1355   static const int kDefaultMinGPUMultiprocessorCount = 8;
1356 
1357   // Find the highest multi-processor count across all visible GPUs.
1358   int max_count = -1;
1359   for (int i = 0; i < visible_gpu_order.size(); ++i) {
1360     auto exec_status =
1361         GpuIdUtil::ExecutorForPlatformGpuId(gpu_manager, visible_gpu_order[i]);
1362     if (!exec_status.ok()) {
1363       continue;
1364     }
1365 
1366     se::StreamExecutor* se = exec_status.ValueOrDie();
1367     const se::DeviceDescription& desc = se->GetDeviceDescription();
1368     max_count = std::max(max_count, desc.core_count());
1369   }
1370 
1371   if (max_count < 0 || kDefaultMinGPUMultiprocessorCount < max_count) {
1372     return kDefaultMinGPUMultiprocessorCount;
1373   } else {
1374     return max_count;
1375   }
1376 }
1377 
GetMinGPUMultiprocessorCount(se::Platform * gpu_manager,const std::vector<PlatformGpuId> & visible_gpu_order)1378 static int GetMinGPUMultiprocessorCount(
1379     se::Platform* gpu_manager,
1380     const std::vector<PlatformGpuId>& visible_gpu_order) {
1381   const char* tf_min_gpu_core_count = getenv("TF_MIN_GPU_MULTIPROCESSOR_COUNT");
1382 
1383   if (tf_min_gpu_core_count == nullptr ||
1384       strcmp(tf_min_gpu_core_count, "") == 0) {
1385     return GetDefaultMinGPUMultiprocessorCount(gpu_manager, visible_gpu_order);
1386   }
1387 
1388   int min_gpu_core_count = -1;
1389   if (strings::safe_strto32(tf_min_gpu_core_count, &min_gpu_core_count)) {
1390     if (min_gpu_core_count >= 0) {
1391       return min_gpu_core_count;
1392     }
1393   }
1394 
1395   int count =
1396       GetDefaultMinGPUMultiprocessorCount(gpu_manager, visible_gpu_order);
1397   LOG(ERROR) << "Invalid minimum GPU multiprocessor count: ["
1398              << tf_min_gpu_core_count << "]. "
1399              << "Using the default value: " << count;
1400   return count;
1401 }
1402 
1403 namespace {
1404 
1405 #if GOOGLE_CUDA
1406 struct CudaVersion {
1407   // Initialize from version_name in the form of "3.5"
CudaVersiontensorflow::__anon0a6e82d20a11::CudaVersion1408   explicit CudaVersion(const std::string& version_name) {
1409     size_t dot_pos = version_name.find('.');
1410     CHECK(dot_pos != string::npos)
1411         << "Illegal version name: [" << version_name << "]";
1412     string major_str = version_name.substr(0, dot_pos);
1413     CHECK(strings::safe_strto32(major_str, &major_part))
1414         << "Illegal version name: [" << version_name << "]";
1415     string minor_str = version_name.substr(dot_pos + 1);
1416     CHECK(strings::safe_strto32(minor_str, &minor_part))
1417         << "Illegal version name: [" << version_name << "]";
1418   }
CudaVersiontensorflow::__anon0a6e82d20a11::CudaVersion1419   CudaVersion() {}
operator <tensorflow::__anon0a6e82d20a11::CudaVersion1420   bool operator<(const CudaVersion& other) const {
1421     if (this->major_part != other.major_part) {
1422       return this->major_part < other.major_part;
1423     }
1424     return this->minor_part < other.minor_part;
1425   }
operator <<(std::ostream & os,const CudaVersion & version)1426   friend std::ostream& operator<<(std::ostream& os,
1427                                   const CudaVersion& version) {
1428     os << version.major_part << "." << version.minor_part;
1429     return os;
1430   }
1431   int major_part = -1;
1432   int minor_part = -1;
1433 };
1434 
1435 std::vector<CudaVersion> supported_cuda_compute_capabilities = {
1436     TF_CUDA_CAPABILITIES,};
1437 
GetSupportedCudaComputeCapabilities()1438 std::vector<CudaVersion> GetSupportedCudaComputeCapabilities() {
1439   auto cuda_caps = supported_cuda_compute_capabilities;
1440 #ifdef TF_EXTRA_CUDA_CAPABILITIES
1441 // TF_EXTRA_CUDA_CAPABILITIES should be defined a sequence separated by commas,
1442 // for example:
1443 //   TF_EXTRA_CUDA_CAPABILITIES=3.0,4.0,5.0
1444 // Use two-level macro expansion for stringification.
1445 #define TF_XSTRING(...) #__VA_ARGS__
1446 #define TF_STRING(s) TF_XSTRING(s)
1447   string extra_cuda_caps = TF_STRING(TF_EXTRA_CUDA_CAPABILITIES);
1448 #undef TF_STRING
1449 #undef TF_XSTRING
1450   auto extra_capabilities = str_util::Split(extra_cuda_caps, ',');
1451   for (const auto& capability : extra_capabilities) {
1452     cuda_caps.push_back(CudaVersion(capability));
1453   }
1454 #endif
1455   return cuda_caps;
1456 }
1457 #endif  // GOOGLE_CUDA
1458 
1459 #if TENSORFLOW_USE_ROCM
1460 std::vector<int> supported_amdgpu_isa_versions = {803, 900, 906};
1461 
GetSupportedAMDGPUISAVersions()1462 std::vector<int> GetSupportedAMDGPUISAVersions() {
1463   return supported_amdgpu_isa_versions;
1464 }
1465 #endif  // TENSORFLOW_USE_ROCM
1466 
EnablePeerAccess(se::Platform * platform,const std::vector<PlatformGpuId> & visible_gpu_order)1467 Status EnablePeerAccess(se::Platform* platform,
1468                         const std::vector<PlatformGpuId>& visible_gpu_order) {
1469   int possible_peer_count = 0;
1470   int enabled_peer_count = 0;
1471   for (int i = 0; i < visible_gpu_order.size(); ++i) {
1472     const PlatformGpuId platform_gpu_i = visible_gpu_order[i];
1473     for (int j = 0; j < visible_gpu_order.size(); ++j) {
1474       const PlatformGpuId platform_gpu_j = visible_gpu_order[j];
1475       // We have already validated that ExecutorForDevice() calls return OK.
1476       se::StreamExecutor* from =
1477           GpuIdUtil::ExecutorForPlatformGpuId(platform, platform_gpu_i)
1478               .ValueOrDie();
1479       se::StreamExecutor* to =
1480           GpuIdUtil::ExecutorForPlatformGpuId(platform, platform_gpu_j)
1481               .ValueOrDie();
1482 
1483       if (from->CanEnablePeerAccessTo(to)) {
1484         ++possible_peer_count;
1485         auto status = from->EnablePeerAccessTo(to);
1486         if (!status.ok()) {
1487           LOG(WARNING)
1488               << "Unable to enable peer access between device ordinals "
1489               << platform_gpu_i << " and " << platform_gpu_j
1490               << ", status: " << status;
1491         } else {
1492           ++enabled_peer_count;
1493         }
1494       }
1495     }
1496   }
1497 
1498   // Return an error in the extreme failure case where the driver
1499   // reported that peering was possible but not a single peering was
1500   // successful.  This is to catch possible system misconfigurations
1501   // or more fundamental issues.
1502   if (possible_peer_count > 0 && enabled_peer_count == 0) {
1503     return errors::Internal(possible_peer_count,
1504                             " potential peer access pairs were reported by the "
1505                             "driver, but no peering could be enabled.");
1506   }
1507   return Status::OK();
1508 }
1509 
1510 }  // namespace
1511 
GetValidDeviceIds(const std::vector<PlatformGpuId> & visible_gpu_order,std::vector<PlatformGpuId> * ids)1512 Status BaseGPUDeviceFactory::GetValidDeviceIds(
1513     const std::vector<PlatformGpuId>& visible_gpu_order,
1514     std::vector<PlatformGpuId>* ids) {
1515   se::Platform* gpu_manager = GPUMachineManager();
1516   bool new_gpu_found = false;
1517   for (int i = 0; i < visible_gpu_order.size(); ++i) {
1518     const PlatformGpuId visible_gpu_id = visible_gpu_order[i];
1519 
1520     // Only perform this once per visible platform gpu id.
1521     if (visible_gpu_initialized_[visible_gpu_id.value()]) {
1522       continue;
1523     }
1524 
1525     visible_gpu_initialized_[visible_gpu_id.value()] = true;
1526     new_gpu_found = true;
1527 
1528     auto executor =
1529         GpuIdUtil::ExecutorForPlatformGpuId(gpu_manager, visible_gpu_id);
1530     if (!executor.ok()) {
1531       return executor.status();
1532     }
1533 
1534     auto stream_exec = executor.ValueOrDie();
1535     int64 free_bytes;
1536     int64 total_bytes;
1537     if (!stream_exec->DeviceMemoryUsage(&free_bytes, &total_bytes)) {
1538       // Logs internally on failure.
1539       free_bytes = 0;
1540       total_bytes = 0;
1541     }
1542     const auto& description = stream_exec->GetDeviceDescription();
1543 #if GOOGLE_CUDA
1544     int cc_major;
1545     int cc_minor;
1546     if (!description.cuda_compute_capability(&cc_major, &cc_minor)) {
1547       // Logs internally on failure.
1548       cc_major = 0;
1549       cc_minor = 0;
1550     }
1551     LOG(INFO) << "Found device " << i << " with properties: "
1552               << "\nname: " << description.name() << " major: " << cc_major
1553               << " minor: " << cc_minor
1554               << " memoryClockRate(GHz): " << description.clock_rate_ghz()
1555               << "\npciBusID: " << description.pci_bus_id() << "\ntotalMemory: "
1556               << strings::HumanReadableNumBytes(total_bytes)
1557               << " freeMemory: " << strings::HumanReadableNumBytes(free_bytes);
1558 #elif TENSORFLOW_USE_ROCM
1559     int isa_version;
1560     if (!description.rocm_amdgpu_isa_version(&isa_version)) {
1561       // Logs internally on failure.
1562       isa_version = 0;
1563     }
1564     LOG(INFO) << "Found device " << i << " with properties: "
1565               << "\nname: " << description.name() << "\nAMDGPU ISA: gfx"
1566               << isa_version << "\nmemoryClockRate (GHz) "
1567               << description.clock_rate_ghz() << "\npciBusID "
1568               << description.pci_bus_id() << "\nTotal memory: "
1569               << strings::HumanReadableNumBytes(total_bytes)
1570               << "\nFree memory: "
1571               << strings::HumanReadableNumBytes(free_bytes);
1572 #endif
1573   }
1574   // Checking peering and shows matrix if more than one gpu found.
1575   if (new_gpu_found && visible_gpu_order.size() > 1) {
1576     // Enable peer access
1577     TF_RETURN_IF_ERROR(EnablePeerAccess(gpu_manager, visible_gpu_order));
1578   }
1579 
1580 #if GOOGLE_CUDA
1581   auto cuda_supported_capabilities = GetSupportedCudaComputeCapabilities();
1582   if (cuda_supported_capabilities.empty()) {
1583     return errors::FailedPrecondition(
1584         "No supported cuda capabilities in binary.");
1585   }
1586   CudaVersion min_supported_capability = *std::min_element(
1587       cuda_supported_capabilities.begin(), cuda_supported_capabilities.end());
1588 #elif TENSORFLOW_USE_ROCM
1589   auto rocm_supported_isas = GetSupportedAMDGPUISAVersions();
1590   if (rocm_supported_isas.empty()) {
1591     return errors::FailedPrecondition(
1592         "No supported rocm capabilities in binary.");
1593   }
1594   int min_supported_isa =
1595       *std::min_element(rocm_supported_isas.begin(), rocm_supported_isas.end());
1596 #endif
1597 
1598   int min_gpu_core_count =
1599       GetMinGPUMultiprocessorCount(gpu_manager, visible_gpu_order);
1600 
1601   // Filter out devices that don't have the right capability or power.
1602   for (int i = 0; i < visible_gpu_order.size(); ++i) {
1603     const PlatformGpuId visible_gpu_id = visible_gpu_order[i];
1604     auto exec_status =
1605         GpuIdUtil::ExecutorForPlatformGpuId(gpu_manager, visible_gpu_id);
1606     if (!exec_status.ok()) {
1607       LOG(INFO) << "Ignoring visible gpu device " << visible_gpu_id
1608                 << " whose executor is in invalid state: "
1609                 << exec_status.status().ToString();
1610       continue;
1611     }
1612     se::StreamExecutor* se = exec_status.ValueOrDie();
1613     const se::DeviceDescription& desc = se->GetDeviceDescription();
1614 
1615 #if GOOGLE_CUDA
1616     CudaVersion device_capability;
1617     if (!desc.cuda_compute_capability(&device_capability.major_part,
1618                                       &device_capability.minor_part)) {
1619       LOG(INFO) << "Ignoring visible gpu device "
1620                 << "(" << GetShortDeviceDescription(visible_gpu_id, desc)
1621                 << ") "
1622                 << "whose CUDA compute capability is not available.";
1623       continue;
1624     }
1625     // Only GPUs with no less than the minimum supported compute capability is
1626     // accepted.
1627     if (device_capability < min_supported_capability) {
1628       LOG(INFO) << "Ignoring visible gpu device "
1629                 << "(" << GetShortDeviceDescription(visible_gpu_id, desc)
1630                 << ") "
1631                 << "with Cuda compute capability " << device_capability
1632                 << ". The minimum required Cuda capability is "
1633                 << min_supported_capability << ".";
1634       continue;
1635     }
1636 #elif TENSORFLOW_USE_ROCM
1637     int device_isa;
1638     if (!desc.rocm_amdgpu_isa_version(&device_isa)) {
1639       continue;
1640     }
1641     // Only GPUs with no less than the minimum supported compute capability is
1642     // accepted.
1643     if (device_isa < min_supported_isa) {
1644       LOG(INFO) << "Ignoring visible gpu device "
1645                 << "(" << GetShortDeviceDescription(visible_gpu_id, desc)
1646                 << ") "
1647                 << "with AMDGPU ISA gfx" << device_isa
1648                 << ". The minimum required AMDGPU ISA is gfx"
1649                 << min_supported_isa << ".";
1650       continue;
1651     }
1652 #endif
1653 
1654     // Filter out slow GPUs. By default, GPUs with a lower multiprocessor
1655     // count than the fastest GPU are filtered out, unless they have 8 or more
1656     // multiprocessors. If the TF_MIN_GPU_MULTIPROCESSOR_COUNT environment
1657     // variable is set, its value will be used to filter out GPUs.
1658     if (desc.core_count() < min_gpu_core_count) {
1659       LOG(INFO) << "Ignoring visible gpu device "
1660                 << "(" << GetShortDeviceDescription(visible_gpu_id, desc)
1661                 << ") "
1662                 << "with core count: " << desc.core_count()
1663                 << ". The minimum required count is " << min_gpu_core_count
1664                 << ". You can adjust this requirement with the env var "
1665                    "TF_MIN_GPU_MULTIPROCESSOR_COUNT.";
1666       continue;
1667     }
1668     ids->push_back(visible_gpu_id);
1669   }
1670   if (!ids->empty()) {
1671     std::vector<int> raw_ids(ids->size());
1672     std::transform(ids->begin(), ids->end(), raw_ids.begin(),
1673                    [](PlatformGpuId id) -> int { return id.value(); });
1674     LOG(INFO) << "Adding visible gpu devices: "
1675               << str_util::Join(raw_ids, ", ");
1676   }
1677 
1678   return Status::OK();
1679 }
1680 
SafeAllocFrontier()1681 uint64 BaseGPUDevice::SafeAllocFrontier() {
1682   if (timestamped_allocator_) {
1683     return kernel_tracker_->LastTerminatedCount();
1684   } else {
1685     return 0;
1686   }
1687 }
1688 
PendingKernels()1689 int BaseGPUDevice::PendingKernels() {
1690   if (kernel_tracker_) {
1691     return kernel_tracker_->NumPending();
1692   }
1693   return 0;
1694 }
1695 
RecordQueued()1696 uint64 GPUKernelTracker::RecordQueued() {
1697   mutex_lock l(mu_);
1698   uint64 queued_count = timing_counter_->next();
1699   VLOG(2) << "RecordQueued queued_count=" << queued_count
1700           << " first_available_=" << first_available_
1701           << " last_completed_=" << last_completed_
1702           << " num_pending_=" << num_pending_;
1703   pending_kernels_[first_available_].queued_count = queued_count;
1704   pending_kernels_[first_available_].terminated = false;
1705   ++first_available_;
1706   ++num_pending_;
1707   if (first_available_ >= pending_kernels_.size()) {
1708     first_available_ = 0;
1709   }
1710   if (first_available_ == last_completed_) {
1711     // Ring buffer is full: double it.  All of the same valid PendingKernel
1712     // entries exist after the copy, they are just shifted to begin
1713     // at index 0 in the new array.
1714     std::vector<PendingKernel> new_buffer(pending_kernels_.size() * 2);
1715     for (int i = 0; i < pending_kernels_.size(); ++i) {
1716       int j = (i + last_completed_) % pending_kernels_.size();
1717       new_buffer[i] = pending_kernels_[j];
1718     }
1719     last_completed_ = 0;
1720     first_available_ = pending_kernels_.size();
1721     pending_kernels_.swap(new_buffer);
1722     VLOG(1) << "last_completed_=" << last_completed_
1723             << " first_available_=" << first_available_
1724             << " num_pending_=" << num_pending_;
1725   }
1726   DCHECK_NE(first_available_, last_completed_) << "exhausted pending_kernels";
1727   return queued_count;
1728 }
1729 
RecordTerminated(uint64 queued_count)1730 void GPUKernelTracker::RecordTerminated(uint64 queued_count) {
1731   mutex_lock l(mu_);
1732   VLOG(2) << "RecordTerminated queued_count=" << queued_count
1733           << " first_available_=" << first_available_
1734           << " last_completed_=" << last_completed_
1735           << " num_pending_=" << num_pending_ << " LC="
1736           << ((last_completed_ >= 0)
1737                   ? pending_kernels_[last_completed_].queued_count
1738                   : -1);
1739   DCHECK_NE(first_available_, last_completed_);
1740   DCHECK_GT(num_pending_, 0);
1741   // Starting just past the last completed entry, find the entry with
1742   // this queued_count and mark it done.
1743   int index = (last_completed_ + 1) % pending_kernels_.size();
1744   while (true) {
1745     if (index == first_available_) {
1746       // This should never happen.
1747       LOG(FATAL) << "Failed to find " << queued_count  // Crash OK
1748                  << " in queue";
1749     }
1750     if (pending_kernels_[index].queued_count == queued_count) {
1751       pending_kernels_[index].terminated = true;
1752       break;
1753     }
1754     index = (index + 1) % pending_kernels_.size();
1755   }
1756   // Next move last_completed_ forward past all completed kernels.  In theory
1757   // kernels should always complete in queued order so we should be able to
1758   // advance the completed frontier to the last queued PendingKernel.  In
1759   // practice we occassionally see the termination callbacks arrive out of order
1760   // probably because of thread scheduling.  Eventually we may support out-of-
1761   // order completion involving multple compute streams so here we follow a
1762   // conservative approach and wait for every single callback to arrive before
1763   // advancing the frontier.
1764   while (true) {
1765     int next_index = (last_completed_ + 1) % pending_kernels_.size();
1766     if (next_index == first_available_) break;
1767     if (pending_kernels_[next_index].terminated) {
1768       last_completed_ = next_index;
1769     } else {
1770       break;
1771     }
1772   }
1773   // Last decrease num_pending before maybe waking a waiter.
1774   --num_pending_;
1775   pending_decreased_.notify_one();
1776 }
1777 
LastTerminatedCount()1778 uint64 GPUKernelTracker::LastTerminatedCount() {
1779   mutex_lock l(mu_);
1780   if (last_completed_ < 0) {
1781     // This is an edge case that can be encountered only at the beginning of
1782     // execution.  There's not yet a safe threshold count. We don't want to
1783     // return 0 since that bypasses the count mechanism in BFCAllocator, so
1784     // return the least non-zero value.
1785     return 1;
1786   }
1787   return pending_kernels_[last_completed_].queued_count;
1788 }
1789 
1790 }  // namespace tensorflow
1791 
1792 #endif  // GOOGLE_CUDA
1793