1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 // TODO(opensource): Use a more generic sounding preprocessor name than
17 // GOOGLE_CUDA
18 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
19
20 #if TENSORFLOW_USE_ROCM
21 #include "rocm/include/hip/hip_runtime.h"
22 #endif
23
24 #define EIGEN_USE_GPU
25
26 #include "tensorflow/core/common_runtime/gpu/gpu_device.h"
27
28 #include <stdlib.h>
29 #include <string.h>
30 #include <algorithm>
31 #include <list>
32 #include <map>
33 #include <tuple>
34 #include <vector>
35
36 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
37 #include "tensorflow/core/common_runtime/device_factory.h"
38 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
39 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
40 #include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
41 #include "tensorflow/core/common_runtime/gpu/gpu_id_utils.h"
42 #include "tensorflow/core/common_runtime/gpu/gpu_init.h"
43 #include "tensorflow/core/common_runtime/gpu/gpu_process_state.h"
44 #include "tensorflow/core/common_runtime/gpu/gpu_stream_util.h"
45 #include "tensorflow/core/common_runtime/gpu/gpu_util.h"
46 #include "tensorflow/core/common_runtime/gpu_device_context.h"
47 #include "tensorflow/core/common_runtime/local_device.h"
48 #include "tensorflow/core/framework/allocator.h"
49 #include "tensorflow/core/framework/device_base.h"
50 #include "tensorflow/core/framework/op_kernel.h"
51 #include "tensorflow/core/framework/tensor.h"
52 #include "tensorflow/core/framework/tensor.pb.h"
53 #include "tensorflow/core/framework/types.h"
54 #include "tensorflow/core/framework/variant_op_registry.h"
55 #include "tensorflow/core/graph/types.h"
56 #include "tensorflow/core/lib/core/errors.h"
57 #include "tensorflow/core/lib/core/status.h"
58 #include "tensorflow/core/lib/gtl/stl_util.h"
59 #include "tensorflow/core/lib/strings/numbers.h"
60 #include "tensorflow/core/lib/strings/str_util.h"
61 #include "tensorflow/core/lib/strings/strcat.h"
62 #if GOOGLE_CUDA
63 #include "tensorflow/core/platform/cuda.h"
64 #elif TENSORFLOW_USE_ROCM
65 #include "tensorflow/core/platform/rocm.h"
66 #endif
67 #include "tensorflow/core/platform/logging.h"
68 #include "tensorflow/core/platform/macros.h"
69 #include "tensorflow/core/platform/stream_executor.h"
70 #include "tensorflow/core/platform/tracing.h"
71 #include "tensorflow/core/platform/types.h"
72 #include "tensorflow/core/public/session_options.h"
73 #include "tensorflow/core/util/device_name_utils.h"
74 #include "tensorflow/core/util/env_var.h"
75 #include "tensorflow/core/util/stream_executor_util.h"
76
77 #if !defined(PLATFORM_GOOGLE)
78 #if GOOGLE_CUDA
79 #include "cuda/cuda_config.h"
80 #endif
81 #endif
82
83 namespace tensorflow {
84
85 #if GOOGLE_CUDA
86
87 typedef cudaStream_t gpuStream_t;
88 typedef cudaDeviceProp gpuDeviceProp_t;
89 #define EIGEN_GPU_SCRATCH_SIZE (Eigen::kGpuScratchSize)
90 using se::cuda::ScopedActivateExecutorContext;
91
92 #elif TENSORFLOW_USE_ROCM
93
94 typedef hipStream_t gpuStream_t;
95 typedef hipDeviceProp_t gpuDeviceProp_t;
96 #define EIGEN_GPU_SCRATCH_SIZE (Eigen::kGpuScratchSize)
97 using se::rocm::ScopedActivateExecutorContext;
98
99 #endif
100
101 // Eigen Ops directly allocate memory only for temporary buffers used
102 // during OpKernel::Compute(). The recommended way of allocating such
103 // memory is via OpKernelContext::allocate_temp(). However, Eigen Ops
104 // don't have access to OpKernelContext, instead they get access to
105 // memory directly through the device allocator. As an Open Source
106 // project, Eigen assumes allocator semantics similar to those of the
107 // CUDA or ROCm memory allocator, and may not work correctly due to race
108 // conditions if used with some other allocator. For safety, we need
109 // to delay deallocation calls out of Eigen until all events on the
110 // corresponding stream have completed. The following two classes
111 // serve this purpose in two different compilation environments.
112
113 class EigenGpuStreamDevice : public ::Eigen::StreamInterface {
114 public:
EigenGpuStreamDevice()115 EigenGpuStreamDevice()
116 : scratch_(nullptr), semaphore_(nullptr), context_(nullptr) {
117 Eigen::initializeDeviceProp();
118 }
~EigenGpuStreamDevice()119 ~EigenGpuStreamDevice() override {}
Reinitialize(OpKernelContext * context,const gpuStream_t * gpu_stream,TfGpuId tf_gpu_id,::tensorflow::Allocator * alloc,char * scratch)120 void Reinitialize(OpKernelContext* context, const gpuStream_t* gpu_stream,
121 TfGpuId tf_gpu_id, ::tensorflow::Allocator* alloc,
122 char* scratch) {
123 if (LogMemory::IsEnabled()) {
124 operation_ = context->op_kernel().name() + "/EigenAllocator";
125 step_id_ = context->step_id();
126 }
127 context_ = context;
128 scratch_ = scratch;
129 semaphore_ =
130 reinterpret_cast<unsigned int*>(scratch + Eigen::kGpuScratchSize);
131 stream_ = gpu_stream;
132 allocator_ = alloc;
133 PlatformGpuId platform_gpu_id;
134 TF_CHECK_OK(GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id));
135 device_prop_ = &Eigen::m_deviceProperties[platform_gpu_id.value()];
136 }
137
stream() const138 const gpuStream_t& stream() const override { return *stream_; }
deviceProperties() const139 const gpuDeviceProp_t& deviceProperties() const override {
140 return *device_prop_;
141 }
142
allocate(size_t num_bytes) const143 void* allocate(size_t num_bytes) const override {
144 void* ret = allocator_->AllocateRaw(32 /* alignment */, num_bytes);
145 if (ret == nullptr) {
146 if (context_) {
147 context_->SetStatus(errors::ResourceExhausted(
148 strings::StrCat("Ran out of GPU memory when allocating ", num_bytes,
149 " bytes for ", operation_)));
150 } else {
151 LOG(FATAL)
152 << "EigenAllocator for GPU ran out of memory when allocating "
153 << num_bytes << ". See error logs for more detailed info.";
154 }
155 }
156 if (LogMemory::IsEnabled() && ret != nullptr) {
157 LogMemory::RecordRawAllocation(operation_, step_id_, num_bytes, ret,
158 allocator_);
159 }
160 return ret;
161 }
deallocate(void * buffer) const162 void deallocate(void* buffer) const override {
163 if (LogMemory::IsEnabled() && buffer != nullptr) {
164 LogMemory::RecordRawDeallocation(operation_, step_id_, buffer, allocator_,
165 true);
166 }
167 AsyncFreeData* afData =
168 new AsyncFreeData(allocator_, buffer, operation_, step_id_);
169 #if GOOGLE_CUDA
170 cudaError_t err = cudaStreamAddCallback(*stream_, asyncFree, afData, 0);
171 CHECK_EQ(err, cudaSuccess);
172 #elif TENSORFLOW_USE_ROCM
173 hipError_t err = hipStreamAddCallback(*stream_, asyncFree, afData, 0);
174 CHECK_EQ(err, hipSuccess);
175 #endif
176 }
177
178 // Return a pointer to a per stream scratchpad of 1024 bytes residing
179 // in global memory.
scratchpad() const180 void* scratchpad() const override { return scratch_; }
181
182 // Return a semaphore. The semaphore is initially initialized to 0, and
183 // each kernel using it is responsible for resetting to 0 upon completion
184 // to maintain the invariant that the semaphore is always equal to 0 upon
185 // each kernel start.
semaphore() const186 unsigned int* semaphore() const override { return semaphore_; }
187
188 private:
189 struct AsyncFreeData {
AsyncFreeDatatensorflow::EigenGpuStreamDevice::AsyncFreeData190 AsyncFreeData(::tensorflow::Allocator* a, void* p, const string& o,
191 const int64 s)
192 : allocator_(a), address_(p), operation_(o), step_id_(s) {}
193 ::tensorflow::Allocator* allocator_;
194 void* address_;
195 const string operation_;
196 const int64 step_id_;
197 };
198
199 #if GOOGLE_CUDA
asyncFree(gpuStream_t stream,cudaError_t status,void * userData)200 static void CUDART_CB asyncFree(gpuStream_t stream, cudaError_t status,
201 void* userData) {
202 #elif TENSORFLOW_USE_ROCM
203 static void asyncFree(gpuStream_t stream, hipError_t status, void* userData) {
204 #endif
205 AsyncFreeData* data = static_cast<AsyncFreeData*>(userData);
206 if (LogMemory::IsEnabled()) {
207 LogMemory::RecordRawDeallocation(data->operation_, data->step_id_,
208 data->address_, data->allocator_, false);
209 }
210 data->allocator_->DeallocateRaw(data->address_);
211 delete data;
212 }
213
214 string operation_;
215 int64 step_id_;
216 const gpuStream_t* stream_; // Not owned.
217 const gpuDeviceProp_t* device_prop_; // Not owned.
218 ::tensorflow::Allocator* allocator_; // Not owned.
219 mutable char* scratch_;
220 mutable unsigned int* semaphore_;
221 OpKernelContext* context_;
222
223 TF_DISALLOW_COPY_AND_ASSIGN(EigenGpuStreamDevice);
224 };
225
226 // This factory helps to ensure that different GPU device objects that refer to
227 // the same physical device and stream group id use the same stream group
228 // object (and therefore the same CUDA streams). This is necessary since there
229 // is a single memory allocator per device (see ProcessState::GetGPUAllocator)
230 // and allocators must not be shared across streams.
231 class BaseGPUDevice::StreamGroupFactory {
232 public:
233 // Returns the unique stream group for use with the stream defined by
234 // {tf_gpu_id, stream_group_within_gpu}, creating it if it does not yet
235 // exist.
236 // This function is thread safe.
GetOrCreate(TfGpuId tf_gpu_id,int stream_group_within_gpu,se::StreamExecutor * executor,const GPUOptions & options)237 BaseGPUDevice::StreamGroup* GetOrCreate(TfGpuId tf_gpu_id,
238 int stream_group_within_gpu,
239 se::StreamExecutor* executor,
240 const GPUOptions& options) {
241 mutex_lock guard(lock_);
242 StreamGroup* group =
243 &streams_[key_type(tf_gpu_id.value(), stream_group_within_gpu)];
244 if (!group->compute) {
245 group->compute = new se::Stream(executor);
246 group->compute->Init();
247 VLOG(2) << "Created stream[" << stream_group_within_gpu
248 << "] = " << group->compute;
249
250 group->host_to_device = new se::Stream(executor);
251 group->host_to_device->Init();
252 VLOG(2) << "Created host_to_device_stream[" << stream_group_within_gpu
253 << "] = " << group->host_to_device;
254
255 group->device_to_host = new se::Stream(executor);
256 group->device_to_host->Init();
257 VLOG(2) << "Created device_to_host_stream[" << stream_group_within_gpu
258 << "] = " << group->device_to_host;
259
260 int num_d2d_streams =
261 options.experimental().num_dev_to_dev_copy_streams();
262 if (num_d2d_streams == 0) num_d2d_streams = 1;
263 if (num_d2d_streams < 1 || num_d2d_streams > 4) {
264 LOG(ERROR)
265 << "Illegal GPUOptions.experimental.num_dev_to_dev_copy_streams="
266 << num_d2d_streams << " set to 1 instead.";
267 num_d2d_streams = 1;
268 }
269 for (int i = 0; i < num_d2d_streams; ++i) {
270 se::Stream* stream = new se::Stream(executor);
271 stream->Init();
272 group->device_to_device.push_back(stream);
273 VLOG(2) << "Created device_to_device_stream[" << stream_group_within_gpu
274 << "] = " << group->device_to_device.back();
275 }
276 }
277 return group;
278 }
279
280 // Returns a reference to the StreamGroupFactory singleton. Note that this is
281 // never destroyed, so the objects it owns are never deleted.
Global()282 static StreamGroupFactory& Global() {
283 static StreamGroupFactory* instance = new StreamGroupFactory();
284 return *instance;
285 }
286
287 private:
288 mutex lock_;
289 using key_type = std::tuple<int, int>;
290 std::map<key_type, StreamGroup> streams_;
291
292 // StreamGroupFactory cannot be created directly; Call
293 // StreamGroupFactory::Global() to get the global instance.
294 StreamGroupFactory() = default;
295 TF_DISALLOW_COPY_AND_ASSIGN(StreamGroupFactory);
296 };
297
BaseGPUDevice(const SessionOptions & options,const string & name,Bytes memory_limit,const DeviceLocality & locality,TfGpuId tf_gpu_id,const string & physical_device_desc,Allocator * gpu_allocator,Allocator * cpu_allocator,bool sync_every_op,int32 max_streams)298 BaseGPUDevice::BaseGPUDevice(const SessionOptions& options, const string& name,
299 Bytes memory_limit, const DeviceLocality& locality,
300 TfGpuId tf_gpu_id,
301 const string& physical_device_desc,
302 Allocator* gpu_allocator, Allocator* cpu_allocator,
303 bool sync_every_op, int32 max_streams)
304 : LocalDevice(options, Device::BuildDeviceAttributes(name, DEVICE_GPU,
305 memory_limit, locality,
306 physical_device_desc)),
307 gpu_allocator_(gpu_allocator),
308 cpu_allocator_(cpu_allocator),
309 scoped_allocator_mgr_(new ScopedAllocatorMgr(name)),
310 tf_gpu_id_(tf_gpu_id),
311 sync_every_op_(sync_every_op),
312 max_streams_(max_streams) {
313 GPUProcessState::singleton()->EnableGPUDevice();
314 pending_cap_ = options.config.gpu_options().experimental().pending_cap();
315 timestamped_allocator_ =
316 options.config.gpu_options().experimental().timestamped_allocator();
317 if (timestamped_allocator_ || pending_cap_ > 0) {
318 SharedCounter* timing_counter = nullptr;
319 if (timestamped_allocator_) {
320 // In this case the SharedCounter was already created and set in the
321 // associated Allocator, with ownership by GPUProcessState.
322 // The GPUKernelTracker will use this SharedCounter, instead of
323 // owning its own.
324 timing_counter =
325 GPUProcessState::singleton()->GPUAllocatorCounter(tf_gpu_id);
326 DCHECK(timing_counter);
327 } else {
328 DCHECK_GT(pending_cap_, 0);
329 }
330 kernel_tracker_.reset(new GPUKernelTracker(Env::Default(), timing_counter));
331 }
332 }
333
~BaseGPUDevice()334 BaseGPUDevice::~BaseGPUDevice() {
335 delete gpu_device_info_;
336 for (auto sb : scratch_) gpu_allocator_->DeallocateRaw(sb);
337 for (auto ctx : device_contexts_) ctx->Unref();
338 }
339
340 // This should be idempotent if already initialized.
InitScratchBuffers()341 Status BaseGPUDevice::InitScratchBuffers() {
342 mutex_lock l(scratch_init_mutex_);
343 if (scratch_.size() < max_streams_) {
344 for (int i = 0; i < max_streams_; i++) {
345 DCHECK(streams_[i]);
346 if (scratch_.size() > i && scratch_[i]) continue;
347 size_t scratch_buffer_size =
348 Eigen::kGpuScratchSize + sizeof(unsigned int);
349 void* scratch_buffer = gpu_allocator_->AllocateRaw(
350 Allocator::kAllocatorAlignment, scratch_buffer_size);
351 if (scratch_buffer == nullptr) {
352 return errors::FailedPrecondition(
353 "Failed to allocate scratch buffer for device ",
354 tf_gpu_id_.value());
355 }
356 se::DeviceMemory<char> mem(
357 se::DeviceMemoryBase(scratch_buffer, scratch_buffer_size));
358
359 bool ok = executor_->SynchronousMemZero(
360 &mem, Eigen::kGpuScratchSize + sizeof(unsigned int));
361 if (!ok) {
362 return errors::FailedPrecondition(
363 "Failed to memcopy into scratch buffer for device ",
364 tf_gpu_id_.value());
365 }
366 scratch_.push_back(static_cast<char*>(scratch_buffer));
367 }
368 }
369 return Status::OK();
370 }
371
Init(const SessionOptions & options)372 Status BaseGPUDevice::Init(const SessionOptions& options) {
373 auto executor_status = GpuIdUtil::ExecutorForTfGpuId(tf_gpu_id_);
374 if (!executor_status.status().ok()) {
375 return errors::Internal("Failed to get StreamExecutor for device ",
376 tf_gpu_id_.value());
377 }
378
379 executor_ = executor_status.ValueOrDie();
380 em_.reset(new EventMgr(executor_, options.config.gpu_options()));
381
382 if (max_streams_ < 1) {
383 return errors::InvalidArgument("Invalid value for max_streams.");
384 }
385
386 // Create the specified number of GPU streams
387 for (int i = 0; i < max_streams_; i++) {
388 streams_.push_back(StreamGroupFactory::Global().GetOrCreate(
389 tf_gpu_id_, i, executor_, options.config.gpu_options()));
390 device_contexts_.push_back(new GPUDeviceContext(
391 i, streams_.back()->compute, streams_.back()->host_to_device,
392 streams_.back()->device_to_host, streams_.back()->device_to_device));
393 }
394 gpu_device_info_ = new GpuDeviceInfo;
395 gpu_device_info_->stream = streams_[0]->compute;
396 gpu_device_info_->default_context = device_contexts_[0];
397 gpu_device_info_->event_mgr = em_.get();
398 PlatformGpuId platform_gpu_id;
399 TF_RETURN_IF_ERROR(
400 GpuIdManager::TfToPlatformGpuId(tf_gpu_id_, &platform_gpu_id));
401 gpu_device_info_->gpu_id = platform_gpu_id.value();
402 set_tensorflow_gpu_device_info(gpu_device_info_);
403
404 // Whether and how the GPU device uses its own threadpool.
405 // This option is experimental. Once we confirm the best setting, we
406 // may change the default behavior and completely remove this flag.
407 // Default values might change in future releases.
408 // Possible values:
409 // * global: GPU uses threads shared with CPU in the main compute
410 // thread-pool. This is currently the default.
411 // * gpu_private: GPU uses threads dedicated to this device.
412 // * gpu_shared: All GPUs share a dedicated thread pool.
413 string gpu_thread_mode;
414 TF_RETURN_IF_ERROR(
415 ReadStringFromEnvVar("TF_GPU_THREAD_MODE", "global", &gpu_thread_mode));
416 gpu_thread_mode = str_util::Lowercase(gpu_thread_mode);
417 if (gpu_thread_mode != "global") {
418 int64 gpu_thread_count = -1;
419 // Default to two threads. One for device compute and another for memory
420 // copies.
421 TF_RETURN_IF_ERROR(
422 ReadInt64FromEnvVar("TF_GPU_THREAD_COUNT", 2, &gpu_thread_count));
423 if (gpu_thread_mode == "gpu_private") {
424 // TODO(zhengxq): since these threads only serve a single GPU device,
425 // we should set the device context once for each thread, and avoid
426 // setting them for each kernel.
427 // TODO(zhengxq): pin the thread to the same socket of the target GPU.
428 thread_pool_.reset(new thread::ThreadPool(
429 options.env, strings::StrCat("gpu_private_", tf_gpu_id_.value()),
430 static_cast<int32>(gpu_thread_count)));
431 set_tensorflow_device_thread_pool(thread_pool_.get());
432 } else if (gpu_thread_mode == "gpu_shared") {
433 static thread::ThreadPool* thread_pool = new thread::ThreadPool(
434 options.env, "gpu_shared", static_cast<int32>(gpu_thread_count));
435 set_tensorflow_device_thread_pool(thread_pool);
436 } else {
437 string error_message =
438 strings::StrCat("Invalid gpu_thread_mode: ", gpu_thread_mode);
439 LOG(WARNING) << error_message;
440 return errors::InvalidArgument(error_message);
441 }
442 }
443
444 return Status::OK();
445 }
446
RequiresRecordingAccessedTensors() const447 bool BaseGPUDevice::RequiresRecordingAccessedTensors() const {
448 // When there is no more than one stream, we release the tensor reference
449 // at the end of the kernel launch, instead of at the end of the kernel
450 // execution.
451 return streams_.size() > 1;
452 }
453
FillContextMap(const Graph * graph,DeviceContextMap * device_context_map)454 Status BaseGPUDevice::FillContextMap(const Graph* graph,
455 DeviceContextMap* device_context_map) {
456 VLOG(2) << "FillContextMap";
457
458 const size_t num_streams = streams_.size();
459 // Special case for single stream.
460 if (num_streams == 1) {
461 return Status::OK();
462 }
463 const int64 before = Env::Default()->NowMicros();
464 gpu_stream_util::AssignStreamsOpts opts;
465 opts.max_streams = static_cast<int32>(num_streams);
466 std::unordered_map<int, int> node_to_stream_id;
467 TF_RETURN_IF_ERROR(
468 gpu_stream_util::AssignStreams(graph, opts, &node_to_stream_id));
469 int64 elapsed = Env::Default()->NowMicros() - before;
470 VLOG(3) << "AssignStreams took " << elapsed << "us";
471
472 // Fill in the context map. It is OK for this map to contain
473 // duplicate DeviceContexts so long as we increment the refcount.
474 device_context_map->resize(graph->num_node_ids());
475 for (Node* n : graph->nodes()) {
476 auto mapped_stream = node_to_stream_id[n->id()];
477 CHECK_LE(mapped_stream, num_streams);
478 auto ctx = device_contexts_[mapped_stream];
479 VLOG(3) << "Assigned stream " << node_to_stream_id[n->id()]
480 << " ==> stream[" << ctx->stream_id() << "] for node id " << n->id()
481 << " " << n->type_string() << " " << n->name();
482 ctx->Ref();
483 (*device_context_map)[n->id()] = ctx;
484 }
485
486 return Status::OK();
487 }
488
Compute(OpKernel * op_kernel,OpKernelContext * context)489 void BaseGPUDevice::Compute(OpKernel* op_kernel, OpKernelContext* context) {
490 // NOTE(tucker): We need to discriminate between Eigen GPU
491 // operations and all others. If an operation is Eigen
492 // implemented (or otherwise tries to launch a GPU kernel
493 // directly), we need to establish a stacked-scoped environment
494 // that directs it to execute on the proper device. Otherwise we
495 // expect the Op to use StreamExecutor directly and correctly. The
496 // way we make this discrimination is quite hacky: At the moment
497 // the only non-Eigen GPU Op is the recv-op, which is known to be
498 // asynchronous.
499 if (op_kernel->is_internal() && op_kernel->type_string() == "_Recv") {
500 context->SetStatus(errors::Internal(
501 "Invalid synchronous 'Compute' on GPU for '_Recv' op"));
502 } else {
503 ComputeHelper(op_kernel, context);
504 }
505 }
506
ComputeOpKernelDebugString(const OpKernel & op_kernel,const int & stream_id)507 string BaseGPUDevice::ComputeOpKernelDebugString(const OpKernel& op_kernel,
508 const int& stream_id) {
509 return strings::StrCat(op_kernel.name(), " op ", op_kernel.type_string(),
510 " on GPU ", tf_gpu_id_.value(), " stream[", stream_id,
511 "]");
512 }
513
ComputeHelper(OpKernel * op_kernel,OpKernelContext * context)514 void BaseGPUDevice::ComputeHelper(OpKernel* op_kernel,
515 OpKernelContext* context) {
516 GPUDeviceContext* gpu_device_context = device_contexts_[0];
517 if (context->op_device_context() != nullptr) {
518 gpu_device_context =
519 static_cast<GPUDeviceContext*>(context->op_device_context());
520 }
521 se::Stream* stream = gpu_device_context->stream();
522 const auto stream_id = gpu_device_context->stream_id();
523
524 const bool vlog_1 = VLOG_IS_ON(1);
525 const bool vlog_2 = vlog_1 && VLOG_IS_ON(2);
526
527 if (vlog_1) {
528 VLOG(1) << "GpuDevice::ComputeHelper "
529 << ComputeOpKernelDebugString(*op_kernel, stream_id);
530 }
531
532 const auto num_streams = streams_.size();
533 if (num_streams > 1) {
534 // If this op's device context is different from the other contexts,
535 // we must wait on the stream.
536 for (int i = 0; i < context->num_inputs(); ++i) {
537 const GPUDeviceContext* idc =
538 static_cast<GPUDeviceContext*>(context->input_device_context(i));
539 OP_REQUIRES(context, idc != nullptr,
540 errors::Internal("Input device context ", i,
541 " was not set properly."));
542 if (vlog_2) {
543 const void* base;
544 size_t len;
545 if (context->has_input(i)) {
546 if (IsRefType(context->input_dtype(i))) {
547 Tensor tensor = context->mutable_input(i, false);
548 base = DMAHelper::base(&tensor);
549 len = tensor.TotalBytes();
550 } else {
551 const Tensor& tensor = context->input(i);
552 base = DMAHelper::base(&tensor);
553 len = tensor.TotalBytes();
554 }
555 LOG(INFO) << "Input " << i << " " << base << " " << len;
556 LOG(INFO) << " stream[" << stream_id << "].ThenWaitFor(stream["
557 << idc->stream_id() << "])"
558 << ((idc->stream() == stream) ? " not needed" : "");
559 }
560 }
561 if (idc->stream() != stream) stream->ThenWaitFor(idc->stream());
562 }
563 }
564 if (pending_cap_ > 0) {
565 DCHECK(kernel_tracker_);
566 kernel_tracker_->PauseWhilePendingExceeds(pending_cap_);
567 }
568 ScopedActivateExecutorContext scoped_activation{stream->parent()};
569 op_kernel->Compute(context);
570 if (context->status().ok()) {
571 if (sync_every_op_) {
572 // Note: GPUUtil::Sync() only syncs the default stream.
573 // We need to either sync the stream used by this op, or
574 // all streams. Given that this flag is typically used for
575 // debugging it makes more sense to sync all GPU activity.
576 context->SetStatus(GPUUtil::SyncAll(this));
577 if (vlog_1) {
578 VLOG(1) << "GpuDevice::ComputeHelper finished "
579 << ComputeOpKernelDebugString(*op_kernel, stream_id);
580 }
581 } else if (vlog_1) {
582 VLOG(1) << "GpuDevice::ComputeHelper scheduled "
583 << ComputeOpKernelDebugString(*op_kernel, stream_id);
584 }
585 if (kernel_tracker_) {
586 GPUKernelTracker* tracker = kernel_tracker_.get();
587 DCHECK(tracker);
588 uint64 queued_count = tracker->RecordQueued();
589 em_->ThenExecute(stream, [op_kernel, tracker, queued_count]() {
590 tracker->RecordTerminated(queued_count);
591 });
592 }
593 } else {
594 if (vlog_1) {
595 VLOG(1) << "GpuDevice::ComputeHelper failed to schedule "
596 << ComputeOpKernelDebugString(*op_kernel, stream_id);
597 }
598 }
599 }
600
ConsumeListOfAccessedTensors(DeviceContext * device_context,const TensorReferenceVector & tensor_refs)601 void BaseGPUDevice::ConsumeListOfAccessedTensors(
602 DeviceContext* device_context, const TensorReferenceVector& tensor_refs) {
603 GPUDeviceContext* gpu_device_context = device_contexts_[0];
604 if (device_context != nullptr) {
605 gpu_device_context = static_cast<GPUDeviceContext*>(device_context);
606 }
607 se::Stream* stream = gpu_device_context->stream();
608 em_->ThenDeleteTensors(stream, tensor_refs);
609 }
610
611 // Based on the semantics of Device::Sync this call should wait for
612 // all streams not just the current one.
Sync()613 Status BaseGPUDevice::Sync() { return GPUUtil::SyncAll(this); }
614
ComputeAsync(AsyncOpKernel * op_kernel,OpKernelContext * context,AsyncOpKernel::DoneCallback done)615 void BaseGPUDevice::ComputeAsync(AsyncOpKernel* op_kernel,
616 OpKernelContext* context,
617 AsyncOpKernel::DoneCallback done) {
618 GPUDeviceContext* gpu_device_context = device_contexts_[0];
619 if (context->op_device_context() != nullptr) {
620 gpu_device_context =
621 static_cast<GPUDeviceContext*>(context->op_device_context());
622 }
623 se::Stream* stream = gpu_device_context->stream();
624 const auto stream_id = gpu_device_context->stream_id();
625
626 VLOG(1) << "GpuDevice::ComputeAsync " << op_kernel->name() << " op "
627 << op_kernel->type_string() << " on GPU" << tf_gpu_id_ << " stream["
628 << stream_id << "]";
629
630 // When Xprof profiling is off (which is the default), constructing the
631 // activity is simple enough that its overhead is negligible.
632 tracing::ScopedActivity activity(op_kernel->name(), op_kernel->type_string(),
633 op_kernel->IsExpensive());
634 ScopedActivateExecutorContext scoped_activation{stream->parent()};
635 op_kernel->ComputeAsync(context, done);
636 }
637
MaybeCopyTensorToGPU(const AllocatorAttributes & alloc_attrs,const Tensor & from,Tensor * to,StatusCallback done)638 Status BaseGPUDevice::MaybeCopyTensorToGPU(
639 const AllocatorAttributes& alloc_attrs, const Tensor& from, Tensor* to,
640 StatusCallback done) {
641 if (alloc_attrs.on_host()) {
642 *to = from;
643 done(Status::OK());
644 return Status::OK();
645 } else {
646 if (!DMAHelper::CanUseDMA(&from)) {
647 Status err = errors::Internal("GPU copy from non-DMA ",
648 DataTypeString(from.dtype()), " tensor");
649 done(err);
650 return err;
651 }
652 auto* copy =
653 new Tensor(GetAllocator(alloc_attrs), from.dtype(), from.shape());
654
655 // If the tensor is not initialized, we likely ran out of memory.
656 if (!copy->IsInitialized()) {
657 delete copy;
658 Status err = errors::ResourceExhausted(
659 "OOM when allocating tensor of shape ", from.shape().DebugString(),
660 " and type ", DataTypeString(from.dtype()));
661 done(err);
662 return err;
663 }
664
665 StatusCallback wrapped_done = std::bind(
666 [to, copy](StatusCallback done_,
667 // Begin unbound arguments.
668 const Status& s) {
669 if (s.ok()) {
670 *to = std::move(*copy);
671 }
672 delete copy;
673 done_(s);
674 },
675 std::move(done), std::placeholders::_1);
676
677 tracing::ScopedAnnotation annotation("MakeTensorFromProto");
678 device_contexts_[0]->CopyCPUTensorToDevice(&from, this, copy,
679 std::move(wrapped_done));
680 return Status::OK();
681 }
682 }
683
MakeTensorFromProto(const TensorProto & tensor_proto,const AllocatorAttributes alloc_attrs,Tensor * tensor)684 Status BaseGPUDevice::MakeTensorFromProto(const TensorProto& tensor_proto,
685 const AllocatorAttributes alloc_attrs,
686 Tensor* tensor) {
687 AllocatorAttributes attr;
688 attr.set_on_host(true);
689 attr.set_gpu_compatible(true);
690 Allocator* host_alloc = GetAllocator(attr);
691 Tensor parsed(tensor_proto.dtype());
692 if (!parsed.FromProto(host_alloc, tensor_proto)) {
693 return errors::InvalidArgument("Cannot parse tensor from proto: ",
694 tensor_proto.DebugString());
695 }
696
697 if (parsed.dtype() == DT_VARIANT) {
698 const Variant* from = parsed.flat<Variant>().data();
699 int numa_node = attributes().locality().numa_node();
700 Tensor copy(cpu_allocator(numa_node), DT_VARIANT, parsed.shape());
701 Variant* copy_variant = copy.flat<Variant>().data();
702
703 std::list<Notification> notifications;
704 Status copy_status;
705 auto copier = [this, &alloc_attrs, ¬ifications, ©_status](
706 const Tensor& from, Tensor* to) {
707 // Copier isn't run in a multithreaded environment, so we don't
708 // have to worry about the notifications list being modified in parallel.
709 notifications.emplace_back();
710 Notification& n = *notifications.rbegin();
711 return MaybeCopyTensorToGPU(alloc_attrs, from, to,
712 [&n, ©_status](const Status& s) {
713 if (copy_status.ok()) {
714 copy_status.Update(s);
715 }
716 n.Notify();
717 });
718 };
719 Status s;
720 for (int64 ix = 0; ix < parsed.NumElements(); ++ix) {
721 s = VariantDeviceCopy(VariantDeviceCopyDirection::HOST_TO_DEVICE,
722 from[ix], ©_variant[ix], copier);
723 if (!s.ok()) {
724 break;
725 }
726 }
727 for (auto& n : notifications) {
728 n.WaitForNotification();
729 }
730 if (!s.ok()) {
731 return s;
732 }
733 *tensor = std::move(copy);
734 return copy_status;
735 } else {
736 Notification n;
737 Status status;
738 TF_RETURN_IF_ERROR(MaybeCopyTensorToGPU(alloc_attrs, parsed, tensor,
739 [&n, &status](const Status& s) {
740 status = s;
741 n.Notify();
742 }));
743 n.WaitForNotification();
744 return status;
745 }
746 }
747
748 namespace {
749 class ConcretePerOpGpuDevice : public PerOpGpuDevice {
750 public:
ConcretePerOpGpuDevice()751 ConcretePerOpGpuDevice() : device_(&stream_device_) {}
752
Reinitialize(OpKernelContext * context,const gpuStream_t * gpu_stream,TfGpuId tf_gpu_id,Allocator * base_allocator,char * scratch)753 void Reinitialize(OpKernelContext* context, const gpuStream_t* gpu_stream,
754 TfGpuId tf_gpu_id, Allocator* base_allocator,
755 char* scratch) {
756 stream_device_.Reinitialize(context, gpu_stream, tf_gpu_id, base_allocator,
757 scratch);
758 }
759
device() const760 const Eigen::GpuDevice& device() const override { return device_; }
761
762 private:
763 EigenGpuStreamDevice stream_device_;
764 Eigen::GpuDevice device_;
765 };
766
767 // Parse 'visible_device_list' into a list of platform GPU ids.
ParseVisibleDeviceList(const string & visible_device_list,std::vector<PlatformGpuId> * visible_gpu_order)768 Status ParseVisibleDeviceList(const string& visible_device_list,
769 std::vector<PlatformGpuId>* visible_gpu_order) {
770 visible_gpu_order->clear();
771 se::Platform* gpu_manager = GPUMachineManager();
772
773 // If the user wants to remap the visible to virtual GPU mapping,
774 // check for that here.
775 if (visible_device_list.empty()) {
776 visible_gpu_order->resize(gpu_manager->VisibleDeviceCount());
777 // By default, visible to virtual mapping is unchanged.
778 int deviceNo = 0;
779 std::generate(visible_gpu_order->begin(), visible_gpu_order->end(),
780 [&deviceNo] { return deviceNo++; });
781 } else {
782 const std::vector<string> order_str =
783 str_util::Split(visible_device_list, ',');
784 for (const string& platform_gpu_id_str : order_str) {
785 int32 platform_gpu_id;
786 if (!strings::safe_strto32(platform_gpu_id_str, &platform_gpu_id)) {
787 return errors::InvalidArgument(
788 "Could not parse entry in 'visible_device_list': '",
789 platform_gpu_id_str,
790 "'. visible_device_list = ", visible_device_list);
791 }
792 if (platform_gpu_id < 0 ||
793 platform_gpu_id >= gpu_manager->VisibleDeviceCount()) {
794 return errors::InvalidArgument(
795 "'visible_device_list' listed an invalid GPU id '", platform_gpu_id,
796 "' but visible device count is ",
797 gpu_manager->VisibleDeviceCount());
798 }
799 visible_gpu_order->push_back(PlatformGpuId(platform_gpu_id));
800 }
801 }
802
803 // Validate no repeats.
804 std::set<PlatformGpuId> visible_device_set(visible_gpu_order->begin(),
805 visible_gpu_order->end());
806 if (visible_device_set.size() != visible_gpu_order->size()) {
807 return errors::InvalidArgument(
808 "visible_device_list contained a duplicate entry: ",
809 visible_device_list);
810 }
811 return Status::OK();
812 }
813
VerifyVirtualDeviceSettings(const size_t num_gpus_to_use,const GPUOptions & gpu_options,const std::vector<PlatformGpuId> & visible_gpu_order,const std::vector<PlatformGpuId> & valid_platform_gpu_ids)814 Status VerifyVirtualDeviceSettings(
815 const size_t num_gpus_to_use, const GPUOptions& gpu_options,
816 const std::vector<PlatformGpuId>& visible_gpu_order,
817 const std::vector<PlatformGpuId>& valid_platform_gpu_ids) {
818 const auto& virtual_devices = gpu_options.experimental().virtual_devices();
819 CHECK(!virtual_devices.empty());
820 if (gpu_options.per_process_gpu_memory_fraction() > 0) {
821 return errors::InvalidArgument(
822 "It's invalid to set per_process_gpu_memory_fraction when "
823 "virtual_devices is set.");
824 }
825 if (num_gpus_to_use < virtual_devices.size()) {
826 return errors::Unknown(
827 "Not enough GPUs to create virtual devices."
828 " num_gpus_to_use: ",
829 num_gpus_to_use, " #virtual_devices: ", virtual_devices.size());
830 }
831 if (!gpu_options.visible_device_list().empty() &&
832 visible_gpu_order.size() != virtual_devices.size()) {
833 return errors::InvalidArgument(
834 "The number of GPUs in visible_device_list doesn't match the number "
835 "of elements in the virtual_devices list.",
836 " #GPUs in visible_device_list: ", visible_gpu_order.size(),
837 " virtual_devices.size(): ", virtual_devices.size());
838 }
839 if (valid_platform_gpu_ids.size() != virtual_devices.size()) {
840 return errors::Unknown(
841 "The number of valid GPUs doesn't match the number of elements in "
842 "the virtual_devices list.",
843 " #valid GPUs: ", valid_platform_gpu_ids.size(),
844 " virtual_devices.size(): ", virtual_devices.size());
845 }
846 return Status::OK();
847 }
848
MinSystemMemory(int64 available_memory)849 int64 MinSystemMemory(int64 available_memory) {
850 // We use the following heuristic for now:
851 //
852 // If the available_memory is < 2GiB, we allocate 225MiB to system memory.
853 // Otherwise, allocate max(300MiB, 0.05 * available_memory) to system memory.
854 //
855 // In the future we could be more sophisticated by using a table of devices.
856 int64 min_system_memory;
857 if (available_memory < (1LL << 31)) {
858 // 225MiB
859 min_system_memory = 225 * 1024 * 1024;
860 } else {
861 // max(300 MiB, 0.05 * available_memory)
862 min_system_memory =
863 std::max(int64{314572800}, static_cast<int64>(available_memory * 0.05));
864 }
865 #if defined(__GNUC__) && defined(__OPTIMIZE__)
866 // Do nothing
867 #elif !defined(__GNUC__) && defined(NDEBUG)
868 // Do nothing
869 #else
870 // Double the amount of available GPU memory in non-opt builds (debug
871 // builds in windows); because in non-opt builds more system memory
872 // is necessary.
873 min_system_memory *= 2;
874 #endif
875
876 #if defined(ANDROID_TEGRA)
877 // 1GB system mem for NVIDIA Tegra devices since they use the same mem for RAM
878 // and Video RAM
879 min_system_memory = 1 << 30;
880 #endif
881 return min_system_memory;
882 }
883
884 // Get the memory limit for the virtual device being created on GPU with
885 // 'platform_gpu_id', when that virtual device is the only virtual device being
886 // created on that GPU.
SingleVirtualDeviceMemoryLimit(const GPUOptions & gpu_options,PlatformGpuId platform_gpu_id,int64 * memory_limit)887 Status SingleVirtualDeviceMemoryLimit(const GPUOptions& gpu_options,
888 PlatformGpuId platform_gpu_id,
889 int64* memory_limit) {
890 int64 total_memory = 0;
891 int64 available_memory = 0;
892 se::StreamExecutor* se =
893 GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie();
894 if (!se->DeviceMemoryUsage(&available_memory, &total_memory)) {
895 return errors::Unknown("Failed to query available memory for GPU ",
896 platform_gpu_id.value());
897 }
898
899 int64 allocated_memory = 0;
900 const double per_process_gpu_memory_fraction =
901 gpu_options.per_process_gpu_memory_fraction();
902 if (per_process_gpu_memory_fraction > 1.0 ||
903 gpu_options.experimental().use_unified_memory()) {
904 int cc_major = 0, cc_minor = 0;
905 if (!se->GetDeviceDescription().cuda_compute_capability(&cc_major,
906 &cc_minor)) {
907 return errors::Internal("Failed to get compute capability for device.");
908 }
909 if (cc_major < 6) {
910 return errors::Internal(
911 "Unified memory on GPUs with compute capability lower than 6.0 "
912 "(pre-Pascal class GPUs) does not support oversubscription.");
913 }
914 }
915
916 if (per_process_gpu_memory_fraction == 0) {
917 allocated_memory = available_memory;
918 const int64 min_system_memory = MinSystemMemory(available_memory);
919 if (min_system_memory < allocated_memory) {
920 allocated_memory -= min_system_memory;
921 }
922 } else {
923 allocated_memory = total_memory * per_process_gpu_memory_fraction;
924 }
925 *memory_limit = allocated_memory;
926 return Status::OK();
927 }
928 } // namespace
929
ReinitializeDevice(OpKernelContext * context,PerOpGpuDevice * device,int stream_id,Allocator * allocator)930 void BaseGPUDevice::ReinitializeDevice(OpKernelContext* context,
931 PerOpGpuDevice* device, int stream_id,
932 Allocator* allocator) {
933 ConcretePerOpGpuDevice* concrete_device =
934 static_cast<ConcretePerOpGpuDevice*>(device);
935 DCHECK(concrete_device);
936 const gpuStream_t* gpu_stream = reinterpret_cast<const gpuStream_t*>(
937 streams_[stream_id]->compute->implementation()->GpuStreamMemberHack());
938 concrete_device->Reinitialize(context, gpu_stream, tf_gpu_id_, allocator,
939 scratch_[stream_id]);
940 }
941
MakeGpuDevice()942 PerOpGpuDevice* BaseGPUDevice::MakeGpuDevice() {
943 return new ConcretePerOpGpuDevice();
944 }
945
ReinitializeGpuDevice(OpKernelContext * context,PerOpGpuDevice * device,DeviceContext * dc,Allocator * allocator)946 Status BaseGPUDevice::ReinitializeGpuDevice(OpKernelContext* context,
947 PerOpGpuDevice* device,
948 DeviceContext* dc,
949 Allocator* allocator) {
950 TF_RETURN_IF_ERROR(InitScratchBuffers());
951 if (dc) {
952 const GPUDeviceContext* gpu_dc = static_cast<GPUDeviceContext*>(dc);
953 const int stream_id = gpu_dc->stream_id();
954 VLOG(1) << " eigen_gpu_device(" << dc << ") => stream[" << stream_id
955 << "]";
956 CHECK_LT(stream_id, streams_.size());
957 ReinitializeDevice(context, device, stream_id, allocator);
958 } else {
959 ReinitializeDevice(context, device, 0, allocator);
960 }
961 return Status::OK();
962 }
963
GetScopedAllocator(AllocatorAttributes attr,int64 step_id)964 Allocator* BaseGPUDevice::GetScopedAllocator(AllocatorAttributes attr,
965 int64 step_id) {
966 if (attr.scope_id > 0) {
967 return scoped_allocator_mgr_->GetContainer(step_id)->GetInstance(
968 attr.scope_id);
969 }
970 LOG(FATAL) << "Unexpected call to BaseGPUDevice::GetScopedAllocator "
971 << "attr.scope_id = " << attr.scope_id;
972 return gpu_allocator_;
973 }
974
975 const int BaseGPUDeviceFactory::InterconnectMap::kSameDeviceStrength = 1000;
976 const int BaseGPUDeviceFactory::InterconnectMap::kStreamExecutorStrength = 1;
977
CreateDevices(const SessionOptions & options,const string & name_prefix,std::vector<std::unique_ptr<Device>> * devices)978 Status BaseGPUDeviceFactory::CreateDevices(
979 const SessionOptions& options, const string& name_prefix,
980 std::vector<std::unique_ptr<Device>>* devices) {
981 TF_RETURN_IF_ERROR(ValidateGPUMachineManager());
982 se::Platform* gpu_manager = GPUMachineManager();
983 if (gpu_manager == nullptr) {
984 return Status::OK();
985 }
986 // If there are no GPUs visible, do nothing.
987 if (gpu_manager->VisibleDeviceCount() <= 0) {
988 return Status::OK();
989 }
990
991 size_t num_gpus_to_use = INT_MAX;
992 auto iter = options.config.device_count().find("GPU");
993 if (iter != options.config.device_count().end()) {
994 num_gpus_to_use = iter->second;
995 }
996 const auto& gpu_options = options.config.gpu_options();
997 std::vector<PlatformGpuId> visible_gpu_order;
998 std::vector<PlatformGpuId> valid_platform_gpu_ids;
999 // If we aren't going to use any GPUs, don't initialize them.
1000 // We don't want to call ParseVisibleDeviceList if num_gpus_to_use is 0,
1001 // because it treats an empty gpu_options.visible_device_list as 'all GPUs are
1002 // visible'.
1003 if (num_gpus_to_use > 0) {
1004 TF_RETURN_IF_ERROR(ParseVisibleDeviceList(gpu_options.visible_device_list(),
1005 &visible_gpu_order));
1006 TF_RETURN_IF_ERROR(
1007 GetValidDeviceIds(visible_gpu_order, &valid_platform_gpu_ids));
1008 }
1009 if (num_gpus_to_use > valid_platform_gpu_ids.size()) {
1010 num_gpus_to_use = valid_platform_gpu_ids.size();
1011 }
1012 if (!valid_platform_gpu_ids.empty()) {
1013 // Save the original device.
1014 int original_device = 0;
1015 #if GOOGLE_CUDA
1016 cudaError_t err = cudaGetDevice(&original_device);
1017 if (err != cudaSuccess) {
1018 return errors::Internal("cudaGetDevice() failed. Status: ",
1019 cudaGetErrorString(err));
1020 }
1021 #elif TENSORFLOW_USE_ROCM
1022 hipError_t err = hipGetDevice(&original_device);
1023 if (err != hipSuccess) {
1024 return errors::Internal("hipGetDevice() failed. Status: ",
1025 hipGetErrorString(err));
1026 }
1027 #endif
1028
1029 // Force to implicitly initialize CUDA runtime on each valid GPU before
1030 // CreateGPUDevice().
1031 for (PlatformGpuId platform_gpu_id : valid_platform_gpu_ids) {
1032 #if GOOGLE_CUDA
1033 err = cudaSetDevice(platform_gpu_id.value());
1034 if (err != cudaSuccess) {
1035 return errors::Internal(
1036 "cudaSetDevice() on GPU:", platform_gpu_id.value(),
1037 " failed. Status: ", cudaGetErrorString(err));
1038 }
1039 err = cudaFree(nullptr);
1040 if (err != cudaSuccess) {
1041 return errors::Internal("CUDA runtime implicit initialization on GPU:",
1042 platform_gpu_id.value(),
1043 " failed. Status: ", cudaGetErrorString(err));
1044 }
1045 #elif TENSORFLOW_USE_ROCM
1046 err = hipSetDevice(platform_gpu_id.value());
1047 if (err != hipSuccess) {
1048 return errors::Internal(
1049 "hipSetDevice() on GPU:", platform_gpu_id.value(),
1050 " failed. Status: ", hipGetErrorString(err));
1051 }
1052 err = hipFree(nullptr);
1053 if (err != hipSuccess) {
1054 return errors::Internal("ROCm runtime implicit initialization on GPU:",
1055 platform_gpu_id.value(),
1056 " failed. Status: ", hipGetErrorString(err));
1057 }
1058 #endif
1059 }
1060 // Reset to the original device.
1061 #if GOOGLE_CUDA
1062 err = cudaSetDevice(original_device);
1063 if (err != cudaSuccess) {
1064 return errors::Internal("cudaSetDevice() on GPU:", original_device,
1065 " failed. Status: ", cudaGetErrorString(err));
1066 }
1067 #elif TENSORFLOW_USE_ROCM
1068 err = hipSetDevice(original_device);
1069 if (err != hipSuccess) {
1070 return errors::Internal("hipSetDevice() on GPU:", original_device,
1071 " failed. Status: ", hipGetErrorString(err));
1072 }
1073 #endif
1074 }
1075
1076 std::vector<InterconnectMap> interconnect_maps;
1077 TF_RETURN_IF_ERROR(
1078 GetInterconnectMaps(visible_gpu_order, gpu_manager, &interconnect_maps));
1079
1080 // Print each interconnect map to the log.
1081 for (const InterconnectMap& im : interconnect_maps) {
1082 LOG(INFO) << "Device interconnect " << im.name << " with strength "
1083 << im.strength << " edge matrix:";
1084 string line_buf = " ";
1085 for (int i = 0; i < visible_gpu_order.size(); ++i) {
1086 strings::StrAppend(&line_buf, visible_gpu_order[i].value(), " ");
1087 }
1088 LOG(INFO) << line_buf;
1089 for (int i = 0; i < visible_gpu_order.size(); ++i) {
1090 line_buf = strings::StrCat(visible_gpu_order[i].value(), ": ");
1091 PlatformGpuId gpu_id_i = visible_gpu_order[i];
1092 for (int j = 0; j < visible_gpu_order.size(); ++j) {
1093 PlatformGpuId gpu_id_j = visible_gpu_order[j];
1094 if (im.directed_links.find({gpu_id_i, gpu_id_j}) !=
1095 im.directed_links.end()) {
1096 line_buf.append("Y ");
1097 } else {
1098 line_buf.append("N ");
1099 }
1100 }
1101 LOG(INFO) << line_buf;
1102 }
1103 }
1104
1105 const auto& virtual_devices = gpu_options.experimental().virtual_devices();
1106 if (!virtual_devices.empty()) {
1107 TF_RETURN_IF_ERROR(VerifyVirtualDeviceSettings(num_gpus_to_use, gpu_options,
1108 visible_gpu_order,
1109 valid_platform_gpu_ids));
1110 // We've verified that num_gpus_to_use >= virtual_devices.size().
1111 num_gpus_to_use = virtual_devices.size();
1112 CHECK(gpu_options.visible_device_list().empty() ||
1113 valid_platform_gpu_ids == visible_gpu_order);
1114 }
1115 int next_tf_gpu_id = 0;
1116 std::vector<int64> memory_limit_bytes;
1117 for (int i = 0; i < num_gpus_to_use; ++i) {
1118 const PlatformGpuId platform_gpu_id = valid_platform_gpu_ids[i];
1119 if (virtual_devices.empty() ||
1120 virtual_devices.Get(i).memory_limit_mb_size() == 0) {
1121 int64 single_virtual_device_memory_limit = 0;
1122 TF_RETURN_IF_ERROR(SingleVirtualDeviceMemoryLimit(
1123 gpu_options, platform_gpu_id, &single_virtual_device_memory_limit));
1124 memory_limit_bytes.push_back(single_virtual_device_memory_limit);
1125 } else {
1126 const auto& memory_limit_mb = virtual_devices.Get(i).memory_limit_mb();
1127 std::transform(memory_limit_mb.begin(), memory_limit_mb.end(),
1128 std::back_inserter(memory_limit_bytes), [](float mb) {
1129 return static_cast<int64>(mb) * (1ll << 20);
1130 });
1131 }
1132 while (next_tf_gpu_id < memory_limit_bytes.size()) {
1133 TfGpuId tf_gpu_id(next_tf_gpu_id);
1134 ++next_tf_gpu_id;
1135 TF_RETURN_IF_ERROR(
1136 GpuIdManager::InsertTfPlatformGpuIdPair(tf_gpu_id, platform_gpu_id));
1137 }
1138 }
1139 const int num_tf_gpus = next_tf_gpu_id;
1140
1141 LocalityMap device_localities;
1142 TF_RETURN_IF_ERROR(
1143 GetDeviceLocalities(num_tf_gpus, interconnect_maps, &device_localities));
1144
1145 // Build the GPUDevices
1146 CHECK_EQ(next_tf_gpu_id, memory_limit_bytes.size());
1147 for (int di = 0; di < num_tf_gpus; ++di) {
1148 TfGpuId tf_gpu_id(di);
1149 int64 bytes = memory_limit_bytes[di];
1150 auto it = device_localities.find(tf_gpu_id);
1151 if (it == device_localities.end()) {
1152 return errors::Internal("Failed to find DeviceLocality for GPU device ",
1153 tf_gpu_id.value());
1154 }
1155 TF_RETURN_IF_ERROR(CreateGPUDevice(options, name_prefix, tf_gpu_id, bytes,
1156 it->second, devices));
1157 }
1158 return Status::OK();
1159 }
1160
GetShortDeviceDescription(PlatformGpuId platform_gpu_id,const se::DeviceDescription & desc)1161 static string GetShortDeviceDescription(PlatformGpuId platform_gpu_id,
1162 const se::DeviceDescription& desc) {
1163 #if GOOGLE_CUDA
1164 int cc_major;
1165 int cc_minor;
1166 if (!desc.cuda_compute_capability(&cc_major, &cc_minor)) {
1167 cc_major = 0;
1168 cc_minor = 0;
1169 }
1170 // LINT.IfChange
1171 return strings::StrCat("device: ", platform_gpu_id.value(), ", name: ",
1172 desc.name(), ", pci bus id: ", desc.pci_bus_id(),
1173 ", compute capability: ", cc_major, ".", cc_minor);
1174 // LINT.ThenChange(//tensorflow/python/platform/test.py)
1175 #elif TENSORFLOW_USE_ROCM
1176 return strings::StrCat("device: ", platform_gpu_id.value(),
1177 ", name: ", desc.name(),
1178 ", pci bus id: ", desc.pci_bus_id());
1179 #endif
1180 }
1181
CreateGPUDevice(const SessionOptions & options,const string & name_prefix,TfGpuId tf_gpu_id,int64 memory_limit,const DeviceLocality & dev_locality,std::vector<std::unique_ptr<Device>> * devices)1182 Status BaseGPUDeviceFactory::CreateGPUDevice(
1183 const SessionOptions& options, const string& name_prefix, TfGpuId tf_gpu_id,
1184 int64 memory_limit, const DeviceLocality& dev_locality,
1185 std::vector<std::unique_ptr<Device>>* devices) {
1186 CHECK_GE(tf_gpu_id.value(), 0);
1187 const string device_name =
1188 strings::StrCat(name_prefix, "/device:GPU:", tf_gpu_id.value());
1189 GpuIdUtil::CheckValidTfGpuId(tf_gpu_id);
1190 PlatformGpuId platform_gpu_id;
1191 TF_RETURN_IF_ERROR(
1192 GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id));
1193 int numa_node = dev_locality.numa_node();
1194
1195 se::StreamExecutor* se =
1196 GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie();
1197 const se::DeviceDescription& desc = se->GetDeviceDescription();
1198 GPUProcessState* process_state = GPUProcessState::singleton();
1199 Allocator* gpu_allocator = process_state->GetGPUAllocator(
1200 options.config.gpu_options(), tf_gpu_id, memory_limit);
1201 if (gpu_allocator == nullptr) {
1202 return errors::Internal("Failed to get memory allocator for TF GPU ",
1203 tf_gpu_id.value(), " with ", memory_limit,
1204 " bytes of memory.");
1205 }
1206 absl::optional<AllocatorStats> stats = gpu_allocator->GetStats();
1207 if (!stats) {
1208 return errors::Internal("No allocator statistics");
1209 }
1210 // 'memory_limit' is the required memory size, but if the allocator with given
1211 // tf_gpu_id was created before, we'll use it instead of creating a new one
1212 // (as TF gpu device is a shared resource), in which case the actual memory
1213 // limit represented by 'stats.bytes_limit' used by that allocator may be
1214 // different (which should be an error).
1215 //
1216 // TODO(laigd): report error if memory_limit doesn't match stats->bytes_limit.
1217 int64 bytes_limit = stats->bytes_limit ? *stats->bytes_limit : 0;
1218 std::unique_ptr<BaseGPUDevice> gpu_device = CreateGPUDevice(
1219 options, device_name, static_cast<Bytes>(bytes_limit), dev_locality,
1220 tf_gpu_id, GetShortDeviceDescription(platform_gpu_id, desc),
1221 gpu_allocator, ProcessState::singleton()->GetCPUAllocator(numa_node));
1222 LOG(INFO) << "Created TensorFlow device (" << device_name << " with "
1223 << (bytes_limit >> 20) << " MB memory) -> physical GPU ("
1224 << GetShortDeviceDescription(platform_gpu_id, desc) << ")";
1225 TF_RETURN_IF_ERROR(gpu_device->Init(options));
1226 devices->push_back(std::move(gpu_device));
1227
1228 return Status::OK();
1229 }
1230
1231 namespace {
1232 std::unique_ptr<std::map<std::pair<PlatformGpuId, PlatformGpuId>, bool>>
GetPeerAccessMap(se::Platform * platform,const std::vector<PlatformGpuId> & visible_gpu_order)1233 GetPeerAccessMap(se::Platform* platform,
1234 const std::vector<PlatformGpuId>& visible_gpu_order) {
1235 std::unique_ptr<std::map<std::pair<PlatformGpuId, PlatformGpuId>, bool>> map(
1236 new std::map<std::pair<PlatformGpuId, PlatformGpuId>, bool>);
1237 for (PlatformGpuId platform_gpu_i : visible_gpu_order) {
1238 for (PlatformGpuId platform_gpu_j : visible_gpu_order) {
1239 se::StreamExecutor* from =
1240 GpuIdUtil::ExecutorForPlatformGpuId(platform, platform_gpu_i)
1241 .ValueOrDie();
1242 se::StreamExecutor* to =
1243 GpuIdUtil::ExecutorForPlatformGpuId(platform, platform_gpu_j)
1244 .ValueOrDie();
1245 (*map)[{platform_gpu_i, platform_gpu_j}] =
1246 from->CanEnablePeerAccessTo(to);
1247 }
1248 }
1249
1250 return map;
1251 }
1252
1253 } // namespace
1254
GetInterconnectMaps(const std::vector<PlatformGpuId> & visible_gpu_order,se::Platform * gpu_manager,std::vector<InterconnectMap> * maps)1255 Status BaseGPUDeviceFactory::GetInterconnectMaps(
1256 const std::vector<PlatformGpuId>& visible_gpu_order,
1257 se::Platform* gpu_manager, std::vector<InterconnectMap>* maps) {
1258 // The default interconnect map is obtained from the StreamExecutor.
1259 auto access_map = GetPeerAccessMap(gpu_manager, visible_gpu_order);
1260 maps->resize(1);
1261 InterconnectMap& imap = maps->at(0);
1262 imap.name = "StreamExecutor";
1263 imap.strength = InterconnectMap::kStreamExecutorStrength;
1264 for (PlatformGpuId gpu_id_i : visible_gpu_order) {
1265 for (PlatformGpuId gpu_id_j : visible_gpu_order) {
1266 if (gpu_id_i == gpu_id_j) continue;
1267 if ((*access_map)[{gpu_id_i, gpu_id_j}]) {
1268 imap.directed_links.insert({gpu_id_i, gpu_id_j});
1269 }
1270 }
1271 }
1272 return Status::OK();
1273 }
1274
GetDeviceLocalities(int num_tf_gpus,const std::vector<InterconnectMap> & interconnects,LocalityMap * localities)1275 Status BaseGPUDeviceFactory::GetDeviceLocalities(
1276 int num_tf_gpus, const std::vector<InterconnectMap>& interconnects,
1277 LocalityMap* localities) {
1278 std::vector<TfGpuId> all_tf_gpu_ids;
1279 all_tf_gpu_ids.reserve(num_tf_gpus);
1280 for (int i = 0; i < num_tf_gpus; ++i) {
1281 all_tf_gpu_ids.push_back(TfGpuId(i));
1282 }
1283 for (TfGpuId tf_gpu_id : all_tf_gpu_ids) {
1284 PlatformGpuId platform_gpu_id;
1285 TF_RETURN_IF_ERROR(
1286 GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id));
1287 // Get GPU bus_id from its reported NUMA affinity. Because GPUs are
1288 // virtualized in some environments, we can't just use the GPU id.
1289 // NUMA locales are indexed from 0, buses are indexed from 1.
1290 se::StreamExecutor* se =
1291 GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie();
1292 const se::DeviceDescription& desc = se->GetDeviceDescription();
1293 int numa_node = desc.numa_node();
1294 if (numa_node < 0) {
1295 // For some reason the StreamExecutor couldn't get the NUMA
1296 // affinity of the GPU. If this is not a multi-socket mobo with
1297 // GPUs local to different buses, it doesn't matter. If it is, we
1298 // may run into trouble later with data transfer operations. The
1299 // trouble may manifest as slower than expected performance, or
1300 // outright failures.
1301 LOG(INFO) << "Could not identify NUMA node of platform GPU id "
1302 << platform_gpu_id
1303 << ", defaulting to 0. Your kernel may not have been built "
1304 << "with NUMA support.";
1305 numa_node = 0;
1306 }
1307 DeviceLocality dev_locality;
1308 dev_locality.set_numa_node(numa_node);
1309 dev_locality.set_bus_id(numa_node + 1);
1310
1311 // Set LocalLinks from InterconnectMaps.
1312 LocalLinks* links = dev_locality.mutable_links();
1313 for (const InterconnectMap& imap : interconnects) {
1314 for (TfGpuId tf_gpu_dst : all_tf_gpu_ids) {
1315 PlatformGpuId platform_gpu_dst;
1316 TF_RETURN_IF_ERROR(
1317 GpuIdManager::TfToPlatformGpuId(tf_gpu_dst, &platform_gpu_dst));
1318 if (imap.directed_links.find({platform_gpu_id, platform_gpu_dst}) !=
1319 imap.directed_links.end()) {
1320 InterconnectLink* ilink = links->add_link();
1321 ilink->set_device_id(tf_gpu_dst.value());
1322 ilink->set_type(imap.name);
1323 ilink->set_strength(imap.strength);
1324 }
1325 }
1326 }
1327
1328 // If this is one of multiple virtual GPUs on the same physical GPU
1329 // add high strength links to the others.
1330 for (TfGpuId tf_gpu_dst : all_tf_gpu_ids) {
1331 if (tf_gpu_id == tf_gpu_dst) continue;
1332 PlatformGpuId platform_gpu_dst;
1333 TF_RETURN_IF_ERROR(
1334 GpuIdManager::TfToPlatformGpuId(tf_gpu_dst, &platform_gpu_dst));
1335 if (platform_gpu_id == platform_gpu_dst) {
1336 InterconnectLink* ilink = links->add_link();
1337 ilink->set_device_id(tf_gpu_dst.value());
1338 ilink->set_type("SAME_DEVICE");
1339 ilink->set_strength(InterconnectMap::kSameDeviceStrength);
1340 }
1341 }
1342
1343 (*localities)[tf_gpu_id] = dev_locality;
1344 VLOG(1) << "GPUDevice PlatformGpuId " << platform_gpu_id << " TfGpuId "
1345 << tf_gpu_id << " on bus " << dev_locality.bus_id()
1346 << " numa: " << numa_node << " pci: " << desc.pci_bus_id()
1347 << " DeviceLocality: " << dev_locality.DebugString();
1348 }
1349 return Status::OK();
1350 }
1351
GetDefaultMinGPUMultiprocessorCount(se::Platform * gpu_manager,const std::vector<PlatformGpuId> & visible_gpu_order)1352 static int GetDefaultMinGPUMultiprocessorCount(
1353 se::Platform* gpu_manager,
1354 const std::vector<PlatformGpuId>& visible_gpu_order) {
1355 static const int kDefaultMinGPUMultiprocessorCount = 8;
1356
1357 // Find the highest multi-processor count across all visible GPUs.
1358 int max_count = -1;
1359 for (int i = 0; i < visible_gpu_order.size(); ++i) {
1360 auto exec_status =
1361 GpuIdUtil::ExecutorForPlatformGpuId(gpu_manager, visible_gpu_order[i]);
1362 if (!exec_status.ok()) {
1363 continue;
1364 }
1365
1366 se::StreamExecutor* se = exec_status.ValueOrDie();
1367 const se::DeviceDescription& desc = se->GetDeviceDescription();
1368 max_count = std::max(max_count, desc.core_count());
1369 }
1370
1371 if (max_count < 0 || kDefaultMinGPUMultiprocessorCount < max_count) {
1372 return kDefaultMinGPUMultiprocessorCount;
1373 } else {
1374 return max_count;
1375 }
1376 }
1377
GetMinGPUMultiprocessorCount(se::Platform * gpu_manager,const std::vector<PlatformGpuId> & visible_gpu_order)1378 static int GetMinGPUMultiprocessorCount(
1379 se::Platform* gpu_manager,
1380 const std::vector<PlatformGpuId>& visible_gpu_order) {
1381 const char* tf_min_gpu_core_count = getenv("TF_MIN_GPU_MULTIPROCESSOR_COUNT");
1382
1383 if (tf_min_gpu_core_count == nullptr ||
1384 strcmp(tf_min_gpu_core_count, "") == 0) {
1385 return GetDefaultMinGPUMultiprocessorCount(gpu_manager, visible_gpu_order);
1386 }
1387
1388 int min_gpu_core_count = -1;
1389 if (strings::safe_strto32(tf_min_gpu_core_count, &min_gpu_core_count)) {
1390 if (min_gpu_core_count >= 0) {
1391 return min_gpu_core_count;
1392 }
1393 }
1394
1395 int count =
1396 GetDefaultMinGPUMultiprocessorCount(gpu_manager, visible_gpu_order);
1397 LOG(ERROR) << "Invalid minimum GPU multiprocessor count: ["
1398 << tf_min_gpu_core_count << "]. "
1399 << "Using the default value: " << count;
1400 return count;
1401 }
1402
1403 namespace {
1404
1405 #if GOOGLE_CUDA
1406 struct CudaVersion {
1407 // Initialize from version_name in the form of "3.5"
CudaVersiontensorflow::__anon0a6e82d20a11::CudaVersion1408 explicit CudaVersion(const std::string& version_name) {
1409 size_t dot_pos = version_name.find('.');
1410 CHECK(dot_pos != string::npos)
1411 << "Illegal version name: [" << version_name << "]";
1412 string major_str = version_name.substr(0, dot_pos);
1413 CHECK(strings::safe_strto32(major_str, &major_part))
1414 << "Illegal version name: [" << version_name << "]";
1415 string minor_str = version_name.substr(dot_pos + 1);
1416 CHECK(strings::safe_strto32(minor_str, &minor_part))
1417 << "Illegal version name: [" << version_name << "]";
1418 }
CudaVersiontensorflow::__anon0a6e82d20a11::CudaVersion1419 CudaVersion() {}
operator <tensorflow::__anon0a6e82d20a11::CudaVersion1420 bool operator<(const CudaVersion& other) const {
1421 if (this->major_part != other.major_part) {
1422 return this->major_part < other.major_part;
1423 }
1424 return this->minor_part < other.minor_part;
1425 }
operator <<(std::ostream & os,const CudaVersion & version)1426 friend std::ostream& operator<<(std::ostream& os,
1427 const CudaVersion& version) {
1428 os << version.major_part << "." << version.minor_part;
1429 return os;
1430 }
1431 int major_part = -1;
1432 int minor_part = -1;
1433 };
1434
1435 std::vector<CudaVersion> supported_cuda_compute_capabilities = {
1436 TF_CUDA_CAPABILITIES,};
1437
GetSupportedCudaComputeCapabilities()1438 std::vector<CudaVersion> GetSupportedCudaComputeCapabilities() {
1439 auto cuda_caps = supported_cuda_compute_capabilities;
1440 #ifdef TF_EXTRA_CUDA_CAPABILITIES
1441 // TF_EXTRA_CUDA_CAPABILITIES should be defined a sequence separated by commas,
1442 // for example:
1443 // TF_EXTRA_CUDA_CAPABILITIES=3.0,4.0,5.0
1444 // Use two-level macro expansion for stringification.
1445 #define TF_XSTRING(...) #__VA_ARGS__
1446 #define TF_STRING(s) TF_XSTRING(s)
1447 string extra_cuda_caps = TF_STRING(TF_EXTRA_CUDA_CAPABILITIES);
1448 #undef TF_STRING
1449 #undef TF_XSTRING
1450 auto extra_capabilities = str_util::Split(extra_cuda_caps, ',');
1451 for (const auto& capability : extra_capabilities) {
1452 cuda_caps.push_back(CudaVersion(capability));
1453 }
1454 #endif
1455 return cuda_caps;
1456 }
1457 #endif // GOOGLE_CUDA
1458
1459 #if TENSORFLOW_USE_ROCM
1460 std::vector<int> supported_amdgpu_isa_versions = {803, 900, 906};
1461
GetSupportedAMDGPUISAVersions()1462 std::vector<int> GetSupportedAMDGPUISAVersions() {
1463 return supported_amdgpu_isa_versions;
1464 }
1465 #endif // TENSORFLOW_USE_ROCM
1466
EnablePeerAccess(se::Platform * platform,const std::vector<PlatformGpuId> & visible_gpu_order)1467 Status EnablePeerAccess(se::Platform* platform,
1468 const std::vector<PlatformGpuId>& visible_gpu_order) {
1469 int possible_peer_count = 0;
1470 int enabled_peer_count = 0;
1471 for (int i = 0; i < visible_gpu_order.size(); ++i) {
1472 const PlatformGpuId platform_gpu_i = visible_gpu_order[i];
1473 for (int j = 0; j < visible_gpu_order.size(); ++j) {
1474 const PlatformGpuId platform_gpu_j = visible_gpu_order[j];
1475 // We have already validated that ExecutorForDevice() calls return OK.
1476 se::StreamExecutor* from =
1477 GpuIdUtil::ExecutorForPlatformGpuId(platform, platform_gpu_i)
1478 .ValueOrDie();
1479 se::StreamExecutor* to =
1480 GpuIdUtil::ExecutorForPlatformGpuId(platform, platform_gpu_j)
1481 .ValueOrDie();
1482
1483 if (from->CanEnablePeerAccessTo(to)) {
1484 ++possible_peer_count;
1485 auto status = from->EnablePeerAccessTo(to);
1486 if (!status.ok()) {
1487 LOG(WARNING)
1488 << "Unable to enable peer access between device ordinals "
1489 << platform_gpu_i << " and " << platform_gpu_j
1490 << ", status: " << status;
1491 } else {
1492 ++enabled_peer_count;
1493 }
1494 }
1495 }
1496 }
1497
1498 // Return an error in the extreme failure case where the driver
1499 // reported that peering was possible but not a single peering was
1500 // successful. This is to catch possible system misconfigurations
1501 // or more fundamental issues.
1502 if (possible_peer_count > 0 && enabled_peer_count == 0) {
1503 return errors::Internal(possible_peer_count,
1504 " potential peer access pairs were reported by the "
1505 "driver, but no peering could be enabled.");
1506 }
1507 return Status::OK();
1508 }
1509
1510 } // namespace
1511
GetValidDeviceIds(const std::vector<PlatformGpuId> & visible_gpu_order,std::vector<PlatformGpuId> * ids)1512 Status BaseGPUDeviceFactory::GetValidDeviceIds(
1513 const std::vector<PlatformGpuId>& visible_gpu_order,
1514 std::vector<PlatformGpuId>* ids) {
1515 se::Platform* gpu_manager = GPUMachineManager();
1516 bool new_gpu_found = false;
1517 for (int i = 0; i < visible_gpu_order.size(); ++i) {
1518 const PlatformGpuId visible_gpu_id = visible_gpu_order[i];
1519
1520 // Only perform this once per visible platform gpu id.
1521 if (visible_gpu_initialized_[visible_gpu_id.value()]) {
1522 continue;
1523 }
1524
1525 visible_gpu_initialized_[visible_gpu_id.value()] = true;
1526 new_gpu_found = true;
1527
1528 auto executor =
1529 GpuIdUtil::ExecutorForPlatformGpuId(gpu_manager, visible_gpu_id);
1530 if (!executor.ok()) {
1531 return executor.status();
1532 }
1533
1534 auto stream_exec = executor.ValueOrDie();
1535 int64 free_bytes;
1536 int64 total_bytes;
1537 if (!stream_exec->DeviceMemoryUsage(&free_bytes, &total_bytes)) {
1538 // Logs internally on failure.
1539 free_bytes = 0;
1540 total_bytes = 0;
1541 }
1542 const auto& description = stream_exec->GetDeviceDescription();
1543 #if GOOGLE_CUDA
1544 int cc_major;
1545 int cc_minor;
1546 if (!description.cuda_compute_capability(&cc_major, &cc_minor)) {
1547 // Logs internally on failure.
1548 cc_major = 0;
1549 cc_minor = 0;
1550 }
1551 LOG(INFO) << "Found device " << i << " with properties: "
1552 << "\nname: " << description.name() << " major: " << cc_major
1553 << " minor: " << cc_minor
1554 << " memoryClockRate(GHz): " << description.clock_rate_ghz()
1555 << "\npciBusID: " << description.pci_bus_id() << "\ntotalMemory: "
1556 << strings::HumanReadableNumBytes(total_bytes)
1557 << " freeMemory: " << strings::HumanReadableNumBytes(free_bytes);
1558 #elif TENSORFLOW_USE_ROCM
1559 int isa_version;
1560 if (!description.rocm_amdgpu_isa_version(&isa_version)) {
1561 // Logs internally on failure.
1562 isa_version = 0;
1563 }
1564 LOG(INFO) << "Found device " << i << " with properties: "
1565 << "\nname: " << description.name() << "\nAMDGPU ISA: gfx"
1566 << isa_version << "\nmemoryClockRate (GHz) "
1567 << description.clock_rate_ghz() << "\npciBusID "
1568 << description.pci_bus_id() << "\nTotal memory: "
1569 << strings::HumanReadableNumBytes(total_bytes)
1570 << "\nFree memory: "
1571 << strings::HumanReadableNumBytes(free_bytes);
1572 #endif
1573 }
1574 // Checking peering and shows matrix if more than one gpu found.
1575 if (new_gpu_found && visible_gpu_order.size() > 1) {
1576 // Enable peer access
1577 TF_RETURN_IF_ERROR(EnablePeerAccess(gpu_manager, visible_gpu_order));
1578 }
1579
1580 #if GOOGLE_CUDA
1581 auto cuda_supported_capabilities = GetSupportedCudaComputeCapabilities();
1582 if (cuda_supported_capabilities.empty()) {
1583 return errors::FailedPrecondition(
1584 "No supported cuda capabilities in binary.");
1585 }
1586 CudaVersion min_supported_capability = *std::min_element(
1587 cuda_supported_capabilities.begin(), cuda_supported_capabilities.end());
1588 #elif TENSORFLOW_USE_ROCM
1589 auto rocm_supported_isas = GetSupportedAMDGPUISAVersions();
1590 if (rocm_supported_isas.empty()) {
1591 return errors::FailedPrecondition(
1592 "No supported rocm capabilities in binary.");
1593 }
1594 int min_supported_isa =
1595 *std::min_element(rocm_supported_isas.begin(), rocm_supported_isas.end());
1596 #endif
1597
1598 int min_gpu_core_count =
1599 GetMinGPUMultiprocessorCount(gpu_manager, visible_gpu_order);
1600
1601 // Filter out devices that don't have the right capability or power.
1602 for (int i = 0; i < visible_gpu_order.size(); ++i) {
1603 const PlatformGpuId visible_gpu_id = visible_gpu_order[i];
1604 auto exec_status =
1605 GpuIdUtil::ExecutorForPlatformGpuId(gpu_manager, visible_gpu_id);
1606 if (!exec_status.ok()) {
1607 LOG(INFO) << "Ignoring visible gpu device " << visible_gpu_id
1608 << " whose executor is in invalid state: "
1609 << exec_status.status().ToString();
1610 continue;
1611 }
1612 se::StreamExecutor* se = exec_status.ValueOrDie();
1613 const se::DeviceDescription& desc = se->GetDeviceDescription();
1614
1615 #if GOOGLE_CUDA
1616 CudaVersion device_capability;
1617 if (!desc.cuda_compute_capability(&device_capability.major_part,
1618 &device_capability.minor_part)) {
1619 LOG(INFO) << "Ignoring visible gpu device "
1620 << "(" << GetShortDeviceDescription(visible_gpu_id, desc)
1621 << ") "
1622 << "whose CUDA compute capability is not available.";
1623 continue;
1624 }
1625 // Only GPUs with no less than the minimum supported compute capability is
1626 // accepted.
1627 if (device_capability < min_supported_capability) {
1628 LOG(INFO) << "Ignoring visible gpu device "
1629 << "(" << GetShortDeviceDescription(visible_gpu_id, desc)
1630 << ") "
1631 << "with Cuda compute capability " << device_capability
1632 << ". The minimum required Cuda capability is "
1633 << min_supported_capability << ".";
1634 continue;
1635 }
1636 #elif TENSORFLOW_USE_ROCM
1637 int device_isa;
1638 if (!desc.rocm_amdgpu_isa_version(&device_isa)) {
1639 continue;
1640 }
1641 // Only GPUs with no less than the minimum supported compute capability is
1642 // accepted.
1643 if (device_isa < min_supported_isa) {
1644 LOG(INFO) << "Ignoring visible gpu device "
1645 << "(" << GetShortDeviceDescription(visible_gpu_id, desc)
1646 << ") "
1647 << "with AMDGPU ISA gfx" << device_isa
1648 << ". The minimum required AMDGPU ISA is gfx"
1649 << min_supported_isa << ".";
1650 continue;
1651 }
1652 #endif
1653
1654 // Filter out slow GPUs. By default, GPUs with a lower multiprocessor
1655 // count than the fastest GPU are filtered out, unless they have 8 or more
1656 // multiprocessors. If the TF_MIN_GPU_MULTIPROCESSOR_COUNT environment
1657 // variable is set, its value will be used to filter out GPUs.
1658 if (desc.core_count() < min_gpu_core_count) {
1659 LOG(INFO) << "Ignoring visible gpu device "
1660 << "(" << GetShortDeviceDescription(visible_gpu_id, desc)
1661 << ") "
1662 << "with core count: " << desc.core_count()
1663 << ". The minimum required count is " << min_gpu_core_count
1664 << ". You can adjust this requirement with the env var "
1665 "TF_MIN_GPU_MULTIPROCESSOR_COUNT.";
1666 continue;
1667 }
1668 ids->push_back(visible_gpu_id);
1669 }
1670 if (!ids->empty()) {
1671 std::vector<int> raw_ids(ids->size());
1672 std::transform(ids->begin(), ids->end(), raw_ids.begin(),
1673 [](PlatformGpuId id) -> int { return id.value(); });
1674 LOG(INFO) << "Adding visible gpu devices: "
1675 << str_util::Join(raw_ids, ", ");
1676 }
1677
1678 return Status::OK();
1679 }
1680
SafeAllocFrontier()1681 uint64 BaseGPUDevice::SafeAllocFrontier() {
1682 if (timestamped_allocator_) {
1683 return kernel_tracker_->LastTerminatedCount();
1684 } else {
1685 return 0;
1686 }
1687 }
1688
PendingKernels()1689 int BaseGPUDevice::PendingKernels() {
1690 if (kernel_tracker_) {
1691 return kernel_tracker_->NumPending();
1692 }
1693 return 0;
1694 }
1695
RecordQueued()1696 uint64 GPUKernelTracker::RecordQueued() {
1697 mutex_lock l(mu_);
1698 uint64 queued_count = timing_counter_->next();
1699 VLOG(2) << "RecordQueued queued_count=" << queued_count
1700 << " first_available_=" << first_available_
1701 << " last_completed_=" << last_completed_
1702 << " num_pending_=" << num_pending_;
1703 pending_kernels_[first_available_].queued_count = queued_count;
1704 pending_kernels_[first_available_].terminated = false;
1705 ++first_available_;
1706 ++num_pending_;
1707 if (first_available_ >= pending_kernels_.size()) {
1708 first_available_ = 0;
1709 }
1710 if (first_available_ == last_completed_) {
1711 // Ring buffer is full: double it. All of the same valid PendingKernel
1712 // entries exist after the copy, they are just shifted to begin
1713 // at index 0 in the new array.
1714 std::vector<PendingKernel> new_buffer(pending_kernels_.size() * 2);
1715 for (int i = 0; i < pending_kernels_.size(); ++i) {
1716 int j = (i + last_completed_) % pending_kernels_.size();
1717 new_buffer[i] = pending_kernels_[j];
1718 }
1719 last_completed_ = 0;
1720 first_available_ = pending_kernels_.size();
1721 pending_kernels_.swap(new_buffer);
1722 VLOG(1) << "last_completed_=" << last_completed_
1723 << " first_available_=" << first_available_
1724 << " num_pending_=" << num_pending_;
1725 }
1726 DCHECK_NE(first_available_, last_completed_) << "exhausted pending_kernels";
1727 return queued_count;
1728 }
1729
RecordTerminated(uint64 queued_count)1730 void GPUKernelTracker::RecordTerminated(uint64 queued_count) {
1731 mutex_lock l(mu_);
1732 VLOG(2) << "RecordTerminated queued_count=" << queued_count
1733 << " first_available_=" << first_available_
1734 << " last_completed_=" << last_completed_
1735 << " num_pending_=" << num_pending_ << " LC="
1736 << ((last_completed_ >= 0)
1737 ? pending_kernels_[last_completed_].queued_count
1738 : -1);
1739 DCHECK_NE(first_available_, last_completed_);
1740 DCHECK_GT(num_pending_, 0);
1741 // Starting just past the last completed entry, find the entry with
1742 // this queued_count and mark it done.
1743 int index = (last_completed_ + 1) % pending_kernels_.size();
1744 while (true) {
1745 if (index == first_available_) {
1746 // This should never happen.
1747 LOG(FATAL) << "Failed to find " << queued_count // Crash OK
1748 << " in queue";
1749 }
1750 if (pending_kernels_[index].queued_count == queued_count) {
1751 pending_kernels_[index].terminated = true;
1752 break;
1753 }
1754 index = (index + 1) % pending_kernels_.size();
1755 }
1756 // Next move last_completed_ forward past all completed kernels. In theory
1757 // kernels should always complete in queued order so we should be able to
1758 // advance the completed frontier to the last queued PendingKernel. In
1759 // practice we occassionally see the termination callbacks arrive out of order
1760 // probably because of thread scheduling. Eventually we may support out-of-
1761 // order completion involving multple compute streams so here we follow a
1762 // conservative approach and wait for every single callback to arrive before
1763 // advancing the frontier.
1764 while (true) {
1765 int next_index = (last_completed_ + 1) % pending_kernels_.size();
1766 if (next_index == first_available_) break;
1767 if (pending_kernels_[next_index].terminated) {
1768 last_completed_ = next_index;
1769 } else {
1770 break;
1771 }
1772 }
1773 // Last decrease num_pending before maybe waking a waiter.
1774 --num_pending_;
1775 pending_decreased_.notify_one();
1776 }
1777
LastTerminatedCount()1778 uint64 GPUKernelTracker::LastTerminatedCount() {
1779 mutex_lock l(mu_);
1780 if (last_completed_ < 0) {
1781 // This is an edge case that can be encountered only at the beginning of
1782 // execution. There's not yet a safe threshold count. We don't want to
1783 // return 0 since that bypasses the count mechanism in BFCAllocator, so
1784 // return the least non-zero value.
1785 return 1;
1786 }
1787 return pending_kernels_[last_completed_].queued_count;
1788 }
1789
1790 } // namespace tensorflow
1791
1792 #endif // GOOGLE_CUDA
1793