1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/core/common_runtime/gpu/gpu_process_state.h"
17 
18 #include <cstring>
19 #include <vector>
20 
21 #include "absl/container/flat_hash_set.h"
22 #include "tensorflow/core/common_runtime/device/device_host_allocator.h"
23 #include "tensorflow/core/common_runtime/device/device_id_utils.h"
24 #include "tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h"
25 #include "tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h"
26 #include "tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h"
27 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
28 #include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
29 #include "tensorflow/core/common_runtime/gpu/gpu_init.h"
30 #include "tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.h"
31 #include "tensorflow/core/common_runtime/pool_allocator.h"
32 #include "tensorflow/core/common_runtime/shared_counter.h"
33 #include "tensorflow/core/framework/allocator.h"
34 #include "tensorflow/core/framework/log_memory.h"
35 #include "tensorflow/core/framework/tracking_allocator.h"
36 #include "tensorflow/core/lib/strings/strcat.h"
37 #include "tensorflow/core/platform/logging.h"
38 #include "tensorflow/core/platform/mutex.h"
39 #include "tensorflow/core/platform/stream_executor.h"
40 #include "tensorflow/core/platform/types.h"
41 #include "tensorflow/core/util/env_var.h"
42 
43 namespace tensorflow {
44 namespace {
45 
useCudaMallocAllocator()46 bool useCudaMallocAllocator() {
47   const char* debug_allocator_str = std::getenv("TF_GPU_ALLOCATOR");
48   return debug_allocator_str != nullptr &&
49          std::strcmp(debug_allocator_str, "cuda_malloc") == 0;
50 }
51 
useCudaMemoryGuardAllocator()52 bool useCudaMemoryGuardAllocator() {
53   const char* debug_allocator_str = std::getenv("TF_GPU_ALLOCATOR");
54   return debug_allocator_str != nullptr &&
55          std::strcmp(debug_allocator_str, "memory_guard") == 0;
56 }
57 
58 }  // namespace
59 
singleton(GPUProcessState * ps)60 /*static*/ GPUProcessState* GPUProcessState::singleton(GPUProcessState* ps) {
61   static GPUProcessState* instance = ps ? ps : new GPUProcessState;
62   DCHECK((!ps) || (ps == instance))
63       << "Multiple calls to GPUProcessState with non-null ps";
64   return instance;
65 }
66 
GPUProcessState()67 GPUProcessState::GPUProcessState() : gpu_device_enabled_(false) {
68   process_state_ = ProcessState::singleton();
69 }
70 
BusIdForGPU(TfGpuId tf_gpu_id)71 int GPUProcessState::BusIdForGPU(TfGpuId tf_gpu_id) {
72   // Return the NUMA node associated with the GPU's StreamExecutor.
73   se::StreamExecutor* se = DeviceIdUtil::ExecutorForTfDeviceId(
74                                DEVICE_GPU, GPUMachineManager(), tf_gpu_id)
75                                .ValueOrDie();
76   int numa_node = se->GetDeviceDescription().numa_node();
77   // bus_id must be non-negative.  If the numa_node is not known,
78   // use 0.
79   return numa_node >= 0 ? numa_node : 0;
80 }
81 
82 // NOLINTNEXTLINE: clang-tidy complains this is unused because of build flags.
CreateSubAllocator(const GPUOptions & options,PlatformGpuId platform_gpu_id,const std::vector<SubAllocator::Visitor> & alloc_visitors,size_t total_bytes,const std::vector<TfGpuId> & peer_gpu_ids)83 static SubAllocator* CreateSubAllocator(
84     const GPUOptions& options, PlatformGpuId platform_gpu_id,
85     const std::vector<SubAllocator::Visitor>& alloc_visitors,
86     size_t total_bytes, const std::vector<TfGpuId>& peer_gpu_ids) {
87   auto executor = DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(),
88                                                             platform_gpu_id)
89                       .ValueOrDie();
90 
91   // FIXME(imintz): Observed OOM issues when using the virtual memory
92   // allocators. This should be reenabled when resolved.
93 #if 0 && defined(GOOGLE_CUDA) && CUDA_VERSION >= 10020
94   // Use the old allocator when unified memory is required.
95   // TODO(imintz): Remove the cuMemAlloc capability of this allocator.
96   if (options.per_process_gpu_memory_fraction() > 1.0 ||
97       options.experimental().use_unified_memory()) {
98     return new DeviceMemAllocator(executor, platform_gpu_id,
99                                   /*use_unified_memory=*/true, alloc_visitors,
100                                   {});
101   } else {
102     auto* gpu_context = reinterpret_cast<stream_executor::gpu::GpuContext*>(
103         executor->implementation()->GpuContextHack());
104 
105     absl::flat_hash_set<PlatformGpuId> platform_peer_gpu_ids;
106     platform_peer_gpu_ids.reserve(peer_gpu_ids.size());
107     for (const TfGpuId tf_gpu_id : peer_gpu_ids) {
108       PlatformGpuId platform_gpu_id;
109       TF_CHECK_OK(GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id));
110       platform_peer_gpu_ids.insert(platform_gpu_id);
111     }
112     std::vector<PlatformGpuId> platform_peer_gpu_ids_vec(
113         platform_peer_gpu_ids.begin(), platform_peer_gpu_ids.end());
114 
115     // Adjust virtual address space to be slightly larger than the physical
116     // address space in case the BFC allocator performs suboptimal garbage
117     // collection.
118     // TODO(imintz): Update BFC allocator to ensure it doesn't create holes in
119     // the va space.
120     return GpuVirtualMemAllocator::Create(
121                alloc_visitors, {}, *gpu_context, platform_gpu_id,
122                /*virtual_address_space_size=*/total_bytes * 2,
123                platform_peer_gpu_ids_vec)
124         .ValueOrDie()
125         .release();
126   }
127 #else
128   return new DeviceMemAllocator(
129       executor, platform_gpu_id,
130       (options.per_process_gpu_memory_fraction() > 1.0 ||
131        options.experimental().use_unified_memory()),
132       alloc_visitors, {});
133 #endif
134 }
135 
GetGPUAllocator(const GPUOptions & options,TfGpuId tf_gpu_id,size_t total_bytes,const std::vector<TfGpuId> & peer_gpu_ids)136 Allocator* GPUProcessState::GetGPUAllocator(
137     const GPUOptions& options, TfGpuId tf_gpu_id, size_t total_bytes,
138     const std::vector<TfGpuId>& peer_gpu_ids) {
139   CHECK(process_state_);
140 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
141     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
142   const string& allocator_type = options.allocator_type();
143   mutex_lock lock(mu_);
144   DeviceIdUtil::CheckValidTfDeviceId(DEVICE_GPU, GPUMachineManager(),
145                                      tf_gpu_id);
146 
147   if (tf_gpu_id.value() >= static_cast<int64>(gpu_allocators_.size())) {
148     gpu_allocators_.resize(tf_gpu_id.value() + 1);
149   }
150 
151   AllocatorParts& allocator_parts = gpu_allocators_[tf_gpu_id.value()];
152   if (allocator_parts.allocator == nullptr) {
153     // Validate allocator types.
154     if (!allocator_type.empty() && allocator_type != "BFC") {
155       LOG(ERROR) << "Invalid allocator type: " << allocator_type;
156       return nullptr;
157     }
158 
159     PlatformGpuId platform_gpu_id;
160     TF_CHECK_OK(GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id));
161     int bus_id = BusIdForGPU(tf_gpu_id);
162     DCHECK_GE(bus_id, 0);
163     while (bus_id >= gpu_visitors_.size()) {
164       gpu_visitors_.push_back({});
165     }
166     auto* sub_allocator =
167         CreateSubAllocator(options, platform_gpu_id, gpu_visitors_[bus_id],
168                            total_bytes, peer_gpu_ids);
169     GPUBFCAllocator* gpu_bfc_allocator =
170         new GPUBFCAllocator(sub_allocator, total_bytes, options,
171                             strings::StrCat("GPU_", tf_gpu_id.value(), "_bfc"));
172     Allocator* gpu_allocator = gpu_bfc_allocator;
173     SharedCounter* timing_counter = nullptr;
174     if (options.experimental().timestamped_allocator()) {
175       timing_counter = new SharedCounter;
176       gpu_bfc_allocator->SetTimingCounter(timing_counter);
177     }
178 
179     // If true, checks for memory overwrites by writing
180     // distinctive patterns on both ends of allocated memory.
181     if (useCudaMemoryGuardAllocator()) {
182       LOG(INFO) << "Using memory guard allocator for GPU.";
183       gpu_allocator = new GPUDebugAllocator(gpu_allocator, platform_gpu_id);
184       gpu_allocator = new GPUNanResetAllocator(gpu_allocator, platform_gpu_id);
185     } else if (useCudaMallocAllocator()) {
186       LOG(INFO) << "Using CUDA malloc allocator for GPU.";
187       // If true, passes all allocation requests through to cudaMalloc
188       // useful for doing memory debugging with tools like cuda-memcheck
189       // **WARNING** probably will not work in a multi-gpu scenario
190       gpu_allocator =
191           new GPUcudaMallocAllocator(gpu_allocator, platform_gpu_id);
192     }
193 
194     Allocator* recording_allocator = nullptr;
195     if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) {
196       ProcessState::MemDesc md;
197       md.loc = ProcessState::MemDesc::GPU;
198       md.dev_index = platform_gpu_id.value();
199       md.gpu_registered = false;
200       md.nic_registered = true;
201       recording_allocator = new internal::RecordingAllocator(
202           &process_state_->mem_desc_map_, gpu_allocator, md, &mu_);
203     }
204     allocator_parts = {std::unique_ptr<Allocator>(gpu_allocator),
205                        std::unique_ptr<SharedCounter>(timing_counter),
206                        gpu_bfc_allocator, sub_allocator,
207                        std::unique_ptr<Allocator>(recording_allocator)};
208   }
209   if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) {
210     return allocator_parts.recording_allocator.get();
211   } else {
212     return allocator_parts.allocator.get();
213   }
214 #else
215   LOG(FATAL) << "GPUAllocator unavailable. Not compiled with --config=cuda or "
216                 "--config=rocm.";
217   return nullptr;
218 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
219 }
220 
GPUAllocatorCounter(TfGpuId tf_gpu_id)221 SharedCounter* GPUProcessState::GPUAllocatorCounter(TfGpuId tf_gpu_id) {
222   DCHECK(process_state_);
223 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
224     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
225   DeviceIdUtil::CheckValidTfDeviceId(DEVICE_GPU, GPUMachineManager(),
226                                      tf_gpu_id);
227   mutex_lock l(mu_);
228   if (tf_gpu_id.value() >= static_cast<int64>(gpu_allocators_.size())) {
229     LOG(ERROR) << "Asked for counter for GPU allocator " << tf_gpu_id.value()
230                << " but only have " << gpu_allocators_.size();
231     return nullptr;
232   }
233 
234   AllocatorParts& allocator_parts = gpu_allocators_[tf_gpu_id.value()];
235   if (allocator_parts.counter.get() == nullptr) {
236     SharedCounter* timing_counter = new SharedCounter;
237     allocator_parts.bfc_allocator->SetTimingCounter(timing_counter);
238     allocator_parts.counter.reset(timing_counter);
239   }
240   return allocator_parts.counter.get();
241 #else
242   return nullptr;
243 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
244 }
245 
GetGpuHostAllocator(int numa_node)246 Allocator* GPUProcessState::GetGpuHostAllocator(int numa_node) {
247   CHECK(process_state_);
248   if (!HasGPUDevice() ||
249       !process_state_->ProcessState::FLAGS_brain_mem_reg_gpu_dma) {
250     return process_state_->GetCPUAllocator(numa_node);
251   }
252   if (numa_node == port::kNUMANoAffinity) {
253     numa_node = 0;
254   }
255   {
256     // Here we optimize the most common use case where gpu_host_allocators_
257     // have already been populated and since we're only reading
258     // these vectors, we can get by with a shared lock. In the slower case,
259     // we take a unique lock and populate these vectors.
260     tf_shared_lock lock(mu_);
261 
262     if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types &&
263         !gpu_host_allocators_.empty() &&
264         gpu_host_allocators_[0].recording_allocator != nullptr) {
265       return gpu_host_allocators_[0].recording_allocator.get();
266     }
267     if (static_cast<int>(gpu_host_allocators_.size()) > numa_node) {
268       return gpu_host_allocators_[0].allocator.get();
269     }
270   }
271 
272   mutex_lock lock(mu_);
273   // Find the first valid StreamExecutor to request CUDA or ROCm host memory
274   // through, since any will work.
275   //
276   // This search isn't super clean, and it would be nice to use a
277   // better source of information about which executor to use.  For
278   // example, process_state could maybe save the first stream executor
279   // it knows is valid.
280   se::StreamExecutor* se = nullptr;
281   for (int i = 0; i < static_cast<int>(gpu_allocators_.size()); ++i) {
282     if (gpu_allocators_[i].allocator != nullptr) {
283       se = DeviceIdUtil::ExecutorForTfDeviceId(DEVICE_GPU, GPUMachineManager(),
284                                                TfGpuId(i))
285                .ValueOrDie();
286       break;
287     }
288   }
289 
290   CHECK_NE(nullptr, se);
291 
292   while (static_cast<int>(gpu_host_allocators_.size()) <= numa_node) {
293     while (gpu_host_alloc_visitors_.size() <= numa_node) {
294       gpu_host_alloc_visitors_.push_back({});
295     }
296     while (gpu_host_free_visitors_.size() <= numa_node) {
297       gpu_host_free_visitors_.push_back({});
298     }
299     SubAllocator* sub_allocator = new DeviceHostAllocator(
300         se, numa_node, gpu_host_alloc_visitors_[numa_node],
301         gpu_host_free_visitors_[numa_node]);
302     // TODO(zheng-xq): evaluate whether 64GB by default is the best choice.
303     int64 gpu_host_mem_limit_in_mb = -1;
304     Status status = ReadInt64FromEnvVar("TF_GPU_HOST_MEM_LIMIT_IN_MB",
305                                         1LL << 16 /*64GB max by default*/,
306                                         &gpu_host_mem_limit_in_mb);
307     if (!status.ok()) {
308       LOG(ERROR) << "GetGpuHostAllocator: " << status.error_message();
309     }
310     int64 gpu_host_mem_limit = gpu_host_mem_limit_in_mb * (1LL << 20);
311 
312     Allocator* allocator =
313         new BFCAllocator(sub_allocator, gpu_host_mem_limit,
314                          /*allow_growth=*/true, /*name=*/"gpu_host_bfc");
315 
316     if (LogMemory::IsEnabled() && !allocator->TracksAllocationSizes()) {
317       // Wrap the allocator to track allocation ids for better logging
318       // at the cost of performance.
319       allocator = new TrackingAllocator(allocator, true);
320     }
321     gpu_host_allocators_.push_back({std::unique_ptr<Allocator>(allocator),
322                                     std::unique_ptr<SharedCounter>(nullptr),
323                                     nullptr, sub_allocator,
324                                     std::unique_ptr<Allocator>(nullptr)});
325     AllocatorParts& allocator_parts = gpu_host_allocators_.back();
326     if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) {
327       ProcessState::MemDesc md;
328       md.loc = ProcessState::MemDesc::CPU;
329       md.dev_index = 0;
330       md.gpu_registered = true;
331       md.nic_registered = false;
332       allocator_parts.recording_allocator.reset(
333           new internal::RecordingAllocator(&process_state_->mem_desc_map_,
334                                            allocator_parts.allocator.get(), md,
335                                            &mu_));
336     }
337   }
338   if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) {
339     return gpu_host_allocators_[0].recording_allocator.get();
340   } else {
341     return gpu_host_allocators_[0].allocator.get();
342   }
343 }
344 
AddGPUAllocVisitor(int bus_id,const SubAllocator::Visitor & visitor)345 void GPUProcessState::AddGPUAllocVisitor(int bus_id,
346                                          const SubAllocator::Visitor& visitor) {
347 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
348     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
349   mutex_lock lock(mu_);
350   CHECK(gpu_allocators_.empty())  // Crash OK
351       << "AddGPUAllocVisitor must be called before "
352          "first call to GetGPUAllocator.";
353   DCHECK_GE(bus_id, 0);
354   while (bus_id >= static_cast<int64>(gpu_visitors_.size())) {
355     gpu_visitors_.push_back(std::vector<SubAllocator::Visitor>());
356   }
357   gpu_visitors_[bus_id].push_back(visitor);
358 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
359 }
360 
AddGpuHostAllocVisitor(int numa_node,const SubAllocator::Visitor & visitor)361 void GPUProcessState::AddGpuHostAllocVisitor(
362     int numa_node, const SubAllocator::Visitor& visitor) {
363 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
364     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
365   mutex_lock lock(mu_);
366   CHECK(gpu_host_allocators_.empty())  // Crash OK
367       << "AddGpuHostAllocVisitor must be called before "
368          "first call to GetGpuHostAllocator.";
369   while (numa_node >= static_cast<int64>(gpu_host_alloc_visitors_.size())) {
370     gpu_host_alloc_visitors_.push_back(std::vector<SubAllocator::Visitor>());
371   }
372   gpu_host_alloc_visitors_[numa_node].push_back(visitor);
373 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
374 }
375 
AddGpuHostFreeVisitor(int numa_node,const SubAllocator::Visitor & visitor)376 void GPUProcessState::AddGpuHostFreeVisitor(
377     int numa_node, const SubAllocator::Visitor& visitor) {
378 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
379     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
380   mutex_lock lock(mu_);
381   CHECK(gpu_host_allocators_.empty())  // Crash OK
382       << "AddGpuHostFreeVisitor must be called before "
383          "first call to GetGpuHostAllocator.";
384   while (numa_node >= static_cast<int64>(gpu_host_free_visitors_.size())) {
385     gpu_host_free_visitors_.push_back(std::vector<SubAllocator::Visitor>());
386   }
387   gpu_host_free_visitors_[numa_node].push_back(visitor);
388 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
389 }
390 
TestOnlyReset()391 void GPUProcessState::TestOnlyReset() {
392   if (process_state_) {
393     process_state_->ProcessState::TestOnlyReset();
394   }
395   {
396     mutex_lock lock(mu_);
397     gpu_device_enabled_ = false;
398     gpu_allocators_.clear();
399     gpu_visitors_.clear();
400     gpu_host_allocators_.clear();
401     gpu_host_alloc_visitors_.clear();
402     gpu_host_free_visitors_.clear();
403   }
404 }
405 
406 }  // namespace tensorflow
407