1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h"
17 
18 #include <cstddef>
19 #include <vector>
20 
21 #include "tensorflow/core/common_runtime/device/device_id_utils.h"
22 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
23 #include "tensorflow/core/common_runtime/gpu/gpu_init.h"
24 #include "tensorflow/core/platform/stream_executor.h"
25 
26 #define MASK_WORDS 2
27 #define MASK_BYTES (MASK_WORDS * sizeof(int64))
28 
29 namespace tensorflow {
30 namespace {
31 
NewMask(int64 word)32 int64* NewMask(int64 word) {
33   int64* m = new int64[MASK_WORDS];
34   for (int i = 0; i < MASK_WORDS; ++i) {
35     m[i] = word;
36   }
37   return m;
38 }
39 
40 int64* before_mask = NewMask(0xabababababababab);
41 int64* after_mask = NewMask(0xcdcdcdcdcdcdcdcd);
42 
CheckMask(se::StreamExecutor * exec,void * ptr,int64 * mask)43 bool CheckMask(se::StreamExecutor* exec, void* ptr, int64* mask) {
44   se::DeviceMemory<int64> gpu_ptr{se::DeviceMemoryBase{ptr, MASK_BYTES}};
45   int64 tmp[MASK_WORDS];
46 
47   Status result = exec->SynchronousMemcpyD2H(gpu_ptr, MASK_BYTES, tmp);
48   if (!result.ok()) {
49     LOG(FATAL) << "Could not copy debug mask, " << result;
50   }
51 
52   bool ok = true;
53   for (int i = 0; i < MASK_WORDS; ++i) {
54     ok &= (mask[i] == tmp[i]);
55     if (!ok) {
56       LOG(ERROR) << "i=" << i
57                  << " mask=" << reinterpret_cast<const void*>(mask[i])
58                  << " field=" << reinterpret_cast<const void*>(tmp[i]);
59     }
60   }
61 
62   return ok;
63 }
64 
InitMask(se::StreamExecutor * exec,void * ptr,int64 * mask)65 void InitMask(se::StreamExecutor* exec, void* ptr, int64* mask) {
66   se::DeviceMemory<int64> gpu_ptr{se::DeviceMemoryBase{ptr, MASK_BYTES}};
67   Status result = exec->SynchronousMemcpyH2D(mask, MASK_BYTES, &gpu_ptr);
68   if (!result.ok()) {
69     LOG(FATAL) << "Could not copy debug mask, " << result;
70   }
71 }
72 
73 }  // namespace
74 
75 // -----------------------------------------------------------------------------
76 // GPUDebugAllocator
77 // -----------------------------------------------------------------------------
GPUDebugAllocator(Allocator * allocator,PlatformGpuId platform_gpu_id)78 GPUDebugAllocator::GPUDebugAllocator(Allocator* allocator,
79                                      PlatformGpuId platform_gpu_id)
80     : base_allocator_(allocator) {
81   stream_exec_ = DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(),
82                                                            platform_gpu_id)
83                      .ValueOrDie();
84 }
85 
~GPUDebugAllocator()86 GPUDebugAllocator::~GPUDebugAllocator() { delete base_allocator_; }
87 
AllocateRaw(size_t alignment,size_t num_bytes)88 void* GPUDebugAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
89   num_bytes += (2 * MASK_BYTES);
90   void* allocated_ptr = base_allocator_->AllocateRaw(alignment, num_bytes);
91   if (allocated_ptr == nullptr) return allocated_ptr;
92 
93   // Return the pointer after the header
94   void* rv = static_cast<char*>(allocated_ptr) + MASK_BYTES;
95 
96   // Write the header at allocated_ptr
97   InitMask(stream_exec_, allocated_ptr, before_mask);
98 
99   // Write the footer at the end.
100   size_t req_size = base_allocator_->RequestedSize(allocated_ptr);
101   InitMask(stream_exec_,
102            static_cast<char*>(allocated_ptr) + req_size - MASK_BYTES,
103            after_mask);
104   return rv;
105 }
DeallocateRaw(void * ptr)106 void GPUDebugAllocator::DeallocateRaw(void* ptr) {
107   if (ptr != nullptr) {
108     CHECK(CheckHeader(ptr)) << "before_mask has been overwritten";
109     CHECK(CheckFooter(ptr)) << "after_mask has been overwritten";
110 
111     // Backtrack to the beginning of the header.
112     ptr = static_cast<void*>(static_cast<char*>(ptr) - MASK_BYTES);
113   }
114   // Deallocate the memory
115   base_allocator_->DeallocateRaw(ptr);
116 }
117 
TracksAllocationSizes() const118 bool GPUDebugAllocator::TracksAllocationSizes() const { return true; }
119 
RequestedSize(const void * ptr) const120 size_t GPUDebugAllocator::RequestedSize(const void* ptr) const {
121   auto req_size = base_allocator_->RequestedSize(static_cast<const char*>(ptr) -
122                                                  MASK_BYTES);
123   return req_size - 2 * MASK_BYTES;
124 }
125 
AllocatedSize(const void * ptr) const126 size_t GPUDebugAllocator::AllocatedSize(const void* ptr) const {
127   return base_allocator_->AllocatedSize(static_cast<const char*>(ptr) -
128                                         MASK_BYTES);
129 }
130 
AllocationId(const void * ptr) const131 int64 GPUDebugAllocator::AllocationId(const void* ptr) const {
132   return base_allocator_->AllocationId(static_cast<const char*>(ptr) -
133                                        MASK_BYTES);
134 }
135 
GetStats()136 absl::optional<AllocatorStats> GPUDebugAllocator::GetStats() {
137   return base_allocator_->GetStats();
138 }
139 
ClearStats()140 void GPUDebugAllocator::ClearStats() { base_allocator_->ClearStats(); }
141 
CheckHeader(void * ptr)142 bool GPUDebugAllocator::CheckHeader(void* ptr) {
143   return CheckMask(stream_exec_, static_cast<char*>(ptr) - MASK_BYTES,
144                    before_mask);
145 }
146 
CheckFooter(void * ptr)147 bool GPUDebugAllocator::CheckFooter(void* ptr) {
148   char* original_ptr = static_cast<char*>(ptr) - MASK_BYTES;
149   size_t req_size = base_allocator_->RequestedSize(original_ptr);
150   return CheckMask(stream_exec_, original_ptr + req_size - MASK_BYTES,
151                    after_mask);
152 }
153 
154 // -----------------------------------------------------------------------------
155 // GPUNanResetAllocator
156 // -----------------------------------------------------------------------------
GPUNanResetAllocator(Allocator * allocator,PlatformGpuId platform_gpu_id)157 GPUNanResetAllocator::GPUNanResetAllocator(Allocator* allocator,
158                                            PlatformGpuId platform_gpu_id)
159     : base_allocator_(allocator) {
160   stream_exec_ = DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(),
161                                                            platform_gpu_id)
162                      .ValueOrDie();
163 }
164 
~GPUNanResetAllocator()165 GPUNanResetAllocator::~GPUNanResetAllocator() { delete base_allocator_; }
166 
AllocateRaw(size_t alignment,size_t num_bytes)167 void* GPUNanResetAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
168   void* allocated_ptr = base_allocator_->AllocateRaw(alignment, num_bytes);
169   if (allocated_ptr == nullptr) return allocated_ptr;
170 
171   // Initialize the buffer to Nans
172   size_t req_size = base_allocator_->RequestedSize(allocated_ptr);
173   std::vector<float> nans((req_size + sizeof(float) - 1) / sizeof(float),
174                           std::nanf(""));
175   se::DeviceMemory<float> nan_ptr{
176       se::DeviceMemoryBase{static_cast<float*>(allocated_ptr), req_size}};
177 
178   Status result =
179       stream_exec_->SynchronousMemcpyH2D(&nans[0], req_size, &nan_ptr);
180   if (!result.ok()) {
181     LOG(ERROR) << "Could not initialize to NaNs, " << result;
182   }
183 
184   return allocated_ptr;
185 }
DeallocateRaw(void * ptr)186 void GPUNanResetAllocator::DeallocateRaw(void* ptr) {
187   if (ptr != nullptr) {
188     // Reset the buffer to Nans
189     size_t req_size = base_allocator_->RequestedSize(ptr);
190     std::vector<float> nans((req_size + sizeof(float) - 1) / sizeof(float),
191                             std::nanf(""));
192     se::DeviceMemory<float> nan_ptr{
193         se::DeviceMemoryBase{static_cast<float*>(ptr), req_size}};
194     Status result =
195         stream_exec_->SynchronousMemcpyH2D(&nans[0], req_size, &nan_ptr);
196     if (!result.ok()) {
197       LOG(ERROR) << "Could not initialize to NaNs, " << result;
198     }
199   }
200 
201   // Deallocate the memory
202   base_allocator_->DeallocateRaw(ptr);
203 }
204 
RequestedSize(const void * ptr) const205 size_t GPUNanResetAllocator::RequestedSize(const void* ptr) const {
206   return base_allocator_->RequestedSize(ptr);
207 }
208 
AllocatedSize(const void * ptr) const209 size_t GPUNanResetAllocator::AllocatedSize(const void* ptr) const {
210   return base_allocator_->AllocatedSize(ptr);
211 }
212 
GetStats()213 absl::optional<AllocatorStats> GPUNanResetAllocator::GetStats() {
214   return base_allocator_->GetStats();
215 }
216 
ClearStats()217 void GPUNanResetAllocator::ClearStats() { base_allocator_->ClearStats(); }
218 
219 }  // namespace tensorflow
220