1 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h"
17 
18 #include "tensorflow/core/platform/logging.h"
19 
20 #if GOOGLE_CUDA
21 #if GOOGLE_TENSORRT
22 #include "cuda/include/cuda_runtime_api.h"
23 #endif  // GOOGLE_TENSORRT
24 #endif  // GOOGLE_CUDA
25 
26 namespace tensorflow {
27 namespace tensorrt {
28 
29 // std::align is not supported, so this method mimic its behavior.
30 //
31 // NOTE(aaroey): according to the TensorRT API,
32 // nvinfer1::IGpuAllocator::allocate() uses uint64_t type for size and alignment
33 // parameters, so here we use the same type to make it compatible.
Align(uint64_t alignment,uint64_t size,void * & ptr,uint64_t & space)34 void* Align(uint64_t alignment, uint64_t size, void*& ptr, uint64_t& space) {
35   QCHECK_GT(alignment, 0ul) << "alignment must be greater than 0.";
36   QCHECK_EQ(0, alignment & (alignment - 1)) << "Alignment must be power of 2.";
37   QCHECK_GT(size, 0ul) << "size must be greater than 0.";
38   QCHECK(ptr) << "ptr must not be nullptr.";
39   QCHECK_GT(space, 0ul) << "space must be greater than 0.";
40   const uintptr_t ptr_val = reinterpret_cast<uintptr_t>(ptr);
41   QCHECK_GE(ptr_val + space, ptr_val) << "Provided space overflows.";
42 
43   if (size > space) return nullptr;
44   const uintptr_t aligned_ptr_val = ((ptr_val + alignment - 1) & -alignment);
45   if (aligned_ptr_val > ptr_val + space - size) return nullptr;
46   ptr = reinterpret_cast<void*>(aligned_ptr_val);
47   const uintptr_t diff = aligned_ptr_val - ptr_val;
48   space -= diff;
49   return ptr;
50 }
51 
52 }  // namespace tensorrt
53 }  // namespace tensorflow
54 
55 #if GOOGLE_CUDA
56 #if GOOGLE_TENSORRT
57 
58 namespace tensorflow {
59 namespace tensorrt {
60 
allocate(uint64_t size,uint64_t alignment,uint32_t flags)61 void* TRTCudaAllocator::allocate(uint64_t size, uint64_t alignment,
62                                  uint32_t flags) {
63   assert((alignment & (alignment - 1)) == 0);  // zero or a power of 2.
64   void* memory;
65   cudaMalloc(&memory, size);
66   return memory;
67 }
68 
free(void * memory)69 void TRTCudaAllocator::free(void* memory) { cudaFree(memory); }
70 
allocate(uint64_t size,uint64_t alignment,uint32_t flags)71 void* TRTDeviceAllocator::allocate(uint64_t size, uint64_t alignment,
72                                    uint32_t flags) {
73   if (size == 0) return nullptr;
74   // WAR for allocator alignment requirement. Certain cuda API calls require GPU
75   // memory with alignment to cudaDeviceProp::textureAlignment.
76   // See issue #20856
77   alignment = 512;
78   assert((alignment & (alignment - 1)) == 0);  // zero or a power of 2.
79   uint64_t total_size = size + alignment;
80   // TODO(aaroey): AllocateRaw takes size_t size as input, so it'll produce
81   // unexpected result when TRT tries to allocate more bytes than size_t can
82   // carry. Fix this.
83   void* mem = allocator_->AllocateRaw(alignment, total_size);
84   if (!mem) return nullptr;
85 
86   void* alloc_mem = mem;
87   QCHECK(Align(alignment, size, mem, total_size));
88   if (mem != alloc_mem) {
89     QCHECK(mem_map_.insert({mem, alloc_mem}).second);
90   }
91   VLOG(2) << "Allocated " << total_size << " bytes memory @" << alloc_mem
92           << "; aligned to " << size << " bytes @" << mem << " with alignment "
93           << alignment;
94   return mem;
95 }
96 
TRTDeviceAllocator(Allocator * allocator)97 TRTDeviceAllocator::TRTDeviceAllocator(Allocator* allocator)
98     : allocator_(allocator) {
99   VLOG(1) << "Using " << allocator->Name() << " allocator from TensorFlow";
100 }
101 
free(void * memory)102 void TRTDeviceAllocator::free(void* memory) {
103   VLOG(2) << "Deallocating @ " << memory;
104   // allocated memory adjusted for alignment, restore the original pointer
105   if (memory) {
106     auto alloc_mem = mem_map_.find(memory);
107     if (alloc_mem != mem_map_.end()) {
108       memory = alloc_mem->second;
109       mem_map_.erase(alloc_mem->first);
110     }
111     allocator_->DeallocateRaw(memory);
112   }
113 }
114 
115 }  // namespace tensorrt
116 }  // namespace tensorflow
117 
118 #endif  // GOOGLE_TENSORRT
119 #endif  // GOOGLE_CUDA
120