1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/core/common_runtime/gpu/gpu_util.h"
17 
18 #include "tensorflow/core/common_runtime/copy_tensor.h"
19 #include "tensorflow/core/common_runtime/device.h"
20 #include "tensorflow/core/common_runtime/dma_helper.h"
21 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
22 #include "tensorflow/core/common_runtime/gpu/gpu_process_state.h"
23 #include "tensorflow/core/common_runtime/gpu_device_context.h"
24 #include "tensorflow/core/framework/tensor.h"
25 #include "tensorflow/core/framework/tensor.pb.h"
26 #include "tensorflow/core/framework/tensor_reference.h"
27 #include "tensorflow/core/framework/types.h"
28 #include "tensorflow/core/lib/core/errors.h"
29 #include "tensorflow/core/lib/core/refcount.h"
30 #include "tensorflow/core/lib/gtl/array_slice.h"
31 #include "tensorflow/core/lib/gtl/stl_util.h"
32 #include "tensorflow/core/lib/hash/hash.h"
33 #include "tensorflow/core/lib/strings/strcat.h"
34 #include "tensorflow/core/lib/strings/stringprintf.h"
35 #include "tensorflow/core/platform/logging.h"
36 #include "tensorflow/core/platform/stream_executor.h"
37 #include "tensorflow/core/platform/tensor_coding.h"
38 #include "tensorflow/core/platform/tracing.h"
39 #include "tensorflow/core/util/util.h"
40 
41 // IMPLEMENTATION NOTE:
42 //
43 // 1. Within this module, we intentionally LOG(FATAL) if any stream
44 //    involved in memcpy becomes !stream->ok(), because TF process
45 //    today (1/2016) can not properly recover from such an error.
46 //
47 // 2. When 0-size tensor is being copied, we should not schedule a
48 //    copy ThenMemcpy since there is no byte to move. However, we must
49 //    ensure the causal ordering by arranging the copy done callback
50 //    happens-after all activities scheduled on the given stream being
51 //    finished.
52 
53 // If this need to be runtime configurable, consider adding options to
54 // ConfigProto.
55 const tensorflow::int64 FLAGS_brain_gpu_util_debug_string_maxlen = 128;
56 extern bool FLAGS_brain_gpu_record_mem_types;
57 
58 namespace tensorflow {
59 
60 using se::DeviceMemoryBase;
61 using se::Stream;
62 
PrepareCopy(Device * device,const DeviceContext * ctx,const Tensor & src,const Tensor * dst,const DeviceBase::GpuDeviceInfo ** dev_info,se::Stream ** stream)63 Status PrepareCopy(Device* device, const DeviceContext* ctx, const Tensor& src,
64                    const Tensor* dst,
65                    const DeviceBase::GpuDeviceInfo** dev_info,
66                    se::Stream** stream) {
67   if (device == nullptr) {
68     return errors::Internal("Unexpected null device.");
69   }
70   auto di = device->tensorflow_gpu_device_info();
71   if (di == nullptr) {
72     return errors::Internal("Unexpected null device info.");
73   }
74   *dev_info = di;
75   if (ctx == nullptr) {
76     return errors::Internal("Unexpected null device context.");
77   }
78   auto gs = static_cast<const GPUDeviceContext*>(ctx)->stream();
79   if (gs == nullptr) {
80     return errors::Internal("No gpu stream is available.");
81   }
82   *stream = gs;
83   if (dst != nullptr) {
84     if (src.dtype() != dst->dtype()) {
85       return errors::Internal("Can't copy a tensor of ",
86                               DataTypeString(src.dtype()), " into a tensor of ",
87                               DataTypeString(dst->dtype()));
88     }
89     if (src.TotalBytes() != dst->TotalBytes()) {
90       return errors::Internal("Can't copy ", src.TotalBytes(),
91                               " bytes of a tensor into another with ",
92                               dst->TotalBytes(), " bytes buffer.");
93     }
94     if ((src.TotalBytes() > 0) && !src.IsInitialized()) {
95       return errors::Internal("Src tensor is not initialized.");
96     }
97     if ((dst->TotalBytes() > 0) && !dst->IsInitialized()) {
98       return errors::Internal("Dst tensor is not initialized.");
99     }
100   }
101   if (!DMAHelper::CanUseDMA(&src)) {
102     return errors::Internal("GPU copy from non-DMA ",
103                             DataTypeString(src.dtype()), "tensor");
104   }
105   return Status::OK();
106 }
107 
GetBase(const Tensor * src)108 void* GetBase(const Tensor* src) {
109   return const_cast<void*>(DMAHelper::base(src));
110 }
111 
GetBase(Tensor * dst)112 void* GetBase(Tensor* dst) { return DMAHelper::base(dst); }
113 
114 /*static*/
SetProtoFromGPU(const Tensor & tensor,Device * dev,const DeviceContext * device_context,TensorProto * proto,bool is_dead,StatusCallback done)115 void GPUUtil::SetProtoFromGPU(const Tensor& tensor, Device* dev,
116                               const DeviceContext* device_context,
117                               TensorProto* proto, bool is_dead,
118                               StatusCallback done) {
119   VLOG(1) << "SetProtoFromGPU device_context " << device_context;
120   const DeviceBase::GpuDeviceInfo* dev_info = nullptr;
121   se::Stream* send_stream = nullptr;
122   Status s = PrepareCopy(dev, device_context, tensor, nullptr, &dev_info,
123                          &send_stream);
124   if (!s.ok()) {
125     done(s);
126     return;
127   }
128 
129   auto send_device_to_host_stream =
130       static_cast<const GPUDeviceContext*>(device_context)
131           ->device_to_host_stream();
132   if (send_device_to_host_stream == nullptr) {
133     done(errors::Internal("No send gpu copy-out-stream is available."));
134     return;
135   }
136   // Wait for the sender's main stream to make sure the data are available.
137   send_device_to_host_stream->ThenWaitFor(send_stream);
138 
139   // Tensor values need to be copied from GPU to CPU ram so that
140   // we can build the protobuf response for a RecvTensor RPC.
141   // "device context" identifies the stream where the _Send op executed.
142   proto->set_dtype(tensor.dtype());
143   tensor.shape().AsProto(proto->mutable_tensor_shape());
144 
145   // Prepare a proto with the right data buf size, and DMA the data
146   // over from the GPU buffer.  Note that 0-size tensors do not have a
147   // backing buffer.
148   Allocator* alloc = nullptr;
149   char* buf = nullptr;
150   const int64 total_bytes = is_dead ? 0 : tensor.TotalBytes();
151   if (total_bytes > 0) {
152     tracing::ScopedAnnotation annotation("SetProtoFromGPU");
153     alloc = GPUProcessState::singleton()->GetGpuHostAllocator(0);
154     buf = alloc->Allocate<char>(total_bytes);
155     if (LogMemory::IsEnabled()) {
156       LogMemory::RecordRawAllocation("SetProtoFromGPU",
157                                      LogMemory::PROTO_BUFFER_STEP_ID,
158                                      total_bytes, buf, alloc);
159     }
160     void* src_ptr = GetBase(&tensor);
161     DeviceMemoryBase gpu_src_ptr(src_ptr, total_bytes);
162     send_device_to_host_stream->ThenMemcpy(buf, gpu_src_ptr, total_bytes);
163   }
164   // Use of tensor may outlive stack scope, so keep a ref.
165   TensorReference tensor_ref(tensor);
166   dev_info->event_mgr->ThenExecute(
167       send_device_to_host_stream, [send_device_to_host_stream, done, proto, buf,
168                                    total_bytes, alloc, tensor_ref]() {
169         if (!send_device_to_host_stream->ok()) {
170           LOG(FATAL) << "SetProtoFromGPU: GPU Memcpy failed";
171         }
172         tensor_ref.Unref();
173         if (total_bytes > 0) {
174           port::CopyFromArray(proto->mutable_tensor_content(), buf,
175                               total_bytes);
176           if (LogMemory::IsEnabled()) {
177             LogMemory::RecordRawDeallocation("SetProtoFromGPU",
178                                              LogMemory::PROTO_BUFFER_STEP_ID,
179                                              buf, alloc, false);
180           }
181           alloc->Deallocate<char>(buf, total_bytes);
182         }
183         done(Status::OK());
184       });
185 }
186 
187 // static
DeviceToDeviceCopy(DeviceContext * send_dev_context,DeviceContext * recv_dev_context,Device * src,Device * dst,AllocatorAttributes src_alloc_attr,AllocatorAttributes dst_alloc_attr,const Tensor * input,Tensor * output,int dev_to_dev_stream_index,StatusCallback done)188 void GPUUtil::DeviceToDeviceCopy(
189     DeviceContext* send_dev_context, DeviceContext* recv_dev_context,
190     Device* src, Device* dst, AllocatorAttributes src_alloc_attr,
191     AllocatorAttributes dst_alloc_attr, const Tensor* input, Tensor* output,
192     int dev_to_dev_stream_index, StatusCallback done) {
193   const DeviceBase::GpuDeviceInfo* dev_info = nullptr;
194   se::Stream* send_stream = nullptr;
195   Status s = PrepareCopy(src, send_dev_context, *input, output, &dev_info,
196                          &send_stream);
197   if (!s.ok()) {
198     done(s);
199     return;
200   }
201   auto send_device_to_device_stream =
202       static_cast<const GPUDeviceContext*>(send_dev_context)
203           ->device_to_device_stream(dev_to_dev_stream_index);
204   if (send_device_to_device_stream == nullptr) {
205     done(errors::Internal("No send gpu copy-out-stream is available."));
206     return;
207   }
208   // Wait for the main stream on the sender to make sure the result is
209   // available.
210   send_device_to_device_stream->ThenWaitFor(send_stream);
211 
212   const int64 total_bytes = input->TotalBytes();
213   if (total_bytes > 0) {
214     void* src_ptr = GetBase(input);
215     DeviceMemoryBase gpu_src_ptr(src_ptr, total_bytes);
216     void* dst_ptr = GetBase(output);
217     DeviceMemoryBase gpu_dst_ptr(dst_ptr, total_bytes);
218     auto recv_stream =
219         static_cast<const GPUDeviceContext*>(recv_dev_context)->stream();
220     if (recv_stream == nullptr) {
221       done(errors::Internal("No recv gpu stream is available."));
222       return;
223     }
224     // Since we want to use the memory from recv_stream in the
225     // send_device_to_device_stream, add a dependency to make sure the memory is
226     // truly free.
227     // TODO(zhengxq): remove this dependency when we switch to a better way
228     // to make sure the memory is free.
229     send_device_to_device_stream->ThenWaitFor(recv_stream);
230 
231     VLOG(2) << "src_ptr " << src_ptr << " dst_ptr " << dst_ptr;
232     send_device_to_device_stream->ThenMemcpy(&gpu_dst_ptr, gpu_src_ptr,
233                                              total_bytes);
234   }
235 
236   // Use of input may outlive stack scope, so keep a ref.
237   TensorReference input_ref(*input);
238   dev_info->event_mgr->ThenExecute(
239       send_device_to_device_stream,
240       [done, send_device_to_device_stream, input_ref]() {
241         input_ref.Unref();
242         if (!send_device_to_device_stream->ok()) {
243           LOG(FATAL) << "GPU->GPU Memcpy failed";
244         }
245         done(Status::OK());
246       });
247   send_dev_context->MaintainLifetimeOnStream(input,
248                                              send_device_to_device_stream);
249 }
250 
251 static CopyTensor::Registration register_gpu_gpu_copy(
252     DEVICE_GPU, DEVICE_GPU, GPUUtil::DeviceToDeviceCopy);
253 
254 // static
CopyGPUTensorToCPU(Device * gpu_device,const DeviceContext * device_context,const Tensor * gpu_tensor,Tensor * cpu_tensor,StatusCallback done)255 void GPUUtil::CopyGPUTensorToCPU(Device* gpu_device,
256                                  const DeviceContext* device_context,
257                                  const Tensor* gpu_tensor, Tensor* cpu_tensor,
258                                  StatusCallback done) {
259   VLOG(1) << "CopyGPUTensorToCPU";
260   const DeviceBase::GpuDeviceInfo* dev_info = nullptr;
261   se::Stream* send_stream = nullptr;
262   Status s = PrepareCopy(gpu_device, device_context, *gpu_tensor, cpu_tensor,
263                          &dev_info, &send_stream);
264   if (!s.ok()) {
265     done(s);
266     return;
267   }
268 
269   auto send_device_to_host_stream =
270       static_cast<const GPUDeviceContext*>(device_context)
271           ->device_to_host_stream();
272   if (send_device_to_host_stream == nullptr) {
273     done(errors::Internal("No send gpu copy-out-stream is available."));
274     return;
275   }
276   // Wait for the sender's main stream to make sure the data are available.
277   send_device_to_host_stream->ThenWaitFor(send_stream);
278 
279   const int64 total_bytes = gpu_tensor->TotalBytes();
280   if (total_bytes > 0) {
281     void* src_ptr = GetBase(gpu_tensor);
282     DeviceMemoryBase gpu_src_ptr(src_ptr, total_bytes);
283     void* dst_ptr = GetBase(cpu_tensor);
284     send_device_to_host_stream->ThenMemcpy(dst_ptr, gpu_src_ptr, total_bytes);
285   }
286   // Use of the input may outlive stack scope, so keep a ref.
287   TensorReference input_ref(*gpu_tensor);
288   dev_info->event_mgr->ThenExecute(
289       send_device_to_host_stream,
290       [send_device_to_host_stream, done, input_ref]() {
291         if (!send_device_to_host_stream->ok()) {
292           LOG(FATAL) << "GPU->CPU Memcpy failed";
293         }
294         input_ref.Unref();
295         done(Status::OK());
296       });
297 }
298 
299 /*  static */
CopyCPUTensorToGPU(const Tensor * cpu_tensor,const DeviceContext * device_context,Device * gpu_device,Tensor * gpu_tensor,StatusCallback done)300 void GPUUtil::CopyCPUTensorToGPU(const Tensor* cpu_tensor,
301                                  const DeviceContext* device_context,
302                                  Device* gpu_device, Tensor* gpu_tensor,
303                                  StatusCallback done) {
304   VLOG(1) << "CopyCPUTensorToGPU";
305   const DeviceBase::GpuDeviceInfo* dev_info = nullptr;
306   se::Stream* recv_stream = nullptr;
307   Status s = PrepareCopy(gpu_device, device_context, *cpu_tensor, gpu_tensor,
308                          &dev_info, &recv_stream);
309   if (!s.ok()) {
310     done(s);
311     return;
312   }
313 
314   auto recv_host_to_device_stream =
315       static_cast<const GPUDeviceContext*>(device_context)
316           ->host_to_device_stream();
317   if (recv_host_to_device_stream == nullptr) {
318     done(errors::Internal("No send gpu copy-out-stream is available."));
319     return;
320   }
321   // Wait for the recv-stream to make sure the buffer is truly available.
322   recv_host_to_device_stream->ThenWaitFor(recv_stream);
323 
324   const int64 total_bytes = cpu_tensor->TotalBytes();
325   // Note that 0-size tensors have no backing buffer.
326   if (total_bytes > 0) {
327     void* src_ptr = GetBase(cpu_tensor);
328     void* dst_ptr = GetBase(gpu_tensor);
329     DeviceMemoryBase gpu_dst_ptr(dst_ptr, total_bytes);
330     recv_host_to_device_stream->ThenMemcpy(&gpu_dst_ptr, src_ptr, total_bytes);
331   }
332   // Use of cpu_tensor may outlive stack scope, so keep a ref.
333   TensorReference input_ref(*cpu_tensor);
334   dev_info->event_mgr->ThenExecute(
335       recv_host_to_device_stream,
336       [recv_host_to_device_stream, done, input_ref]() {
337         input_ref.Unref();
338         if (!recv_host_to_device_stream->ok()) {
339           LOG(FATAL) << "CPU->GPU Memcpy failed";
340         }
341         done(Status::OK());
342       });
343 }
344 
Sync(Device * gpu_device)345 Status GPUUtil::Sync(Device* gpu_device) {
346   VLOG(1) << "GPUUtil::Sync";
347   auto* dev_info = gpu_device->tensorflow_gpu_device_info();
348   if (!dev_info) {
349     return errors::Internal("Failed to find dest device GPUDeviceInfo");
350   }
351   return dev_info->stream->BlockHostUntilDone();
352 }
353 
SyncAll(Device * gpu_device)354 Status GPUUtil::SyncAll(Device* gpu_device) {
355   VLOG(1) << "GPUUtil::SyncAll";
356   auto* dev_info = gpu_device->tensorflow_gpu_device_info();
357   if (!dev_info) {
358     return errors::Internal("Failed to find dest device GPUDeviceInfo");
359   }
360   if (!dev_info->stream->parent()->SynchronizeAllActivity() ||
361       !dev_info->stream->ok()) {
362     return errors::Internal("GPU sync failed");
363   }
364   return Status::OK();
365 }
366 
MemoryDebugString(const Device * device,Tensor * tensor)367 string GPUUtil::MemoryDebugString(const Device* device, Tensor* tensor) {
368   string ret;
369   CHECK(tensor);
370   const int64 num_bytes = std::min<int64>(
371       FLAGS_brain_gpu_util_debug_string_maxlen, tensor->TotalBytes());
372   void* ptr = (num_bytes > 0) ? GetBase(tensor) : nullptr;
373   strings::Appendf(&ret, "%p:", ptr);
374   if (num_bytes > 0) {
375     auto* dev_info = device->tensorflow_gpu_device_info();
376     if (!dev_info) {
377       strings::StrAppend(
378           &ret, PrintMemory(reinterpret_cast<const char*>(ptr), num_bytes));
379     } else {
380       string buf;
381       buf.resize(num_bytes);
382       DeviceMemoryBase gpu_ptr(ptr, num_bytes);
383       auto s = dev_info->stream->parent()->SynchronousMemcpyD2H(
384           gpu_ptr, num_bytes, gtl::string_as_array(&buf));
385       strings::StrAppend(&ret,
386                          PrintMemory(gtl::string_as_array(&buf), num_bytes));
387     }
388   }
389   return ret;
390 }
391 
392 // TODO(pbar) Checksum is called from places without a valid device context.
Checksum(Device * gpu_device,const DeviceContext * device_context,const Tensor & tensor)393 uint64 GPUUtil::Checksum(Device* gpu_device,
394                          const DeviceContext* device_context,
395                          const Tensor& tensor) {
396   Tensor copy(tensor.dtype(), tensor.shape());
397   Status s;
398   Notification n;
399   CopyGPUTensorToCPU(gpu_device, device_context, &tensor, &copy,
400                      [&s, &n](Status status) {
401                        s.Update(status);
402                        n.Notify();
403                      });
404   n.WaitForNotification();
405   CHECK(s.ok()) << s;
406   return Checksum(copy);
407 }
408 
Checksum(const Tensor & tensor)409 uint64 GPUUtil::Checksum(const Tensor& tensor) {
410   const float* fptr = reinterpret_cast<const float*>(GetBase(&tensor));
411   size_t num_bytes = tensor.TotalBytes();
412   size_t num_floats = num_bytes / sizeof(float);
413   for (size_t i = 0; i < num_floats; ++i) {
414     CHECK(!std::isnan(fptr[i])) << " i " << i;
415   }
416   // TODO(tucker): consider using crc32c instead.
417   return Hash64(reinterpret_cast<const char*>(GetBase(&tensor)),
418                 tensor.TotalBytes(), 0);
419 }
420 
421 // static
CopyGPUTensorToSameGPU(Device * gpu_device,const DeviceContext * device_context,const Tensor * src_gpu_tensor,Tensor * dst_gpu_tensor,StatusCallback done)422 void GPUUtil::CopyGPUTensorToSameGPU(Device* gpu_device,
423                                      const DeviceContext* device_context,
424                                      const Tensor* src_gpu_tensor,
425                                      Tensor* dst_gpu_tensor,
426                                      StatusCallback done) {
427   VLOG(1) << "CopyGPUTensorToSameGPU";
428   const DeviceBase::GpuDeviceInfo* dev_info = nullptr;
429   se::Stream* send_stream = nullptr;
430   Status s = PrepareCopy(gpu_device, device_context, *src_gpu_tensor,
431                          dst_gpu_tensor, &dev_info, &send_stream);
432   if (!s.ok()) {
433     done(s);
434     return;
435   }
436 
437   const int64 total_bytes = src_gpu_tensor->TotalBytes();
438   if (total_bytes > 0) {
439     void* src_ptr = GetBase(src_gpu_tensor);
440     DeviceMemoryBase gpu_src_ptr(src_ptr, total_bytes);
441     void* dst_ptr = GetBase(dst_gpu_tensor);
442     DeviceMemoryBase gpu_dst_ptr(dst_ptr, total_bytes);
443     send_stream->ThenMemcpy(&gpu_dst_ptr, gpu_src_ptr, total_bytes);
444   }
445 
446   done(Status::OK());
447 }
448 
449 }  // namespace tensorflow
450