1 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #include "tensorflow/core/common_runtime/collective_rma_local.h"
16 
17 #include "tensorflow/core/common_runtime/copy_tensor.h"
18 #include "tensorflow/core/common_runtime/dma_helper.h"
19 
20 namespace tensorflow {
21 
StartAbort(const Status & s)22 void CollectiveRemoteAccessLocal::StartAbort(const Status& s) {
23   buf_rendezvous_.StartAbort(s);
24 }
25 
RecvFromPeer(const string & peer_device,const string & peer_task,bool peer_is_local,const string & key,Device * to_device,DeviceContext * to_device_ctx,const AllocatorAttributes & to_alloc_attr,Tensor * to_tensor,const DeviceLocality & client_locality,int dev_to_dev_stream_index,const StatusCallback & done)26 void CollectiveRemoteAccessLocal::RecvFromPeer(
27     const string& peer_device, const string& peer_task, bool peer_is_local,
28     const string& key, Device* to_device, DeviceContext* to_device_ctx,
29     const AllocatorAttributes& to_alloc_attr, Tensor* to_tensor,
30     const DeviceLocality& client_locality, int dev_to_dev_stream_index,
31     const StatusCallback& done) {
32   VLOG(1) << "RecvFromPeer " << this << " from " << peer_device << " key "
33           << key;
34   if (!peer_is_local) {
35     done(
36         errors::Internal("CollectiveRemoteAccessLocal::RecvFromPeer "
37                          "called with peer_is_local=false"));
38     return;
39   }
40   buf_rendezvous_.ConsumeBuf(
41       key, [to_tensor, to_device_ctx, to_device, to_alloc_attr,
42             dev_to_dev_stream_index,
43             done](const Status& s, BufRendezvous::Hook* hook) {
44         if (!s.ok()) {
45           done(s);
46           delete hook;
47         } else {
48           int64 recv_bytes = to_tensor->TotalBytes();
49           CHECK_EQ(recv_bytes, hook->prod_value->TotalBytes());
50           MemCpyAsync(hook->prod_ctx,    // src DeviceContext
51                       to_device_ctx,     // dst DeviceContext
52                       hook->prod_dev,    // src Device
53                       to_device,         // dst Device
54                       hook->prod_attr,   // src AllocatorAttributes
55                       to_alloc_attr,     // dst AllocatorAttributes
56                       hook->prod_value,  // src Tensor*
57                       to_tensor,         // dst Tensor*
58                       dev_to_dev_stream_index, [hook, done](const Status& s) {
59                         // This callback may be executing in the GPUEventMgr
60                         // pool in which case it must be very short duration
61                         // and non-blocking (except e.g. for queue insertion).
62                         // It would be safer, though expensive, to transfer
63                         // to another thread here.
64                         done(s);
65                         BufRendezvous::DoneWithHook(hook);
66                       });
67         }
68       });
69 }
70 
PostToPeer(const string & peer_device,const string & peer_task,const string & key,Device * from_device,DeviceContext * from_device_ctx,const AllocatorAttributes & from_alloc_attr,const Tensor * from_tensor,const DeviceLocality & client_locality,const StatusCallback & done)71 void CollectiveRemoteAccessLocal::PostToPeer(
72     const string& peer_device, const string& peer_task, const string& key,
73     Device* from_device, DeviceContext* from_device_ctx,
74     const AllocatorAttributes& from_alloc_attr, const Tensor* from_tensor,
75     const DeviceLocality& client_locality, const StatusCallback& done) {
76   VLOG(1) << "PostToPeer " << this << " key " << key
77           << " step_id_=" << step_id_;
78   buf_rendezvous_.ProvideBuf(key, from_device, from_device_ctx, from_tensor,
79                              from_alloc_attr, done);
80 }
81 
82 /*static*/
MemCpyAsync(DeviceContext * src_dev_ctx,DeviceContext * dst_dev_ctx,Device * src_dev,Device * dst_dev,const AllocatorAttributes & src_attr,const AllocatorAttributes & dst_attr,const Tensor * src,Tensor * dst,int dev_to_dev_stream_index,const StatusCallback & done)83 void CollectiveRemoteAccessLocal::MemCpyAsync(
84     DeviceContext* src_dev_ctx, DeviceContext* dst_dev_ctx, Device* src_dev,
85     Device* dst_dev, const AllocatorAttributes& src_attr,
86     const AllocatorAttributes& dst_attr, const Tensor* src, Tensor* dst,
87     int dev_to_dev_stream_index, const StatusCallback& done) {
88   // We want a real copy to happen, i.e. the bytes inside of src should be
89   // transferred to the buffer backing dst.  If src and dst are on different
90   // devices then CopyTensor::ViaDMA will do just that.  But if they're both
91   // the same CPU, then it will actually just reset dst to point to src.
92   // Since this routine is used for copying between devices and within a
93   // device, we need to detect and bypass the wrong-semantics case.
94   const DeviceType src_device_type(
95       src_attr.on_host() ? DEVICE_CPU : src_dev->attributes().device_type());
96   const DeviceType dst_device_type(
97       dst_attr.on_host() ? DEVICE_CPU : dst_dev->attributes().device_type());
98   const bool non_cpu_src = src_device_type != DeviceType(DEVICE_CPU);
99   const bool non_cpu_dst = dst_device_type != DeviceType(DEVICE_CPU);
100   // For GPU devices when only one compute stream is used (the default)
101   // the OpKernelContext does not supply a DeviceContext.  It's assumed
102   // that all nodes use the default context.
103   if (src_dev_ctx == nullptr && src_device_type == DEVICE_GPU) {
104     const DeviceBase::GpuDeviceInfo* dev_info =
105         src_dev->tensorflow_gpu_device_info();
106     CHECK(dev_info);
107     src_dev_ctx = dev_info->default_context;
108   }
109   if (dst_dev_ctx == nullptr && dst_device_type == DEVICE_GPU) {
110     const DeviceBase::GpuDeviceInfo* dev_info =
111         src_dev->tensorflow_gpu_device_info();
112     CHECK(dev_info);
113     dst_dev_ctx = dev_info->default_context;
114   }
115   if (non_cpu_src) CHECK(src_dev_ctx);
116   if (non_cpu_dst) CHECK(dst_dev_ctx);
117   if (non_cpu_src || non_cpu_dst) {
118     CopyTensor::ViaDMA("",  // edge name (non-existent)
119                        src_dev_ctx, dst_dev_ctx, src_dev, dst_dev, src_attr,
120                        dst_attr, src, dst, dev_to_dev_stream_index, done);
121   } else {
122     int64 bytes = src->TotalBytes();
123     DCHECK_EQ(dst->TotalBytes(), bytes);
124     memcpy(DMAHelper::base(dst), DMAHelper::base(src), bytes);
125     done(Status::OK());
126   }
127 }
128 
129 }  // namespace tensorflow
130