1 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/compiler/xla/service/gpu/outfeed_thunk.h"
17 
18 #include "tensorflow/compiler/xla/literal.h"
19 #include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h"
20 #include "tensorflow/compiler/xla/service/gpu/outfeed_manager.h"
21 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
22 #include "tensorflow/compiler/xla/util.h"
23 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
24 
25 namespace xla {
26 namespace gpu {
27 
OutfeedThunk(ThunkInfo thunk_info,std::vector<ShapedSlice> source_slices)28 OutfeedThunk::OutfeedThunk(ThunkInfo thunk_info,
29                            std::vector<ShapedSlice> source_slices)
30     : Thunk(Kind::kOutfeed, thunk_info),
31       source_slices_(std::move(source_slices)) {}
32 
ExecuteOnStream(const ExecuteParams & params)33 Status OutfeedThunk::ExecuteOnStream(const ExecuteParams& params) {
34   auto& stream = *params.stream;
35   auto& buffer_allocations = *params.buffer_allocations;
36 
37   VLOG(2) << "Outfeeding from GPU";
38 
39   auto op_profiler =
40       params.profiler->MakeScopedInstructionProfiler(profile_index());
41   OutfeedManager* outfeed_manager = GetOrCreateOutfeedManager();
42   ShapeTree<std::unique_ptr<OutfeedBuffer>>* output_buffers =
43       outfeed_manager->BlockingGetNextDestination();
44 
45   // Nothing to be done for an outfeed with no inputs.
46   // Note: Cannot do this before `BlockingGetNextDestination` above to dequeue
47   // an entry from the outfeed manager.
48   if (source_slices_.empty()) {
49     return Status::OK();
50   }
51 
52   const int64 leaf_count = output_buffers->leaf_count();
53   TF_RET_CHECK(source_slices_.size() == leaf_count)
54       << "Mismatch between number of outfeed inputs (" << source_slices_.size()
55       << ") and outputs (" << leaf_count << ")";
56 
57   auto output_leaf_it = output_buffers->leaf_begin();
58   for (int64 index = 0; index < leaf_count; ++index) {
59     // Assert that the shapes are compatible.
60     const ShapeIndex& shape_index = output_leaf_it->first;
61     std::unique_ptr<OutfeedBuffer>& buffer = output_leaf_it->second;
62 
63     // NOTE: This code needs deal with the `output_buffers` object getting
64     // deleted when its executing. Specifically, objects in the outfeed queue
65     // are pointers to instance of stack allocated objects in
66     // `GpuTransferManager::TransferLiteralFromOutfeed`. When all leaf node
67     // buffers are notified via "buffer->Done()" below in the stream host
68     // callback, `TransferLiteralFromOutfeed` deletes this stack allocated
69     // object when it returns. This means that its possible that during the last
70     // iteration, after the call to "buffer->Done()" is scheduled onto the
71     // stream, the `output_buffers` object might get deleted, so we should avoid
72     // accessing the object after that.
73     //
74     // To achieve that, increment the leaf iterator here before the last "Done"
75     // is enqueued, instead of in the loop increment, which would be after the
76     // "Done" is scheduled.
77     ++output_leaf_it;
78     const Shape& output_shape =
79         ShapeUtil::GetSubshape(output_buffers->shape(), shape_index);
80     TF_RET_CHECK(ShapeUtil::Equal(source_slices_[index].shape, output_shape))
81         << "Mismatch between outfeed output buffer shape "
82         << ShapeUtil::HumanStringWithLayout(output_shape)
83         << " and outfeed source buffer shape "
84         << ShapeUtil::HumanStringWithLayout(source_slices_[index].shape);
85 
86     BufferAllocation::Slice source_slice = source_slices_[index].slice;
87     if (!source_slice.allocation())
88       return InternalError("outfeed source missing buffer allocation");
89     se::DeviceMemoryBase data_address =
90         buffer_allocations.GetDeviceAddress(source_slice);
91 
92     // TODO(b/111309141): Run this on a separate stream so it doesn't block
93     // the GPU from doing work during the transfer. This could be handled by
94     // making StreamAssignment do something intelligent with outfeed thunks.
95     stream
96         .ThenMemcpy(buffer->destination()->untyped_data(), data_address,
97                     buffer->length())
98         .ThenDoHostCallback([&buffer]() { buffer->Done(); });
99   }
100 
101   Status block_status = stream.BlockHostUntilDone();
102   if (!block_status.ok()) {
103     return InternalError("Failed to complete data transfer on stream %p: %s",
104                          &stream, block_status.error_message());
105   }
106 
107   VLOG(2) << "Outfeeding from GPU complete";
108   return Status::OK();
109 }
110 
111 }  // namespace gpu
112 }  // namespace xla
113