1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
17 
18 #include <set>
19 #include <utility>
20 #include <vector>
21 
22 #include "absl/container/flat_hash_map.h"
23 #include "absl/memory/memory.h"
24 #include "tensorflow/compiler/xla/map_util.h"
25 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
26 #include "tensorflow/compiler/xla/service/gpu/gpu_constants.h"
27 #include "tensorflow/compiler/xla/service/gpu/gpu_executable_run_options.h"
28 #include "tensorflow/compiler/xla/service/gpu/gpu_types.h"
29 #include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h"
30 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
31 #include "tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.h"
32 #include "tensorflow/compiler/xla/service/logical_buffer.h"
33 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
34 #include "tensorflow/compiler/xla/service/transfer_manager.h"
35 #include "tensorflow/compiler/xla/service/xla_debug_info_manager.h"
36 #include "tensorflow/compiler/xla/shape_tree.h"
37 #include "tensorflow/compiler/xla/shape_util.h"
38 #include "tensorflow/compiler/xla/status_macros.h"
39 #include "tensorflow/compiler/xla/util.h"
40 #include "tensorflow/core/lib/gtl/map_util.h"
41 #include "tensorflow/core/platform/errors.h"
42 #include "tensorflow/core/platform/logging.h"
43 #include "tensorflow/core/profiler/lib/scoped_annotation.h"
44 #include "tensorflow/core/profiler/lib/traceme.h"
45 #include "tensorflow/stream_executor/platform.h"
46 
47 namespace xla {
48 namespace gpu {
49 namespace {
50 
51 using ::tensorflow::profiler::ScopedAnnotation;
52 
53 }  // namespace
54 
55 // Implementation note: HLO profiling is always enabled for GPU executables,
56 // since we can use timers around thunks.
GpuExecutable(GpuExecutable::Params params)57 GpuExecutable::GpuExecutable(GpuExecutable::Params params)
58     : Executable(std::move(params.debug_module),
59                  std::move(params.hlo_profile_printer_data),
60                  std::move(params.hlo_profile_index_map)),
61       text_(std::move(params.asm_text)),
62       binary_(std::move(params.binary)),
63       gpu_version_(params.gpu_version),
64       thunk_schedule_(std::move(params.thunk_schedule)),
65       module_name_(params.module_name),
66       output_shape_(params.output_shape),
67       allocations_(std::move(params.allocations)),
68       debug_buffer_assignment_(std::move(params.debug_buffer_assignment)),
69       entry_computation_profile_index_(params.entry_computation_profile_index),
70       constants_(std::move(params.constants)),
71       output_info_(std::move(params.output_info)) {
72   XlaDebugInfoManager::Get()->RegisterModule(module_name_, shared_module(),
73                                              debug_buffer_assignment_);
74 }
75 
~GpuExecutable()76 GpuExecutable::~GpuExecutable() {
77   XlaDebugInfoManager::Get()->UnregisterModule(module_name_, shared_module(),
78                                                debug_buffer_assignment_);
79 
80   {
81     // We could have issued host->device mem copies in ResolveConstantGlobals.
82     // Wait for those to finish so that we can safely deallocate the backing HLO
83     // module.
84     //
85     // We need for the host->device memcpies to finish they are concurrently
86     // reading memory (xla::Literal's) owned by the HLO module.
87     tensorflow::mutex_lock lock(module_handle_mutex_);
88     for (const auto& pair : module_globals_) {
89       CHECK(pair.first->SynchronizeAllActivity());
90     }
91   }
92 }
93 
CheckCompatibilityWithServiceExecutableRunOptions(const ServiceExecutableRunOptions * run_options)94 Status GpuExecutable::CheckCompatibilityWithServiceExecutableRunOptions(
95     const ServiceExecutableRunOptions* run_options) {
96   se::Stream* main_stream = run_options->stream();
97 
98   stream_executor::PlatformKind platform_kind =
99       main_stream->parent()->platform_kind();
100   if (platform_kind == stream_executor::PlatformKind::kROCm) {
101     int stream_isa_version;
102     main_stream->parent()->GetDeviceDescription().rocm_amdgpu_isa_version(
103         &stream_isa_version);
104     int gpu_exec_isa_version =
105         absl::get<std::pair<int, std::string>>(gpu_version_).first;
106     TF_RET_CHECK(stream_isa_version == gpu_exec_isa_version)
107         << "AMDGPU GCN ISA version mismatch; expected {" << gpu_exec_isa_version
108         << ", but was " << stream_isa_version;
109   } else if (platform_kind == stream_executor::PlatformKind::kCuda) {
110     std::pair<int, int> stream_compute_compatibility;
111     main_stream->parent()->GetDeviceDescription().cuda_compute_capability(
112         &stream_compute_compatibility.first,
113         &stream_compute_compatibility.second);
114     GpuVersion nvidia_compute_compatibility = stream_compute_compatibility;
115     TF_RET_CHECK(nvidia_compute_compatibility == gpu_version_)
116         << "Compute capability mismatch; expected {"
117         << absl::get<std::pair<int, int>>(gpu_version_).first << ", "
118         << absl::get<std::pair<int, int>>(gpu_version_).second << "}, but was {"
119         << stream_compute_compatibility.first << ", "
120         << stream_compute_compatibility.second << "}";
121   } else {
122     return InternalError("Unknown platform: %d", platform_kind);
123   }
124 
125   return Status::OK();
126 }
127 
ExecuteThunks(const ServiceExecutableRunOptions * run_options,const BufferAllocations & buffer_allocations,bool block_host_until_done,HloExecutionProfile * hlo_execution_profile)128 Status GpuExecutable::ExecuteThunks(
129     const ServiceExecutableRunOptions* run_options,
130     const BufferAllocations& buffer_allocations, bool block_host_until_done,
131     HloExecutionProfile* hlo_execution_profile) {
132   TF_RETURN_IF_ERROR(
133       CheckCompatibilityWithServiceExecutableRunOptions(run_options));
134   XlaDebugInfoManager::Get()->OnModuleStart(module_name_);
135   auto cleanup = MakeCleanup(
136       [&]() { XlaDebugInfoManager::Get()->OnModuleStop(module_name_); });
137 
138   se::Stream* main_stream = run_options->stream();
139   se::StreamExecutor* executor = main_stream->parent();
140 
141   bool do_profile = hlo_execution_profile != nullptr;
142   if (do_profile) {
143     LOG(WARNING) << "PROFILING: profiling is enabled";
144   }
145 
146   // Stream 0 indicates `main_stream` and substreams start from stream 1.
147   std::vector<StreamPool::Ptr> sub_streams;
148   sub_streams.reserve(thunk_schedule_->StreamCount() - 1);
149   while (sub_streams.size() + 1 < thunk_schedule_->StreamCount()) {
150     sub_streams.emplace_back();
151     TF_ASSIGN_OR_RETURN(sub_streams.back(),
152                         run_options->BorrowStream(executor->device_ordinal()));
153     // Require substreams to wait for the main stream, otherwise substreams may
154     // execute before the program is scheduled to start on the main stream.
155     sub_streams.back()->ThenWaitFor(main_stream);
156   }
157 
158   HloExecutionProfiler profiler(do_profile, hlo_execution_profile, main_stream,
159                                 sub_streams, entry_computation_profile_index_);
160   uint64 start_micros = tensorflow::Env::Default()->NowMicros();
161 
162   tensorflow::profiler::TraceMe hlo_module_activity(
163       [&] { return absl::StrCat(module_name_, ":XLA GPU module"); },
164       tensorflow::profiler::TraceMeLevel::kInfo);
165 
166   std::map<const Thunk*, std::unique_ptr<se::Event>> thunk_to_finish_event;
167   std::vector<std::function<void()>> deferred_host_callbacks;
168   for (Thunk* thunk : thunk_schedule_->TotalOrder()) {
169     // Annotate execution of this op if tracing was enabled when we started
170     // running this module.  If tracing is enabled *while* we're running the
171     // module, we won't get any data, but that's probably an OK trade-off.
172     ScopedAnnotation annotation([&] { return thunk->profile_annotation(); });
173 
174     int32 stream_no = thunk_schedule_->StreamNumberForThunk(thunk);
175     se::Stream* stream =
176         (stream_no == 0 ? main_stream : sub_streams[stream_no - 1].get());
177 
178     for (const Thunk* dependency : thunk_schedule_->DependsOn(thunk)) {
179       stream->ThenWaitFor(FindOrDie(thunk_to_finish_event, dependency).get());
180     }
181 
182     VLOG(2) << "Executing the thunk for " << thunk->profile_annotation()
183             << " on stream " << stream_no;
184     const GpuExecutableRunOptions* gpu_options =
185         run_options->run_options().gpu_executable_run_options();
186     Thunk::ExecuteParams thunk_params{
187         &buffer_allocations,
188         stream,
189         run_options->run_options().run_id(),
190         &profiler,
191         run_options->run_options().device_assignment(),
192         &deferred_host_callbacks,
193         gpu_options && gpu_options->gpu_global_device_ids()
194             ? &*gpu_options->gpu_global_device_ids()
195             : nullptr,
196         gpu_options && gpu_options->nccl_unique_id_callback()
197             ? &gpu_options->nccl_unique_id_callback()
198             : nullptr};
199     TF_RETURN_IF_ERROR(thunk->ExecuteOnStream(thunk_params));
200     if (thunk_schedule_->Depended(thunk)) {
201       auto finish_event = absl::make_unique<se::Event>(main_stream->parent());
202       finish_event->Init();
203       stream->ThenRecordEvent(finish_event.get());
204       thunk_to_finish_event[thunk] = std::move(finish_event);
205     }
206   }
207 
208   main_stream->ThenWaitFor(&sub_streams);
209   if (!deferred_host_callbacks.empty()) {
210     auto fn = [deferred_host_callbacks{std::move(deferred_host_callbacks)}]() {
211       for (auto& callback : deferred_host_callbacks) {
212         callback();
213       }
214     };
215     if (run_options->run_options().then_execute_function()) {
216       (*run_options->run_options().then_execute_function())(main_stream,
217                                                             std::move(fn));
218     } else {
219       main_stream->ThenDoHostCallback(std::move(fn));
220     }
221   }
222   // Make sure kernels are completed before deallocating temporary buffers or
223   // the profiler state.
224   // TODO(b/30100571): we could potentially postpone deallocating the temp
225   // buffers until a different computation is executed.
226   if (do_profile || block_host_until_done) {
227     Status block_status = main_stream->BlockHostUntilDone();
228     if (!block_status.ok()) {
229       return InternalError(
230           "Failed to complete all kernels launched on stream %p: %s",
231           main_stream, block_status.error_message());
232     }
233   }
234 
235   // FinishExecution() blocks until main_stream has completed if profiling is
236   // enabled; we therefore do not need to defer profile collection onto a
237   // stream.
238   profiler.FinishExecution();
239   uint64 end_micros = tensorflow::Env::Default()->NowMicros();
240 
241   if (run_options->run_options().execution_profile()) {
242     ExecutionProfile* profile = run_options->run_options().execution_profile();
243     const double nanoseconds = (end_micros - start_micros) * 1000.0;
244     profile->set_compute_time_ns(std::max(nanoseconds, 1.0));
245 
246     // If hlo profiling was disabled then the cycle count is left empty.
247     if (do_profile) {
248       profile->set_compute_cycle_count(hlo_execution_profile->GetCyclesTakenBy(
249           entry_computation_profile_index_));
250     }
251   }
252 
253   return Status::OK();
254 }
255 
256 StatusOr<const GpuExecutable::BufferAllocToDeviceMemoryMap*>
ResolveConstantGlobals(se::Stream * stream)257 GpuExecutable::ResolveConstantGlobals(se::Stream* stream) {
258   se::StreamExecutor* executor = stream->parent();
259 
260   tensorflow::mutex_lock lock(module_handle_mutex_);
261   auto it = module_globals_.find(executor);
262   if (it != module_globals_.end()) {
263     return &it->second;
264   }
265 
266   se::MultiModuleLoaderSpec module_spec;
267   if (!binary().empty()) {
268     module_spec.AddCudaCubinInMemory(binary());
269   }
270   module_spec.AddCudaPtxInMemory(text().c_str());
271 
272   absl::flat_hash_map<int64, se::DeviceMemoryBase> globals;
273   if (executor->platform_kind() == se::PlatformKind::kCuda &&
274       module_spec.cuda_ptx_in_memory() == nullptr) {
275     // No custom PTX => no globals.
276     return &module_globals_.emplace(executor, std::move(globals)).first->second;
277   }
278 
279   se::ModuleHandle module_handle;
280   TF_RETURN_IF_ERROR(executor->LoadModule(module_spec, &module_handle));
281 
282   for (const auto& info : constants_) {
283     TF_ASSIGN_OR_RETURN(auto global, executor->GetUntypedSymbol(
284                                          info.symbol_name, module_handle));
285     VLOG(3) << "Resolved global " << info.symbol_name << " to "
286             << global.opaque();
287 
288     if (!info.content.empty()) {
289       stream->ThenMemcpy(&global, info.content.data(), info.content.size());
290     }
291 
292     if (info.allocation_index != -1) {
293       InsertOrDie(&globals, info.allocation_index, global);
294     }
295   }
296 
297   module_handles_.emplace(executor,
298                           se::ScopedModuleHandle(executor, module_handle));
299   return &module_globals_.emplace(executor, std::move(globals)).first->second;
300 }
301 
BufferForAllocation(VariantArguments arguments,const GpuExecutable::BufferAllocToDeviceMemoryMap * globals,const BufferAllocation & allocation,se::DeviceMemoryAllocator * const memory_allocator,int device_ordinal,int64 arg_idx)302 StatusOr<se::DeviceMemoryBase> GpuExecutable::BufferForAllocation(
303     VariantArguments arguments,
304     const GpuExecutable::BufferAllocToDeviceMemoryMap* globals,
305     const BufferAllocation& allocation,
306     se::DeviceMemoryAllocator* const memory_allocator, int device_ordinal,
307     int64 arg_idx) {
308   if (allocation.is_thread_local()) {
309     return se::DeviceMemoryBase{};
310   } else if (allocation.is_entry_computation_parameter()) {
311     int64 param_no = allocation.parameter_number();
312     se::DeviceMemoryBase registered_buffer = [&] {
313       if (auto unowned_shapedbuffers =
314               absl::get_if<absl::Span<const ShapedBuffer* const>>(&arguments)) {
315         return (*unowned_shapedbuffers)[param_no]->buffers().element(
316             allocation.param_shape_index());
317       } else {
318         return absl::get<absl::Span<ExecutionInput>>(arguments)[param_no]
319             .Buffer(allocation.param_shape_index())
320             .AsDeviceMemoryBase();
321       }
322     }();
323     if (registered_buffer.is_null() && registered_buffer.size() > 0) {
324       return FailedPrecondition(
325           "Cannot run XLA computation because pointer to (sub-)buffer at "
326           "index %s of parameter %d was null.  All pointers to "
327           "(sub-)buffers must not be null, unless the (sub-)buffer has "
328           "zero elements.",
329           allocation.param_shape_index().ToString(), param_no);
330     }
331     return registered_buffer;
332   } else if (allocation.is_constant()) {
333     auto it = globals->find(arg_idx);
334     if (it == globals->end()) {
335       return se::DeviceMemoryBase();
336     }
337     return it->second;
338   } else {
339     // Allocate each allocation that might escape, or is the temp buffer.
340     CHECK(allocation.maybe_live_out() || allocation.IsPreallocatedTempBuffer());
341     const int64 buffer_size = allocation.size();
342     se::DeviceMemoryBase buffer_address;
343     if (buffer_size > 0) {
344       TF_ASSIGN_OR_RETURN(
345           se::OwningDeviceMemory buffer,
346           memory_allocator->Allocate(device_ordinal, buffer_size));
347       buffer_address = buffer.Release();
348     }
349     return buffer_address;
350   }
351 }
352 
CheckAlignment(const BufferAllocation & allocation,se::DeviceMemoryBase buffer,int arg_idx)353 static Status CheckAlignment(const BufferAllocation& allocation,
354                              se::DeviceMemoryBase buffer, int arg_idx) {
355   const int64 expected_alignment = [&] {
356     if (allocation.is_entry_computation_parameter()) {
357       return kEntryParameterAlignBytes;
358     } else if (allocation.is_constant()) {
359       return kConstantBufferAlignBytes;
360     } else {
361       return kXlaAllocatedBufferAlignBytes;
362     }
363   }();
364   if (!buffer.is_null() &&
365       reinterpret_cast<uintptr_t>(buffer.opaque()) % expected_alignment != 0) {
366     return InternalError(
367         "Address of buffer %d must be a multiple of %x, but "
368         "was %p",
369         arg_idx, expected_alignment, buffer.opaque());
370   }
371   return Status::OK();
372 }
373 
GenerateBufferAllocations(VariantArguments arguments,const GpuExecutable::BufferAllocToDeviceMemoryMap * globals,se::DeviceMemoryAllocator * const memory_allocator,se::StreamExecutor * executor)374 StatusOr<BufferAllocations> GpuExecutable::GenerateBufferAllocations(
375     VariantArguments arguments,
376     const GpuExecutable::BufferAllocToDeviceMemoryMap* globals,
377     se::DeviceMemoryAllocator* const memory_allocator,
378     se::StreamExecutor* executor) {
379   tensorflow::profiler::TraceMe hlo_module_activity(
380       [&] { return std::string("Build buffer allocations"); },
381       tensorflow::profiler::TraceMeLevel::kInfo);
382 
383   const int64 num_buffers = allocations_.size();
384   std::vector<se::DeviceMemoryBase> buffers;
385   buffers.reserve(num_buffers);
386   for (int64 i = 0; i < num_buffers; ++i) {
387     const BufferAllocation& allocation = allocations_[i];
388     TF_ASSIGN_OR_RETURN(
389         se::DeviceMemoryBase buffer,
390         BufferForAllocation(arguments, globals, allocation, memory_allocator,
391                             executor->device_ordinal(), i));
392     buffers.push_back(buffer);
393     TF_RETURN_IF_ERROR(CheckAlignment(allocation, buffer, i));
394   }
395   return {{buffers, executor->device_ordinal(), memory_allocator}};
396 }
397 
ExecuteAsyncOnStream(const ServiceExecutableRunOptions * run_options,std::vector<ExecutionInput> arguments,HloExecutionProfile * hlo_execution_profile)398 StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStream(
399     const ServiceExecutableRunOptions* run_options,
400     std::vector<ExecutionInput> arguments,
401     HloExecutionProfile* hlo_execution_profile) {
402   return ExecuteAsyncOnStreamImpl(run_options, absl::MakeSpan(arguments),
403                                   hlo_execution_profile);
404 }
405 
ExecuteAsyncOnStream(const ServiceExecutableRunOptions * run_options,absl::Span<const ShapedBuffer * const> arguments,HloExecutionProfile * hlo_execution_profile)406 StatusOr<ScopedShapedBuffer> GpuExecutable::ExecuteAsyncOnStream(
407     const ServiceExecutableRunOptions* run_options,
408     absl::Span<const ShapedBuffer* const> arguments,
409     HloExecutionProfile* hlo_execution_profile) {
410   TF_ASSIGN_OR_RETURN(
411       ExecutionOutput out,
412       ExecuteAsyncOnStreamImpl(run_options, arguments, hlo_execution_profile));
413   return out.ConsumeResult();
414 }
415 
ExecuteAsyncOnStreamImpl(const ServiceExecutableRunOptions * run_options,VariantArguments arguments,HloExecutionProfile * hlo_execution_profile)416 StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStreamImpl(
417     const ServiceExecutableRunOptions* run_options, VariantArguments arguments,
418     HloExecutionProfile* hlo_execution_profile) {
419   XLA_SCOPED_LOGGING_TIMER(absl::StrCat(
420       "GpuExecutable::ExecuteAsyncOnStreamImpl(", module_name_, ")"));
421   se::DeviceMemoryAllocator* const memory_allocator = run_options->allocator();
422   // Force synchronous execution if the allocator requires it.
423   const bool block_host_until_done =
424       !memory_allocator->AllowsAsynchronousDeallocation();
425 
426   const GpuExecutable::BufferAllocToDeviceMemoryMap* globals;
427   {
428     tensorflow::profiler::TraceMe hlo_module_activity(
429         [&] { return std::string("Resolve constant globals"); },
430         tensorflow::profiler::TraceMeLevel::kInfo);
431 
432     TF_ASSIGN_OR_RETURN(globals, ResolveConstantGlobals(run_options->stream()));
433   }
434 
435   se::StreamExecutor* executor = run_options->stream()->parent();
436 
437   auto device_ordinal = executor->device_ordinal();
438   ExecutionOutput result(/*on_device_shape=*/output_shape_, memory_allocator,
439                          device_ordinal);
440 
441   TF_ASSIGN_OR_RETURN(BufferAllocations buffer_allocations,
442                       GenerateBufferAllocations(arguments, globals,
443                                                 memory_allocator, executor));
444   VLOG(2) << buffer_allocations.ToString();
445   std::set<se::DeviceMemoryBase> buffers_in_result;
446 
447   const bool is_entire_tuple_contents_aliased = [&] {
448     for (auto& p : result.MutableResult()->buffers().leaves()) {
449       const OutputInfo& output_info = output_info_.at(p.first);
450       if (!output_info.alias_config.has_value()) {
451         return false;
452       }
453     }
454     return true;
455   }();
456 
457   for (auto& p : result.MutableResult()->buffers()) {
458     const ShapeIndex& index = p.first;
459     if (!output_info_.contains(index)) {
460       continue;
461     }
462     const OutputInfo& output_info = output_info_.at(index);
463     const BufferAllocation* allocation =
464         &allocations_[output_info.allocation_index];
465     se::DeviceMemoryBase& result_buffer = p.second;
466 
467     VLOG(4) << "Looking at: allocation " << output_info.allocation_index
468             << " @ index: " << index.ToString();
469 
470     if (output_info.alias_config) {
471       MaybeOwningDeviceMemory* maybe_owning_memory =
472           [&]() -> xla::MaybeOwningDeviceMemory* {
473         // ScopedBuffer is never an owned buffer.
474         if (auto* unowned_shapedbuffers =
475                 absl::get_if<absl::Span<const ShapedBuffer* const>>(
476                     &arguments)) {
477           return nullptr;
478         } else {
479           auto unowned_execution_input =
480               absl::get<absl::Span<ExecutionInput>>(arguments);
481           ExecutionInput& input =
482               unowned_execution_input[allocation->parameter_number()];
483           return input.MutableBuffer(allocation->param_shape_index());
484         }
485       }();
486       if (output_info.alias_config->must_alias() && maybe_owning_memory &&
487           !maybe_owning_memory->HasOwnership()) {
488         return InvalidArgument(
489             "An input was configured to be must-alias at "
490             "compile time but not donated at runtime: allocation %d",
491             output_info.allocation_index);
492       }
493       if (maybe_owning_memory && maybe_owning_memory->HasOwnership()) {
494         absl::optional<tensorflow::se::OwningDeviceMemory> owning =
495             maybe_owning_memory->Release();
496         // If the caller passes the ownership of the device memory, reuse it
497         // as the output buffer. It is up to the caller whether or not to
498         // donate a buffer; the aliasing information describes which buffers
499         // may alias, not buffers that must alias.
500         se::DeviceMemoryBase argument_buffer = owning->Release();
501         *maybe_owning_memory = argument_buffer;
502         result_buffer = argument_buffer;
503         // The caller is giving us the
504         // input buffer, but in case of error from the execute call, we should
505         // not be releasing it as it contains valid data (for example, it is a
506         // parameter which the user wants us to alias, in a gradient update
507         // computation). So we store the index into the result in the aliased
508         // vector, which will be fed to the ExecutionOutput, which will use
509         // the indices to drop the addresses from its own ScopedShapedBuffer
510         // result, if the ExecutionOutput is not committed.
511         result.AddAliasedIndex(index);
512       } else if (!output_info.passthrough) {
513         // The guard is above is not to insert copy-protection when aliasing
514         // pass-through params, as we do not need to write into the output
515         // buffer.
516         VLOG(3) << "Using copy-protection: aliasing is specified, but the "
517                    "buffer is not donated; allocating a fresh buffer";
518         int64 allocation_size =
519             ShapeUtil::ByteSizeOf(ShapeUtil::GetSubshape(output_shape_, index));
520         TF_ASSIGN_OR_RETURN(
521             se::OwningDeviceMemory allocated_buffer,
522             memory_allocator->Allocate(device_ordinal, allocation_size));
523         result_buffer = allocated_buffer.Release();
524         se::DeviceMemoryBase& aliased_buffer =
525             buffer_allocations.GetMutableDeviceAddress(
526                 output_info.allocation_index);
527         CHECK_EQ(aliased_buffer.size(), result_buffer.size());
528         run_options->stream()->ThenMemcpyD2D(&result_buffer, aliased_buffer,
529                                              aliased_buffer.size());
530         aliased_buffer = result_buffer;
531       }
532     }
533 
534     if (result_buffer.is_null()) {
535       // The source instruction should have a non-parameter buffer
536       // assigned.
537       result_buffer =
538           buffer_allocations.GetDeviceAddress(output_info.allocation_index);
539 
540       // If the entire tuple contents is aliased, the copy insertion will *not*
541       // materialize a new tuple, so we mark it as aliased as well.
542       if (is_entire_tuple_contents_aliased) {
543         result.AddAliasedIndex(index);
544       }
545     }
546     buffers_in_result.insert(result_buffer);
547   }
548 
549   for (Thunk* thunk : thunk_schedule_->TotalOrder()) {
550     TF_RETURN_IF_ERROR(thunk->Initialize(*this, executor));
551   }
552   TF_RETURN_IF_ERROR(ExecuteThunks(run_options, buffer_allocations,
553                                    block_host_until_done,
554                                    hlo_execution_profile));
555 
556   // Free all temporary allocations.
557   TF_RETURN_IF_ERROR(
558       buffer_allocations.TearDown(buffers_in_result, allocations_));
559 
560   // Free allocations for arguments.
561   if (auto args = absl::get_if<absl::Span<ExecutionInput>>(&arguments)) {
562     MarkToBeReleasedArguments(*args, result);
563   }
564   return std::move(result);
565 }
566 
SizeOfGeneratedCodeInBytes() const567 int64 GpuExecutable::SizeOfGeneratedCodeInBytes() const {
568   // Non-empty PTX but empty cubin: compilation must have failed, return
569   // "unknown".
570   if (binary().empty() && !text_.empty()) {
571     return -1;
572   }
573   int64 size = binary().size();
574   for (BufferAllocation::Index i = 0; i < allocations_.size(); ++i) {
575     const BufferAllocation& allocation = allocations_[i];
576     if (allocation.is_constant()) {
577       size += allocation.size();
578     }
579   }
580   return size;
581 }
582 
583 StatusOr<absl::flat_hash_map<ShapeIndex, GpuExecutable::OutputInfo>>
GetOutputInfo(const HloModule & hlo_module,const BufferAssignment & assignment)584 GetOutputInfo(const HloModule& hlo_module, const BufferAssignment& assignment) {
585   const HloInstruction* root =
586       hlo_module.entry_computation()->root_instruction();
587 
588   InstructionValueSet root_value_set =
589       assignment.dataflow_analysis().GetInstructionValueSet(root);
590 
591   if (root_value_set.IsAmbiguous()) {
592     return Unimplemented("Points-to set of root instruction is ambiguous");
593   }
594 
595   using OutputInfoMap =
596       absl::flat_hash_map<ShapeIndex, GpuExecutable::OutputInfo>;
597   OutputInfoMap output;
598   TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
599       root->shape(),
600       [&](const Shape& /*sub_shape*/, const ShapeIndex& index) -> Status {
601         const auto& sources = root_value_set.element(index);
602         // The points-to set is unambiguous so the set should be a
603         // singleton. That is, we know exactly which instruction
604         // produced the array at this element.
605         CHECK_EQ(1, sources.values().size());
606         HloInstruction* src_hlo = sources.values()[0]->instruction();
607 
608         GpuExecutable::OutputInfo& info = output[index];
609         info.passthrough = src_hlo->opcode() == HloOpcode::kParameter;
610         TF_ASSIGN_OR_RETURN(
611             const BufferAllocation::Slice slice,
612             assignment.GetUniqueSlice(src_hlo, sources.values()[0]->index()));
613         CHECK_EQ(slice.offset(), 0) << "Parameter should get its own slice";
614         info.allocation_index = slice.index();
615 
616         output[index].alias_config =
617             hlo_module.input_output_alias_config().GetAliasedParameter(index);
618 
619         return Status::OK();
620       }));
621   return output;
622 }
623 
624 }  // namespace gpu
625 }  // namespace xla
626