1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_EXECUTABLE_H_
17 #define TENSORFLOW_COMPILER_XLA_SERVICE_EXECUTABLE_H_
18 
19 #include <memory>
20 #include <set>
21 #include <utility>
22 #include <vector>
23 
24 #include "absl/types/span.h"
25 #include "absl/types/variant.h"
26 #include "tensorflow/compiler/xla/debug_options_flags.h"
27 #include "tensorflow/compiler/xla/service/computation_layout.h"
28 #include "tensorflow/compiler/xla/service/hlo.pb.h"
29 #include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
30 #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
31 #include "tensorflow/compiler/xla/service/hlo_module.h"
32 #include "tensorflow/compiler/xla/service/maybe_owning_device_memory.h"
33 #include "tensorflow/compiler/xla/service/service_executable_run_options.h"
34 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
35 #include "tensorflow/compiler/xla/shape_tree.h"
36 #include "tensorflow/compiler/xla/statusor.h"
37 #include "tensorflow/compiler/xla/util.h"
38 #include "tensorflow/compiler/xla/xla_data.pb.h"
39 #include "tensorflow/core/platform/mutex.h"
40 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
41 #include "tensorflow/core/platform/thread_annotations.h"
42 #include "tensorflow/stream_executor/device_memory_allocator.h"
43 
44 namespace xla {
45 
46 // TODO(b/150633678): Both the ExecutionInput and ExecutionOutput need to be
47 // revisited, with the execute APIs taking data structure which can better model
48 // shareable buffers.
49 //
50 // ExecutionInput buffers are in one of three states:
51 //
52 // 1) Owned by the caller and immutable.
53 // 2) Donated by the caller but returned on error.
54 // 3) Donated by the caller and freed on error.
55 //
56 // Case (1) buffers are stored as MaybeOwningDeviceMemory(DeviceMemoryBase).
57 // Case (2) buffers are stored as MaybeOwningDeviceMemory(OwningDeviceMemory),
58 //   with their indices present in unowned_indices_.
59 // Case (3) buffers are stored as MaybeOwningDeviceMemory(OwningDeviceMemory),
60 //   with their indices absent from unowned_indices_.
61 class ExecutionInput {
62  public:
ExecutionInput(xla::Shape shape)63   explicit ExecutionInput(xla::Shape shape) : buffers_(std::move(shape)) {
64     SetHostShape(ShapeUtil::DeviceShapeToHostShape(buffers_.shape()));
65   }
66   // TODO(b/170310047): remove this overload.
ExecutionInput(xla::Shape shape,xla::Shape host_shape)67   ExecutionInput(xla::Shape shape, xla::Shape host_shape)
68       : buffers_(std::move(shape)) {
69     SetHostShape(ShapeUtil::DeviceShapeToHostShape(buffers_.shape()));
70   }
71 
ExecutionInput(ShapeTree<MaybeOwningDeviceMemory> buffers)72   explicit ExecutionInput(ShapeTree<MaybeOwningDeviceMemory> buffers)
73       : buffers_(std::move(buffers)) {
74     SetHostShape(ShapeUtil::DeviceShapeToHostShape(buffers_.shape()));
75   }
76   // TODO(b/170310047): remove this overload.
ExecutionInput(ShapeTree<MaybeOwningDeviceMemory> buffers,xla::Shape host_shape)77   ExecutionInput(ShapeTree<MaybeOwningDeviceMemory> buffers,
78                  xla::Shape host_shape)
79       : buffers_(std::move(buffers)) {
80     SetHostShape(ShapeUtil::DeviceShapeToHostShape(buffers_.shape()));
81   }
82 
83   ExecutionInput(ExecutionInput&&) = default;
84 
85   ~ExecutionInput();
86 
87   ExecutionInput& operator=(ExecutionInput&&) = default;
88 
shape()89   const Shape& shape() const {
90     return dynamic_shape_ != nullptr ? *dynamic_shape_ : buffers_.shape();
91   }
92 
host_shape()93   const Shape& host_shape() const {
94     return host_shape_ != nullptr ? *host_shape_ : shape();
95   }
96 
97   Status SetDynamicShape(Shape dynamic_shape);
98 
99   xla::StatusOr<xla::ShapedBuffer> ToShapedBuffer(
100       se::DeviceMemoryAllocator* allocator, int device_ordinal) const;
101 
SetBuffer(const ShapeIndex & index,MaybeOwningDeviceMemory buffer)102   void SetBuffer(const ShapeIndex& index, MaybeOwningDeviceMemory buffer) {
103     *buffers_.mutable_element(index) = std::move(buffer);
104   }
105 
106   void SetUnownedBuffer(const ShapeIndex& index,
107                         MaybeOwningDeviceMemory buffer);
108 
SetUnownedIndex(const ShapeIndex & index)109   void SetUnownedIndex(const ShapeIndex& index) {
110     unowned_indices_.insert(index);
111   }
112 
ClearUnownedIndex(const ShapeIndex & index)113   void ClearUnownedIndex(const ShapeIndex& index) {
114     unowned_indices_.erase(index);
115   }
116 
unowned_indices()117   const std::set<ShapeIndex>& unowned_indices() { return unowned_indices_; }
118 
Buffers()119   const ShapeTree<MaybeOwningDeviceMemory>& Buffers() const { return buffers_; }
120 
MutableBuffers()121   ShapeTree<MaybeOwningDeviceMemory>* MutableBuffers() { return &buffers_; }
122 
MutableBuffer(const ShapeIndex & index)123   MaybeOwningDeviceMemory* MutableBuffer(const ShapeIndex& index) {
124     return buffers_.mutable_element(index);
125   }
126 
Buffer(const ShapeIndex & index)127   const MaybeOwningDeviceMemory& Buffer(const ShapeIndex& index) const {
128     return buffers_.element(index);
129   }
130 
131  private:
SetHostShape(xla::Shape host_shape)132   void SetHostShape(xla::Shape host_shape) {
133     if (shape() != host_shape) {
134       host_shape_ = absl::make_unique<Shape>(std::move(host_shape));
135     }
136   }
137 
138   ShapeTree<MaybeOwningDeviceMemory> buffers_;
139   // Set of indices of buffers that should be returned to the caller if an error
140   // occurs when enqueuing the computation.
141   std::set<ShapeIndex> unowned_indices_;
142   std::unique_ptr<Shape> dynamic_shape_;
143   std::unique_ptr<Shape> host_shape_;
144 };
145 
146 // ExecutionOutput encapsulates the output buffers of a execution and the
147 // leftover buffers to be released by the caller.
148 class ExecutionOutput {
149  public:
ExecutionOutput(ScopedShapedBuffer result)150   explicit ExecutionOutput(ScopedShapedBuffer result)
151       : result_(std::move(result)) {}
ExecutionOutput(ScopedShapedBuffer result,std::vector<se::OwningDeviceMemory> to_be_released)152   ExecutionOutput(ScopedShapedBuffer result,
153                   std::vector<se::OwningDeviceMemory> to_be_released)
154       : result_(std::move(result)),
155         to_be_released_(std::move(to_be_released)) {}
156   // TODO(b/170310047): remove this overload.
ExecutionOutput(Shape on_host_shape,Shape on_device_shape,se::DeviceMemoryAllocator * allocator,int device_ordinal)157   ExecutionOutput(Shape on_host_shape, Shape on_device_shape,
158                   se::DeviceMemoryAllocator* allocator, int device_ordinal)
159       : result_(std::move(on_device_shape), allocator, device_ordinal) {}
ExecutionOutput(Shape on_device_shape,se::DeviceMemoryAllocator * allocator,int device_ordinal)160   ExecutionOutput(Shape on_device_shape, se::DeviceMemoryAllocator* allocator,
161                   int device_ordinal)
162       : result_(std::move(on_device_shape), allocator, device_ordinal) {}
163   ExecutionOutput(ExecutionOutput&&) = default;
164   ExecutionOutput& operator=(ExecutionOutput&&) = default;
165 
~ExecutionOutput()166   ~ExecutionOutput() {
167     // If the ExecutionOutput has not been committed, and if there are aliased
168     // indices, clear them off the ScopedShapedBuffer to prevent them to be
169     // released.
170     for (auto& index : aliased_indices_) {
171       result_.set_buffer(se::OwningDeviceMemory(), index);
172     }
173   }
174 
AddAliasedIndex(ShapeIndex index)175   void AddAliasedIndex(ShapeIndex index) {
176     aliased_indices_.push_back(std::move(index));
177   }
178 
AddToBeReleased(se::OwningDeviceMemory mem)179   void AddToBeReleased(se::OwningDeviceMemory mem) {
180     to_be_released_.push_back(std::move(mem));
181   }
182 
183   // Should be called once it is known that the execute operation succeeded,
184   // before returning the ExecutionOutput to the caller.
Commit()185   ExecutionOutput& Commit() {
186     aliased_indices_.clear();
187     return *this;
188   }
189 
Result()190   const ScopedShapedBuffer& Result() const { return result_; }
191 
MutableResult()192   ScopedShapedBuffer* MutableResult() { return &result_; }
193 
ConsumeResult()194   ScopedShapedBuffer ConsumeResult() {
195     aliased_indices_.clear();
196     return std::move(result_);
197   }
198 
ToBeReleased()199   const std::vector<se::OwningDeviceMemory>& ToBeReleased() const {
200     return to_be_released_;
201   }
202 
ConsumeToBeReleased()203   std::vector<se::OwningDeviceMemory> ConsumeToBeReleased() {
204     return std::move(to_be_released_);
205   }
206 
ConsumeAliasedIndices()207   std::vector<ShapeIndex> ConsumeAliasedIndices() {
208     auto aliased = std::move(aliased_indices_);
209     aliased_indices_.clear();
210     return aliased;
211   }
212 
213  private:
214   ScopedShapedBuffer result_;
215 
216   // Leftover buffers for the caller to release. Elements in this list are
217   // donated input memory buffers that are not reused by XLA as outputs.
218   std::vector<se::OwningDeviceMemory> to_be_released_;
219 
220   // These are the indices in result_ which have been aliased from the caller.
221   // If the execution operation fails, the caller should maintain ownership of
222   // the buffer, so we track the indices here, and unless the ExecutionOutput is
223   // committed, we remove them from the result_ before destruction.
224   std::vector<ShapeIndex> aliased_indices_;
225 
226   // A shape table is a continuous region in memory that is used to hold the
227   // runtime dimension sizes of dynamic output shapes.
228   se::OwningDeviceMemory output_shape_table_;
229 };
230 
231 // A given platform's compiler will produce an Executable -- this is a uniform
232 // interface that is used for launching compiled programs across platforms.
233 class Executable {
234  public:
Executable(std::shared_ptr<HloModule> hlo_module)235   explicit Executable(std::shared_ptr<HloModule> hlo_module)
236       : hlo_module_(std::move(hlo_module)) {}
237 
238   // TODO(b/172012028): Remove this constructor.
Executable(std::shared_ptr<HloModule> hlo_module,std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)239   explicit Executable(
240       std::shared_ptr<HloModule> hlo_module,
241       std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
242       std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)
243       : hlo_module_(std::move(hlo_module)),
244         hlo_profile_printer_data_(std::move(hlo_profile_printer_data)),
245         hlo_profile_index_map_(std::move(hlo_profile_index_map)) {
246     CHECK_EQ(hlo_profile_printer_data_.get() == nullptr,
247              hlo_profile_index_map_.get() == nullptr);
248   }
~Executable()249   virtual ~Executable() {}
250 
251   // Enqueues the compilation result on the provided stream, passing the given
252   // arguments. This call is blocking and returns after the execution is done.
253   //
254   // If the hlo_execution_profile is provided as non-nullptr, profiling will be
255   // enabled.
256   //
257   // Returns a shaped buffer containing the result of the computation.
258   StatusOr<ScopedShapedBuffer> ExecuteOnStream(
259       const ServiceExecutableRunOptions* run_options,
260       absl::Span<const ShapedBuffer* const> arguments,
261       HloExecutionProfile* hlo_execution_profile);
262 
263   // Starts the given program executing on the given stream/executor.
264   //
265   // `arguments` are ShapeTree containing the input parameters. For each element
266   // in the shape tree, if the element holds the ownership of the memory, it is
267   // considered donated and XLA will potentially reuse it as output buffers. For
268   // all donated inputs, XLA is also responsible for freeing them.
269   //
270   // If an input is donated to XLA but is not reused as output, it is returned
271   // as an leftover buffer for the caller to release.
272   //
273   // This call should be non-blocking and may return as soon as all of the
274   // operations are enqueued for launch on the stream. Note that some
275   // implementations may in fact block or may block in some circumstances (e.g.,
276   // when profiling); i.e., asynchronous is a "may" not a "must".
277   //
278   // If the hlo_execution_profile is provided as non-nullptr, profiling will be
279   // enabled. Note that profiling is tricky to use correctly, as the profiling
280   // objects (when they exist) must out-live the task.
281   virtual StatusOr<ScopedShapedBuffer> ExecuteAsyncOnStream(
282       const ServiceExecutableRunOptions* run_options,
283       absl::Span<const ShapedBuffer* const> arguments,
284       HloExecutionProfile* hlo_execution_profile);
285 
286   // Same as ExecuteAsyncOnStream(), but blocks waiting for the computation to
287   // complete.
288   StatusOr<ExecutionOutput> ExecuteOnStream(
289       const ServiceExecutableRunOptions* run_options,
290       std::vector<ExecutionInput> arguments,
291       HloExecutionProfile* hlo_execution_profile);
292 
293   virtual StatusOr<ExecutionOutput> ExecuteAsyncOnStream(
294       const ServiceExecutableRunOptions* run_options,
295       std::vector<ExecutionInput> arguments,
296       HloExecutionProfile* hlo_execution_profile) = 0;
297 
298   // Same as ExecuteOnStream(), but runs this executable on multiple
299   // streams. arguments[i] contains the arguments to the execution on
300   // run_options[i]->stream() and the returned value is at index i of the
301   // returned vector.
302   virtual StatusOr<std::vector<ScopedShapedBuffer>> ExecuteOnStreams(
303       absl::Span<const ServiceExecutableRunOptions> run_options,
304       absl::Span<const absl::Span<const ShapedBuffer* const>> arguments);
305 
306   // Populates `hlo_execution_profile` from `executor`. This is implicit in any
307   // Execute* API call that takes a hlo_execution_profile argument, but must be
308   // called explicitly for other (async, for example) variants after the stream
309   // has completed.
PopulateExecutionProfile(ExecutionProfile * execution_profile,HloExecutionProfile * hlo_execution_profile,se::Stream * stream)310   virtual Status PopulateExecutionProfile(
311       ExecutionProfile* execution_profile,
312       HloExecutionProfile* hlo_execution_profile, se::Stream* stream) {
313     return Status::OK();
314   }
315 
316   // Convenience wrapper for calling Executable::ExecuteOnStream. Sets up a
317   // timer for the execution, sets up HLO profiling if enabled, and fills in the
318   // given ExecutionProfile if non-null.
319   StatusOr<ScopedShapedBuffer> ExecuteOnStreamWrapper(
320       const ServiceExecutableRunOptions* run_options,
321       absl::Span<const ShapedBuffer* const> arguments);
322 
323   StatusOr<ExecutionOutput> ExecuteOnStreamWrapper(
324       const ServiceExecutableRunOptions* run_options,
325       std::vector<ExecutionInput> arguments);
326 
327   StatusOr<ScopedShapedBuffer> ExecuteAsyncOnStreamWrapper(
328       const ServiceExecutableRunOptions* run_options,
329       absl::Span<const ShapedBuffer* const> arguments);
330 
331   StatusOr<ExecutionOutput> ExecuteAsyncOnStreamWrapper(
332       const ServiceExecutableRunOptions* run_options,
333       std::vector<ExecutionInput> arguments);
334 
hlo_profile_printer_data()335   const HloProfilePrinterData& hlo_profile_printer_data() const {
336     CHECK(hlo_profiling_enabled());
337     return *hlo_profile_printer_data_;
338   }
339 
hlo_profile_index_map()340   const HloProfileIndexMap& hlo_profile_index_map() const {
341     CHECK(hlo_profiling_enabled());
342     return *hlo_profile_index_map_;
343   }
344 
345   // Returns whether this executable was compiled with HLO profilings support
346   // enabled. If not, the caller should not expect an hlo_execution_profile
347   // passed to ExecuteOnStream above to be populated during execution.
hlo_profiling_enabled()348   bool hlo_profiling_enabled() const {
349     return hlo_profile_printer_data_ != nullptr;
350   }
351 
module()352   HloModule& module() const { return *hlo_module_; }
shared_module()353   std::shared_ptr<HloModule> shared_module() const { return hlo_module_; }
354 
has_module()355   const bool has_module() const { return hlo_module_ != nullptr; }
356 
module_config()357   const HloModuleConfig& module_config() const { return hlo_module_->config(); }
358 
359   // The shape (including layout) that results from this execution. This is the
360   // shape of the DeviceMemoryBase result value in ExecuteOnStream above.
result_shape()361   const Shape& result_shape() const {
362     return hlo_module_->config().entry_computation_layout().result_shape();
363   }
364 
365   // Returns the size of the executable in bytes. Returns -1 if this query is
366   // not supported by the executable.
367   //
368   // Does not include the size of used libraries (e.g. cuDNN, Eigen, etc.).
369   virtual int64 SizeOfGeneratedCodeInBytes() const;
370 
371   // Dumping helpers.
set_hlo_proto(std::unique_ptr<xla::HloProto> hlo_proto)372   void set_hlo_proto(std::unique_ptr<xla::HloProto> hlo_proto) {
373     hlo_proto_ = std::move(hlo_proto);
374   }
dumping_snapshot()375   bool dumping_snapshot() const { return hlo_proto_ != nullptr; }
hlo_proto()376   HloProto const* hlo_proto() const { return hlo_proto_.get(); }
377 
debug_info()378   std::string& debug_info() { return debug_info_; }
set_debug_info(const std::string & debug_info)379   void set_debug_info(const std::string& debug_info) {
380     debug_info_ = debug_info;
381   }
382   // Gather unused but donated buffers, return them to the caller of this API.
383   // We don't free buffers inside this function since the caller could have
384   // different preferences for buffer deallocation. For example, in TensorFlow,
385   // buffers are mostly efficiently deallocated as soon as a program has been
386   // launched. However, in XRT, the buffers are expected to be deallocated after
387   // the program has finished since XRT doesn't support async deallocation.
388   void MarkToBeReleasedArguments(absl::Span<ExecutionInput> arguments,
389                                  ExecutionOutput& result);
390 
391  protected:
392   // HloModule this was compiled from. BufferAssignment keeps pointers to
393   // HloInstructions owned by the HloModule so we need to keep the HloModule
394   // around.
395   const std::shared_ptr<HloModule> hlo_module_;
396 
397   // The serialized HLO proto. Non-null only if dumping snapshots is enabled.
398   std::unique_ptr<HloProto const> hlo_proto_;
399 
400   // Execution count, used to generate a unique filename for each dumped
401   // execution.
402   int64 execution_count_ = 0;
403 
404   std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data_;
405   std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map_;
406 
407   // Generic debug information as a string.
408   std::string debug_info_;
409 };
410 
411 }  // namespace xla
412 
413 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_EXECUTABLE_H_
414