1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_EXECUTABLE_H_
17 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_EXECUTABLE_H_
18 
19 #include <memory>
20 #include <string>
21 
22 #include "absl/container/flat_hash_map.h"
23 #include "absl/strings/string_view.h"
24 #include "absl/types/optional.h"
25 #include "absl/types/span.h"
26 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
27 #include "tensorflow/compiler/xla/service/executable.h"
28 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
29 #include "tensorflow/compiler/xla/service/gpu/gpu_types.h"
30 #include "tensorflow/compiler/xla/service/gpu/stream_assignment.h"
31 #include "tensorflow/compiler/xla/service/gpu/thunk.h"
32 #include "tensorflow/compiler/xla/service/gpu/thunk_schedule.h"
33 #include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
34 #include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
35 #include "tensorflow/compiler/xla/service/hlo_module.h"
36 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
37 #include "tensorflow/compiler/xla/statusor.h"
38 #include "tensorflow/core/platform/macros.h"
39 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
40 #include "tensorflow/stream_executor/device_memory_allocator.h"
41 
42 namespace xla {
43 namespace gpu {
44 
45 // GPU-targeting implementation of the XLA Executable interface.
46 //
47 // Launches the given GPU kernel via the StreamExecutor.
48 //
49 // This is an immutable data type after initialization, and thus thread safe.
50 class GpuExecutable : public Executable {
51  public:
52   struct ConstantInfo {
53     std::string symbol_name;
54     std::vector<uint8> content;
55     int allocation_index = -1;
56   };
57 
58   struct OutputInfo {
59     // Corresponding allocation index.
60     int allocation_index;
61 
62     // Output is passed-through from a parameter.
63     bool passthrough = false;
64 
65     // Whether this output is hinted to alias a parameter (BufferAllocation*
66     // would indicate the aliased parameter), and what kind of alias it is.
67     absl::optional<HloInputOutputAliasConfig::Alias> alias_config;
68   };
69 
70   struct Params {
71     std::string asm_text;
72     std::vector<uint8> binary;
73     GpuVersion gpu_version;
74     std::unique_ptr<const ThunkSchedule> thunk_schedule;
75     std::vector<ConstantInfo> constants;
76     absl::flat_hash_map<ShapeIndex, OutputInfo> output_info;
77     std::string module_name;
78     xla::Shape output_shape;
79     std::vector<BufferAllocation> allocations;
80     std::unique_ptr<BufferAssignmentProto> debug_buffer_assignment;
81     std::unique_ptr<HloModule> debug_module = nullptr;
82     size_t entry_computation_profile_index = 0;
83     std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data = nullptr;
84     std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map = nullptr;
85   };
86 
87   // We need to share ownership of hlo_module and assignment with profiler to
88   // safely keep a reference to these objects during tracing period, thus they
89   // are passed as shared pointers.
90   explicit GpuExecutable(Params params);
91   ~GpuExecutable() override;
92 
93   int64 SizeOfGeneratedCodeInBytes() const override;
94 
95   // This should be called after set_ir_module_string.
ir_module_string()96   const string& ir_module_string() const { return ir_module_string_; }
97 
98   // This should be called before ExecuteOnStream.
set_ir_module_string(const string & ir_module_string)99   void set_ir_module_string(const string& ir_module_string) {
100     ir_module_string_ = ir_module_string;
101   }
102 
103   // Returns the compiled code for the computation. The compiled code is PTX in
104   // Cuda and unused empty string in ROCm.
text()105   const string& text() const { return text_; }
106 
107   // Returns the binary stored in this GpuExecutable. The binary is cubin in
108   // Cuda, and HSA code object in ROCm. It may be empty, in which case
109   // compilation is left up to the GPU driver.
binary()110   const std::vector<uint8>& binary() const { return binary_; }
111 
112   // ExecuteAsyncOnStream will fail if the compute capability of the stream
113   // doesn't match the compute capability passed to this object's constructor.
114   StatusOr<ExecutionOutput> ExecuteAsyncOnStream(
115       const ServiceExecutableRunOptions* run_options,
116       std::vector<ExecutionInput> arguments,
117       HloExecutionProfile* hlo_execution_profile) override;
118 
119   StatusOr<ScopedShapedBuffer> ExecuteAsyncOnStream(
120       const ServiceExecutableRunOptions* run_options,
121       absl::Span<const ShapedBuffer* const> arguments,
122       HloExecutionProfile* hlo_execution_profile);
123 
124   using VariantArguments = absl::variant<absl::Span<const ShapedBuffer* const>,
125                                          absl::Span<ExecutionInput>>;
126   StatusOr<ExecutionOutput> ExecuteAsyncOnStreamImpl(
127       const ServiceExecutableRunOptions* run_options,
128       VariantArguments arguments, HloExecutionProfile* hlo_execution_profile);
129 
GetAllocations()130   absl::Span<const BufferAllocation> GetAllocations() const {
131     return allocations_;
132   }
133 
134  private:
135   // If `block_host_until_done` is false, execution will not block the host
136   // until the kernels have completed. This is used as an optimization for
137   // clients, such as Tensorflow, that use a single stream of execution for
138   // computations, and allow host-side deallocation from the allocator before
139   // GPU execution completes.
140   Status ExecuteThunks(const ServiceExecutableRunOptions* run_options,
141                        const BufferAllocations& buffer_allocations,
142                        bool block_host_until_done,
143                        HloExecutionProfile* hlo_execution_profile);
144 
145   using BufferAllocToDeviceMemoryMap =
146       absl::flat_hash_map<BufferAllocation::Index, se::DeviceMemoryBase>;
147 
148   // Loads the PTX or CUBIN for this executable into `executor` and resolves the
149   // globals corresponding to constant buffers.  Returns a map mapping buffer
150   // allocation indices to GPU pointers.
151   StatusOr<const BufferAllocToDeviceMemoryMap*> ResolveConstantGlobals(
152       stream_executor::Stream* stream);
153 
154   // GpuExecutable check with either AMD's ISA version, or Nvidia's major minor
155   // version for compute capability, depending on the hardware.
156   Status CheckCompatibilityWithServiceExecutableRunOptions(
157       const ServiceExecutableRunOptions* run_options);
158 
159   StatusOr<BufferAllocations> GenerateBufferAllocations(
160       VariantArguments arguments,
161       const GpuExecutable::BufferAllocToDeviceMemoryMap* globals,
162       se::DeviceMemoryAllocator* const memory_allocator,
163       se::StreamExecutor* executor);
164 
165   StatusOr<se::DeviceMemoryBase> BufferForAllocation(
166       VariantArguments arguments,
167       const GpuExecutable::BufferAllocToDeviceMemoryMap* globals,
168       const BufferAllocation& allocation,
169       se::DeviceMemoryAllocator* const memory_allocator, int device_ordinal,
170       int64 arg_idx);
171 
172   // The LLVM IR, in string format, of the unoptimized module generated for
173   // this GpuExecutable. We save a string instead of an llvm::Module* because
174   // leaving llvm::Module* in a singleton can cause the heap checker to emit
175   // false positives.
176   //
177   // This string should be modified only before ExecuteOnStream.
178   string ir_module_string_;
179 
180   // The compiled code for the computation.
181   const string text_;
182 
183   // The GPU machine code for the computation, targeting GPUs at
184   // compute_capability_.
185   //
186   // May be empty, in which case we leave compilation up to the GPU driver.
187   const std::vector<uint8> binary_;
188 
189   // The GPU version for compute compatibility check.
190   GpuVersion gpu_version_;
191 
192   // The thunks to be invoked by this GpuExecutable. They are generated by the
193   // IrEmitter.
194   const std::unique_ptr<const ThunkSchedule> thunk_schedule_;
195 
196   std::string module_name_;
197 
198   xla::Shape output_shape_;
199 
200   // Owns the buffer data at runtime. It provides information to allocate
201   // memory for every output/temp buffers.
202   const std::vector<BufferAllocation> allocations_;
203 
204   std::shared_ptr<BufferAssignmentProto> debug_buffer_assignment_;
205 
206   size_t entry_computation_profile_index_ = -1;
207 
208   // Cache of module handles and constant buffer allocation maps used by
209   // `ResolveConstantGlobals`.
210   tensorflow::mutex module_handle_mutex_;
211   std::map<stream_executor::StreamExecutor*, se::ScopedModuleHandle>
212       module_handles_ TF_GUARDED_BY(module_handle_mutex_);
213   std::map<stream_executor::StreamExecutor*, BufferAllocToDeviceMemoryMap>
214       module_globals_ TF_GUARDED_BY(module_handle_mutex_);
215 
216   std::vector<ConstantInfo> constants_;
217   const absl::flat_hash_map<ShapeIndex, OutputInfo> output_info_;
218 
219   TF_DISALLOW_COPY_AND_ASSIGN(GpuExecutable);
220 };
221 
222 StatusOr<absl::flat_hash_map<ShapeIndex, GpuExecutable::OutputInfo>>
223 GetOutputInfo(const HloModule& hlo_module, const BufferAssignment& assignment);
224 
225 }  // namespace gpu
226 }  // namespace xla
227 
228 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_EXECUTABLE_H_
229