1 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 // Contains utilities for launching compiled XLA kernels for a KernelContext.
17 
18 #ifndef TENSORFLOW_COMPILER_JIT_XLA_LAUNCH_UTIL_H_
19 #define TENSORFLOW_COMPILER_JIT_XLA_LAUNCH_UTIL_H_
20 
21 #include "absl/base/thread_annotations.h"
22 #include "tensorflow/compiler/jit/xla_compilation_cache.h"
23 #include "tensorflow/compiler/jit/xla_tensor.h"
24 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
25 #include "tensorflow/compiler/xla/client/local_client.h"
26 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
27 #include "tensorflow/compiler/xla/service/owning_device_memory.h"
28 #include "tensorflow/core/framework/allocation_description.pb.h"
29 #include "tensorflow/core/framework/resource_var.h"
30 #include "tensorflow/core/framework/tensor.h"
31 #include "tensorflow/core/framework/types.h"
32 #include "tensorflow/core/lib/core/status.h"
33 #include "tensorflow/core/lib/gtl/array_slice.h"
34 
35 namespace tensorflow {
36 class XlaAllocator;
37 
38 // Struct that represents a possibly-absent Tensor.
39 struct OptionalTensor {
40   string name;           // A descriptive name
41   bool present = false;  // Is the tensor present?
42   Tensor value;          // If present, what is the Tensor's value?
43 };
44 
45 // Takes a snapshot of the values of resource variable arguments, whose indices
46 // are specified in `variable_indices` argument. We snapshot tensors that back
47 // resource variables since concurrent updates may modify the shape, and it is
48 // important that the shapes used for compilation match the true shapes of the
49 // buffers.
50 //
51 // We snapshot the entire set of resource variables as one atomic operation.
52 // This models Read->* dependencies between resource variable operations.  See
53 // jit/resource_operation_safety_analysis for details.
54 //
55 // Returns a map of TensorFlow argument index to resource variable. If a
56 // resource variable is not initialized, the corresponding OptionalTensor
57 // will have its `present` field set to false.
58 Status SnapshotResourceVariables(OpKernelContext* ctx,
59                                  absl::Span<const int> variable_indices,
60                                  std::map<int, OptionalTensor>* result);
61 
62 // Information about the state of a variable passed as input to the _XlaCompile
63 // and _XlaRun operators.  Unlocks the resource variable and decrements its
64 // refcount on destruction.
65 class VariableInfo {
66  public:
67   explicit VariableInfo(int index, Var* var);
68   VariableInfo(VariableInfo&& other);
69 
70   VariableInfo& operator=(VariableInfo&& other);
71 
72   VariableInfo(const VariableInfo&) = delete;
73   VariableInfo& operator=(const VariableInfo&) = delete;
74 
75   // The index of the DT_RESOURCE input to the _XlaCompile/_XlaRun operator.
76   // Note that the indices can be different between _XlaCompile and _XlaRun.
index()77   int index() const { return index_; }
78 
79   // A pointer to the resource variable.  May be null if this VariableInfo is
80   // "empty", i.e. it does not track a resource variable.
var()81   Var* var() const { return var_; }
82 
83   // Returns true if the resource variable lock was successfully acquired by
84   // this thread.
lock_held()85   bool lock_held() const { return lock_held_; }
set_lock_held()86   void set_lock_held() { lock_held_ = true; }
87 
88   ~VariableInfo();
89 
90  private:
91   int index_;
92   Var* var_;
93 
94   // We can't use a optional<mutex_lock> here because it confuses the compiler's
95   // thread safety analysis. Instead we use a boolean flag and release the lock
96   // in the VariableInfo destructor.
97   bool lock_held_ = false;
98 };
99 
100 // Acquires the mutexes for all the variables in `variables` using a
101 // deadlock-safe protocol (acquire the mutexes in increasing-address order).
102 //
103 // `variables` is allowed to contain instances that don't track a resource
104 // variable (i.e. variables[i].var() can be null for some i).
105 Status LockVariables(absl::Span<VariableInfo> variables)
106     EXCLUSIVE_LOCK_FUNCTION();
107 
108 // Adapter class that wraps a Tensorflow allocator as an XLA allocator.
109 // Assumes that the Tensorflow allocator permits asynchronous deallocation:
110 // see comment on `AllowsAsynchronousDeallocation()`.
111 class XlaAllocator : public xla::DeviceMemoryAllocator {
112  public:
113   XlaAllocator(const se::Platform* platform, Allocator* wrapped);
114   ~XlaAllocator() override;
115   xla::StatusOr<xla::OwningDeviceMemory> Allocate(
116       int device_ordinal, uint64 size, bool retry_on_failure) override;
117   Status Deallocate(int device_ordinal, se::DeviceMemoryBase mem) override;
118 
119   // The Tensorflow BFC allocator used on GPU allows host-side deallocation
120   // before GPU execution takes place. Tensorflow uses the ordering of the main
121   // compute stream to enforce a happens-before relationship between a memory
122   // allocation and code that reuses the same memory. If Tensorflow adds
123   // support for multiple GPU streams or allocators with different ordering
124   // requirements, this code may need to change.
125   // (This attribute has no effect on CPU.)
AllowsAsynchronousDeallocation()126   bool AllowsAsynchronousDeallocation() const override { return true; }
127 
128  private:
129   Allocator* wrapped_;
130 };
131 
132 // Helper class to perform the marshalling of TensorFlow inputs and outputs to
133 // ShapedBuffers suitable for passing to an XLA computation.
134 class XlaComputationLaunchContext {
135  public:
136   // Create a new launch context. 'allocate_xla_tensors' is true if allocated
137   // output tensors and variables are always XlaTensors. If false they are
138   // assumed to be "normal" device pointers.
139   // If 'use_multiple_streams' is true, tensors may be defined and used on
140   // multiple streams and so se::Events must be defined and waited for. If
141   // 'use_multiple_streams' is true, 'allocate_xla_tensors' must also be true
142   // because we track inter-stream dependencies through events inside XlaTensor
143   // objects.
144   XlaComputationLaunchContext(xla::LocalClient* client,
145                               xla::DeviceMemoryAllocator* xla_allocator,
146                               bool allocate_xla_tensors,
147                               bool use_multiple_streams);
148 
149   // Builds a XlaCompiler::Argument vector from the arguments to an XlaLaunch
150   // op.
151   static Status BuildXlaCompilerArguments(
152       const std::map<int, Tensor>& constant_args,
153       const std::map<int, OptionalTensor>& variable_args, OpKernelContext* ctx,
154       std::vector<XlaCompiler::Argument>* args);
155 
156   // Add all inputs within `ctx` as XLA arguments (returned by arguments()).
157   // `variables` is a map from TensorFlow argument number to resource variable.
158   //
159   // Assumes that the first `missing_ctx_input_prefix` inputs to the kernel are
160   // missing and adjusts input indices accordingly.  All elements in kernel's
161   // input_mapping must be greater than or equal to `missing_ctx_input_prefix`
162   // (in other words, no inputs actually required by the kernel can be missing).
163   void PopulateInputs(OpKernelContext* ctx,
164                       const XlaCompiler::CompilationResult* kernel,
165                       const std::map<int, OptionalTensor>& variables,
166                       int missing_ctx_input_prefix);
167 
168   // Given the XLA output in `output`, populate all outputs of `ctx`.  Also
169   // writes out the resource variable updates.
170   //
171   // Updates to all resource variables are written in a single atomic operation.
172   // This models *->Write dependencies between resource variable operations.
173   // See jit/resource_operation_safety_analysis for details.
174   //
175   //
176   // Assumes that the first `missing_ctx_input_prefix` inputs to the kernel are
177   // missing and adjusts input indices accordingly.
178   Status PopulateOutputs(OpKernelContext* ctx,
179                          const XlaCompiler::CompilationResult* kernel,
180                          xla::ScopedShapedBuffer output,
181                          int missing_ctx_input_prefix);
182 
183   // Return the argument list. Only valid after PopulateInputs() has been
184   // called.
arguments()185   const std::vector<xla::ShapedBuffer*>& arguments() const { return arg_ptrs_; }
186 
187  private:
188   xla::LocalClient* client_;
189   xla::DeviceMemoryAllocator* xla_allocator_;
190   bool allocate_xla_tensors_;
191   bool use_multiple_streams_;
192   std::vector<std::unique_ptr<xla::ShapedBuffer>> arg_buffers_;
193   std::vector<xla::ShapedBuffer*> arg_ptrs_;
194 };
195 
196 // A simple TensorBuffer implementation that allows us to create Tensors that
197 // take ownership of pre-allocated memory.
198 class XlaTensorBuffer : public TensorBuffer {
199  public:
XlaTensorBuffer(const void * ptr,size_t expected_size,size_t actual_size,Allocator * allocator)200   XlaTensorBuffer(const void* ptr, size_t expected_size, size_t actual_size,
201                   Allocator* allocator)
202       : TensorBuffer(const_cast<void*>(ptr)),
203         expected_size_(expected_size),
204         actual_size_(actual_size),
205         allocator_(allocator) {}
206 
~XlaTensorBuffer()207   ~XlaTensorBuffer() override {
208     if (data()) {
209       allocator_->DeallocateRaw(data());
210     }
211   }
212 
size()213   size_t size() const override { return expected_size_; }
214 
root_buffer()215   TensorBuffer* root_buffer() override { return this; }
216 
FillAllocationDescription(AllocationDescription * proto)217   void FillAllocationDescription(AllocationDescription* proto) const override {
218     proto->set_allocated_bytes(actual_size_);
219   }
220 
MakeTensor(DataType dtype,const TensorShape & shape,se::DeviceMemoryBase buffer,Allocator * allocator)221   static Tensor MakeTensor(DataType dtype, const TensorShape& shape,
222                            se::DeviceMemoryBase buffer, Allocator* allocator) {
223     size_t expected_size = shape.num_elements() * DataTypeSize(dtype);
224     auto* tensor_buffer = new XlaTensorBuffer(buffer.opaque(), expected_size,
225                                               buffer.size(), allocator);
226     Tensor t(dtype, shape, tensor_buffer);
227     tensor_buffer->Unref();
228     return t;
229   }
230 
231  private:
232   size_t expected_size_;
233   size_t actual_size_;
234   Allocator* allocator_;
235 };
236 
237 }  // namespace tensorflow
238 
239 #endif  // TENSORFLOW_COMPILER_JIT_XLA_LAUNCH_UTIL_H_
240