1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_THUNK_H_
17 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_THUNK_H_
18 
19 #include <memory>
20 #include <vector>
21 
22 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
23 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
24 #include "tensorflow/core/lib/core/status.h"
25 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
26 
27 namespace xla {
28 namespace gpu {
29 
30 class GpuExecutable;
31 
32 // Thunk acts as the bridge between IrEmitter and GpuExecutable. It stores the
33 // metadata IrEmitter generates for GpuExecutable to invoke an HloInstruction.
34 //
35 // Thunk provides the Initialize and ExecuteOnStream interface for GpuExecutable
36 // to initialize and execute the invocation respectively. Its subclasses are
37 // supposed to override these interfaces to launch a generated kernel or call an
38 // external library function (such as operations in cuBLAS).
39 //
40 // This is thread-compatible.
41 class Thunk {
42  public:
43   enum class Kind {
44     kConditional,
45     kConvolution,
46     kCopy,
47     kCudnnBatchNormBackward,
48     kCudnnBatchNormForwardInference,
49     kCudnnBatchNormForwardTraining,
50     kFft,
51     kGemm,
52     kInfeed,
53     kKernel,
54     kSequential,
55     kTuple,
56     kWhile,
57   };
58 
59   // The hlo_instruction argument is meant to be the instruction this thunk was
60   // generated from, but Thunk never uses this argument other than to save it
61   // to Thunk::hlo_instruction, so it can be null.
Thunk(Kind kind,const HloInstruction * hlo_instruction)62   explicit Thunk(Kind kind, const HloInstruction* hlo_instruction)
63       : kind_(kind), hlo_instruction_(hlo_instruction) {}
~Thunk()64   virtual ~Thunk() {}
65   Thunk(const Thunk&) = delete;
66   Thunk& operator=(const Thunk&) = delete;
67 
kind()68   Kind kind() const { return kind_; }
hlo_instruction()69   const HloInstruction* hlo_instruction() const { return hlo_instruction_; }
70 
71   // Prepares for executing the thunk. This method is called only once over
72   // Thunk's lifetime. For example, KernelThunk::Initialize loads the PTX of a
73   // kernel, which is the same in every execution.
Initialize(const GpuExecutable & executable)74   virtual tensorflow::Status Initialize(const GpuExecutable& executable) {
75     return tensorflow::Status::OK();
76   }
77 
78   // Users of Thunk should call ShouldHaltAllActivityBeforeRunning(stream)
79   // before calling ExecuteOnStream(stream).  If it returns true, it's the
80   // user's responsibility to wait for all activity on the GPU to finish before
81   // calling ExecuteOnStream.
82   //
83   // This value is not required to be constant for a given Thunk.  For example,
84   // a Thunk that performs autotuning may return true for its first run and
85   // false thereafter.
ShouldHaltAllActivityBeforeRunning(perftools::gputools::Stream *)86   virtual bool ShouldHaltAllActivityBeforeRunning(
87       perftools::gputools::Stream* /*stream*/) {
88     return false;
89   }
90 
91   // Indicates whether thunks scheduled after this one should wait for this one
92   // to complete before running. For example, a convolution thunk creates a
93   // scratch allocator, then kicks off a convolution in cudnn via the stream
94   // executor. When the stream executor call returns, the scratch allocator goes
95   // out of scope, and the scratch memory is deallocated. In this case, the
96   // convolution thunk needs to return true so that future thunks wait for the
97   // convolution thunk to avoid reusing the deallocated memory until the
98   // convolution thunk is done with it.
ShouldBlockFutureThunks()99   virtual bool ShouldBlockFutureThunks() { return false; }
100 
101   // Execute the kernel for the thunk on the given stream. This method must be
102   // called after Initialize and can be called multiple times over Thunk's
103   // lifetime. Stream argument must be non-null.
104   virtual tensorflow::Status ExecuteOnStream(
105       const BufferAllocations& buffer_allocations,
106       perftools::gputools::Stream* stream) = 0;
107 
108  private:
109   Kind kind_;
110   const HloInstruction* hlo_instruction_;
111 };
112 
113 // A sequence of thunks.
114 using ThunkSequence = std::vector<std::unique_ptr<Thunk>>;
115 
116 }  // namespace gpu
117 }  // namespace xla
118 
119 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_THUNK_H_
120