1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 // Kernel-loader specs are structures that describe how to load a data-parallel
17 // kernel on a given platform for subsequent launching. Headers that instantiate
18 // these data structures will typically be auto-generated. However, users can
19 // also instantiate them by hand.
20 //
21 // A kernel with the same exact functionality and type signature may be
22 // implemented on several different platforms. Typical usage is to create a
23 // singleton that describes how to load a kernel on the various supported
24 // platforms:
25 //
26 //  static const MultiKernelLoaderSpec &SaxpySpec() {
27 //    static auto *mkls =
28 //        (new MultiKernelLoaderSpec{4 /* = arity */})
29 //            ->AddCudaPtxOnDisk(ptx_file_path, ptx_kernelname)
30 //            ->AddOpenCLTextOnDisk(opencl_text_file_path, ocl_kernelname);
31 //    };
32 //
33 //    return *mkls;
34 //  }
35 //
36 // This lazily instantiates an object that describes how to load CUDA PTX
37 // present on disk that implements saxpy for the for the CUDA platform, or
38 // OpenCL text present on disk that implements saxpy for an OpenCL-based
39 // platform. The CudaPtxOnDisk and OpenCLTextOnDisk objects are subtypes of
40 // KernelLoaderSpec -- KernelLoaderSpec describes how to load a kernel for
41 // subsequent launching on a single platform.
42 //
43 // For the loader functionality that accepts these KernelLoaderSpecs in order
44 // to grab the kernel appropriately, see StreamExecutor::GetKernel().
45 
46 #ifndef TENSORFLOW_STREAM_EXECUTOR_KERNEL_SPEC_H_
47 #define TENSORFLOW_STREAM_EXECUTOR_KERNEL_SPEC_H_
48 
49 #include <stddef.h>
50 
51 #include <map>
52 #include <memory>
53 
54 #include "absl/strings/string_view.h"
55 #include "absl/synchronization/mutex.h"
56 #include "tensorflow/stream_executor/platform/logging.h"
57 #include "tensorflow/stream_executor/platform/port.h"
58 
59 namespace stream_executor {
60 
61 // Describes how to load a kernel on a target platform.
62 //
63 // This is an abstract base class, subclassed for specific platforms.
64 // The filename_or_text field represents the program location (i.e. PTX or
65 // OpenCL loadable translation unit path) and is simply stored; whether it is a
66 // filename or text is exposed via more specifically named accessors in
67 // subclasses.
68 //
69 // These kernel loader specifications are typically auto-generated into header
70 // files at build time, but can also be specified manually.
71 class KernelLoaderSpec {
72  public:
~KernelLoaderSpec()73   virtual ~KernelLoaderSpec() {}
74 
75   // Returns the kernel name to load out of the program.
kernelname()76   const std::string &kernelname() const { return kernelname_; }
77 
78  protected:
79   explicit KernelLoaderSpec(absl::string_view kernelname);
80 
81  private:
82   // The kernel name that should be loaded out of the program description given
83   // above.
84   std::string kernelname_;
85 
86   SE_DISALLOW_COPY_AND_ASSIGN(KernelLoaderSpec);
87 };
88 
89 // An abstract kernel loader spec that has an associated file path, where
90 // there's a canonical suffix for the filename; e.g. see CudaPtxOnDisk whose
91 // canonical filename suffix is ".ptx".
92 class OnDiskKernelLoaderSpec : public KernelLoaderSpec {
93  public:
~OnDiskKernelLoaderSpec()94   ~OnDiskKernelLoaderSpec() override {}
95 
96   // Returns the path to the on-disk loadable kernel file.
filename()97   const std::string &filename() const { return filename_; }
98 
99   // Returns the canonical suffix for this on-disk kernel loader spec format;
100   // e.g. PTX files on disk have a canonical suffix of ".ptx".
101   virtual const char *CanonicalSuffix() const = 0;
102 
103  protected:
104   OnDiskKernelLoaderSpec(absl::string_view filename,
105                          absl::string_view kernelname);
106 
107   std::string filename_;
108 
109  private:
110   SE_DISALLOW_COPY_AND_ASSIGN(OnDiskKernelLoaderSpec);
111 };
112 
113 // Kernel loader specification for PTX text that resides on disk.
114 class CudaPtxOnDisk : public OnDiskKernelLoaderSpec {
115  public:
116   CudaPtxOnDisk(absl::string_view filename, absl::string_view kernelname);
~CudaPtxOnDisk()117   ~CudaPtxOnDisk() override {}
118 
CanonicalSuffix()119   const char *CanonicalSuffix() const override { return ".ptx"; }
120 
121  private:
122   SE_DISALLOW_COPY_AND_ASSIGN(CudaPtxOnDisk);
123 };
124 
125 // Kernel loader specification for CUBIN binary that resides on disk.
126 class CudaCubinOnDisk : public OnDiskKernelLoaderSpec {
127  public:
128   CudaCubinOnDisk(absl::string_view filename, absl::string_view kernelname);
~CudaCubinOnDisk()129   ~CudaCubinOnDisk() override {}
130 
filename()131   const std::string &filename() const { return filename_; }
132 
CanonicalSuffix()133   const char *CanonicalSuffix() const override { return ".cubin"; }
134 
135  private:
136   std::string filename_;
137 
138   SE_DISALLOW_COPY_AND_ASSIGN(CudaCubinOnDisk);
139 };
140 
141 // Kernel loader specification for PTX text that resides in memory.
142 class CudaPtxInMemory : public KernelLoaderSpec {
143  public:
144   // Components: compute capability major number, compute capability minor
145   // number, and PTX source.
146   typedef std::tuple<int, int, absl::string_view> PtxSpec;
147 
148   // Single-PTX constructor. Adds the provided PTX version with an unknown
149   // compute capability. Since the CC is unknown, the PTX is assumed to be very
150   // generally usable - in other words, PTX specified in this manner is VERY
151   // likely to be used as the default! Note that the PTX can be compressed,
152   // which is indicated by the argument ptx_compressed.
153   //
154   // Warning: the string backing the provided absl::string_view ptx must outlive
155   // this instance.
156   CudaPtxInMemory(absl::string_view ptx, absl::string_view kernelname,
157                   bool ptx_compressed = false);
158 
159   // Multiple-PTX-version constructor. Adds each item in spec_list to this
160   // object. Note that the PTX can be compressed, which is indicated by the
161   // argument ptx_compressed.
162   CudaPtxInMemory(const std::initializer_list<PtxSpec> &spec_list,
163                   absl::string_view kernel_name, bool ptx_compressed = false);
~CudaPtxInMemory()164   ~CudaPtxInMemory() override {}
165 
166   // Add the PTX implementation described by ptx_spec to this object. On
167   // collision (i.e., if a version with the same compute_capability already
168   // exists), the existing implementation will be overwritten.
169   void AddSpec(PtxSpec ptx_spec);
170 
171   // Returns pointer to the ptx of available implementation with the
172   // lowest-valued compute capability. For example, if PTX written to CC2.0,
173   // 3.0, and 3.5 are all available, the version for CC2.0 will be set. Returns
174   // nullptr on failed lookup (if any version is not available).
175   // When the ptx is compressed, returns the decompressed ptx.
176   const char *default_text() const;
177 
178   // Similar to default_text().
179   // When the ptx is compressed, returns the decompressed ptx.
180   const char *original_default_text() const;
181 
182   // Returns pointer to the ptx for the requested compute capability.
183   // Returns nullptr on failed lookup (if the requested version is not
184   // available).
185   // When the ptx is compressed, returns the decompressed ptx.
186   const char *text(int compute_capability_major,
187                    int compute_capability_minor) const;
188 
189   // Similar to text().
190   // When the ptx is compressed, returns the original compressed ptx.
191   const char *original_text(int compute_capability_major,
192                             int compute_capability_minor) const;
193 
194   // Decompresses the PTX string using bzip2.
195   static std::string DecompressPtx(const char *ptx);
196 
197  private:
198   // PTX translation unit text contents in memory. The key is of as a tuple
199   // "<cc_major>,<cc_minor>", i.e., "2,0", "3,0", "3,5". Because CC's
200   // represented in this way have a clear sorting order, map::begin() will give
201   // the lowest-numbered version available, i.e. the default.
202   std::map<std::tuple<int, int>, const char *,
203            bool (*)(const std::tuple<int, int> &, const std::tuple<int, int> &)>
204       ptx_by_compute_capability_;
205 
206   // Stores all decompressed ptx strings, with original ptx string as keys.
207   // It is marked as mutable for lazy decompression.
208   mutable std::map<const char *, std::string> decompressed_ptx_;
209   mutable absl::Mutex mu_;
210 
211   // Defines the minimum compute capability possible. Used when PTX has no
212   // compute capability specified (in the single-PTX constructor).
213   static const std::tuple<int, int> kMinimumCapability;
214 
215   SE_DISALLOW_COPY_AND_ASSIGN(CudaPtxInMemory);
216 };
217 
218 // Kernel loader specification for OpenCL text that resides on disk.
219 class OpenCLTextOnDisk : public OnDiskKernelLoaderSpec {
220  public:
221   OpenCLTextOnDisk(absl::string_view filename, absl::string_view kernelname);
~OpenCLTextOnDisk()222   ~OpenCLTextOnDisk() override {}
223 
CanonicalSuffix()224   const char *CanonicalSuffix() const override { return ".ocl"; }
225 
226  private:
227   SE_DISALLOW_COPY_AND_ASSIGN(OpenCLTextOnDisk);
228 };
229 
230 // Kernel loader specification for OpenCL binary that resides on disk.
231 class OpenCLBinaryOnDisk : public OnDiskKernelLoaderSpec {
232  public:
233   OpenCLBinaryOnDisk(absl::string_view filename, absl::string_view kernelname);
~OpenCLBinaryOnDisk()234   ~OpenCLBinaryOnDisk() override {}
235 
CanonicalSuffix()236   const char *CanonicalSuffix() const override { return ".aocx"; }
237 
238  private:
239   SE_DISALLOW_COPY_AND_ASSIGN(OpenCLBinaryOnDisk);
240 };
241 
242 // Kernel loader specification for OpenCL text that resides in memory.
243 class OpenCLTextInMemory : public KernelLoaderSpec {
244  public:
245   OpenCLTextInMemory(absl::string_view text, absl::string_view kernelname);
~OpenCLTextInMemory()246   ~OpenCLTextInMemory() override {}
247 
248   // Returns the OpenCL text contents.
text()249   const std::string &text() const { return text_; }
250 
251  private:
252   // OpenCL translation unit text contents in memory.
253   std::string text_;
254 
255   SE_DISALLOW_COPY_AND_ASSIGN(OpenCLTextInMemory);
256 };
257 
258 // Kernel loader specification for a CUBIN blob that resides in memory.
259 class CudaCubinInMemory : public KernelLoaderSpec {
260  public:
261   CudaCubinInMemory(const char *bytes, absl::string_view kernelname);
~CudaCubinInMemory()262   ~CudaCubinInMemory() override {}
263 
bytes()264   const char *bytes() const { return bytes_; }
265 
266  private:
267   const char *bytes_;
268 
269   SE_DISALLOW_COPY_AND_ASSIGN(CudaCubinInMemory);
270 };
271 
272 // Describes how to load a kernel on any subset of a number of target platforms.
273 class MultiKernelLoaderSpec {
274  public:
275   explicit MultiKernelLoaderSpec(size_t arity);
276 
277   // Returns the number of arguments that this kernel accepts.
arity()278   size_t arity() const { return arity_; }
279 
280   // Convenience getters for testing whether these platform variants have
281   // kernel loader specifications available.
has_cuda_ptx_on_disk()282   bool has_cuda_ptx_on_disk() const { return cuda_ptx_on_disk_ != nullptr; }
has_cuda_cubin_on_disk()283   bool has_cuda_cubin_on_disk() const { return cuda_cubin_on_disk_ != nullptr; }
has_cuda_cubin_in_memory()284   bool has_cuda_cubin_in_memory() const {
285     return cuda_cubin_in_memory_ != nullptr;
286   }
has_cuda_ptx_in_memory()287   bool has_cuda_ptx_in_memory() const { return cuda_ptx_in_memory_ != nullptr; }
has_ocl_text_on_disk()288   bool has_ocl_text_on_disk() const { return ocl_text_on_disk_ != nullptr; }
has_ocl_binary_on_disk()289   bool has_ocl_binary_on_disk() const { return ocl_binary_on_disk_ != nullptr; }
has_ocl_text_in_memory()290   bool has_ocl_text_in_memory() const { return ocl_text_in_memory_ != nullptr; }
291 
292   // Accessors for platform variant kernel load specifications.
293   // Precondition: corresponding has_* is true.
cuda_ptx_on_disk()294   const CudaPtxOnDisk &cuda_ptx_on_disk() const {
295     CHECK(has_cuda_ptx_on_disk());
296     return *cuda_ptx_on_disk_;
297   }
cuda_cubin_on_disk()298   const CudaCubinOnDisk &cuda_cubin_on_disk() const {
299     CHECK(has_cuda_cubin_on_disk());
300     return *cuda_cubin_on_disk_;
301   }
cuda_cubin_in_memory()302   const CudaCubinInMemory &cuda_cubin_in_memory() const {
303     CHECK(has_cuda_cubin_in_memory());
304     return *cuda_cubin_in_memory_;
305   }
cuda_ptx_in_memory()306   const CudaPtxInMemory &cuda_ptx_in_memory() const {
307     CHECK(has_cuda_ptx_in_memory());
308     return *cuda_ptx_in_memory_;
309   }
ocl_text_on_disk()310   const OpenCLTextOnDisk &ocl_text_on_disk() const {
311     CHECK(has_ocl_text_on_disk());
312     return *ocl_text_on_disk_;
313   }
ocl_binary_on_disk()314   const OpenCLBinaryOnDisk &ocl_binary_on_disk() const {
315     CHECK(has_ocl_binary_on_disk());
316     return *ocl_binary_on_disk_;
317   }
ocl_text_in_memory()318   const OpenCLTextInMemory &ocl_text_in_memory() const {
319     CHECK(has_ocl_text_in_memory());
320     return *ocl_text_in_memory_;
321   }
322 
323   // Builder-pattern-like methods for use in initializing a
324   // MultiKernelLoaderSpec. Each of these should be used at most once for a
325   // single MultiKernelLoaderSpec object. See file comment for example usage.
326   //
327   // Note that the kernelname parameter must be consistent with the kernel in
328   // the PTX or OpenCL being loaded. Also be aware that in CUDA C++ the kernel
329   // name may be mangled by the compiler if it is not declared in an
330   // extern "C" scope.
331   MultiKernelLoaderSpec *AddOpenCLTextOnDisk(absl::string_view filename,
332                                              absl::string_view kernelname);
333   MultiKernelLoaderSpec *AddOpenCLBinaryOnDisk(absl::string_view filename,
334                                                absl::string_view kernelname);
335   MultiKernelLoaderSpec *AddOpenCLTextInMemory(absl::string_view ocl_text,
336                                                absl::string_view kernelname);
337   MultiKernelLoaderSpec *AddCudaPtxOnDisk(absl::string_view filename,
338                                           absl::string_view kernelname);
339   MultiKernelLoaderSpec *AddCudaCubinOnDisk(absl::string_view filename,
340                                             absl::string_view kernelname);
341   MultiKernelLoaderSpec *AddCudaCubinInMemory(const char *cubin_bytes,
342                                               absl::string_view kernelname);
343   MultiKernelLoaderSpec *AddCudaPtxInMemory(absl::string_view ptx,
344                                             absl::string_view kernelname);
345   MultiKernelLoaderSpec *AddCudaCompressedPtxInMemory(
346       absl::string_view ptx, absl::string_view kernelname);
347   MultiKernelLoaderSpec *AddCudaPtxInMemory(
348       std::initializer_list<CudaPtxInMemory::PtxSpec> spec_list,
349       absl::string_view kernelname);
350   MultiKernelLoaderSpec *AddCudaCompressedPtxInMemory(
351       std::initializer_list<CudaPtxInMemory::PtxSpec> spec_list,
352       absl::string_view kernelname);
353 
354  private:
355   std::unique_ptr<CudaPtxOnDisk>
356       cuda_ptx_on_disk_;  // PTX text that resides in a file.
357   std::unique_ptr<CudaCubinOnDisk>
358       cuda_cubin_on_disk_;  // Binary CUDA program in a file.
359   std::unique_ptr<CudaCubinInMemory>
360       cuda_cubin_in_memory_;  // Binary CUDA program in memory.
361   std::unique_ptr<CudaPtxInMemory>
362       cuda_ptx_in_memory_;  // PTX text that resides in memory.
363   std::unique_ptr<OpenCLTextOnDisk>
364       ocl_text_on_disk_;  // OpenCL text that resides on disk.
365   std::unique_ptr<OpenCLBinaryOnDisk>
366       ocl_binary_on_disk_;  // OpenCL binary that resides on disk.
367   std::unique_ptr<OpenCLTextInMemory>
368       ocl_text_in_memory_;  // OpenCL text that resides in memory.
369 
370   // Number of parameters that the kernel takes. (This is nicer to have in a
371   // constexpr than having to determine it from the types via template
372   // metaprogramming).
373   size_t arity_;
374 };
375 
376 }  // namespace stream_executor
377 
378 #endif  // TENSORFLOW_STREAM_EXECUTOR_KERNEL_SPEC_H_
379