1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 // This header declares functions which may be called by the generated code on
17 // the CPU. Calls to these functions must be resolved explicitly in the JIT in
18 // xla::cpu::SimpleResolver.  It also defines a per-CpuExecutable context
19 // which is used to cache expensive state and resources utilized by the
20 // aforementioned functions.
21 //
22 // Other functions are declared in individual libraries as well, such as
23 // runtime_conv2d and runtime_matmul. As individual libraries, callers for
24 // ahead-of-time compilation can link only the required subset.
25 
26 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_RUNTIME_H_
27 #define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_RUNTIME_H_
28 
29 #include "tensorflow/compiler/xla/executable_run_options.h"
30 #include "tensorflow/compiler/xla/service/cpu/xfeed_manager.h"
31 #include "tensorflow/compiler/xla/types.h"
32 
33 namespace xla {
34 namespace cpu {
35 namespace runtime {
36 
37 // Names of runtime functions. These get resolved from the generated code to the
38 // right symbol at link time in one of two ways:
39 // 1. When using the JIT, the symbol resolver (SimpleResolver in
40 //    third_party/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc) maps
41 //    this symbol name to
42 //    the actual symbol.
43 // 2. When using ahead-of-time compilation, the linker can resolve the name
44 //    because it is a symbol in the cpu_runtime library.
45 extern const char* const kEigenMatMulF16SymbolName;
46 extern const char* const kEigenMatMulF32SymbolName;
47 extern const char* const kEigenMatMulF64SymbolName;
48 extern const char* const kMKLConvF32SymbolName;
49 extern const char* const kMKLMatMulF32SymbolName;
50 extern const char* const kMKLMatMulF64SymbolName;
51 extern const char* const kMKLSingleThreadedMatMulF32SymbolName;
52 extern const char* const kMKLSingleThreadedMatMulF64SymbolName;
53 extern const char* const kEigenConvF16SymbolName;
54 extern const char* const kEigenConvF32SymbolName;
55 extern const char* const kEigenFftSymbolName;
56 extern const char* const kEigenSingleThreadedFftSymbolName;
57 extern const char* const kEigenSingleThreadedMatMulF16SymbolName;
58 extern const char* const kEigenSingleThreadedMatMulF32SymbolName;
59 extern const char* const kEigenSingleThreadedMatMulF64SymbolName;
60 extern const char* const kEigenSingleThreadedConvF16SymbolName;
61 extern const char* const kEigenSingleThreadedConvF32SymbolName;
62 extern const char* const kAcquireInfeedBufferForDequeueSymbolName;
63 extern const char* const kReleaseInfeedBufferAfterDequeueSymbolName;
64 extern const char* const kAcquireOutfeedBufferForPopulationSymbolName;
65 extern const char* const kReleaseOutfeedBufferAfterPopulationSymbolName;
66 extern const char* const kParallelForkJoinSymbolName;
67 extern const char* const kKeyValueSortSymbolName;
68 
69 // All symbol names for XLA CPU runtime functions need to start with this
70 // prefix.
71 extern const char* const kXlaCpuRuntimeSymbolNamePrefix;
72 
73 // Returns the infeed manager used by the CPU runtime for the CPU device
74 // `device_ordinal`.  Note the device ordinal does not name a CPU
75 XfeedManager* GetXfeedManager(int device_ordinal);
76 
77 }  // namespace runtime
78 }  // namespace cpu
79 }  // namespace xla
80 
81 extern "C" {
82 
83 // Some things common to all of the runtime entry points below:
84 //
85 //  * The shape pointer and shape_length reflect values that can be deserialized
86 //    via llvm_ir::DecodeSelfDescribingShapeConstant. This is the way we pass
87 //    reified type information from the generated program to the runtime, which
88 //    helps check the type safety and contract for the emitted-code/runtime
89 //    communication.
90 //
91 //  * run_options is used to look up the device ordinal for the stream executor
92 //    we're executing under.  If it is null the device ordinal is assumed to be
93 //    0 (this behavior helps in writing tests).
94 
95 // Note: in the runtime entry points below, the shape pointer and shape_length
96 // reflect values that can be deserialized via
97 // llvm_ir::DecodeSelfDescribingShapeConstant. This is the way we pass reified
98 // type information from the generated program to the runtime, which helps check
99 // the type safety and contract for the emitted-code/runtime communication.
100 
101 // Blocks until the next infeed buffer is ready to be dequeued, then
102 // returns it. Fails catastrophically if the next enqueued buffer is
103 // not of the correct length in bytes. Checking the shape rather than
104 // the length would be more exact, but the length check is chosen as a
105 // tradeoff between error checking and speed/simplicity.
106 extern void* __xla_cpu_runtime_AcquireInfeedBufferForDequeue(
107     const xla::ExecutableRunOptions* run_options, xla::int32 buffer_length,
108     const void* shape, xla::int32 shape_length);
109 
110 // Relinquishes the next infeed buffer that was returned by
111 // __xla_cpu_runtime_AcquireInfeedBufferForDequeue. Once this call
112 // completes the data at buffer_ptr may no longer be
113 // accessed. buffer_length must match the length passed to the call to
114 // __xla_cpu_runtime_AcquireInfeedBufferForDequeue that returned
115 // buffer_ptr. This function must be called before the next buffer is
116 // acquired, i.e., there may only be one outstanding infeed buffer in
117 // use by the runtime.  TODO(b/31340454) investigate whether or not it
118 // is worth supporting zero-copy infeed where the buffer is retained
119 // by the compiled code until it has been used. If zero-copy infeed is
120 // implemented we will add support for multiple outstanding buffers
121 // that can be returned out of order.
122 extern void __xla_cpu_runtime_ReleaseInfeedBufferAfterDequeue(
123     const xla::ExecutableRunOptions* run_options, xla::int32 buffer_length,
124     void* buffer_ptr, const void* shape_ptr, xla::int32 shape_length);
125 
126 // Blocks until the next outfeed buffer is available to be populated, then
127 // returns it.
128 extern void* __xla_cpu_runtime_AcquireOutfeedBufferForPopulation(
129     const xla::ExecutableRunOptions* run_options, xla::int32 buffer_length,
130     const void* shape_ptr, xla::int32 shape_length);
131 
132 // Relinquishes the outfeed buffer after it has been populated.
133 // buffer_ptr must have been previously returned by
134 // __xla_cpu_runtime_AcquireOutfeedBufferForPopulation.
135 // Once this call completes, buffer_ptr may no longer be accessed.
136 // buffer_length must match the length passed to the call to
137 // __xla_cpu_runtime_AcquireInfeedBufferForDequeue that returned
138 // buffer_ptr. This function must be called before the next buffer is
139 // acquired, i.e., there may only be one outstanding outfeed buffer in
140 // use by the runtime.
141 extern void __xla_cpu_runtime_ReleaseOutfeedBufferAfterPopulation(
142     const xla::ExecutableRunOptions* run_options, xla::int32 buffer_length,
143     void* buffer_ptr, const void* shape_ptr, xla::int32 shape_length);
144 
145 }  // extern "C"
146 
147 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_RUNTIME_H_
148