1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_CORE_FRAMEWORK_DEVICE_BASE_H_
17 #define TENSORFLOW_CORE_FRAMEWORK_DEVICE_BASE_H_
18 
19 #include <memory>
20 #include <string>
21 #include <vector>
22 
23 #include "absl/base/macros.h"
24 #include "absl/strings/string_view.h"
25 #include "tensorflow/core/framework/device_attributes.pb.h"
26 #include "tensorflow/core/framework/tensor.h"
27 #include "tensorflow/core/lib/core/errors.h"
28 #include "tensorflow/core/lib/core/refcount.h"
29 #include "tensorflow/core/lib/core/status.h"
30 #include "tensorflow/core/lib/core/stringpiece.h"
31 #include "tensorflow/core/platform/logging.h"
32 
33 namespace Eigen {
34 struct ThreadPoolDevice;
35 }  // end namespace Eigen
36 
37 namespace stream_executor {
38 class Stream;
39 }  // namespace stream_executor
40 
41 namespace tensorflow {
42 
43 class Device;
44 class DeviceAttributes;
45 class Env;
46 class EventMgr;
47 class OpKernelContext;
48 class ResourceMgr;
49 class ScopedAllocatorMgr;
50 class TensorProto;
51 
52 namespace thread {
53 class ThreadPool;
54 }
55 
56 // A wrapper for an Eigen Gpu Device that includes per-op state. The
57 // class is defined even for non-GPU devices since the
58 // OpKernelContext::Params structure wants to fill it in.
59 class PerOpGpuDevice {
60  public:
~PerOpGpuDevice()61   virtual ~PerOpGpuDevice() {}
62   virtual const Eigen::GpuDevice& device() const = 0;
63 };
64 
65 // A class that devices can subclass to pass around
66 // Device-specific context to OpKernels.
67 class DeviceContext : public core::RefCounted {
68  public:
~DeviceContext()69   ~DeviceContext() override {}
stream()70   virtual stream_executor::Stream* stream() const { return nullptr; }
MaintainLifetimeOnStream(const Tensor * t,stream_executor::Stream * stream)71   virtual void MaintainLifetimeOnStream(const Tensor* t,
72                                         stream_executor::Stream* stream) const {
73   }
74 
75   // "cpu_tensor" is a tensor on a CPU. Copies "cpu_tensor" into
76   // "device_tensor" which is on a non-CPU device "device". "device_tensor"
77   // must be allocated to be of the same size as "cpu_tensor".
78   virtual void CopyCPUTensorToDevice(const Tensor* cpu_tensor, Device* device,
79                                      Tensor* device_tensor, StatusCallback done,
80                                      bool sync_dst_compute = true) const {
81     done(errors::Internal("Unrecognized device type in CPU-to-device Copy"));
82   }
83 
84   // Same as CopyCPUTensorToDevice, but in a synchronous way.
85   Status CopyCPUTensorToDeviceSync(const Tensor* cpu_tensor, Device* device,
86                                    Tensor* device_tensor) const;
87 
88   // Copies a tensor in this device.
CopyTensorInSameDevice(const Tensor * input_tensor,Device * device,Tensor * output_tensor,StatusCallback done)89   virtual void CopyTensorInSameDevice(const Tensor* input_tensor,
90                                       Device* device, Tensor* output_tensor,
91                                       StatusCallback done) const {
92     done(errors::Unimplemented("Copy in same device not implemented."));
93   }
94 
95   // "device_tensor" is a tensor on a non-CPU device.  Copies
96   // device_tensor into "cpu_tensor".  "cpu_tensor" must be allocated
97   // to be of the same size as "device_tensor".
CopyDeviceTensorToCPU(const Tensor * device_tensor,StringPiece tensor_name,Device * device,Tensor * cpu_tensor,StatusCallback done)98   virtual void CopyDeviceTensorToCPU(const Tensor* device_tensor,
99                                      StringPiece tensor_name, Device* device,
100                                      Tensor* cpu_tensor, StatusCallback done) {
101     done(errors::Internal("Unrecognized device type in device-to-CPU Copy"));
102   }
103 
104   // Same as `CopyDeviceTensorToCPU`, but blocks until the copy is done.
105   Status CopyDeviceTensorToCPUSync(const Tensor* device_tensor,
106                                    StringPiece tensor_name, Device* device,
107                                    Tensor* cpu_tensor);
108 
109   // If possible, wait for all events on *stream to complete then execute func.
110   // A non-OK Status is returned otherwise.  The stream argument should be the
111   // one provided by GpuDeviceInfo.  This function is not applicable to devices
112   // that don't provide such a value.
ThenExecute(Device * device,stream_executor::Stream * stream,std::function<void ()> func)113   virtual Status ThenExecute(Device* device, stream_executor::Stream* stream,
114                              std::function<void()> func) {
115     return errors::Internal("ThenExecute not supported by device");
116   }
117 
118   // check if device is a pluggable device
IsPluggableDevice()119   virtual bool IsPluggableDevice() { return false; }
120 };
121 
122 class DeviceBase {
123  public:
DeviceBase(Env * env)124   explicit DeviceBase(Env* env) : env_(env) {}
125   virtual ~DeviceBase();
126 
env()127   Env* env() const { return env_; }
128 
129   struct CpuWorkerThreads {
130     int num_threads = 0;
131     thread::ThreadPool* workers = nullptr;
132   };
133 
134   // Does not take ownership.
set_tensorflow_cpu_worker_threads(CpuWorkerThreads * t)135   void set_tensorflow_cpu_worker_threads(CpuWorkerThreads* t) {
136     cpu_worker_threads_ = t;
137   }
138 
tensorflow_cpu_worker_threads()139   virtual const CpuWorkerThreads* tensorflow_cpu_worker_threads() const {
140     CHECK(cpu_worker_threads_ != nullptr);
141     return cpu_worker_threads_;
142   }
143 
144   // "stream" is used in special circumstances (such as the
145   // constructors of Ops) where there is no available OpKernelContext.
146   // "default_context" is used by OpKernelContext whenever a device does not
147   // supply a DeviceContext for an op in TryGetDeviceContext() (e.g. when only
148   // using a single stream.)
149   // "event_mgr" is used to delay deallocation of temporary GPU buffers.
150   // TODO(pbar) Work out how to move this out of DeviceBase.
151   // GpuDeviceInfo name is an unfortunate legacy, it is used not only by GPUs
152   // but also by TPU devices (to provide default device context).
153   struct GpuDeviceInfo {
154     // Make sure all the defaults are NULL, so we can spot missing assignments.
155     stream_executor::Stream* stream = nullptr;
156     DeviceContext* default_context = nullptr;
157     EventMgr* event_mgr = nullptr;
158     int gpu_id = -1;
159   };
160 
161   // Does not take ownership.
set_tensorflow_gpu_device_info(GpuDeviceInfo * g)162   void set_tensorflow_gpu_device_info(GpuDeviceInfo* g) {
163     gpu_device_info_ = g;
164   }
165 
tensorflow_gpu_device_info()166   virtual const GpuDeviceInfo* tensorflow_gpu_device_info() const {
167     return gpu_device_info_;
168   }
169 
170   // The preferred thread pool for this device. If it is nullptr, the system
171   // automatically assigns a thread pool for execution.
tensorflow_device_thread_pool()172   virtual thread::ThreadPool* tensorflow_device_thread_pool() {
173     return device_thread_pool_;
174   }
175 
176   // Does not take ownership.
177   void set_eigen_cpu_device(Eigen::ThreadPoolDevice* d);
178 
179   // Return the Allocator implementation to use based on the allocator
180   // attributes requested.  See allocator.h for more details.
GetAllocator(AllocatorAttributes)181   virtual Allocator* GetAllocator(AllocatorAttributes /*attr*/) {
182     LOG(FATAL) << "GetAllocator() is not implemented.";
183     return nullptr;
184   }
185 
186   // This method is provided for backwards compatibility, and will be removed
187   // in a future release.
188   ABSL_DEPRECATED("Use `this->GetAllocator()` or `this->GetScopedAllocator()`.")
GetStepAllocator(AllocatorAttributes attr,ResourceMgr *)189   Allocator* GetStepAllocator(AllocatorAttributes attr, ResourceMgr*) {
190     return GetAllocator(attr);
191   }
192 
193   // Return an Allocator prepared for use in particular places by graph
194   // optimization
GetScopedAllocator(AllocatorAttributes attr,int64 step_id)195   virtual Allocator* GetScopedAllocator(AllocatorAttributes attr,
196                                         int64 step_id) {
197     LOG(FATAL) << "Device does not implement GetScopedAllocator()";
198     return nullptr;
199   }
200 
GetScopedAllocatorMgr()201   virtual ScopedAllocatorMgr* GetScopedAllocatorMgr() const { return nullptr; }
202 
has_eigen_cpu_device()203   virtual bool has_eigen_cpu_device() const {
204     return !eigen_cpu_devices_.empty();
205   }
206 
207   virtual const Eigen::ThreadPoolDevice* eigen_cpu_device();
208 
209   // Caller owns the return value. The OpKernelContext calls this even
210   // for devices that do not implement an eigen_gpu_device. Overridden
211   // by GPU devices to return a derived type.
MakeGpuDevice()212   virtual PerOpGpuDevice* MakeGpuDevice() { return nullptr; }
213 
UnderlyingDevice()214   virtual DeviceBase* UnderlyingDevice() { return this; }
UnderlyingDevice()215   virtual const DeviceBase* UnderlyingDevice() const { return this; }
216 
217   // This is overridden by GPU devices to reinitialize the derived
218   // type returned by MakeGpuDevice.
ReinitializeGpuDevice(OpKernelContext *,PerOpGpuDevice *,DeviceContext *,Allocator *)219   virtual Status ReinitializeGpuDevice(OpKernelContext* /*context*/,
220                                        PerOpGpuDevice* /*device*/,
221                                        DeviceContext* /*dc*/,
222                                        Allocator* /*allocator*/) {
223     return Status::OK();
224   }
225 
226   // Unimplemented by default
227   virtual const DeviceAttributes& attributes() const;
NumaNode()228   virtual int NumaNode() const { return attributes().locality().numa_node(); }
229   virtual const std::string& name() const;
230 
231   // Materializes the given TensorProto into 'tensor' stored in Device
232   // memory.  Most devices will want to override this.
233   //
234   // TODO(vrv): We should be able to put this function into
235   // OpKernelContext and handle the copies from device memory via send
236   // and receive nodes, instead of requiring that each device handle
237   // the copies here as well as in copy ops.
MakeTensorFromProto(const TensorProto & tensor_proto,const AllocatorAttributes alloc_attrs,Tensor * tensor)238   virtual Status MakeTensorFromProto(const TensorProto& tensor_proto,
239                                      const AllocatorAttributes alloc_attrs,
240                                      Tensor* tensor) {
241     return errors::Internal("Device does not implement MakeTensorFromProto()");
242   }
243 
244   // Some devices (i.e. GPUs) may free device memory prior to its actual use
245   // being completed on the assumption that subsequent allocations can only be
246   // used serially with respect to pending uses.  If this function returns a
247   // non-zero value it is the value of a device-specific counter such that any
248   // device memory tagged with an earlier freed-at count is really unencumbered
249   // by pending uses.  For this to be useful the device memory allocator must
250   // be tagging deallocated memory chunks using the same counter.
SafeAllocFrontier(uint64 old_value)251   virtual uint64 SafeAllocFrontier(uint64 old_value) { return 0; }
252 
253   // Copies `input_tensor` to `output_tensor`, where both tensors are on this
254   // device. This function assumes that `output_tensor` has already been
255   // allocated with a buffer that is large enough to hold `input_tensor`'s data.
256   // Calls `done` from a device-specific thread after copy is finished, which
257   // may be the same as calling thread.
258   //
259   // NOTE(ayushd): This function is for TensorFlow internal use only.  Deep copy
260   // is discouraged and should not be used in OpKernels.
CopyTensorInSameDevice(const Tensor * input_tensor,Tensor * output_tensor,const DeviceContext * device_context,StatusCallback done)261   virtual void CopyTensorInSameDevice(const Tensor* input_tensor,
262                                       Tensor* output_tensor,
263                                       const DeviceContext* device_context,
264                                       StatusCallback done) {
265     done(errors::Internal("Device ", name(), " does not implement ",
266                           "CopyTensorInSameDevice"));
267   }
268 
269  protected:
270   // Does not take ownership.
set_tensorflow_device_thread_pool(thread::ThreadPool * thread_pool)271   void set_tensorflow_device_thread_pool(thread::ThreadPool* thread_pool) {
272     device_thread_pool_ = thread_pool;
273   }
274 
275  private:
276   Env* const env_;
277   CpuWorkerThreads* cpu_worker_threads_ = nullptr;
278   // Set by GPUs as well as by TPU devices.
279   GpuDeviceInfo* gpu_device_info_ = nullptr;
280   thread::ThreadPool* device_thread_pool_ = nullptr;
281   std::vector<Eigen::ThreadPoolDevice*> eigen_cpu_devices_;
282 };
283 
284 // Methods to create and check for Symbolic execution devices.
285 // Such devices are mostly used for TF-XLA bridge. TF should not treat these as
286 // normal devices.
287 void AddSymbolicExecutionDevice(absl::string_view device_name);
288 bool IsSymbolicExecutionDevice(absl::string_view device_name);
289 
290 }  // namespace tensorflow
291 
292 #endif  // TENSORFLOW_CORE_FRAMEWORK_DEVICE_BASE_H_
293