1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_CORE_FRAMEWORK_DEVICE_BASE_H_
17 #define TENSORFLOW_CORE_FRAMEWORK_DEVICE_BASE_H_
18 
19 #include <memory>
20 #include <string>
21 #include <vector>
22 
23 #include "absl/base/macros.h"
24 #include "tensorflow/core/framework/tensor.h"
25 #include "tensorflow/core/lib/core/errors.h"
26 #include "tensorflow/core/lib/core/refcount.h"
27 #include "tensorflow/core/lib/core/status.h"
28 #include "tensorflow/core/lib/core/stringpiece.h"
29 #include "tensorflow/core/platform/logging.h"
30 
31 namespace Eigen {
32 struct ThreadPoolDevice;
33 #ifdef TENSORFLOW_USE_SYCL
34 struct SyclDevice;
35 #endif
36 }  // end namespace Eigen
37 
38 namespace stream_executor {
39 class Stream;
40 }  // namespace stream_executor
41 
42 namespace tensorflow {
43 
44 class Device;
45 class DeviceAttributes;
46 class Env;
47 class EventMgr;
48 class OpKernelContext;
49 class ResourceMgr;
50 class ScopedAllocatorMgr;
51 class TensorProto;
52 
53 namespace thread {
54 class ThreadPool;
55 }
56 
57 // A wrapper for an Eigen Gpu Device that includes per-op state. The
58 // class is defined even for non-GPU devices since the
59 // OpKernelContext::Params structure wants to fill it in.
60 class PerOpGpuDevice {
61  public:
~PerOpGpuDevice()62   virtual ~PerOpGpuDevice() {}
63   virtual const Eigen::GpuDevice& device() const = 0;
64 };
65 
66 // A class that devices can subclass to pass around
67 // Device-specific context to OpKernels.
68 class DeviceContext : public core::RefCounted {
69  public:
~DeviceContext()70   ~DeviceContext() override {}
stream()71   virtual stream_executor::Stream* stream() const { return nullptr; }
MaintainLifetimeOnStream(const Tensor * t,stream_executor::Stream * stream)72   virtual void MaintainLifetimeOnStream(const Tensor* t,
73                                         stream_executor::Stream* stream) const {
74   }
75 
76   // "cpu_tensor" is a tensor on a CPU. Copies "cpu_tensor" into
77   // "device_tensor" which is on a GPU device "device". "device_tensor"
78   // must be allocated to be of the same size as "cpu_tensor".
CopyCPUTensorToDevice(const Tensor * cpu_tensor,Device * device,Tensor * device_tensor,StatusCallback done)79   virtual void CopyCPUTensorToDevice(const Tensor* cpu_tensor, Device* device,
80                                      Tensor* device_tensor,
81                                      StatusCallback done) const {
82     done(errors::Internal("Unrecognized device type in CPU-to-device Copy"));
83   }
84 
85   // Copies a tensor in this device.
CopyTensorInSameDevice(const Tensor * input_tensor,Device * device,Tensor * output_tensor,StatusCallback done)86   virtual void CopyTensorInSameDevice(const Tensor* input_tensor,
87                                       Device* device, Tensor* output_tensor,
88                                       StatusCallback done) const {
89     done(errors::Unimplemented("Copy in same device not implemented."));
90   }
91 
92   // "device_tensor" is a tensor on a non-CPU device.  Copies
93   // device_tensor into "cpu_tensor".  "cpu_tensor" must be allocated
94   // to be of the same size as "device_tensor".
CopyDeviceTensorToCPU(const Tensor * device_tensor,StringPiece tensor_name,Device * device,Tensor * cpu_tensor,StatusCallback done)95   virtual void CopyDeviceTensorToCPU(const Tensor* device_tensor,
96                                      StringPiece tensor_name, Device* device,
97                                      Tensor* cpu_tensor, StatusCallback done) {
98     done(errors::Internal("Unrecognized device type in device-to-CPU Copy"));
99   }
100 
101   // If possible, wait for all events on *stream to complete then execute func.
102   // A non-OK Status is returned otherwise.  The stream argument should be the
103   // one provided by GpuDeviceInfo.  This function is not applicable to devices
104   // that don't provide such a value.
ThenExecute(Device * device,stream_executor::Stream * stream,std::function<void ()> func)105   virtual Status ThenExecute(Device* device, stream_executor::Stream* stream,
106                              std::function<void()> func) {
107     return errors::Internal("ThenExecute not supported by device");
108   }
109 };
110 
111 // map[i] is the DeviceContext* for the node with id i, if i < map.size().
112 typedef std::vector<DeviceContext*> DeviceContextMap;
113 
114 class DeviceBase {
115  public:
DeviceBase(Env * env)116   explicit DeviceBase(Env* env) : env_(env) {}
117   virtual ~DeviceBase();
118 
env()119   Env* env() const { return env_; }
120 
121   // Override this to return true for devices that require an Op's
122   // compute method to save references to the temporary tensors it
123   // allocates until the Op execution completes
RequiresRecordingAccessedTensors()124   virtual bool RequiresRecordingAccessedTensors() const { return false; }
125 
126   struct CpuWorkerThreads {
127     int num_threads = 0;
128     thread::ThreadPool* workers = nullptr;
129   };
130 
131   // Does not take ownership.
set_tensorflow_cpu_worker_threads(CpuWorkerThreads * t)132   void set_tensorflow_cpu_worker_threads(CpuWorkerThreads* t) {
133     cpu_worker_threads_ = t;
134   }
135 
tensorflow_cpu_worker_threads()136   virtual const CpuWorkerThreads* tensorflow_cpu_worker_threads() const {
137     CHECK(cpu_worker_threads_ != nullptr);
138     return cpu_worker_threads_;
139   }
140 
141   // "stream" is used in special circumstances (such as the
142   // constructors of Ops) where there is no available OpKernelContext.
143   // "default_context" is used by OpKernelContext whenever a device does not
144   // supply a DeviceContext for an op in FillContextMap (e.g. when only
145   // using a single stream.)
146   // "event_mgr" is used to delay deallocation of temporary GPU buffers.
147   // TODO(pbar) Work out how to move this out of DeviceBase.
148   // GpuDeviceInfo name is an unfortunate legacy, it is used not only by GPUs
149   // but also by TPU devices (to provide default device context).
150   struct GpuDeviceInfo {
151     // Make sure all the defaults are NULL, so we can spot missing assignments.
152     stream_executor::Stream* stream = nullptr;
153     DeviceContext* default_context = nullptr;
154     EventMgr* event_mgr = nullptr;
155     int gpu_id = -1;
156   };
157 
158   // Does not take ownership.
set_tensorflow_gpu_device_info(GpuDeviceInfo * g)159   void set_tensorflow_gpu_device_info(GpuDeviceInfo* g) {
160     gpu_device_info_ = g;
161   }
162 
tensorflow_gpu_device_info()163   virtual const GpuDeviceInfo* tensorflow_gpu_device_info() const {
164     return gpu_device_info_;
165   }
166 
167   // The preferred thread pool for this device. If it is nullptr, the system
168   // automatically assigns a thread pool for execution.
tensorflow_device_thread_pool()169   virtual thread::ThreadPool* tensorflow_device_thread_pool() {
170     return device_thread_pool_;
171   }
172 
173   // Does not take ownership.
174   void set_eigen_cpu_device(Eigen::ThreadPoolDevice* d);
175 
176 #ifdef TENSORFLOW_USE_SYCL
set_eigen_sycl_device(Eigen::SyclDevice * d)177   void set_eigen_sycl_device(Eigen::SyclDevice* d) { eigen_sycl_device_ = d; }
178 #endif
179 
180   // Return the Allocator implementation to use based on the allocator
181   // attributes requested.  See allocator.h for more details.
GetAllocator(AllocatorAttributes)182   virtual Allocator* GetAllocator(AllocatorAttributes /*attr*/) {
183     LOG(FATAL) << "GetAllocator() is not implemented.";
184     return nullptr;
185   }
186 
187   // This method is provided for backwards compatibility, and will be removed
188   // in a future release.
189   ABSL_DEPRECATED("Use `this->GetAllocator()` or `this->GetScopedAllocator()`.")
GetStepAllocator(AllocatorAttributes attr,ResourceMgr *)190   Allocator* GetStepAllocator(AllocatorAttributes attr, ResourceMgr*) {
191     return GetAllocator(attr);
192   }
193 
194   // Return an Allocator prepared for use in particular places by graph
195   // optimization
GetScopedAllocator(AllocatorAttributes attr,int64 step_id)196   virtual Allocator* GetScopedAllocator(AllocatorAttributes attr,
197                                         int64 step_id) {
198     LOG(FATAL) << "Device does not implement GetScopedAllocator()";
199     return nullptr;
200   }
201 
GetScopedAllocatorMgr()202   virtual ScopedAllocatorMgr* GetScopedAllocatorMgr() const { return nullptr; }
203 
has_eigen_cpu_device()204   bool has_eigen_cpu_device() const { return !eigen_cpu_devices_.empty(); }
205 
206   virtual const Eigen::ThreadPoolDevice* eigen_cpu_device();
207 
208 #ifdef TENSORFLOW_USE_SYCL
eigen_sycl_device()209   virtual const Eigen::SyclDevice* eigen_sycl_device() const {
210     CHECK(eigen_sycl_device_ != nullptr);
211     return eigen_sycl_device_;
212   }
213 #endif
214 
215   // Caller owns the return value. The OpKernelContext calls this even
216   // for devices that do not implement an eigen_gpu_device. Overridden
217   // by GPU devices to return a derived type.
MakeGpuDevice()218   virtual PerOpGpuDevice* MakeGpuDevice() { return nullptr; }
219 
UnderlyingDevice()220   virtual DeviceBase* UnderlyingDevice() { return this; }
UnderlyingDevice()221   virtual const DeviceBase* UnderlyingDevice() const { return this; }
222 
223   // This is overridden by GPU devices to reinitialize the derived
224   // type returned by MakeGpuDevice.
ReinitializeGpuDevice(OpKernelContext *,PerOpGpuDevice *,DeviceContext *,Allocator *)225   virtual Status ReinitializeGpuDevice(OpKernelContext* /*context*/,
226                                        PerOpGpuDevice* /*device*/,
227                                        DeviceContext* /*dc*/,
228                                        Allocator* /*allocator*/) {
229     return Status::OK();
230   }
231 
232   // Unimplemented by default
233   virtual const DeviceAttributes& attributes() const;
234   virtual const string& name() const;
235 
236   // Materializes the given TensorProto into 'tensor' stored in Device
237   // memory.  Most devices will want to override this.
238   //
239   // TODO(vrv): We should be able to put this function into
240   // OpKernelContext and handle the copies from device memory via send
241   // and receive nodes, instead of requiring that each device handle
242   // the copies here as well as in copy ops.
MakeTensorFromProto(const TensorProto & tensor_proto,const AllocatorAttributes alloc_attrs,Tensor * tensor)243   virtual Status MakeTensorFromProto(const TensorProto& tensor_proto,
244                                      const AllocatorAttributes alloc_attrs,
245                                      Tensor* tensor) {
246     return errors::Internal("Device does not implement MakeTensorFromProto()");
247   }
248 
249   // Some devices (i.e. GPUs) may free device memory prior to its actual use
250   // being completed on the assumption that subsequent allocations can only be
251   // used serially with respect to pending uses.  If this function returns a
252   // non-zero value it is the value of a device-specific counter such that any
253   // device memory tagged with an earlier freed-at count is really unencumbered
254   // by pending uses.  For this to be useful the device memory allocator must
255   // be tagging deallocated memory chunks using the same counter.
SafeAllocFrontier()256   virtual uint64 SafeAllocFrontier() { return 0; }
257 
258  protected:
259   // Does not take ownership.
set_tensorflow_device_thread_pool(thread::ThreadPool * thread_pool)260   void set_tensorflow_device_thread_pool(thread::ThreadPool* thread_pool) {
261     device_thread_pool_ = thread_pool;
262   }
263 
264  private:
265   Env* const env_;
266   CpuWorkerThreads* cpu_worker_threads_ = nullptr;
267   // Set by GPUs as well as by TPU devices.
268   GpuDeviceInfo* gpu_device_info_ = nullptr;
269   thread::ThreadPool* device_thread_pool_ = nullptr;
270   std::vector<Eigen::ThreadPoolDevice*> eigen_cpu_devices_;
271 #ifdef TENSORFLOW_USE_SYCL
272   Eigen::SyclDevice* eigen_sycl_device_ = nullptr;
273 #endif
274 };
275 
276 }  // namespace tensorflow
277 
278 #endif  // TENSORFLOW_CORE_FRAMEWORK_DEVICE_BASE_H_
279