1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include <iterator>
17 #include <memory>
18 #include <vector>
19 
20 #include "tensorflow/core/common_runtime/device_mgr.h"
21 #include "tensorflow/core/common_runtime/local_device.h"
22 #include "tensorflow/core/framework/device_attributes.pb.h"
23 #include "tensorflow/core/lib/core/errors.h"
24 #include "tensorflow/core/platform/logging.h"
25 #include "tensorflow/core/util/device_name_utils.h"
26 
27 namespace tensorflow {
28 
DynamicDeviceMgr()29 DynamicDeviceMgr::DynamicDeviceMgr() : cpu_device_(nullptr) {}
30 
~DynamicDeviceMgr()31 DynamicDeviceMgr::~DynamicDeviceMgr() {
32   // Release resources ahead of destroying the device manager as the resource
33   // destructors (e.g. ~IteratorResource) assume devices still exist.
34   for (auto& pair : dynamic_devices_) {
35     pair.first->ClearResourceMgr();
36   }
37 }
38 
ListDeviceAttributes(std::vector<DeviceAttributes> * devices) const39 void DynamicDeviceMgr::ListDeviceAttributes(
40     std::vector<DeviceAttributes>* devices) const {
41   tf_shared_lock l(devices_mu_);
42   devices->reserve(dynamic_devices_.size());
43   for (const auto& pair : dynamic_devices_) {
44     devices->emplace_back(pair.first->attributes());
45   }
46 }
47 
ListDevices() const48 std::vector<Device*> DynamicDeviceMgr::ListDevices() const {
49   tf_shared_lock l(devices_mu_);
50   std::vector<Device*> devices;
51   devices.reserve(dynamic_devices_.size());
52   for (const auto& pair : dynamic_devices_) {
53     devices.emplace_back(pair.first);
54   }
55   return devices;
56 }
57 
DebugString() const58 string DynamicDeviceMgr::DebugString() const {
59   string out;
60   tf_shared_lock l(devices_mu_);
61   for (const auto& pair : dynamic_devices_) {
62     strings::StrAppend(&out, pair.first->name(), "\n");
63   }
64   return out;
65 }
66 
DeviceMappingString() const67 string DynamicDeviceMgr::DeviceMappingString() const {
68   string out;
69   tf_shared_lock l(devices_mu_);
70   for (const auto& pair : dynamic_devices_) {
71     if (!pair.first->attributes().physical_device_desc().empty()) {
72       strings::StrAppend(&out, pair.first->name(), " -> ",
73                          pair.first->attributes().physical_device_desc(), "\n");
74     }
75   }
76   return out;
77 }
78 
LookupDevice(StringPiece name,Device ** device) const79 Status DynamicDeviceMgr::LookupDevice(StringPiece name, Device** device) const {
80   tf_shared_lock l(devices_mu_);
81   auto iter = device_map_.find(string(name));
82   if (iter == device_map_.end()) {
83     std::vector<StringPiece> device_names;
84     for (auto&& itr : device_map_) {
85       device_names.push_back(itr.first);
86     }
87     VLOG(1) << "Unknown device: " << name
88             << " all devices: " << absl::StrJoin(device_names, ", ");
89     return errors::InvalidArgument(name, " unknown device.");
90   }
91   *device = iter->second;
92   return Status::OK();
93 }
94 
ContainsDevice(int64 device_incarnation) const95 bool DynamicDeviceMgr::ContainsDevice(int64 device_incarnation) const {
96   tf_shared_lock l(devices_mu_);
97   return device_incarnation_set_.contains(device_incarnation);
98 }
99 
ClearContainers(gtl::ArraySlice<string> containers) const100 void DynamicDeviceMgr::ClearContainers(
101     gtl::ArraySlice<string> containers) const {
102   Status s;
103   tf_shared_lock l(devices_mu_);
104   for (const auto& pair : dynamic_devices_) {
105     if (containers.empty()) {
106       s.Update(pair.first->resource_manager()->Cleanup(
107           pair.first->resource_manager()->default_container()));
108     } else {
109       for (const string& c : containers) {
110         s.Update(pair.first->resource_manager()->Cleanup(c));
111       }
112     }
113     if (!s.ok()) {
114       LOG(WARNING) << s;
115     }
116   }
117 }
118 
NumDeviceType(const string & type) const119 int DynamicDeviceMgr::NumDeviceType(const string& type) const {
120   tf_shared_lock l(devices_mu_);
121   auto iter = device_type_counts_.find(type);
122   if (iter != device_type_counts_.end()) return iter->second;
123   return 0;
124 }
125 
AddDevices(std::vector<std::unique_ptr<Device>> devices)126 Status DynamicDeviceMgr::AddDevices(
127     std::vector<std::unique_ptr<Device>> devices) {
128   mutex_lock l(devices_mu_);
129   for (auto& d : devices) {
130     if (device_map_.find(d->name()) != device_map_.end()) {
131       return errors::InvalidArgument(
132           "Trying to add device ", d->name(),
133           " to manager but its name conflicts with an existing device.");
134     }
135     // Register under the (1) full name and (2) canonical name.
136     for (const string& name :
137          DeviceNameUtils::GetNamesForDeviceMappings(d->parsed_name())) {
138       device_map_[name] = d.get();
139     }
140     // Register under the (3) local name and (4) legacy local name.
141     for (const string& name :
142          DeviceNameUtils::GetLocalNamesForDeviceMappings(d->parsed_name())) {
143       device_map_[name] = d.get();
144     }
145     device_type_counts_[d->device_type()]++;
146     device_incarnation_set_.insert(d->attributes().incarnation());
147     dynamic_devices_.emplace(d.get(), std::move(d));
148   }
149   return Status::OK();
150 }
151 
RemoveDevices(std::vector<Device * > devices)152 Status DynamicDeviceMgr::RemoveDevices(std::vector<Device*> devices) {
153   mutex_lock l(devices_mu_);
154 
155   for (const auto& d : devices) {
156     if (d == cpu_device_) {
157       TF_RETURN_IF_ERROR(
158           errors::InvalidArgument("Can not remove HostCPU device ", d->name()));
159     }
160     auto it = dynamic_devices_.find(d);
161     if (it == dynamic_devices_.end()) {
162       TF_RETURN_IF_ERROR(errors::InvalidArgument("Unknown device ", d->name()));
163     }
164   }
165 
166   for (const auto& d : devices) {
167     auto it = dynamic_devices_.find(d);
168 
169     // Clear registration of (1) full name and (2) canonical name
170     for (const string& name :
171          DeviceNameUtils::GetNamesForDeviceMappings(d->parsed_name())) {
172       device_map_.erase(name);
173     }
174     // Clear registration of (3) local name and (4) legacy local name
175     for (const string& name :
176          DeviceNameUtils::GetLocalNamesForDeviceMappings(d->parsed_name())) {
177       device_map_.erase(name);
178     }
179     device_type_counts_[d->device_type()]--;
180     device_incarnation_set_.erase(d->attributes().incarnation());
181     stale_devices_.add(std::move(it->second));
182     dynamic_devices_.erase(it);
183   }
184   return Status::OK();
185 }
186 
RemoveDevicesByName(const std::vector<string> & device_names)187 Status DynamicDeviceMgr::RemoveDevicesByName(
188     const std::vector<string>& device_names) {
189   std::vector<Device*> devices_to_remove;
190   for (const string& name : device_names) {
191     Device* device;
192     TF_RETURN_IF_ERROR(LookupDevice(name, &device));
193     devices_to_remove.emplace_back(device);
194   }
195   return RemoveDevices(devices_to_remove);
196 }
197 
HostCPU() const198 Device* DynamicDeviceMgr::HostCPU() const {
199   mutex_lock l(devices_mu_);
200   if (dynamic_devices_.find(cpu_device_) != dynamic_devices_.end()) {
201     return cpu_device_;
202   }
203   cpu_device_ = nullptr;
204   for (const auto& pair : dynamic_devices_) {
205     if (pair.first->device_type() == DEVICE_CPU &&
206         pair.first->parsed_name().id == 0) {
207       cpu_device_ = pair.first;
208       break;
209     }
210   }
211   return cpu_device_;
212 }
213 
214 }  // namespace tensorflow
215