1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/core/common_runtime/placer.h"
17
18 #include <memory>
19 #include <vector>
20
21 #include "tensorflow/core/common_runtime/colocation_graph.h"
22 #include "tensorflow/core/common_runtime/device.h"
23 #include "tensorflow/core/framework/attr_value_util.h"
24 #include "tensorflow/core/framework/device_attributes.pb.h"
25 #include "tensorflow/core/framework/function.h"
26 #include "tensorflow/core/framework/graph.pb.h"
27 #include "tensorflow/core/framework/types.h"
28 #include "tensorflow/core/framework/types.pb.h"
29 #include "tensorflow/core/graph/graph_node_util.h"
30 #include "tensorflow/core/lib/core/errors.h"
31 #include "tensorflow/core/util/dump_graph.h"
32 #include "tensorflow/core/util/port.h"
33
34 namespace tensorflow {
35
36 namespace {
37
38 struct NameCounts {
39 mutex counts_mutex;
40 std::unordered_map<string, int> counts;
41 };
42
MakeUniqueFilename(string name)43 string MakeUniqueFilename(string name) {
44 static NameCounts& instance = *new NameCounts;
45
46 // Remove illegal characters from `name`.
47 for (int i = 0; i < name.size(); ++i) {
48 char ch = name[i];
49 if (ch == '/' || ch == '[' || ch == ']' || ch == '*' || ch == '?') {
50 name[i] = '_';
51 }
52 }
53
54 int count;
55 {
56 mutex_lock lock(instance.counts_mutex);
57 count = instance.counts[name]++;
58 }
59
60 string filename = name;
61 if (count > 0) {
62 absl::StrAppend(&filename, "_", count);
63 }
64 absl::StrAppend(&filename, ".txt");
65 return filename;
66 }
67
GetFileName(string base_name,string * fname)68 Status GetFileName(string base_name, string* fname) {
69 const char* dir = nullptr;
70 dir = getenv("TF_DUMP_GRAPH_PREFIX");
71 if (!dir) {
72 return errors::Internal("Failed to get the directory for ", base_name,
73 " because dump location is not specified through "
74 "TF_DUMP_GRAPH_PREFIX environment variable");
75 }
76 base_name = MakeUniqueFilename(base_name);
77 *fname = absl::StrCat(dir, "/", base_name);
78 return Status::OK();
79 }
80
DumpColocationGraph(const string & base_name,const ColocationGraph & colocation_graph)81 void DumpColocationGraph(const string& base_name,
82 const ColocationGraph& colocation_graph) {
83 string fname;
84 Status status = GetFileName(base_name, &fname);
85 if (status.ok()) {
86 status = WriteStringToFile(Env::Default(), fname,
87 colocation_graph.DebugString());
88 if (status.ok()) {
89 LOG(INFO) << "Wrote ColocationGraph to " << fname;
90 }
91 }
92 if (!status.ok()) {
93 LOG(ERROR) << "Failed to write final colocation graph to file " << fname
94 << " with " << status.ToString();
95 }
96 }
97
98 // Returns true if the node has no inputs and produces outputs
99 // that are consumed by a single node.
100 //
101 // TODO(vrv): Currently this handles only nodes with one output, but
102 // this could be extended to handle the case where a node has many
103 // outputs that are connected to nodes in the same colocation group.
IsGeneratorNode(const Node * node)104 bool IsGeneratorNode(const Node* node) {
105 return node->num_inputs() == 0 && node->num_outputs() == 1 &&
106 !IsRefType(node->output_type(0));
107 }
108
LogDeviceAssignment(const Node * node,bool log_device_placement)109 void LogDeviceAssignment(const Node* node, bool log_device_placement) {
110 // Log placement if log_device_placement is set.
111 if (log_device_placement) {
112 printf("%s: (%s): %s\n", node->name().c_str(), node->type_string().c_str(),
113 node->assigned_device_name().c_str());
114 LOG(INFO) << node->name() << ": "
115 << "(" << node->type_string()
116 << "): " << node->assigned_device_name();
117 }
118 }
119
AssignAndLog(int assigned_device,Node * node,ColocationGraph * colocation_graph,bool log_device_placement)120 Status AssignAndLog(int assigned_device, Node* node,
121 ColocationGraph* colocation_graph,
122 bool log_device_placement) {
123 node->set_assigned_device_name_index(assigned_device);
124
125 // Constraint the group of node to the assigned device.
126 TF_RETURN_IF_ERROR(colocation_graph->LimitToAssignedDevice(*node));
127
128 LogDeviceAssignment(node, log_device_placement);
129 return Status::OK();
130 }
131
132 } // namespace
133
Placer(Graph * graph,const string & function_name,const FunctionLibraryDefinition * flib_def,const DeviceSet * devices,const Device * default_local_device,bool allow_soft_placement,bool log_device_placement)134 Placer::Placer(Graph* graph, const string& function_name,
135 const FunctionLibraryDefinition* flib_def,
136 const DeviceSet* devices, const Device* default_local_device,
137 bool allow_soft_placement, bool log_device_placement)
138 : graph_(graph),
139 function_name_(function_name),
140 flib_def_(flib_def),
141 devices_(devices),
142 default_local_device_(default_local_device),
143 allow_soft_placement_(allow_soft_placement),
144 log_device_placement_(log_device_placement) {}
145
Placer(Graph * graph,const string & function_name,const DeviceSet * devices,const Device * default_local_device)146 Placer::Placer(Graph* graph, const string& function_name,
147 const DeviceSet* devices, const Device* default_local_device)
148 : Placer(graph, function_name, &graph->flib_def(), devices,
149 default_local_device, true, false) {}
150
Placer(Graph * graph,const string & function_name,const DeviceSet * devices)151 Placer::Placer(Graph* graph, const string& function_name,
152 const DeviceSet* devices)
153 : Placer(graph, function_name, &graph->flib_def(), devices, nullptr, true,
154 false) {}
155
~Placer()156 Placer::~Placer() {}
157
Run()158 Status Placer::Run() {
159 if (devices_->devices().empty()) {
160 return errors::FailedPrecondition("No devices are registered");
161 }
162
163 if (VLOG_IS_ON(3)) {
164 DumpGraphToFile("placer_input", *graph_, nullptr);
165 }
166 if (VLOG_IS_ON(5)) {
167 for (const Node* node : graph_->op_nodes()) {
168 VLOG(5) << " " << node->name() << ": requested: '"
169 << node->requested_device() << "' assigned: '"
170 << node->assigned_device_name() << "'";
171 }
172 }
173
174 FunctionStack stack(function_name_);
175 ColocationGraph colocation_graph(graph_, stack, flib_def_, devices_,
176 default_local_device_, allow_soft_placement_,
177 log_device_placement_);
178
179 TF_RETURN_IF_ERROR(colocation_graph.Initialize());
180
181 // For each node, assign a device based on the constraints in the disjoint
182 // node set.
183 std::vector<Node*> second_pass;
184 for (Node* node : graph_->op_nodes()) {
185 // The graph may have come pre-populated by the framework with assigned
186 // devices (e.g., for stateful placements), so the placer should not try to
187 // place nodes that are already placed.
188 if (node->has_assigned_device_name()) {
189 TF_RETURN_IF_ERROR(colocation_graph.LimitToAssignedDevice(*node));
190 LogDeviceAssignment(node, log_device_placement_);
191 continue;
192 }
193
194 // Heuristic A: prefer to place "generators" with their only
195 // consumers.
196 //
197 // If this is a node with no inputs and one output, we save
198 // this for a second pass, so that the consumer's placement
199 // is chosen.
200 if (IsGeneratorNode(node)) {
201 second_pass.push_back(node);
202 continue;
203 }
204
205 const std::vector<Device*>* devices;
206 Status status = colocation_graph.GetDevicesForNode(node, &devices);
207 if (!status.ok()) {
208 return AttachDef(
209 errors::InvalidArgument("Cannot assign a device for operation ",
210 node->name(), ": ", status.error_message()),
211 *node);
212 }
213
214 // Returns the first device in sorted devices list so we will always
215 // choose the same device.
216 //
217 // TODO(vrv): Factor this assignment out into a pluggable
218 // algorithm, so that Placer is responsible for enforcing
219 // preconditions and we can experiment with other algorithms when
220 // given a choice of devices. Once we have a better idea of the
221 // types of heuristics we want to use and the information needed
222 // to perform good placement we can add an interface for this.
223 int assigned_device = -1;
224
225 // Heuristic B: If the node only operates on metadata, not data,
226 // then it is desirable to place that metadata node with its
227 // input.
228 if (IsMetadata(node)) {
229 // Make sure that the input device type is in the list of supported
230 // device types for this node.
231 const Node* input = (*node->in_edges().begin())->src();
232 // TODO(vrv): if the input is empty, consider postponing this
233 // node's assignment to the second pass, so that we handle the
234 // case where a metadata node's input comes from a backedge
235 // of a loop.
236 if (CanAssignToDevice(input->assigned_device_name(), *devices)) {
237 assigned_device = input->assigned_device_name_index();
238 }
239 }
240
241 // Provide the default, if necessary.
242 if (assigned_device == -1) {
243 assigned_device = graph_->InternDeviceName((*devices)[0]->name());
244 }
245
246 TF_RETURN_IF_ERROR(AssignAndLog(assigned_device, node, &colocation_graph,
247 log_device_placement_));
248 }
249
250 // Perform a second pass assignment for those nodes explicitly
251 // skipped during the first pass.
252 for (Node* node : second_pass) {
253 const std::vector<Device*>* devices;
254 Status status = colocation_graph.GetDevicesForNode(node, &devices);
255 if (!status.ok()) {
256 return AttachDef(
257 errors::InvalidArgument("Cannot assign a device for operation ",
258 node->name(), ": ", status.error_message()),
259 *node);
260 }
261
262 int assigned_device = -1;
263
264 // Heuristic A application.
265 if (IsGeneratorNode(node) && !node->out_edges().empty()) {
266 const Node* output = (*node->out_edges().begin())->dst();
267 int output_device_name = output->assigned_device_name_index();
268
269 const bool consumers_on_same_device = std::all_of(
270 node->out_edges().begin(), node->out_edges().end(),
271 [output_device_name](const Edge* e) {
272 return e->dst()->assigned_device_name_index() == output_device_name;
273 });
274
275 if (consumers_on_same_device &&
276 CanAssignToDevice(output->assigned_device_name(), *devices)) {
277 assigned_device = output_device_name;
278 }
279 }
280
281 // Provide the default, if necessary.
282 if (assigned_device == -1) {
283 assigned_device = graph_->InternDeviceName((*devices)[0]->name());
284 }
285
286 TF_RETURN_IF_ERROR(AssignAndLog(assigned_device, node, &colocation_graph,
287 log_device_placement_));
288 }
289
290 if (VLOG_IS_ON(3)) {
291 DumpGraphToFile("placer_output", *graph_, nullptr);
292 DumpColocationGraph("colocation_graph", colocation_graph);
293 }
294 return Status::OK();
295 }
296
CanAssignToDevice(const string & candidate_device_name,const std::vector<Device * > & devices) const297 bool Placer::CanAssignToDevice(const string& candidate_device_name,
298 const std::vector<Device*>& devices) const {
299 if (!candidate_device_name.empty()) {
300 // 'devices' lists the set of devices that the placer or the user has
301 // constrained the operation to. "candidate_device_name" must
302 // refer to a concrete Device that is in the list of 'devices'.
303 const Device* other_device =
304 devices_->FindDeviceByName(candidate_device_name);
305 if (std::find(devices.begin(), devices.end(), other_device) !=
306 devices.end()) {
307 return true;
308 }
309 }
310
311 return false;
312 }
313
314 } // namespace tensorflow
315