1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/core/common_runtime/placer.h"
17 
18 #include <memory>
19 #include <vector>
20 
21 #include "tensorflow/core/common_runtime/colocation_graph.h"
22 #include "tensorflow/core/common_runtime/device.h"
23 #include "tensorflow/core/framework/attr_value_util.h"
24 #include "tensorflow/core/framework/device_attributes.pb.h"
25 #include "tensorflow/core/framework/function.h"
26 #include "tensorflow/core/framework/graph.pb.h"
27 #include "tensorflow/core/framework/types.h"
28 #include "tensorflow/core/framework/types.pb.h"
29 #include "tensorflow/core/graph/graph_node_util.h"
30 #include "tensorflow/core/lib/core/errors.h"
31 #include "tensorflow/core/util/dump_graph.h"
32 #include "tensorflow/core/util/port.h"
33 
34 namespace tensorflow {
35 
36 namespace {
37 
38 struct NameCounts {
39   mutex counts_mutex;
40   std::unordered_map<string, int> counts;
41 };
42 
MakeUniqueFilename(string name)43 string MakeUniqueFilename(string name) {
44   static NameCounts& instance = *new NameCounts;
45 
46   // Remove illegal characters from `name`.
47   for (int i = 0; i < name.size(); ++i) {
48     char ch = name[i];
49     if (ch == '/' || ch == '[' || ch == ']' || ch == '*' || ch == '?') {
50       name[i] = '_';
51     }
52   }
53 
54   int count;
55   {
56     mutex_lock lock(instance.counts_mutex);
57     count = instance.counts[name]++;
58   }
59 
60   string filename = name;
61   if (count > 0) {
62     absl::StrAppend(&filename, "_", count);
63   }
64   absl::StrAppend(&filename, ".txt");
65   return filename;
66 }
67 
GetFileName(string base_name,string * fname)68 Status GetFileName(string base_name, string* fname) {
69   const char* dir = nullptr;
70   dir = getenv("TF_DUMP_GRAPH_PREFIX");
71   if (!dir) {
72     return errors::Internal("Failed to get the directory for ", base_name,
73                             " because dump location is not specified through "
74                             "TF_DUMP_GRAPH_PREFIX environment variable");
75   }
76   base_name = MakeUniqueFilename(base_name);
77   *fname = absl::StrCat(dir, "/", base_name);
78   return Status::OK();
79 }
80 
DumpColocationGraph(const string & base_name,const ColocationGraph & colocation_graph)81 void DumpColocationGraph(const string& base_name,
82                          const ColocationGraph& colocation_graph) {
83   string fname;
84   Status status = GetFileName(base_name, &fname);
85   if (status.ok()) {
86     status = WriteStringToFile(Env::Default(), fname,
87                                colocation_graph.DebugString());
88     if (status.ok()) {
89       LOG(INFO) << "Wrote ColocationGraph to " << fname;
90     }
91   }
92   if (!status.ok()) {
93     LOG(ERROR) << "Failed to write final colocation graph to file " << fname
94                << " with " << status.ToString();
95   }
96 }
97 
98 // Returns true if the node has no inputs and produces outputs
99 // that are consumed by a single node.
100 //
101 // TODO(vrv): Currently this handles only nodes with one output, but
102 // this could be extended to handle the case where a node has many
103 // outputs that are connected to nodes in the same colocation group.
IsGeneratorNode(const Node * node)104 bool IsGeneratorNode(const Node* node) {
105   return node->num_inputs() == 0 && node->num_outputs() == 1 &&
106          !IsRefType(node->output_type(0));
107 }
108 
LogDeviceAssignment(const Node * node,bool log_device_placement)109 void LogDeviceAssignment(const Node* node, bool log_device_placement) {
110   // Log placement if log_device_placement is set.
111   if (log_device_placement) {
112     printf("%s: (%s): %s\n", node->name().c_str(), node->type_string().c_str(),
113            node->assigned_device_name().c_str());
114     LOG(INFO) << node->name() << ": "
115               << "(" << node->type_string()
116               << "): " << node->assigned_device_name();
117   }
118 }
119 
AssignAndLog(int assigned_device,Node * node,ColocationGraph * colocation_graph,bool log_device_placement)120 Status AssignAndLog(int assigned_device, Node* node,
121                     ColocationGraph* colocation_graph,
122                     bool log_device_placement) {
123   node->set_assigned_device_name_index(assigned_device);
124 
125   // Constraint the group of node to the assigned device.
126   TF_RETURN_IF_ERROR(colocation_graph->LimitToAssignedDevice(*node));
127 
128   LogDeviceAssignment(node, log_device_placement);
129   return Status::OK();
130 }
131 
132 }  // namespace
133 
Placer(Graph * graph,const string & function_name,const FunctionLibraryDefinition * flib_def,const DeviceSet * devices,const Device * default_local_device,bool allow_soft_placement,bool log_device_placement)134 Placer::Placer(Graph* graph, const string& function_name,
135                const FunctionLibraryDefinition* flib_def,
136                const DeviceSet* devices, const Device* default_local_device,
137                bool allow_soft_placement, bool log_device_placement)
138     : graph_(graph),
139       function_name_(function_name),
140       flib_def_(flib_def),
141       devices_(devices),
142       default_local_device_(default_local_device),
143       allow_soft_placement_(allow_soft_placement),
144       log_device_placement_(log_device_placement) {}
145 
Placer(Graph * graph,const string & function_name,const DeviceSet * devices,const Device * default_local_device)146 Placer::Placer(Graph* graph, const string& function_name,
147                const DeviceSet* devices, const Device* default_local_device)
148     : Placer(graph, function_name, &graph->flib_def(), devices,
149              default_local_device, true, false) {}
150 
Placer(Graph * graph,const string & function_name,const DeviceSet * devices)151 Placer::Placer(Graph* graph, const string& function_name,
152                const DeviceSet* devices)
153     : Placer(graph, function_name, &graph->flib_def(), devices, nullptr, true,
154              false) {}
155 
~Placer()156 Placer::~Placer() {}
157 
Run()158 Status Placer::Run() {
159   if (devices_->devices().empty()) {
160     return errors::FailedPrecondition("No devices are registered");
161   }
162 
163   if (VLOG_IS_ON(3)) {
164     DumpGraphToFile("placer_input", *graph_, nullptr);
165   }
166   if (VLOG_IS_ON(5)) {
167     for (const Node* node : graph_->op_nodes()) {
168       VLOG(5) << "    " << node->name() << ": requested: '"
169               << node->requested_device() << "' assigned: '"
170               << node->assigned_device_name() << "'";
171     }
172   }
173 
174   FunctionStack stack(function_name_);
175   ColocationGraph colocation_graph(graph_, stack, flib_def_, devices_,
176                                    default_local_device_, allow_soft_placement_,
177                                    log_device_placement_);
178 
179   TF_RETURN_IF_ERROR(colocation_graph.Initialize());
180 
181   // For each node, assign a device based on the constraints in the disjoint
182   // node set.
183   std::vector<Node*> second_pass;
184   for (Node* node : graph_->op_nodes()) {
185     // The graph may have come pre-populated by the framework with assigned
186     // devices (e.g., for stateful placements), so the placer should not try to
187     // place nodes that are already placed.
188     if (node->has_assigned_device_name()) {
189       TF_RETURN_IF_ERROR(colocation_graph.LimitToAssignedDevice(*node));
190       LogDeviceAssignment(node, log_device_placement_);
191       continue;
192     }
193 
194     // Heuristic A: prefer to place "generators" with their only
195     // consumers.
196     //
197     // If this is a node with no inputs and one output, we save
198     // this for a second pass, so that the consumer's placement
199     // is chosen.
200     if (IsGeneratorNode(node)) {
201       second_pass.push_back(node);
202       continue;
203     }
204 
205     const std::vector<Device*>* devices;
206     Status status = colocation_graph.GetDevicesForNode(node, &devices);
207     if (!status.ok()) {
208       return AttachDef(
209           errors::InvalidArgument("Cannot assign a device for operation ",
210                                   node->name(), ": ", status.error_message()),
211           *node);
212     }
213 
214     // Returns the first device in sorted devices list so we will always
215     // choose the same device.
216     //
217     // TODO(vrv): Factor this assignment out into a pluggable
218     // algorithm, so that Placer is responsible for enforcing
219     // preconditions and we can experiment with other algorithms when
220     // given a choice of devices. Once we have a better idea of the
221     // types of heuristics we want to use and the information needed
222     // to perform good placement we can add an interface for this.
223     int assigned_device = -1;
224 
225     // Heuristic B: If the node only operates on metadata, not data,
226     // then it is desirable to place that metadata node with its
227     // input.
228     if (IsMetadata(node)) {
229       // Make sure that the input device type is in the list of supported
230       // device types for this node.
231       const Node* input = (*node->in_edges().begin())->src();
232       // TODO(vrv): if the input is empty, consider postponing this
233       // node's assignment to the second pass, so that we handle the
234       // case where a metadata node's input comes from a backedge
235       // of a loop.
236       if (CanAssignToDevice(input->assigned_device_name(), *devices)) {
237         assigned_device = input->assigned_device_name_index();
238       }
239     }
240 
241     // Provide the default, if necessary.
242     if (assigned_device == -1) {
243       assigned_device = graph_->InternDeviceName((*devices)[0]->name());
244     }
245 
246     TF_RETURN_IF_ERROR(AssignAndLog(assigned_device, node, &colocation_graph,
247                                     log_device_placement_));
248   }
249 
250   // Perform a second pass assignment for those nodes explicitly
251   // skipped during the first pass.
252   for (Node* node : second_pass) {
253     const std::vector<Device*>* devices;
254     Status status = colocation_graph.GetDevicesForNode(node, &devices);
255     if (!status.ok()) {
256       return AttachDef(
257           errors::InvalidArgument("Cannot assign a device for operation ",
258                                   node->name(), ": ", status.error_message()),
259           *node);
260     }
261 
262     int assigned_device = -1;
263 
264     // Heuristic A application.
265     if (IsGeneratorNode(node) && !node->out_edges().empty()) {
266       const Node* output = (*node->out_edges().begin())->dst();
267       int output_device_name = output->assigned_device_name_index();
268 
269       const bool consumers_on_same_device = std::all_of(
270           node->out_edges().begin(), node->out_edges().end(),
271           [output_device_name](const Edge* e) {
272             return e->dst()->assigned_device_name_index() == output_device_name;
273           });
274 
275       if (consumers_on_same_device &&
276           CanAssignToDevice(output->assigned_device_name(), *devices)) {
277         assigned_device = output_device_name;
278       }
279     }
280 
281     // Provide the default, if necessary.
282     if (assigned_device == -1) {
283       assigned_device = graph_->InternDeviceName((*devices)[0]->name());
284     }
285 
286     TF_RETURN_IF_ERROR(AssignAndLog(assigned_device, node, &colocation_graph,
287                                     log_device_placement_));
288   }
289 
290   if (VLOG_IS_ON(3)) {
291     DumpGraphToFile("placer_output", *graph_, nullptr);
292     DumpColocationGraph("colocation_graph", colocation_graph);
293   }
294   return Status::OK();
295 }
296 
CanAssignToDevice(const string & candidate_device_name,const std::vector<Device * > & devices) const297 bool Placer::CanAssignToDevice(const string& candidate_device_name,
298                                const std::vector<Device*>& devices) const {
299   if (!candidate_device_name.empty()) {
300     // 'devices' lists the set of devices that the placer or the user has
301     // constrained the operation to.  "candidate_device_name" must
302     // refer to a concrete Device that is in the list of 'devices'.
303     const Device* other_device =
304         devices_->FindDeviceByName(candidate_device_name);
305     if (std::find(devices.begin(), devices.end(), other_device) !=
306         devices.end()) {
307       return true;
308     }
309   }
310 
311   return false;
312 }
313 
314 }  // namespace tensorflow
315