1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/core/grappler/costs/utils.h"
17 
18 #include <stddef.h>
19 #include <utility>
20 
21 #include "third_party/eigen3/Eigen/Core"
22 
23 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
24 #include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
25 #include "tensorflow/core/framework/allocation_description.pb.h"
26 #include "tensorflow/core/framework/attr_value.pb.h"
27 #include "tensorflow/core/framework/op.h"
28 #include "tensorflow/core/framework/op_def.pb.h"
29 #include "tensorflow/core/framework/step_stats.pb.h"
30 #include "tensorflow/core/framework/tensor.pb.h"
31 #include "tensorflow/core/framework/tensor_description.pb.h"
32 #include "tensorflow/core/framework/tensor_shape.pb.h"
33 #include "tensorflow/core/framework/types.pb.h"
34 #include "tensorflow/core/graph/graph.h"
35 #include "tensorflow/core/graph/tensor_id.h"
36 #include "tensorflow/core/grappler/clusters/utils.h"
37 #include "tensorflow/core/grappler/utils.h"
38 #include "tensorflow/core/lib/core/bits.h"
39 #include "tensorflow/core/lib/strings/numbers.h"
40 #include "tensorflow/core/lib/strings/strcat.h"
41 #include "tensorflow/core/platform/byte_order.h"
42 #include "tensorflow/core/platform/env.h"
43 #include "tensorflow/core/platform/logging.h"
44 #include "tensorflow/core/platform/protobuf.h"
45 #include "tensorflow/core/protobuf/config.pb.h"
46 #include "tensorflow/core/util/device_name_utils.h"
47 
48 namespace tensorflow {
49 namespace grappler {
50 
UnknownInput()51 static OpInfo::TensorProperties UnknownInput() {
52   OpInfo::TensorProperties input;
53   input.set_dtype(DataType::DT_INVALID);
54   input.mutable_shape()->set_unknown_rank(true);
55   return input;
56 }
57 
ExtractTensors(const AttrValue & attr_value)58 static std::vector<TensorProto> ExtractTensors(const AttrValue& attr_value) {
59   std::vector<TensorProto> tensors;
60   switch (attr_value.value_case()) {
61     case AttrValue::kTensor: {
62       tensors.push_back(attr_value.tensor());
63       break;
64     }
65     case AttrValue::kList: {
66       for (const auto& tensor_proto : attr_value.list().tensor()) {
67         tensors.push_back(tensor_proto);
68       }
69       break;
70     }
71     default: {
72     }
73   }
74   return tensors;
75 }
76 
77 // Annotate the op_info inputs with extra information when possible (e.g. the
78 // input value if it's known statically).
ExtractExtraProperties(const NodeDef & node,const std::unordered_map<string,const NodeDef * > & name_to_node,OpInfo * op_info)79 static void ExtractExtraProperties(
80     const NodeDef& node,
81     const std::unordered_map<string, const NodeDef*>& name_to_node,
82     OpInfo* op_info) {
83   OpRegistry* op_registry = OpRegistry::Global();
84   const OpDef* op_def = nullptr;
85   auto s = op_registry->LookUpOpDef(node.op(), &op_def);
86   if (!s.ok()) {
87     op_def = nullptr;
88   }
89 
90   for (int i = 0; i < node.input_size(); ++i) {
91     const string input_name = node.input(i);
92     CHECK(!input_name.empty());
93     if (IsControlInput(input_name)) {
94       continue;
95     }
96     TensorId input_tensor_id = ParseTensorName(input_name);
97     const string input_node_name(input_tensor_id.first);
98 
99     auto iter = name_to_node.find(input_node_name);
100     if (iter == name_to_node.end()) continue;
101     const NodeDef* input_node = iter->second;
102 
103     if (i >= op_info->inputs_size()) {
104       LOG(ERROR) << "OpInfo's inputs doesn't match the graph! OpInfo: "
105                  << op_info->DebugString()
106                  << "\nCurrent node: " << node.DebugString()
107                  << "\nInput node: " << input_node->DebugString();
108     }
109 
110     // The value attribute in Const input is useful for cost prediction.
111     if (input_node->op() == "Const" && i < op_info->inputs_size()) {
112       auto it = input_node->attr().find("value");
113       if (it == input_node->attr().end()) continue;
114 
115       const AttrValue& attr_value = it->second;
116       std::vector<TensorProto> tensors = ExtractTensors(attr_value);
117       if (tensors.empty()) continue;
118 
119       const TensorProto& t = tensors[0];
120       OpInfo::TensorProperties* input = op_info->mutable_inputs(i);
121       *(input->mutable_value()) = t;
122 
123       // For filename input, the file size can also be useful.
124       if (op_def && i < op_def->input_arg_size() &&
125           op_def->input_arg(i).name().find("filename") != string::npos) {
126         Tensor tensor;
127         if (!tensor.FromProto(t)) {
128           continue;
129         }
130         if (tensor.NumElements() != 1) {
131           continue;
132         }
133         const string filename = tensor.scalar<string>()();
134 
135         Env* env = Env::Default();
136         FileStatistics stat;
137         Status s = env->Stat(filename, &stat);
138         if (!s.ok()) {
139           continue;
140         }
141         AttrValue attr;
142         attr.set_i(stat.length);
143         string attr_key = strings::StrCat("input_", i, "_filesize");
144         (*op_info->mutable_attr())[attr_key] = attr;
145       }
146     }
147 
148     // When the input is a handle (e.g. look up table handle), the information
149     // in the op itself is not sufficient to predict the op memory.
150     if (op_def && i < op_def->input_arg_size() &&
151         op_def->input_arg(i).name().find("handle") != string::npos) {
152       string new_key = strings::StrCat("parent_", i, "_op");
153       AttrValue attr;
154       attr.set_s(input_node->op());
155       (*op_info->mutable_attr())[new_key] = attr;
156       // TODO(yuefengz): Only parent node's op name is copied. Copy inputs
157       // and attributes when necessary.
158     }
159   }
160 }
161 
FindInputFeatures(const NodeDef & node,const std::unordered_map<string,const CostGraphDef::Node * > & name_to_cost,const std::unordered_map<string,const NodeDef * > & name_to_node)162 std::vector<OpInfo::TensorProperties> FindInputFeatures(
163     const NodeDef& node,
164     const std::unordered_map<string, const CostGraphDef::Node*>& name_to_cost,
165     const std::unordered_map<string, const NodeDef*>& name_to_node) {
166   std::vector<OpInfo::TensorProperties> inputs;
167   for (const auto& input_name : node.input()) {
168     CHECK(!input_name.empty());
169     TensorId input_tensor_id = ParseTensorName(input_name);
170     const string input_node_name(input_tensor_id.first);
171     const int output_index = input_tensor_id.second;
172 
173     // Skip control inputs.
174     if (output_index == Graph::kControlSlot) {
175       continue;
176     }
177 
178     auto it = name_to_cost.find(input_node_name);
179     if (it == name_to_cost.end() || output_index < 0) {
180       inputs.push_back(UnknownInput());
181     } else {
182       const CostGraphDef::Node* input_cost = it->second;
183       if (input_cost->output_info_size() == 0) {
184         inputs.push_back(UnknownInput());
185       } else {
186         const CostGraphDef::Node::OutputInfo& output =
187             input_cost->output_info(output_index);
188         OpInfo::TensorProperties input;
189         input.set_dtype(output.dtype());
190         *input.mutable_shape() = output.shape();
191         inputs.push_back(input);
192       }
193     }
194   }
195 
196   return inputs;
197 }
198 
CalculateTensorSize(const OpInfo::TensorProperties & prop)199 int64 CalculateTensorSize(const OpInfo::TensorProperties& prop) {
200   int64 size = DataTypeSize(BaseType(prop.dtype()));
201   TensorShapeProto shape = prop.shape();
202 
203   // Can't infer the size if the rank is unknown. It has to be at least a
204   // scalar though.
205   if (shape.unknown_rank()) {
206     VLOG(2) << "CalculateTensorSize() -- unknown rank";
207     return size;
208   }
209 
210   // If one of the dimensions is unknown statically, assume it's at least one.
211   for (int i = 0; i < shape.dim_size(); ++i) {
212     if (shape.dim(i).size() < 0) {
213       shape.mutable_dim(i)->set_size(1);
214       VLOG(2) << "CalculateTensorSize() -- unknown dim: " << i;
215     }
216   }
217 
218   int64 num_elems = TensorShape(shape).num_elements();
219   return num_elems * size;
220 }
221 
CalculateOutputSize(const std::vector<OpInfo::TensorProperties> & output_properties,const int port_num)222 int64 CalculateOutputSize(
223     const std::vector<OpInfo::TensorProperties>& output_properties,
224     const int port_num) {
225   if (port_num < 0) return 4;  // 4B for control dependency.
226 
227   if (port_num >= output_properties.size()) {
228     LOG(ERROR) << "CalculateOutputSize() -- port_num: " << port_num
229                << " >= output_properties.size(): " << output_properties.size();
230     return 0;
231   }
232 
233   return CalculateTensorSize(output_properties[port_num]);
234 }
235 
GetDeviceInfo(const string & device_str)236 DeviceProperties GetDeviceInfo(const string& device_str) {
237   DeviceProperties unknown;
238   unknown.set_type("UNKNOWN");
239 
240   DeviceNameUtils::ParsedName parsed;
241   if (DeviceNameUtils::ParseFullName(device_str, &parsed)) {
242     if (parsed.type == "GPU") {
243       TfGpuId tf_gpu_id(parsed.id);
244       PlatformGpuId platform_gpu_id;
245       Status s = GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id);
246       if (!s.ok()) {
247         // We are probably running simulation without linking cuda libraries.
248         platform_gpu_id = PlatformGpuId(parsed.id);
249       }
250       return GetLocalGPUInfo(platform_gpu_id);
251     } else if (parsed.type == "CPU") {
252       return GetLocalCPUInfo();
253     }
254   }
255   return unknown;
256 }
257 
GetDeviceInfo(const CostGraphDef::Node & node)258 DeviceProperties GetDeviceInfo(const CostGraphDef::Node& node) {
259   return GetDeviceInfo(node.device());
260 }
261 
BuildOpInfoWithoutDevice(const NodeDef & node,const std::unordered_map<string,const NodeDef * > & name_to_node,const std::vector<OpInfo::TensorProperties> & inputs)262 OpInfo BuildOpInfoWithoutDevice(
263     const NodeDef& node,
264     const std::unordered_map<string, const NodeDef*>& name_to_node,
265     const std::vector<OpInfo::TensorProperties>& inputs) {
266   OpInfo op_info;
267   op_info.set_op(node.op());
268   *op_info.mutable_attr() = node.attr();
269   for (auto& input : inputs) {
270     *op_info.add_inputs() = input;
271   }
272   ExtractExtraProperties(node, name_to_node, &op_info);
273   return op_info;
274 }
275 
GetOpDescription(const OpInfo & op_info)276 string GetOpDescription(const OpInfo& op_info) {
277   string description = "[";
278   description += "Op=" + op_info.op() + ", ";
279   description += "input_shapes=[";
280   for (auto const& input : op_info.inputs()) {
281     description += PartialTensorShape::DebugString(input.shape());
282   }
283   description += "]";
284   return description;
285 }
286 
CostGraphToOpPerformanceData(const CostGraphDef & cost_graph,const GraphDef & graph)287 OpPerformanceList CostGraphToOpPerformanceData(const CostGraphDef& cost_graph,
288                                                const GraphDef& graph) {
289   OpPerformanceList ret;
290   std::unordered_map<string, const CostGraphDef::Node*> name_to_cost;
291   std::unordered_map<string, const NodeDef*> name_to_node;
292   for (auto& node : cost_graph.node()) {
293     name_to_cost[node.name()] = &node;
294   }
295   for (auto& node : graph.node()) {
296     name_to_node[node.name()] = &node;
297   }
298 
299   for (const auto& node : graph.node()) {
300     // Skip the nodes that are not in the cost graph: these are nodes that
301     // aren't run, because they aren't in the intersection of transitive
302     // fan-in of a fetch node and the transitive fan-out of an input, or nodes
303     // that were optimized away by the optimizer. Since they don't contribute
304     // to the execution time we simply discard them.
305     auto it = name_to_cost.find(node.name());
306     if (it == name_to_cost.end()) {
307       continue;
308     }
309     const CostGraphDef::Node* cost_node = it->second;
310 
311     OpPerformance* perf = ret.add_op_performance();
312     perf->set_node(node.name());
313 
314     std::vector<OpInfo::TensorProperties> inputs =
315         FindInputFeatures(node, name_to_cost, name_to_node);
316     *perf->mutable_op() = BuildOpInfoWithoutDevice(node, name_to_node, inputs);
317     *perf->mutable_op()->mutable_device() = GetDeviceInfo(cost_node->device());
318 
319     perf->set_temporary_memory_size(cost_node->temporary_memory_size());
320     // Note that CostGraphDef::Node::compute_cost is microseconds, while
321     // OpPerformance.compute_cost is nanoseconds.
322     perf->set_compute_cost(cost_node->compute_cost() * 1000);
323     perf->set_compute_time(cost_node->compute_time() * 1000);
324     perf->set_memory_time(cost_node->memory_time() * 1000);
325 
326     for (const auto& output_info : cost_node->output_info()) {
327       perf->mutable_op_memory()->add_output_memory(output_info.size());
328     }
329 
330     perf->mutable_op_memory()->set_temp_memory(
331         cost_node->temporary_memory_size());
332     perf->mutable_op_memory()->set_persistent_memory(
333         cost_node->persistent_memory_size());
334   }
335   return ret;
336 }
337 
Add(const uint64 value)338 void TensorSizeHistogram::Add(const uint64 value) {
339   num_elem_++;
340   sum_elem_ += value;
341   min_ = std::min(min_, value);
342   max_ = std::max(max_, value);
343   buckets_[Index(value)]++;
344 }
345 
Merge(const TensorSizeHistogram & src)346 void TensorSizeHistogram::Merge(const TensorSizeHistogram& src) {
347   num_elem_ += src.num_elem_;
348   sum_elem_ += src.sum_elem_;
349   min_ = std::min(min_, src.min_);
350   max_ = std::max(max_, src.max_);
351   std::transform(buckets_.begin(), buckets_.end(), src.buckets_.begin(),
352                  buckets_.begin(), std::plus<uint64>());
353 }
354 
ToString() const355 string TensorSizeHistogram::ToString() const {
356   string r;
357   char buf[200];
358   snprintf(buf, sizeof(buf), "Count: %lld, Average: ", num_elem_);
359   r.append(buf);
360   r.append(strings::HumanReadableNumBytes(Average()));
361   r.append(", Min: ");
362   r.append(strings::HumanReadableNumBytes(min_));
363   r.append(", Max: ");
364   r.append(strings::HumanReadableNumBytes(max_));
365   r.append("\n------------------------------------------------------\n");
366   const double mult = num_elem_ > 0 ? 100.0 / num_elem_ : 0.0;
367   uint64 cumul_sum = 0;
368 
369   const int size_string_width = 12;
370   for (int i = 0; i < buckets_.size(); i++) {
371     if (buckets_[i] == 0) continue;
372     cumul_sum += buckets_[i];
373     r.append("[ ");
374     if (i == 0) {
375       r.append(size_string_width - 2, ' ');
376       r.append("0B");
377     } else {
378       uint64 left = 1ULL << (i - 1);
379       const auto left_string = strings::HumanReadableNumBytes(left);
380       r.append(size_string_width - left_string.size(), ' ');
381       r.append(left_string);
382     }
383     r.append(", ");
384     uint64 right = 1ULL << i;
385     const auto right_string = strings::HumanReadableNumBytes(right);
386     r.append(size_string_width - right_string.size(), ' ');
387     r.append(right_string);
388     snprintf(buf, sizeof(buf), ") %7lld %7.3f%% %7.3f%% ",
389              buckets_[i],         // count
390              mult * buckets_[i],  // percentage
391              mult * cumul_sum);   // cum percentage
392     r.append(buf);
393 
394     // Add hash marks based on percentage; 40 marks for 100%.
395     auto marks = static_cast<int>(
396         (static_cast<double>(40 * buckets_[i] + (num_elem_ >> 1)) / num_elem_));
397     r.append(marks, '#');
398     r.push_back('\n');
399   }
400   return r;
401 }
402 
Index(const uint64 value) const403 const int TensorSizeHistogram::Index(const uint64 value) const {
404   // Log2Floor64 returns -1 for 0, 0 for 1, 1 for 2-3, 2 for 4-7, ...
405   const auto index = Log2Floor64(value) + 1;
406   return std::min(index, kMaxBuckets - 1);
407 }
408 
GetDeviceClassForNonChannelDevice(const string & device_name)409 string GetDeviceClassForNonChannelDevice(const string& device_name) {
410   DeviceNameUtils::ParsedName parsed_name;
411   bool parsed = DeviceNameUtils::ParseFullName(device_name, &parsed_name);
412   if (!parsed) {
413     string name = str_util::StringReplace(device_name, "/job_", "/job:", true);
414     name = str_util::StringReplace(name, "/replica_", "/replica:", true);
415     name = str_util::StringReplace(name, "/task_", "/task:", true);
416     name = str_util::StringReplace(name, "/device_", "/device:", true);
417     name = str_util::StringReplace(name, "GPU_", "GPU:", true);
418     name = str_util::StringReplace(name, "CPU_", "CPU:", true);
419     name = str_util::StringReplace(name, "gpu_", "gpu:", true);
420     name = str_util::StringReplace(name, "cpu_", "cpu:", true);
421     parsed = DeviceNameUtils::ParseFullName(name, &parsed_name);
422   }
423   if (parsed) {
424     const string jobname = parsed_name.has_job ? parsed_name.job : "";
425     return strings::StrCat("/", jobname, "/", parsed_name.type);
426   } else {
427     return "Unclassified";
428   }
429 }
430 
GetDeviceClass(const string & device_name)431 string GetDeviceClass(const string& device_name) {
432   // TODO(dyoon): channel device name follows the convention we currently have
433   // in VirtualScheduler. This should be revised with VirtualScheduler as well
434   // as VirtualPlacer in the future.
435   if (device_name.find("Channel") != string::npos) {
436     const string from = "_from_";
437     const string to = "_to_";
438     const auto from_loc = device_name.find(from);
439     const auto to_loc = device_name.find(to);
440     const auto src_device_full = device_name.substr(
441         from_loc + from.size(), to_loc - (from_loc + from.size()));
442     const auto dst_device_full = device_name.substr(to_loc + to.size());
443     return strings::StrCat(
444         "Channel", ": ", GetDeviceClassForNonChannelDevice(src_device_full),
445         " -> ", GetDeviceClassForNonChannelDevice(dst_device_full));
446   } else {
447     return GetDeviceClassForNonChannelDevice(device_name);
448   }
449 }
450 
GetStatsStringFromRunMetadata(const RunMetadata & run_metadata,bool verbosity)451 string GetStatsStringFromRunMetadata(const RunMetadata& run_metadata,
452                                      bool verbosity) {
453   // TODO(dyoon): print out other stats as needed.
454   std::ostringstream output;
455 
456   // Tensor size histogram:
457   // if verbosity, it outputs per-device histogram,
458   // otherwise, only per-class histogram.
459   std::unordered_map<string, TensorSizeHistogram> device_to_hist_map;
460   const auto& step_stats = run_metadata.step_stats();
461   for (const auto& dev_stat : step_stats.dev_stats()) {
462     const auto& device_name = dev_stat.device();
463     auto& hist = device_to_hist_map[device_name];
464     for (const auto& node_stat : dev_stat.node_stats()) {
465       for (const auto& node_output : node_stat.output()) {
466         // TODO(dyoon): Calculate tensor size from tensor_description's dtype
467         // and shape, instead of using optional allocation_description.
468         const auto size = node_output.tensor_description()
469                               .allocation_description()
470                               .allocated_bytes();
471         hist.Add(size);
472       }
473     }
474   }
475   if (verbosity) {
476     output << "\n";
477     output << "Per device tensor size histogram.\n";
478   }
479 
480   std::unordered_map<string, TensorSizeHistogram> device_class_to_hist_map;
481   for (const auto& device_hist : device_to_hist_map) {
482     const auto& device_name = device_hist.first;
483     const auto& hist = device_hist.second;
484     if (verbosity) {
485       output << "Device: " << device_name << "\n" << hist.ToString() << "\n";
486     }
487     const auto device_class = GetDeviceClass(device_name);
488     auto it = device_class_to_hist_map.find(device_class);
489     if (it == device_class_to_hist_map.end()) {
490       device_class_to_hist_map.emplace(device_class, TensorSizeHistogram(hist));
491     } else {
492       it->second.Merge(hist);
493     }
494   }
495   output << "\n";
496   output << "Aggregated per device / channel type tensor size histogram:\n";
497   for (const auto& device_hist : device_class_to_hist_map) {
498     const auto& device_name = device_hist.first;
499     const auto& hist = device_hist.second;
500     output << "Device: " << device_name << "\n" << hist.ToString() << "\n";
501   }
502   output << "\n";
503 
504   return output.str();
505 }
506 
507 }  // end namespace grappler
508 }  // end namespace tensorflow
509