1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/core/grappler/costs/utils.h"
17 
18 #include <stddef.h>
19 
20 #include <utility>
21 
22 #include "absl/strings/str_cat.h"
23 #include "absl/strings/str_format.h"
24 #include "third_party/eigen3/Eigen/Core"
25 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
26 #include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
27 #include "tensorflow/core/framework/allocation_description.pb.h"
28 #include "tensorflow/core/framework/attr_value.pb.h"
29 #include "tensorflow/core/framework/op.h"
30 #include "tensorflow/core/framework/op_def.pb.h"
31 #include "tensorflow/core/framework/step_stats.pb.h"
32 #include "tensorflow/core/framework/tensor.pb.h"
33 #include "tensorflow/core/framework/tensor_description.pb.h"
34 #include "tensorflow/core/framework/tensor_shape.pb.h"
35 #include "tensorflow/core/framework/types.pb.h"
36 #include "tensorflow/core/graph/graph.h"
37 #include "tensorflow/core/graph/tensor_id.h"
38 #include "tensorflow/core/grappler/clusters/utils.h"
39 #include "tensorflow/core/grappler/utils.h"
40 #include "tensorflow/core/lib/core/bits.h"
41 #include "tensorflow/core/lib/strings/numbers.h"
42 #include "tensorflow/core/platform/byte_order.h"
43 #include "tensorflow/core/platform/env.h"
44 #include "tensorflow/core/platform/logging.h"
45 #include "tensorflow/core/platform/protobuf.h"
46 #include "tensorflow/core/protobuf/config.pb.h"
47 #include "tensorflow/core/util/device_name_utils.h"
48 
49 namespace tensorflow {
50 namespace grappler {
51 
UnknownInput()52 static OpInfo::TensorProperties UnknownInput() {
53   OpInfo::TensorProperties input;
54   input.set_dtype(DataType::DT_INVALID);
55   input.mutable_shape()->set_unknown_rank(true);
56   return input;
57 }
58 
ExtractTensors(const AttrValue & attr_value)59 static std::vector<TensorProto> ExtractTensors(const AttrValue& attr_value) {
60   std::vector<TensorProto> tensors;
61   switch (attr_value.value_case()) {
62     case AttrValue::kTensor: {
63       tensors.push_back(attr_value.tensor());
64       break;
65     }
66     case AttrValue::kList: {
67       for (const auto& tensor_proto : attr_value.list().tensor()) {
68         tensors.push_back(tensor_proto);
69       }
70       break;
71     }
72     default: {
73     }
74   }
75   return tensors;
76 }
77 
78 // Annotate the op_info inputs with extra information when possible (e.g. the
79 // input value if it's known statically).
ExtractExtraProperties(const NodeDef & node,const std::unordered_map<string,const NodeDef * > & name_to_node,OpInfo * op_info)80 static void ExtractExtraProperties(
81     const NodeDef& node,
82     const std::unordered_map<string, const NodeDef*>& name_to_node,
83     OpInfo* op_info) {
84   OpRegistry* op_registry = OpRegistry::Global();
85   const OpDef* op_def = nullptr;
86   auto s = op_registry->LookUpOpDef(node.op(), &op_def);
87   if (!s.ok()) {
88     op_def = nullptr;
89   }
90 
91   for (int i = 0; i < node.input_size(); ++i) {
92     const string input_name = node.input(i);
93     CHECK(!input_name.empty());
94     if (IsControlInput(input_name)) {
95       continue;
96     }
97     TensorId input_tensor_id = ParseTensorName(input_name);
98     const string input_node_name(input_tensor_id.first);
99 
100     auto iter = name_to_node.find(input_node_name);
101     if (iter == name_to_node.end()) continue;
102     const NodeDef* input_node = iter->second;
103 
104     if (i >= op_info->inputs_size()) {
105       LOG(ERROR) << "OpInfo's inputs doesn't match the graph! OpInfo: "
106                  << op_info->DebugString()
107                  << "\nCurrent node: " << node.DebugString()
108                  << "\nInput node: " << input_node->DebugString();
109     }
110 
111     // The value attribute in Const input is useful for cost prediction.
112     if (input_node->op() == "Const" && i < op_info->inputs_size()) {
113       auto it = input_node->attr().find("value");
114       if (it == input_node->attr().end()) continue;
115 
116       const AttrValue& attr_value = it->second;
117       std::vector<TensorProto> tensors = ExtractTensors(attr_value);
118       if (tensors.empty()) continue;
119 
120       const TensorProto& t = tensors[0];
121       OpInfo::TensorProperties* input = op_info->mutable_inputs(i);
122       *(input->mutable_value()) = t;
123 
124       // For filename input, the file size can also be useful.
125       if (op_def && i < op_def->input_arg_size() &&
126           op_def->input_arg(i).name().find("filename") != string::npos) {
127         Tensor tensor;
128         if (!tensor.FromProto(t)) {
129           continue;
130         }
131         if (tensor.NumElements() != 1) {
132           continue;
133         }
134         const string& filename = tensor.scalar<tstring>()();
135 
136         Env* env = Env::Default();
137         FileStatistics stat;
138         Status s = env->Stat(filename, &stat);
139         if (!s.ok()) {
140           continue;
141         }
142         AttrValue attr;
143         attr.set_i(stat.length);
144         string attr_key = absl::StrCat("input_", i, "_filesize");
145         (*op_info->mutable_attr())[attr_key] = attr;
146       }
147     }
148 
149     // When the input is a handle (e.g. look up table handle), the information
150     // in the op itself is not sufficient to predict the op memory.
151     if (op_def && i < op_def->input_arg_size() &&
152         op_def->input_arg(i).name().find("handle") != string::npos) {
153       string new_key = absl::StrCat("parent_", i, "_op");
154       AttrValue attr;
155       attr.set_s(input_node->op());
156       (*op_info->mutable_attr())[new_key] = attr;
157       // TODO(yuefengz): Only parent node's op name is copied. Copy inputs
158       // and attributes when necessary.
159     }
160   }
161 }
162 
FindInputFeatures(const NodeDef & node,const std::unordered_map<string,const CostGraphDef::Node * > & name_to_cost,const std::unordered_map<string,const NodeDef * > & name_to_node)163 std::vector<OpInfo::TensorProperties> FindInputFeatures(
164     const NodeDef& node,
165     const std::unordered_map<string, const CostGraphDef::Node*>& name_to_cost,
166     const std::unordered_map<string, const NodeDef*>& name_to_node) {
167   std::vector<OpInfo::TensorProperties> inputs;
168   for (const auto& input_name : node.input()) {
169     CHECK(!input_name.empty());
170     TensorId input_tensor_id = ParseTensorName(input_name);
171     const string input_node_name(input_tensor_id.first);
172     const int output_index = input_tensor_id.second;
173 
174     // Skip control inputs.
175     if (output_index == Graph::kControlSlot) {
176       continue;
177     }
178 
179     auto it = name_to_cost.find(input_node_name);
180     if (it == name_to_cost.end() || output_index < 0) {
181       inputs.push_back(UnknownInput());
182     } else {
183       const CostGraphDef::Node* input_cost = it->second;
184       if (input_cost->output_info_size() == 0) {
185         inputs.push_back(UnknownInput());
186       } else {
187         const CostGraphDef::Node::OutputInfo& output =
188             input_cost->output_info(output_index);
189         OpInfo::TensorProperties input;
190         input.set_dtype(output.dtype());
191         *input.mutable_shape() = output.shape();
192         inputs.push_back(input);
193       }
194     }
195   }
196 
197   return inputs;
198 }
199 
CalculateTensorSize(const OpInfo::TensorProperties & prop)200 int64 CalculateTensorSize(const OpInfo::TensorProperties& prop) {
201   int64 size = DataTypeSize(BaseType(prop.dtype()));
202   TensorShapeProto shape = prop.shape();
203 
204   // Can't infer the size if the rank is unknown. It has to be at least a
205   // scalar though.
206   if (shape.unknown_rank()) {
207     VLOG(2) << "CalculateTensorSize() -- unknown rank";
208     return size;
209   }
210 
211   // If one of the dimensions is unknown statically, assume it's at least one.
212   for (int i = 0; i < shape.dim_size(); ++i) {
213     if (shape.dim(i).size() < 0) {
214       shape.mutable_dim(i)->set_size(1);
215       VLOG(2) << "CalculateTensorSize() -- unknown dim: " << i;
216     }
217   }
218 
219   int64 num_elems = TensorShape(shape).num_elements();
220   return num_elems * size;
221 }
222 
CalculateOutputSize(const std::vector<OpInfo::TensorProperties> & output_properties,const int port_num)223 int64 CalculateOutputSize(
224     const std::vector<OpInfo::TensorProperties>& output_properties,
225     const int port_num) {
226   if (port_num < 0) return 4;  // 4B for control dependency.
227 
228   if (port_num >= output_properties.size()) {
229     LOG(ERROR) << "CalculateOutputSize() -- port_num: " << port_num
230                << " >= output_properties.size(): " << output_properties.size();
231     return 0;
232   }
233 
234   return CalculateTensorSize(output_properties[port_num]);
235 }
236 
GetDeviceInfo(const string & device_str)237 DeviceProperties GetDeviceInfo(const string& device_str) {
238   DeviceProperties unknown;
239   unknown.set_type("UNKNOWN");
240 
241   DeviceNameUtils::ParsedName parsed;
242   if (DeviceNameUtils::ParseFullName(device_str, &parsed)) {
243     if (parsed.type == "GPU") {
244       TfGpuId tf_gpu_id(parsed.id);
245       PlatformGpuId platform_gpu_id;
246       Status s = GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id);
247       if (!s.ok()) {
248         // We are probably running simulation without linking cuda libraries.
249         platform_gpu_id = PlatformGpuId(parsed.id);
250       }
251       return GetLocalGPUInfo(platform_gpu_id);
252     } else if (parsed.type == "CPU") {
253       return GetLocalCPUInfo();
254     }
255   }
256   return unknown;
257 }
258 
GetDeviceInfo(const CostGraphDef::Node & node)259 DeviceProperties GetDeviceInfo(const CostGraphDef::Node& node) {
260   return GetDeviceInfo(node.device());
261 }
262 
BuildOpInfoWithoutDevice(const NodeDef & node,const std::unordered_map<string,const NodeDef * > & name_to_node,const std::vector<OpInfo::TensorProperties> & inputs)263 OpInfo BuildOpInfoWithoutDevice(
264     const NodeDef& node,
265     const std::unordered_map<string, const NodeDef*>& name_to_node,
266     const std::vector<OpInfo::TensorProperties>& inputs) {
267   OpInfo op_info;
268   op_info.set_op(node.op());
269   *op_info.mutable_attr() = node.attr();
270   for (auto& input : inputs) {
271     *op_info.add_inputs() = input;
272   }
273   ExtractExtraProperties(node, name_to_node, &op_info);
274   return op_info;
275 }
276 
GetOpDescription(const OpInfo & op_info)277 string GetOpDescription(const OpInfo& op_info) {
278   string description = "[";
279   description += "Op=" + op_info.op() + ", ";
280   description += "input_shapes=[";
281   for (auto const& input : op_info.inputs()) {
282     description += PartialTensorShape::DebugString(input.shape());
283   }
284   description += "]";
285   return description;
286 }
287 
CostGraphToOpPerformanceData(const CostGraphDef & cost_graph,const GraphDef & graph)288 OpPerformanceList CostGraphToOpPerformanceData(const CostGraphDef& cost_graph,
289                                                const GraphDef& graph) {
290   OpPerformanceList ret;
291   std::unordered_map<string, const CostGraphDef::Node*> name_to_cost;
292   std::unordered_map<string, const NodeDef*> name_to_node;
293   for (auto& node : cost_graph.node()) {
294     name_to_cost[node.name()] = &node;
295   }
296   for (auto& node : graph.node()) {
297     name_to_node[node.name()] = &node;
298   }
299 
300   for (const auto& node : graph.node()) {
301     // Skip the nodes that are not in the cost graph: these are nodes that
302     // aren't run, because they aren't in the intersection of transitive
303     // fan-in of a fetch node and the transitive fan-out of an input, or nodes
304     // that were optimized away by the optimizer. Since they don't contribute
305     // to the execution time we simply discard them.
306     auto it = name_to_cost.find(node.name());
307     if (it == name_to_cost.end()) {
308       continue;
309     }
310     const CostGraphDef::Node* cost_node = it->second;
311 
312     OpPerformance* perf = ret.add_op_performance();
313     perf->set_node(node.name());
314 
315     std::vector<OpInfo::TensorProperties> inputs =
316         FindInputFeatures(node, name_to_cost, name_to_node);
317     *perf->mutable_op() = BuildOpInfoWithoutDevice(node, name_to_node, inputs);
318     *perf->mutable_op()->mutable_device() = GetDeviceInfo(cost_node->device());
319 
320     perf->set_temporary_memory_size(cost_node->temporary_memory_size());
321     // Note that CostGraphDef::Node::compute_cost is microseconds, while
322     // OpPerformance.compute_cost is nanoseconds.
323     perf->set_compute_cost(cost_node->compute_cost() * 1000);
324     perf->set_compute_time(cost_node->compute_time() * 1000);
325     perf->set_memory_time(cost_node->memory_time() * 1000);
326 
327     for (const auto& output_info : cost_node->output_info()) {
328       perf->mutable_op_memory()->add_output_memory(output_info.size());
329     }
330 
331     perf->mutable_op_memory()->set_temp_memory(
332         cost_node->temporary_memory_size());
333     perf->mutable_op_memory()->set_persistent_memory(
334         cost_node->persistent_memory_size());
335   }
336   return ret;
337 }
338 
Add(const uint64 value)339 void TensorSizeHistogram::Add(const uint64 value) {
340   num_elem_++;
341   sum_elem_ += value;
342   min_ = std::min(min_, value);
343   max_ = std::max(max_, value);
344   buckets_[Index(value)]++;
345 }
346 
Merge(const TensorSizeHistogram & src)347 void TensorSizeHistogram::Merge(const TensorSizeHistogram& src) {
348   num_elem_ += src.num_elem_;
349   sum_elem_ += src.sum_elem_;
350   min_ = std::min(min_, src.min_);
351   max_ = std::max(max_, src.max_);
352   std::transform(buckets_.begin(), buckets_.end(), src.buckets_.begin(),
353                  buckets_.begin(), std::plus<uint64>());
354 }
355 
ToString() const356 string TensorSizeHistogram::ToString() const {
357   string r = absl::StrFormat(
358       "Count: %lld, Average: %s, Min: %s, Max: %s"
359       "\n------------------------------------------------------\n",
360       num_elem_, strings::HumanReadableNumBytes(Average()),
361       strings::HumanReadableNumBytes(min_),
362       strings::HumanReadableNumBytes(max_));
363   const double mult = num_elem_ > 0 ? 100.0 / num_elem_ : 0.0;
364   uint64 cumul_sum = 0;
365 
366   for (int i = 0; i < buckets_.size(); i++) {
367     if (buckets_[i] == 0) continue;
368     cumul_sum += buckets_[i];
369     uint64 left = i == 0 ? 0ULL : 1ULL << (i - 1);
370     uint64 right = 1ULL << i;
371     absl::StrAppendFormat(&r, "[ %12s, %12s) %7d %7.3f%% %7.3f%% ",
372                           strings::HumanReadableNumBytes(left),
373                           strings::HumanReadableNumBytes(right),
374                           buckets_[i],         // count
375                           mult * buckets_[i],  // percentage
376                           mult * cumul_sum);   // cumulative percentage
377 
378     // Add hash marks based on percentage; 40 marks for 100%.
379     auto marks = static_cast<int>(
380         (static_cast<double>(40 * buckets_[i] + (num_elem_ >> 1)) / num_elem_));
381     absl::StrAppendFormat(&r, "%s\n", std::string(marks, '#'));
382   }
383   return r;
384 }
385 
Index(const uint64 value) const386 const int TensorSizeHistogram::Index(const uint64 value) const {
387   // Log2Floor64 returns -1 for 0, 0 for 1, 1 for 2-3, 2 for 4-7, ...
388   const auto index = Log2Floor64(value) + 1;
389   return std::min(index, kMaxBuckets - 1);
390 }
391 
GetDeviceClassForNonChannelDevice(const string & device_name)392 string GetDeviceClassForNonChannelDevice(const string& device_name) {
393   DeviceNameUtils::ParsedName parsed_name;
394   bool parsed = DeviceNameUtils::ParseFullName(device_name, &parsed_name);
395   if (!parsed) {
396     string name = str_util::StringReplace(device_name, "/job_", "/job:", true);
397     name = str_util::StringReplace(name, "/replica_", "/replica:", true);
398     name = str_util::StringReplace(name, "/task_", "/task:", true);
399     name = str_util::StringReplace(name, "/device_", "/device:", true);
400     name = str_util::StringReplace(name, "GPU_", "GPU:", true);
401     name = str_util::StringReplace(name, "CPU_", "CPU:", true);
402     name = str_util::StringReplace(name, "gpu_", "gpu:", true);
403     name = str_util::StringReplace(name, "cpu_", "cpu:", true);
404     parsed = DeviceNameUtils::ParseFullName(name, &parsed_name);
405   }
406   if (parsed) {
407     const string jobname = parsed_name.has_job ? parsed_name.job : "";
408     return absl::StrCat("/", jobname, "/", parsed_name.type);
409   } else {
410     return "Unclassified";
411   }
412 }
413 
GetDeviceClass(const string & device_name)414 string GetDeviceClass(const string& device_name) {
415   // TODO(dyoon): channel device name follows the convention we currently have
416   // in VirtualScheduler. This should be revised with VirtualScheduler as well
417   // as VirtualPlacer in the future.
418   if (device_name.find("Channel") != string::npos) {
419     const string from = "_from_";
420     const string to = "_to_";
421     const auto from_loc = device_name.find(from);
422     const auto to_loc = device_name.find(to);
423     const auto src_device_full = device_name.substr(
424         from_loc + from.size(), to_loc - (from_loc + from.size()));
425     const auto dst_device_full = device_name.substr(to_loc + to.size());
426     return absl::StrCat(
427         "Channel", ": ", GetDeviceClassForNonChannelDevice(src_device_full),
428         " -> ", GetDeviceClassForNonChannelDevice(dst_device_full));
429   } else {
430     return GetDeviceClassForNonChannelDevice(device_name);
431   }
432 }
433 
GetStatsStringFromRunMetadata(const RunMetadata & run_metadata,bool verbosity)434 string GetStatsStringFromRunMetadata(const RunMetadata& run_metadata,
435                                      bool verbosity) {
436   // TODO(dyoon): print out other stats as needed.
437   std::ostringstream output;
438 
439   // Tensor size histogram:
440   // if verbosity, it outputs per-device histogram,
441   // otherwise, only per-class histogram.
442   std::unordered_map<string, TensorSizeHistogram> device_to_hist_map;
443   const auto& step_stats = run_metadata.step_stats();
444   for (const auto& dev_stat : step_stats.dev_stats()) {
445     const auto& device_name = dev_stat.device();
446     auto& hist = device_to_hist_map[device_name];
447     for (const auto& node_stat : dev_stat.node_stats()) {
448       for (const auto& node_output : node_stat.output()) {
449         // TODO(dyoon): Calculate tensor size from tensor_description's dtype
450         // and shape, instead of using optional allocation_description.
451         const auto size = node_output.tensor_description()
452                               .allocation_description()
453                               .allocated_bytes();
454         hist.Add(size);
455       }
456     }
457   }
458   if (verbosity) {
459     output << "\n";
460     output << "Per device tensor size histogram.\n";
461   }
462 
463   std::unordered_map<string, TensorSizeHistogram> device_class_to_hist_map;
464   for (const auto& device_hist : device_to_hist_map) {
465     const auto& device_name = device_hist.first;
466     const auto& hist = device_hist.second;
467     if (verbosity) {
468       output << "Device: " << device_name << "\n" << hist.ToString() << "\n";
469     }
470     const auto device_class = GetDeviceClass(device_name);
471     auto it = device_class_to_hist_map.find(device_class);
472     if (it == device_class_to_hist_map.end()) {
473       device_class_to_hist_map.emplace(device_class, TensorSizeHistogram(hist));
474     } else {
475       it->second.Merge(hist);
476     }
477   }
478   output << "\n";
479   output << "Aggregated per device / channel type tensor size histogram:\n";
480   for (const auto& device_hist : device_class_to_hist_map) {
481     const auto& device_name = device_hist.first;
482     const auto& hist = device_hist.second;
483     output << "Device: " << device_name << "\n" << hist.ToString() << "\n";
484   }
485   output << "\n";
486 
487   return output.str();
488 }
489 
CombineCostsAndUpdateExecutionTime(bool compute_memory_overlap,Costs * costs)490 void CombineCostsAndUpdateExecutionTime(bool compute_memory_overlap,
491                                         Costs* costs) {
492   if (compute_memory_overlap) {
493     costs->execution_time =
494         std::max(costs->intermediate_memory_time,
495                  std::max(costs->compute_time, costs->memory_time));
496   } else {
497     costs->execution_time = costs->compute_time + costs->memory_time +
498                             costs->intermediate_memory_time;
499   }
500 }
501 }  // end namespace grappler
502 }  // end namespace tensorflow
503