1 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/compiler/tf2tensorrt/convert/convert_graph.h"
17
18 #include <fstream>
19 #include <list>
20 #include <map>
21 #include <set>
22 #include <unordered_map>
23 #include <unordered_set>
24 #include <utility>
25 #include <vector>
26
27 #include "absl/strings/str_cat.h"
28 #include "tensorflow/compiler/tf2tensorrt/common/utils.h"
29 #include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h"
30 #include "tensorflow/compiler/tf2tensorrt/convert/logger_registry.h"
31 #include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
32 #include "tensorflow/compiler/tf2tensorrt/segment/segment.h"
33 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
34 #include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
35 #include "tensorflow/core/common_runtime/gpu/gpu_process_state.h"
36 #include "tensorflow/core/common_runtime/graph_constructor.h"
37 #include "tensorflow/core/framework/function.h"
38 #include "tensorflow/core/framework/graph_to_functiondef.h"
39 #include "tensorflow/core/framework/node_def_builder.h"
40 #include "tensorflow/core/graph/algorithm.h"
41 #include "tensorflow/core/graph/graph.h"
42 #include "tensorflow/core/grappler/clusters/virtual_cluster.h"
43 #include "tensorflow/core/grappler/costs/graph_properties.h"
44 #include "tensorflow/core/grappler/devices.h"
45 #include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
46 #include "tensorflow/core/grappler/utils.h"
47 #include "tensorflow/core/lib/core/errors.h"
48 #include "tensorflow/core/lib/gtl/cleanup.h"
49 #include "tensorflow/core/lib/strings/numbers.h"
50 #include "tensorflow/core/platform/logging.h"
51 #include "tensorflow/core/protobuf/config.pb.h" // NOLINT
52 #include "tensorflow/core/protobuf/device_properties.pb.h" // NOLINT
53 #include "tensorflow/core/protobuf/rewriter_config.pb.h" // NOLINT
54 #include "tensorflow/core/util/device_name_utils.h"
55 #include "tensorflow/tools/graph_transforms/transform_utils.h"
56
57 #if GOOGLE_CUDA && GOOGLE_TENSORRT
58 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
59 #include "third_party/tensorrt/NvInfer.h"
60 namespace tensorflow {
61 namespace tensorrt {
62 namespace convert {
63
64 using absl::StrAppend;
65 using absl::StrCat;
66 using ::tensorflow::tensorrt::segment::ClusterProperty;
67 using ::tensorflow::tensorrt::segment::NodePtrCompare;
68 using ::tensorflow::tensorrt::segment::Segment;
69
70 namespace {
71
BuildNodeMap(const Graph & graph,std::unordered_map<string,Node * > * node_map)72 Status BuildNodeMap(const Graph& graph,
73 std::unordered_map<string, Node*>* node_map) {
74 for (auto* node : graph.op_nodes()) {
75 if (!node_map->insert({node->name(), node}).second) {
76 return errors::AlreadyExists("Node name is not unique in graph: " +
77 node->name());
78 }
79 }
80 return Status::OK();
81 }
82
GetEngineType(const ConversionParams & params)83 EngineInfo::EngineType GetEngineType(const ConversionParams& params) {
84 return (params.is_dyn_op || params.use_calibration)
85 ? EngineInfo::EngineType::TRTDynamic
86 : EngineInfo::EngineType::TRTStatic;
87 }
88
89 // Returns true when use_implicit_batch is false or when we are building dynamic
90 // engine, to allow unknown size for dimensions rather than dimension 0.
AllowDynamicNonBatchDimension(const ConversionParams & params)91 bool AllowDynamicNonBatchDimension(const ConversionParams& params) {
92 return !params.use_implicit_batch ||
93 GetEngineType(params) == EngineInfo::EngineType::TRTDynamic;
94 }
95
96 struct EdgePtrCompare {
operator ()tensorflow::tensorrt::convert::__anonff1337d10111::EdgePtrCompare97 bool operator()(const Edge* lhs, const Edge* rhs) const {
98 return lhs->id() < rhs->id();
99 }
100 };
101
102 // TODO(laigd): instead of deciding the device here, the converter should accept
103 // a device name as one of the conversion parameter so users can control on
104 // which device they want to run the conversion.
GetFirstValidDeviceId()105 std::pair<TfGpuId, PlatformGpuId> GetFirstValidDeviceId() {
106 for (int tf_gpu_id_value = 0; tf_gpu_id_value < 100; ++tf_gpu_id_value) {
107 TfGpuId tf_gpu_id(tf_gpu_id_value);
108 PlatformGpuId platform_gpu_id;
109 Status s = GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id);
110 if (s.ok()) {
111 VLOG(1) << "Found TF GPU " << tf_gpu_id.value() << " at cuda device "
112 << platform_gpu_id.value();
113 return std::make_pair(tf_gpu_id, platform_gpu_id);
114 }
115 }
116 LOG(ERROR) << "Could not find any TF GPUs";
117 return std::make_pair(TfGpuId(-1), PlatformGpuId(-1));
118 }
119
120 // Returns false for const nodes (we intend to drop control edges from those).
ShallKeepControlEdgeFrom(const Node * input_node)121 bool ShallKeepControlEdgeFrom(const Node* input_node) {
122 if (!input_node) {
123 LOG(ERROR) << "Node pointer is null, this should not happen";
124 return false;
125 }
126 return input_node->type_string() != "Const";
127 }
128
129 // Function to get subsegment information structure.
GetEngineInfo(const Graph * g,const grappler::GraphProperties & graph_properties,const Segment & segment,const std::unordered_map<string,Node * > & node_map,const std::vector<Node * > & reverse_topo_order,EngineInfo * info)130 Status GetEngineInfo(const Graph* g,
131 const grappler::GraphProperties& graph_properties,
132 const Segment& segment,
133 const std::unordered_map<string, Node*>& node_map,
134 const std::vector<Node*>& reverse_topo_order,
135 EngineInfo* info) {
136 std::vector<const Node*> subgraph_nodes; // Topologically sorted nodes.
137 std::set<const Node*> added_const_nodes; // Used to prevent double insertion.
138
139 const ClusterProperty& segment_property = segment.property;
140 const std::set<const Node*, NodePtrCompare>& segment_nodes = segment.nodes;
141
142 // The device assignment accumulated from the compatible device assignments
143 // for the nodes in the segment.
144 const DeviceNameUtils::ParsedName segment_device =
145 segment_property.DeviceName();
146 info->max_batch_size = segment_property.BatchSize().GetOptionalMaxBatchSize();
147
148 // Map from src_node_name+port to the unique port numbers of the TRT op, where
149 // the src_node_name is the name of the source node of the input/output
150 // edge, thus there must not be any duplicates since source nodes of
151 // input/output edges must be in different split of the graph.
152 // TODO(aaroey): consider using node id and port instead.
153 // TODO(aaroey): using topo order instead of reverting reverse topo order.
154 std::unordered_map<string, int> input_to_engine_port, output_to_engine_port;
155 for (auto it = reverse_topo_order.rbegin(); it != reverse_topo_order.rend();
156 ++it) {
157 const Node* node = *it;
158 if (segment_nodes.count(node) == 0) continue;
159 subgraph_nodes.push_back(node);
160
161 const int node_id = node->id();
162 const string& node_name = node->name();
163
164 // Create input connections. Sort edges first to make deterministic since
165 // in_edges is a set of pointers.
166 std::vector<const Edge*> in_edges(node->in_edges().begin(),
167 node->in_edges().end());
168 std::sort(in_edges.begin(), in_edges.end(), EdgePtrCompare());
169 for (const auto edge : in_edges) {
170 auto input_node = edge->src();
171 if (input_node->IsSource() || segment_nodes.count(input_node)) {
172 continue;
173 }
174 if (edge->IsControlEdge()) {
175 if (ShallKeepControlEdgeFrom(input_node)) {
176 // Non-Const control input.
177 info->connections.emplace_back(input_node->name(), input_node->id(),
178 node_name, node_id,
179 /*input_edge=*/true);
180 }
181 } else if (input_node->type_string() == "Const") {
182 // Add constant data input nodes into the segment graphdef (thus also in
183 // the engine). We don't care if it has other output edges going into
184 // other engines or TF nodes. Since we add it only to the segment
185 // graphdef, not the segment itself, it won't be removed from the graph.
186 // If it doesn't have any edges, TF will prune it out.
187 //
188 // Note that the segmenter already ensure that the constant data input
189 // is valid and supported by the engine.
190 if (!added_const_nodes.insert(input_node).second) {
191 // Already added before.
192 continue;
193 }
194 VLOG(1) << "Adding const node " << input_node->name();
195 } else {
196 // Non-const data input.
197 int port = Graph::kControlSlot - 1;
198 // Use the source non-segment node name/port as key.
199 const string s = StrCat(input_node->name(), ":", edge->src_output());
200 VLOG(1) << "Input edge = " << s;
201 if (input_to_engine_port.count(s)) {
202 port = input_to_engine_port.at(s);
203 } else {
204 port = input_to_engine_port.size();
205 input_to_engine_port.insert({s, port});
206 }
207 info->connections.emplace_back(
208 input_node->name(), input_node->id(), edge->src_output(), node_name,
209 node_id, edge->dst_input(), /*input_edge=*/true, port);
210 }
211 }
212 // Create output connections. Sort edges first to make deterministic since
213 // out_edges is a set of pointers.
214 std::vector<const Edge*> out_edges(node->out_edges().begin(),
215 node->out_edges().end());
216 std::sort(out_edges.begin(), out_edges.end(), EdgePtrCompare());
217 for (const auto edge : out_edges) {
218 auto output_node = edge->dst();
219 if (output_node->IsSink() || segment_nodes.count(output_node)) {
220 continue;
221 }
222 if (edge->IsControlEdge()) {
223 // Control output.
224 if (ShallKeepControlEdgeFrom(node)) {
225 info->connections.emplace_back(output_node->name(), output_node->id(),
226 node_name, node_id,
227 /*input_edge=*/false);
228 }
229 } else {
230 // Data output.
231 int port = Graph::kControlSlot - 1;
232 // Use the source segment node name/port as key.
233 const string s = StrCat(node_name, ":", edge->src_output());
234 VLOG(1) << "Output edge = " << s;
235 if (output_to_engine_port.count(s)) {
236 port = output_to_engine_port.at(s);
237 } else {
238 port = output_to_engine_port.size();
239 output_to_engine_port.insert({s, port});
240 }
241 info->connections.emplace_back(
242 output_node->name(), output_node->id(), edge->dst_input(),
243 node_name, node_id, edge->src_output(), /*input_edge=*/false, port);
244 }
245 }
246 } // For each segment node in topological order.
247
248 // Construct the const nodes first.
249 subgraph_nodes.insert(subgraph_nodes.begin(), added_const_nodes.begin(),
250 added_const_nodes.end());
251 string scope_name;
252 TF_RETURN_IF_ERROR(ConvertSegmentToGraphDef(
253 g, graph_properties, subgraph_nodes, &info->connections,
254 &info->segment_graph_def, &scope_name));
255 info->engine_name = StrCat(scope_name, info->engine_name);
256 VLOG(1) << "Converted TensorRT candidate segment '" << info->engine_name
257 << "' to a GraphDef";
258 if (segment_device.has_type) {
259 // If the accumulated device assignment for the segment has a device type,
260 // the segmenter guarantees the device type is GPU. Use the device
261 // assignment in this case.
262 if (segment_device.type != "GPU") {
263 return errors::Internal(
264 "segment device is not GPU: ",
265 DeviceNameUtils::ParsedNameToString(segment_device));
266 }
267 info->device = DeviceNameUtils::ParsedNameToString(segment_device);
268 } else {
269 TfGpuId tf_gpu_id;
270 PlatformGpuId platform_gpu_id;
271 std::tie(tf_gpu_id, platform_gpu_id) = GetFirstValidDeviceId();
272 if (tf_gpu_id.value() >= 0) {
273 DeviceNameUtils::ParsedName parsed_name;
274 parsed_name.type = "GPU";
275 parsed_name.has_type = true;
276 parsed_name.id = tf_gpu_id.value();
277 parsed_name.has_id = true;
278 info->device = DeviceNameUtils::ParsedNameToString(parsed_name);
279 } else {
280 VLOG(1) << "No device is assigned to the segment. A device will be "
281 "assigned during graph execution (inference).";
282 }
283 }
284 return Status::OK();
285 }
286
287 // Helper function to update edge connection from the removed node to the
288 // engine node. If an outside node is gone, it must have been absorbed into
289 // an engine node. Find the engine node.
UpdateToEngineNode(const std::vector<EngineInfo> & infos,const size_t my_engine_id,const std::vector<Node * > & engine_nodes,const bool is_input_edge,const string & node_name,Node ** node,int * port)290 void UpdateToEngineNode(const std::vector<EngineInfo>& infos,
291 const size_t my_engine_id,
292 const std::vector<Node*>& engine_nodes,
293 const bool is_input_edge, const string& node_name,
294 Node** node, int* port) {
295 for (size_t t = 0; t < infos.size(); ++t) {
296 if (t == my_engine_id) {
297 continue;
298 }
299 const auto& info = infos.at(t);
300 for (const auto& eng_conn : info.connections) {
301 // If the connection being updated is an input connection, the source of
302 // the connection must be an output connection of another engine. And vise
303 // versa.
304 if (is_input_edge == eng_conn.is_input_edge) continue;
305 if (eng_conn.inside_node_name == node_name &&
306 eng_conn.inside_port == *port) {
307 *node = CHECK_NOTNULL(engine_nodes[t]);
308 QCHECK_EQ(info.engine_name, (**node).name())
309 << "Engine name mismatch: " << info.engine_name << " vs "
310 << (**node).name();
311 *port = eng_conn.port_number;
312 return;
313 }
314 }
315 }
316 LOG(FATAL) << "Node " << node_name << " not found in any engine.";
317 }
318
319 // Function to insert a TRT engine node into the graph.
320 // Create engine nodes in the following way:
321 // 1. Each invocation of CreateTRTNode creates an engine node for infos[pos]
322 // 2. When an engine node is created, add it into the graph with necessary
323 // re-wiring.
324 // 2.1. If the outside connected node is existing, connect the engine
325 // node to it.
326 // 2.2. If the outside connected node is gone, it must have been absorted
327 // into another engine node (which was processed before the processing
328 // one). Connect to the pre-existing engine node instead.
329 // 3. In this way, we ensure the graph is topologically sort-able after each
330 // invocation of CreateTRTNode().
CreateTRTNode(const ConversionParams & params,const std::vector<EngineInfo> & infos,int pos,int default_max_batch_size,Graph * graph,std::vector<Node * > * engine_nodes)331 Status CreateTRTNode(const ConversionParams& params,
332 const std::vector<EngineInfo>& infos, int pos,
333 int default_max_batch_size, Graph* graph,
334 std::vector<Node*>* engine_nodes) {
335 const auto& info = infos.at(pos);
336 std::vector<tensorflow::TensorShapeProto> input_shape_protos;
337 std::vector<PartialTensorShape> input_shapes;
338 std::vector<NodeDefBuilder::NodeOut> inputs;
339 std::vector<Node*> input_nodes;
340 std::vector<Node*> control_input_nodes;
341 std::unordered_set<string> control_input_names;
342 std::vector<DataType> out_types;
343
344 VLOG(1) << "Processing " << info.engine_name;
345 // Collect needed info for creating the engine node in the graph
346 for (const auto& conn : info.connections) {
347 // Control edges
348 if (conn.is_control_edge()) {
349 // Skip control outputs for now. control output info are not needed for
350 // node creation and will be processed later.
351 if (!conn.is_input_edge) continue;
352
353 // Rewrire control input if it's not found in original graph.
354 Node* input_node = graph->FindNodeId(conn.outside_id);
355 int port = Graph::kControlSlot;
356 if (!input_node) {
357 UpdateToEngineNode(infos, pos, *engine_nodes, /*is_input_edge=*/true,
358 conn.outside_node_name, &input_node, &port);
359 QCHECK_EQ(Graph::kControlSlot, port);
360 }
361 if (!control_input_names.insert(input_node->name()).second) {
362 continue;
363 }
364 control_input_nodes.push_back(input_node);
365 VLOG(1) << "Engine Control Input " << input_node->name() << " -> "
366 << info.engine_name;
367 } else {
368 // Data edges
369 if (!conn.is_input_edge) {
370 // Set the data types of output edge.
371 if (out_types.size() <= conn.port_number) {
372 out_types.resize(conn.port_number + 1);
373 }
374 out_types.at(conn.port_number) = conn.connection_type;
375 } else {
376 // Set the shapes and data types of input edge.
377 if (input_shapes.size() <= conn.port_number) {
378 input_shape_protos.resize(conn.port_number + 1);
379 input_shapes.resize(conn.port_number + 1);
380 }
381 conn.outside_shape.AsProto(&input_shape_protos.at(conn.port_number));
382 input_shapes.at(conn.port_number) = conn.outside_shape;
383 // Shape must be fully defined (excluding batch dimension) for static
384 // mode.
385 if (params.use_implicit_batch &&
386 info.engine_type == EngineInfo::EngineType::TRTStatic) {
387 for (int i = 1; i < conn.outside_shape.dims(); i++) {
388 if (conn.outside_shape.dim_size(i) <= 0) {
389 return errors::Internal(
390 "Not fully defined input shape when in static mode which "
391 "should have been excluded by the segmenter. ");
392 }
393 }
394 }
395
396 // Rewrire data input if it's not found in original graph.
397 Node* input_node = graph->FindNodeId(conn.outside_id);
398 int port = conn.outside_port;
399 if (!input_node) {
400 UpdateToEngineNode(infos, pos, *engine_nodes, /*is_input_edge=*/true,
401 conn.outside_node_name, &input_node, &port);
402 }
403 if (std::find_if(
404 std::begin(inputs), std::end(inputs),
405 [input_node, &port](const NodeDefBuilder::NodeOut& inp) {
406 return inp.node == input_node->name() && inp.index == port;
407 }) == std::end(inputs)) {
408 inputs.emplace_back(input_node->name(), port, conn.connection_type);
409 input_nodes.push_back(CHECK_NOTNULL(input_node));
410 VLOG(1) << "Engine Input " << input_node->name() << ":" << port
411 << " -> " << info.engine_name << ":" << inputs.size() - 1;
412 }
413 }
414 }
415 }
416 // We don't support segments with no inputs. Fall back to native TF here to
417 // avoid crash later. Constant folding should've folded the ops that make up
418 // these segments.
419 if (inputs.empty()) {
420 return errors::Internal(
421 "Segment has no inputs (possible constfold failure)");
422 }
423
424 const bool calibrate_int8 =
425 (info.precision_mode == TrtPrecisionMode::INT8 && info.use_calibration);
426 // Build the engine and get its serialized representation.
427 string segment_string;
428
429 int max_batch_size = info.max_batch_size.has_value()
430 ? info.max_batch_size.value()
431 : default_max_batch_size;
432
433 if (info.engine_type == EngineInfo::EngineType::TRTStatic) {
434 std::pair<int, Allocator*> device_allocator =
435 GetDeviceAndAllocator(params, info);
436 int cuda_device_id = 0;
437 std::unique_ptr<TRTBaseAllocator> trt_allocator;
438 if (device_allocator.first >= 0) {
439 cuda_device_id = device_allocator.first;
440 trt_allocator.reset(new TRTDeviceAllocator(device_allocator.second));
441 } else {
442 // The value in trt_allocator is a nullptr and cudamalloc will be used.
443 LOG_WARNING_WITH_PREFIX << "Can't identify the cuda device. Running on "
444 "device 0 and use cudamalloc as an allocator";
445 }
446 cudaSetDevice(cuda_device_id);
447
448 auto trt_logger = GetLoggerRegistry()->LookUp(params.trt_logger_name);
449
450 // Create static engines with precision_mode fp32/fp16.
451 TrtUniquePtrType<nvinfer1::ICudaEngine> engine;
452 TF_RETURN_IF_ERROR(ConvertGraphDefToEngine(
453 info.segment_graph_def,
454 calibrate_int8 ? TrtPrecisionMode::FP32 : info.precision_mode,
455 max_batch_size, info.max_workspace_size_bytes, input_shapes, trt_logger,
456 trt_allocator.get(), /*calibrator=*/nullptr, &engine,
457 info.use_calibration, params.use_implicit_batch,
458 /*convert_successfully=*/nullptr,
459 /*profile=*/nullptr, info.engine_name));
460 TrtUniquePtrType<nvinfer1::IHostMemory> engine_data(engine->serialize());
461 segment_string = string(static_cast<const char*>(engine_data->data()),
462 engine_data->size());
463 }
464
465 string prec_string;
466 TF_RETURN_IF_ERROR(TrtPrecisionModeToName(info.precision_mode, &prec_string));
467 NodeDefBuilder node_builder(info.engine_name, "TRTEngineOp");
468 if (!info.device.empty()) node_builder.Device(info.device);
469 if (VLOG_IS_ON(1)) {
470 string ins = StrCat(info.engine_name, " inputs= ");
471 for (const auto& ii : inputs) {
472 StrAppend(&ins, ii.node, ":", ii.index, " ");
473 }
474 VLOG(1) << ins;
475 }
476 node_builder.Input(inputs);
477 for (const string& c : control_input_names) {
478 node_builder.ControlInput(c);
479 }
480
481 NodeDef trt_node;
482 NameAttrList function;
483 function.set_name(StrCat(info.engine_name, "_native_segment"));
484 Status status =
485 node_builder.Attr("input_shapes", input_shape_protos)
486 .Attr("static_engine",
487 info.engine_type == EngineInfo::EngineType::TRTStatic)
488 .Attr("segment_func", function)
489 .Attr("serialized_segment", segment_string)
490 .Attr("calibration_data", "")
491 .Attr("max_cached_engines_count", info.maximum_cached_engines)
492 .Attr("workspace_size_bytes", info.max_workspace_size_bytes)
493 .Attr("max_batch_size", max_batch_size)
494 .Attr("precision_mode", prec_string)
495 .Attr("use_calibration", info.use_calibration)
496 .Attr("_use_implicit_batch", params.use_implicit_batch)
497 .Attr("_allow_build_at_runtime", info.allow_build_at_runtime)
498 .Attr("OutT", out_types)
499 .Finalize(&trt_node);
500 if (!status.ok()) {
501 LOG(ERROR) << "Node construction failed with" << status;
502 return status;
503 }
504 VLOG(1) << "Adding TRTEngine " << info.engine_name << " to graph";
505
506 // Up until this point, graph is not modified. If we return !status.ok() from
507 // here, this segment will be skipped
508 // TODO(aaroey): let it return proper error status for the following logic
509 // instead of checking fail.
510 Node* engine_node = graph->AddNode(trt_node, &status);
511 (*engine_nodes)[pos] = engine_node;
512 if (!status.ok()) {
513 LOG(ERROR) << "Adding node failed " << status;
514 return status;
515 }
516 // Add control input and input edges to the engine node.
517 for (const auto in : control_input_nodes) {
518 VLOG(1) << "Connecting control edge from " << in->name() << " to "
519 << engine_node->name();
520 graph->AddControlEdge(in, engine_node);
521 }
522 VLOG(1) << "input_nodes size = " << input_nodes.size();
523 for (int i = 0; i < input_nodes.size(); ++i) {
524 Node* n = CHECK_NOTNULL(input_nodes[i]);
525 const auto& in = inputs[i];
526 VLOG(1) << "Connecting data edge from " << n->name() << ":" << in.index
527 << " to " << engine_node->name() << ":" << i;
528 graph->AddEdge(n, in.index, engine_node, i);
529 }
530
531 // Updates the inputs of output edges destination nodes, and point them to the
532 // engine node.
533 for (auto& conn : info.connections) {
534 if (conn.is_input_edge) {
535 continue;
536 }
537 Node* output_node = graph->FindNodeId(conn.outside_id);
538 int port = conn.outside_port;
539 if (!output_node) {
540 UpdateToEngineNode(infos, pos, *engine_nodes, /*is_input_edge=*/false,
541 conn.outside_node_name, &output_node, &port);
542 }
543 if (conn.is_control_edge()) {
544 VLOG(1) << "Updating control edge from " << engine_node->name() << " to "
545 << output_node->name();
546 QCHECK_EQ(Graph::kControlSlot, port);
547 graph->AddControlEdge(engine_node, output_node);
548 } else {
549 VLOG(1) << "Updating data edge from " << engine_node->name() << ":"
550 << conn.port_number << " to " << output_node->name() << ":"
551 << port;
552 // Use UpdateEdge() to avoid adding the same edge multiple times.
553 TF_CHECK_OK(
554 graph->UpdateEdge(engine_node, conn.port_number, output_node, port));
555 }
556 }
557 return Status::OK();
558 }
559
GetNextGraphSequenceNumber()560 int64 GetNextGraphSequenceNumber() {
561 static std::atomic<int64> graph_sequence_num;
562 return graph_sequence_num++;
563 }
564
565 constexpr char kCastInputTypeAttrName[] = "SrcT";
566
567 // Transforms node = cast(x, fp32) where datatype(x) != fp16 to:
568 // castToFp16 = cast(x, fp16)
569 // node = cast(castToFp16, fp32)
570 //
MaybeRewriteCastToFp32(GraphDef * graph_def,NodeDef * node_def)571 Status MaybeRewriteCastToFp32(GraphDef* graph_def, NodeDef* node_def) {
572 if (node_def->op() != "Cast") {
573 return Status::OK();
574 }
575
576 DataTypeVector input_types;
577 DataTypeVector output_types;
578 TF_RETURN_IF_ERROR(
579 graph_transforms::GetInOutTypes(*node_def, &input_types, &output_types));
580
581 if (input_types.size() != 1 || output_types.size() != 1) {
582 return errors::Internal("Bad cast operation");
583 }
584
585 if (input_types[0] == DT_HALF || output_types[0] != DT_FLOAT) {
586 return Status::OK();
587 }
588
589 VLOG(2) << "Rewriting cast to FP32 " << node_def->DebugString();
590
591 NodeDef* castToFp16 = graph_def->add_node();
592 for (auto attr_value : node_def->attr()) {
593 (*castToFp16->mutable_attr())[attr_value.first] = attr_value.second;
594 }
595 castToFp16->set_name(node_def->name() + "_split");
596 castToFp16->set_op("Cast");
597 castToFp16->set_device(node_def->device());
598 castToFp16->add_input(node_def->input(0));
599 (*castToFp16->mutable_attr())[kCastOutputTypeAttrName].set_type(DT_HALF);
600
601 node_def->set_input(0, castToFp16->name() + ":0");
602 (*node_def->mutable_attr())[kCastInputTypeAttrName].set_type(DT_HALF);
603
604 VLOG(2) << castToFp16->DebugString();
605 VLOG(2) << node_def->DebugString();
606
607 return Status::OK();
608 }
609
610 } // namespace
611
RegisterGraphToFunctionLibrary(const GraphDef & segment_graph_def,Graph * graph,const string & engine_name)612 Status RegisterGraphToFunctionLibrary(const GraphDef& segment_graph_def,
613 Graph* graph, const string& engine_name) {
614 Graph segment_graph(graph->flib_def());
615 TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(GraphConstructorOptions(),
616 segment_graph_def, &segment_graph));
617 FunctionDefLibrary library;
618 auto segment_func = library.add_function();
619 TF_RETURN_IF_ERROR(GraphToFunctionDef(
620 segment_graph, StrCat(engine_name, "_native_segment"), segment_func));
621 if (VLOG_IS_ON(7)) {
622 VLOG(7) << engine_name << " Function_Def ";
623 VLOG(7) << segment_func->DebugString();
624 }
625 VLOG(1) << "Adding funcdef " << segment_func->signature().name()
626 << " to graphlib";
627 TF_RETURN_IF_ERROR(graph->AddFunctionLibrary(library));
628 return Status::OK();
629 }
630
GetDeviceAndAllocator(const ConversionParams & params,const EngineInfo & engine)631 std::pair<int, Allocator*> GetDeviceAndAllocator(const ConversionParams& params,
632 const EngineInfo& engine) {
633 int cuda_device_id = -1;
634 Allocator* dev_allocator = nullptr;
635 if (params.cluster == nullptr || params.cluster->GetDeviceSet() == nullptr ||
636 engine.device.empty()) {
637 // If device is not set, use the first found GPU device for the conversion.
638 TfGpuId tf_gpu_id;
639 PlatformGpuId platform_gpu_id;
640 std::tie(tf_gpu_id, platform_gpu_id) = GetFirstValidDeviceId();
641 cuda_device_id = platform_gpu_id.value();
642 if (cuda_device_id >= 0) {
643 GPUOptions gpu_options;
644 // If the TF to Cuda gpu id mapping exist, the device and corresponding
645 // allocator must have been initialized already, so the
646 // GetGPUAllocator() call won't create a new allocator.
647 dev_allocator = GPUProcessState::singleton()->GetGPUAllocator(
648 gpu_options, tf_gpu_id, /*total_bytes=*/1, /*peer_gpu_ids=*/{});
649 }
650 return std::make_pair(cuda_device_id, dev_allocator);
651 }
652
653 // Use the device requested by the engine.
654 auto device_set = params.cluster->GetDeviceSet();
655 std::vector<Device*> devices;
656 DeviceNameUtils::ParsedName parsed_name;
657 if (DeviceNameUtils::ParseFullName(engine.device, &parsed_name) &&
658 parsed_name.has_id) {
659 device_set->FindMatchingDevices(parsed_name, &devices);
660 }
661 if (!devices.empty()) {
662 if (devices.size() > 1) {
663 string msg = "Found multiple matching devices using name '";
664 StrAppend(&msg, engine.device, "': ");
665 for (auto d : devices) StrAppend(&msg, d->name(), ", ");
666 StrAppend(&msg, ". Will get the allocator from first one.");
667 LOG_WARNING_WITH_PREFIX << msg;
668 }
669 AllocatorAttributes alloc_attr;
670 cuda_device_id = devices[0]->tensorflow_gpu_device_info()->gpu_id;
671 dev_allocator = devices[0]->GetAllocator(alloc_attr);
672 VLOG(1) << "Using allocator " << dev_allocator->Name()
673 << " and cuda_device_id " << cuda_device_id;
674 } else {
675 LOG_WARNING_WITH_PREFIX << "Cluster is set but device '" << engine.device
676 << "' is not found in the cluster";
677 }
678 return std::make_pair(cuda_device_id, dev_allocator);
679 }
680
681 // Entry function from optimization pass.
ConvertAfterShapes(const ConversionParams & params)682 Status ConvertAfterShapes(const ConversionParams& params) {
683 // Sanity checks.
684 if (params.precision_mode != TrtPrecisionMode::INT8 &&
685 params.use_calibration) {
686 return errors::InvalidArgument(
687 "Calibration with FP32 or FP16 is not supported.");
688 }
689
690 // Make a copy of the input_graph_def because grappler doesn't allow changes
691 // to the input_graph_def and GraphProperties only accepts GraphDef, but not
692 // Graph, as inputs.
693 //
694 // If the overhead of copying the input_graph_def becomes a concern, we can
695 // avoid the copy by (1) enhancing the GraphPropertiers representation to
696 // allow adding shape properties for newly created graph nodes and (2) rewrite
697 // the GraphDef transformation to Graph transformation.
698 GraphDef modified_graph_def = params.grappler_item->graph;
699 // When precision_mode is FP16, transform cast(x, fp32) to
700 // cast(cast(x, fp16), fp32). This creates cast(fp16, f32) that can be
701 // included in the TRTEngineOp as an TensorRT Identity layer for performance:
702 // . Avoid cast(fp32, fp16) in the TRT engine implementation for fp16
703 // precision.
704 // . Changing the input to the TRTEngine from fp32 to fp16 may reduce data
705 // moving from the host to the GPU.
706 if (params.precision_mode == TrtPrecisionMode::FP16) {
707 for (int i = 0; i < modified_graph_def.node_size(); i++) {
708 NodeDef* node_def = modified_graph_def.mutable_node(i);
709 TF_RETURN_IF_ERROR(MaybeRewriteCastToFp32(&modified_graph_def, node_def));
710 }
711 }
712
713 // Construct a GrapplerItem using the modified graph_def and the input
714 // grappler_item.
715 grappler::GrapplerItem grappler_item =
716 params.grappler_item->WithGraph(std::move(modified_graph_def));
717 const GraphDef& graph_def = grappler_item.graph;
718
719 grappler::GraphProperties static_graph_properties(grappler_item);
720 TF_RETURN_IF_ERROR(static_graph_properties.InferStatically(true));
721
722 // Convert graphdef to graph.
723 FunctionLibraryDefinition flib(OpRegistry::Global(), graph_def.library());
724 Graph graph(flib);
725 TF_RETURN_IF_ERROR(
726 ConvertGraphDefToGraph(GraphConstructorOptions(), graph_def, &graph));
727
728 // Segment the graph into subgraphs that can be converted to TensorRT
729 segment::SegmentOptions segment_options;
730 // TODO(ben,jie,sami): exclude output nodes (DISCUSS IT)
731 for (const auto& node : *(params.output_names)) {
732 segment_options.exclude_node_list.insert(node);
733 }
734 segment_options.minimum_segment_size = params.minimum_segment_size;
735 segment_options.use_implicit_batch = params.use_implicit_batch;
736 if (segment_options.use_implicit_batch)
737 segment_options.maximum_batch_size = params.max_batch_size;
738 segment_options.allow_dynamic_non_batch_dim =
739 AllowDynamicNonBatchDimension(params);
740
741 segment::SegmentVector initial_segments;
742 TrtNodeValidator validator(static_graph_properties, params.precision_mode,
743 params.use_calibration, params.use_implicit_batch);
744 TF_RETURN_IF_ERROR(segment::SegmentGraph(
745 &graph, &static_graph_properties,
746 std::bind(&TrtNodeValidator::IsTensorRTCandidate, &validator,
747 std::placeholders::_1),
748 // Input validation is already done by TrtNodeValidator, so we don't
749 // need to check the input edges.
750 [](const Edge* edge) { return true; }, OutputEdgeValidator(),
751 segment_options, &initial_segments));
752 LOG(INFO) << "Number of TensorRT candidate segments: "
753 << initial_segments.size();
754
755 // Get the EngineInfo for each segment.
756 std::unordered_map<string, Node*> node_map;
757 TF_RETURN_IF_ERROR(BuildNodeMap(graph, &node_map));
758 std::vector<EngineInfo> engine_segments;
759 engine_segments.reserve(initial_segments.size());
760 std::vector<Node*> reverse_topo_order;
761 GetPostOrder(graph, &reverse_topo_order);
762 segment::SegmentVector converted_segments;
763 converted_segments.reserve(initial_segments.size());
764 string engine_name_prefix =
765 StrCat("TRTEngineOp_", GetNextGraphSequenceNumber(), "_");
766 for (size_t t = 0; t < initial_segments.size(); t++) {
767 auto& curr_segment = initial_segments.at(t);
768 EngineInfo curr_engine;
769 curr_engine.engine_name = StrCat(engine_name_prefix, t);
770 Status status = GetEngineInfo(&graph, static_graph_properties, curr_segment,
771 node_map, reverse_topo_order, &curr_engine);
772 if (!status.ok()) {
773 LOG_WARNING_WITH_PREFIX << "Failed to get engine info for segment " << t
774 << ": " << status;
775 continue;
776 }
777 curr_engine.precision_mode = params.precision_mode;
778 curr_engine.engine_type = GetEngineType(params);
779 curr_engine.use_calibration = params.use_calibration;
780 curr_engine.maximum_cached_engines = params.max_cached_engines;
781 curr_engine.allow_build_at_runtime = params.allow_build_at_runtime;
782 if (!curr_engine.max_batch_size.has_value()) {
783 curr_engine.max_batch_size = params.max_batch_size;
784 }
785
786 status = RegisterGraphToFunctionLibrary(curr_engine.segment_graph_def,
787 &graph, curr_engine.engine_name);
788
789 if (!status.ok()) {
790 LOG_WARNING_WITH_PREFIX
791 << "Failed to register segment graphdef to the library " << t << ": "
792 << status;
793 continue;
794 }
795
796 engine_segments.push_back(std::move(curr_engine));
797 converted_segments.push_back(std::move(curr_segment));
798
799 if (VLOG_IS_ON(8)) {
800 string fname = engine_segments.back().engine_name;
801 StrAppend(&fname, ".pb");
802 std::fstream f;
803 f.open(fname.c_str(), std::fstream::out | std::fstream::binary);
804 f << engine_segments.at(t).segment_graph_def.SerializeAsString();
805 f.close();
806 }
807 }
808
809 // Save the cuda device if we may need to switch to another cuda device to
810 // build static engines.
811 absl::optional<int> old_cuda_device = absl::nullopt;
812 if (!params.is_dyn_op) {
813 int cuda_device_id;
814 cudaError_t cuda_error = cudaGetDevice(&cuda_device_id);
815 if (cuda_error != cudaSuccess) {
816 LOG_WARNING_WITH_PREFIX << "Couldn't get current device: "
817 << cudaGetErrorString(cuda_error);
818 } else {
819 VLOG(1) << "Current cuda device is " << cuda_device_id;
820 old_cuda_device = cuda_device_id;
821 }
822 }
823
824 auto restore_cuda_device = gtl::MakeCleanup([old_cuda_device] {
825 if (old_cuda_device.has_value()) {
826 cudaSetDevice(old_cuda_device.value());
827 }
828 });
829
830 std::vector<Node*> engine_nodes;
831 engine_nodes.resize(engine_segments.size());
832 for (int i = 0; i < engine_segments.size(); ++i) {
833 auto& engine = engine_segments.at(i);
834 // TODO(b/170762693): implement the heuristic to calculate
835 // max_workspace_size_bytes.
836 engine.max_workspace_size_bytes = params.max_workspace_size_bytes;
837 VLOG(1) << "Assigned " << engine.max_workspace_size_bytes << " bytes to "
838 << engine.engine_name;
839 auto status = CreateTRTNode(params, engine_segments, i,
840 params.max_batch_size, &graph, &engine_nodes);
841
842 string msg = StrCat("segment ", i, " consisting of ",
843 converted_segments.at(i).nodes.size(), " nodes by ",
844 engine.engine_name);
845 if (status.ok()) {
846 LOG(INFO) << "Replaced " << msg << ".";
847 } else {
848 // Graph is not modified.
849 LOG_WARNING_WITH_PREFIX << "Cannot replace " << msg
850 << " reason: " << status.error_message()
851 << " (keeping original segment).";
852 }
853 if (VLOG_IS_ON(1)) {
854 msg = "Segment consists of nodes: ";
855 for (const Node* node : converted_segments.at(i).nodes) {
856 StrAppend(&msg, node->name(), ", ");
857 }
858 VLOG(1) << msg;
859 }
860
861 // If status is ok, we successfully added the node to the graph and can
862 // remove segment ops. Otherwise graph is not modified.
863 if (status.ok()) {
864 for (const Node* node : converted_segments.at(i).nodes) {
865 graph.RemoveNode(const_cast<Node*>(node));
866 }
867 }
868 }
869 graph.ToGraphDef(params.output_graph_def);
870 VLOG(1) << "Returning from conversion";
871 return Status::OK();
872 }
873
874 } // namespace convert
875 } // namespace tensorrt
876 } // namespace tensorflow
877
878 #endif // GOOGLE_CUDA && GOOGLE_TENSORRT
879