1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/core/grappler/costs/graph_memory.h"
17
18 #include <deque>
19 #include "tensorflow/core/framework/allocation_description.pb.h"
20 #include "tensorflow/core/framework/attr_value.pb.h"
21 #include "tensorflow/core/framework/node_def.pb.h"
22 #include "tensorflow/core/framework/step_stats.pb.h"
23 #include "tensorflow/core/framework/tensor.pb.h" // NOLINT
24 #include "tensorflow/core/framework/tensor_description.pb.h"
25 #include "tensorflow/core/framework/tensor_shape.h"
26 #include "tensorflow/core/framework/tensor_shape.pb.h"
27 #include "tensorflow/core/grappler/clusters/virtual_cluster.h"
28 #include "tensorflow/core/grappler/costs/graph_properties.h"
29 #include "tensorflow/core/grappler/utils.h"
30
31 namespace tensorflow {
32 namespace grappler {
33
InferStatically(const std::unordered_map<string,DeviceProperties> & devices)34 Status GraphMemory::InferStatically(
35 const std::unordered_map<string, DeviceProperties>& devices) {
36 VirtualCluster cluster(devices);
37 TF_RETURN_IF_ERROR(cluster.Provision());
38 TF_RETURN_IF_ERROR(cluster.Initialize(item_));
39 RunMetadata metadata;
40 Status s = cluster.Run(item_.graph, item_.feed, item_.fetch, &metadata);
41 // The virtual cluster returns the RESOURCE_EXHAUSTED error when it detects
42 // that the model would run out of memory. We still get the metadata we need
43 // out of the simulation, so we just ignore this error.
44 if (!s.ok() && s.code() != error::RESOURCE_EXHAUSTED) {
45 return s;
46 }
47 InferFromTrace(metadata.step_stats());
48 return Status::OK();
49 }
50
InferDynamically(Cluster * cluster)51 Status GraphMemory::InferDynamically(Cluster* cluster) {
52 if (!cluster->DetailedStatsEnabled()) {
53 return errors::Unavailable("Detailed stats collection must be enabled");
54 }
55
56 TF_RETURN_IF_ERROR(cluster->Initialize(item_));
57 RunMetadata metadata;
58 TF_RETURN_IF_ERROR(
59 cluster->Run(item_.graph, item_.feed, item_.fetch, &metadata));
60 InferFromTrace(metadata.step_stats());
61 return Status::OK();
62 }
63
GetWorstCaseMemoryUsage() const64 int64 GraphMemory::GetWorstCaseMemoryUsage() const {
65 int64 worst_case = -1;
66 for (const auto& peak_usage : peak_usage_) {
67 worst_case = std::max(worst_case, peak_usage.second.used_memory);
68 }
69 return worst_case;
70 }
71
InferMemUsageForNodes(const std::vector<const NodeDef * > & nodes,GraphProperties * properties,int64 * worst_case_memory_usage,int64 * best_case_memory_usage) const72 void GraphMemory::InferMemUsageForNodes(
73 const std::vector<const NodeDef*>& nodes, GraphProperties* properties,
74 int64* worst_case_memory_usage, int64* best_case_memory_usage) const {
75 // TODO(bsteiner) refine this: we should consider the multidevice case.
76 *worst_case_memory_usage = 0;
77 *best_case_memory_usage = 0;
78 for (const auto& node : item_.graph.node()) {
79 // Estimate the memory required to store the tensors generated by the node.
80 std::vector<OpInfo::TensorProperties> outputs =
81 properties->GetOutputProperties(node.name());
82 int64 node_memory_usage = InferMemUsageForNeighbors(outputs);
83
84 // Worst case memory usage corresponds to the case where all the nodes are
85 // alive.
86 *worst_case_memory_usage += node_memory_usage;
87
88 // Estimate the memory required to store the input tensors needed by the
89 // node.
90 std::vector<OpInfo::TensorProperties> inputs =
91 properties->GetInputProperties(node.name());
92 node_memory_usage += InferMemUsageForNeighbors(inputs);
93
94 *best_case_memory_usage =
95 std::max(*best_case_memory_usage, node_memory_usage);
96 }
97 }
98
InferMemUsageForNeighbors(const std::vector<OpInfo::TensorProperties> & props) const99 int64 GraphMemory::InferMemUsageForNeighbors(
100 const std::vector<OpInfo::TensorProperties>& props) const {
101 int64 neighbors_memory_usage = 0;
102 for (const auto& prop : props) {
103 DataType dtype = prop.dtype();
104 int size = DataTypeSize(dtype);
105 TensorShapeProto shape = prop.shape();
106 if (shape.unknown_rank()) {
107 // Can't infer the size if the rank is unknown, just skip.
108 continue;
109 }
110 // If one of the dimensions is unknown statically, assume it's one.
111 for (int i = 0; i < shape.dim_size(); ++i) {
112 if (shape.dim(i).size() < 0) {
113 shape.mutable_dim(i)->set_size(1);
114 }
115 }
116 int num_elems = TensorShape(shape).num_elements();
117 neighbors_memory_usage += num_elems * size;
118 }
119 return neighbors_memory_usage;
120 }
121
FindOrCreateLiveTensor(const string & node_name,int output_id,std::unordered_map<string,GraphMemory::LiveTensor * > * live_tensors,std::deque<GraphMemory::LiveTensor> * device_tensors)122 static GraphMemory::LiveTensor* FindOrCreateLiveTensor(
123 const string& node_name, int output_id,
124 std::unordered_map<string, GraphMemory::LiveTensor*>* live_tensors,
125 std::deque<GraphMemory::LiveTensor>* device_tensors) {
126 string name = strings::StrCat(node_name, ":", output_id);
127 GraphMemory::LiveTensor* live;
128 auto it = live_tensors->find(name);
129 if (it == live_tensors->end()) {
130 GraphMemory::LiveTensor temp;
131 temp.node = node_name;
132 temp.output_id = output_id;
133 temp.allocation_time = 0;
134 temp.deallocation_time = 0;
135 device_tensors->push_front(temp);
136 live = &device_tensors->front();
137 (*live_tensors)[name] = live;
138 } else {
139 live = it->second;
140 }
141 return live;
142 }
143
144 namespace {
145 struct Event {
Eventtensorflow::grappler::__anon5da8d3120111::Event146 Event(int64 _timestamp, bool _allocated,
147 const GraphMemory::LiveTensor* _tensor)
148 : timestamp(_timestamp), allocated(_allocated), tensor(_tensor) {}
149
150 int64 timestamp;
151 bool allocated;
152 const GraphMemory::LiveTensor* tensor;
153
operator <tensorflow::grappler::__anon5da8d3120111::Event154 bool operator<(const Event& other) const {
155 return timestamp < other.timestamp;
156 }
157 };
158 } // namespace
159
InferFromTrace(const StepStats & timeline)160 void GraphMemory::InferFromTrace(const StepStats& timeline) {
161 std::unordered_map<string, string> node_placement;
162 for (const auto& dev_stats : timeline.dev_stats()) {
163 for (const auto& node_stats : dev_stats.node_stats()) {
164 node_placement[node_stats.node_name()] = dev_stats.device();
165 }
166 }
167
168 std::unordered_map<string, LiveTensor*> live_tensors;
169 std::unordered_map<string, std::deque<LiveTensor>> live_tensors_per_device;
170 std::unordered_map<string, const NodeDef*> node_map;
171 for (const NodeDef& node : item_.graph.node()) {
172 node_map[node.name()] = &node;
173 }
174 for (const auto& dev_stats : timeline.dev_stats()) {
175 const string& device_name = dev_stats.device();
176 const bool is_gpu = (device_name.find("GPU:") || device_name.find("gpu:"));
177 std::deque<LiveTensor>& device_tensors =
178 live_tensors_per_device[dev_stats.device()];
179 for (const auto& node_stats : dev_stats.node_stats()) {
180 for (int i = 0; i < node_stats.output_size(); ++i) {
181 const auto& output = node_stats.output(i);
182
183 LiveTensor* live = FindOrCreateLiveTensor(
184 node_stats.node_name(), i, &live_tensors, &device_tensors);
185 live->memory_used = output.tensor_description()
186 .allocation_description()
187 .allocated_bytes();
188
189 // Allocations typically take place at the very beginning of the op
190 // execution.
191 live->allocation_time =
192 Costs::MicroSeconds(node_stats.all_start_micros());
193 // Add one nanosecond to the completion time of the ops to account for
194 // TF overhead that slightly delays deallocations.
195 live->deallocation_time = std::max<Costs::Duration>(
196 live->deallocation_time,
197 Costs::NanoSeconds(1) +
198 Costs::MicroSeconds(node_stats.all_start_micros() +
199 node_stats.op_end_rel_micros()));
200 }
201
202 auto it = node_map.find(node_stats.node_name());
203 if (it == node_map.end()) {
204 // Skip nodes inserted by TF since they don't exist in the original
205 // graph (e.g _Send/_Recv nodes).
206 continue;
207 }
208 const NodeDef* node = it->second;
209 std::unordered_set<int> swapped_inputs;
210 if (is_gpu) {
211 auto it = node->attr().find("_swap_to_host");
212 if (it != node->attr().end()) {
213 const AttrValue& val = it->second;
214 for (int port_id : val.list().i()) {
215 swapped_inputs.insert(port_id);
216 }
217 }
218 }
219 for (int i = 0; i < node->input_size(); ++i) {
220 if (swapped_inputs.find(i) != swapped_inputs.end()) {
221 // The memory of swapped inputs will be released as early as possible:
222 // therefore ignore this input when determining the deallocation time
223 // of the tensor.
224 continue;
225 }
226 const string& input = node->input(i);
227 int position;
228 string input_node = ParseNodeName(input, &position);
229 if (position < 0) {
230 // Skip control dependencies
231 continue;
232 }
233 LiveTensor* live = FindOrCreateLiveTensor(
234 input_node, position, &live_tensors,
235 &live_tensors_per_device[node_placement[input_node]]);
236 live->deallocation_time = std::max<Costs::Duration>(
237 live->deallocation_time,
238 Costs::NanoSeconds(1) +
239 Costs::MicroSeconds(node_stats.all_start_micros() +
240 node_stats.op_end_rel_micros()));
241 }
242 }
243 }
244
245 for (const auto& live_per_device : live_tensors_per_device) {
246 std::vector<Event> events;
247 events.reserve(2 * live_per_device.second.size());
248 for (const auto& live : live_per_device.second) {
249 events.emplace_back(static_cast<int64>(live.allocation_time.count()),
250 true, &live);
251 events.emplace_back(static_cast<int64>(live.deallocation_time.count()),
252 false, &live);
253 }
254 std::stable_sort(events.begin(), events.end());
255 size_t peak = 0;
256 std::unordered_set<const LiveTensor*> live_at_peak;
257 size_t current = 0;
258 std::unordered_set<const LiveTensor*> currently_live;
259 for (int i = 0; i < events.size(); ++i) {
260 const auto& event = events[i];
261
262 if (event.allocated) {
263 VLOG(1) << "At time " << event.timestamp << " allocated "
264 << event.tensor->memory_used << " for tensor "
265 << event.tensor->node << ":" << event.tensor->output_id;
266 current += event.tensor->memory_used;
267 currently_live.insert(event.tensor);
268 } else {
269 VLOG(1) << "At time " << event.timestamp << " deallocated "
270 << event.tensor->memory_used << " for tensor "
271 << event.tensor->node << ":" << event.tensor->output_id;
272 current -= event.tensor->memory_used;
273 currently_live.erase(event.tensor);
274 }
275 if (i + 1 == events.size() ||
276 event.timestamp != events[i + 1].timestamp) {
277 if (current > peak) {
278 peak = current;
279 live_at_peak = currently_live;
280 }
281 }
282 }
283 MemoryUsage& peak_mem_usage = peak_usage_[live_per_device.first];
284 peak_mem_usage.used_memory = peak;
285 peak_mem_usage.live_tensors.clear();
286 peak_mem_usage.live_tensors.reserve(live_at_peak.size());
287 for (const auto& live : live_at_peak) {
288 peak_mem_usage.live_tensors.push_back(*live);
289 }
290 }
291 }
292
293 } // end namespace grappler
294 } // end namespace tensorflow
295