1 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/core/framework/metrics.h"
17 #include "tensorflow/core/lib/monitoring/counter.h"
18 #include "tensorflow/core/lib/monitoring/sampler.h"
19 
20 namespace tensorflow {
21 namespace metrics {
22 namespace {
23 
24 auto* graph_runs = monitoring::Counter<0>::New(
25     "/tensorflow/core/graph_runs",
26     "The number of graph executions used to collect "
27     "/tensorflow/core/graph_run_time_usecs");
28 
29 auto* graph_run_time_usecs = monitoring::Counter<0>::New(
30     "/tensorflow/core/graph_run_time_usecs",
31     "The total time spent on executing graphs in microseconds.");
32 
33 auto* graph_optimization_usecs =
34     monitoring::Counter<2>::New("/tensorflow/core/graph_optimization_usecs",
35                                 "The total time spent running each graph "
36                                 "optimization pass in microseconds.",
37                                 "kind", "name");
38 
39 auto* graph_run_time_usecs_histogram = monitoring::Sampler<0>::New(
40     {"/tensorflow/core/graph_run_time_usecs_histogram",
41      "The wall-clock time spent on executing graphs in microseconds."},
42     // Power of 2 with bucket count 20 (> 17 minutes)
43     {monitoring::Buckets::Exponential(1000, 2, 20)});
44 
45 auto* graph_pending_queue_length_histogram = monitoring::Sampler<0>::New(
46     {"/tensorflow/core/graph_pending_queue_length_histogram",
47      "The number of pending (ready but not running) tasks in graph executor."},
48     // Power of 1.5 with bucket count 30 (> 191k)
49     {monitoring::Buckets::Exponential(1, 1.5, 30)});
50 
51 auto* graph_run_input_tensor_bytes = monitoring::Sampler<0>::New(
52     {"/tensorflow/core/graph_run_input_tensor_bytes",
53      "The size of input tensors in bytes."},
54     // Power of 2 with bucket count 14 (256MB)
55     {monitoring::Buckets::Exponential(1, 4, 14)});
56 
57 auto* graph_run_output_tensor_bytes = monitoring::Sampler<0>::New(
58     {"/tensorflow/core/graph_run_output_tensor_bytes",
59      "The size of output tensors in bytes."},
60     // Power of 2 with bucket count 14 (256MB)
61     {monitoring::Buckets::Exponential(1, 4, 14)});
62 
63 auto* graph_unused_outputs = monitoring::Counter<1>::New(
64     "/tensorflow/core/graph_unused_outputs",
65     "The number of unused outputs for ops of a given type.", "name");
66 
67 auto* tf_data_autotune_counter = monitoring::Counter<1>::New(
68     "/tensorflow/data/autotune", "tf.data autotuning", "name");
69 
70 auto* tf_data_bytes_consumed_counter = monitoring::Counter<1>::New(
71     "/tensorflow/data/bytes_consumed",
72     "The number of bytes consumed by a tf.data Dataset.", "name");
73 
74 auto* tf_data_bytes_produced_counter = monitoring::Counter<1>::New(
75     "/tensorflow/data/bytes_produced",
76     "The number of bytes produced by a tf.data Dataset.", "name");
77 
78 auto* tf_data_bytes_read_counter = monitoring::Counter<1>::New(
79     "/tensorflow/data/bytes_read",
80     "The number of bytes read by tf.data Dataset sources.", "name");
81 
82 auto* tf_data_bytes_fetched_counter = monitoring::Counter<0>::New(
83     "/tensorflow/data/bytes_fetched",
84     "The number of bytes fetched from tf.data Dataset iterator.");
85 
86 auto* tf_data_elements_counter = monitoring::Counter<1>::New(
87     "/tensorflow/data/elements", "tf.data elements", "name");
88 
89 auto* tf_data_experiment_counter = monitoring::Counter<1>::New(
90     "/tensorflow/data/experiment",
91     "The number of times tf.data experiment is applied to input pipelines.",
92     "name");
93 
94 auto* tf_data_fingerprint_counter = monitoring::Counter<1>::New(
95     "/tensorflow/data/fingerprint", "tf.data fingerprint", "name");
96 
97 auto* tf_data_get_next_duration_usecs_histogram = monitoring::Sampler<0>::New(
98     {"/tensorflow/data/getnext_duration",
99      "Microseconds spent fetching an element from tf.data iterator."},
100     // Power of 2 with bucket count 10 (1024 microseconds) and 1 second.
101     {monitoring::Buckets::Explicit(
102         {2., 4., 8., 16., 32., 64., 128., 256., 512., 1024., 1e6})});
103 
104 auto* tf_data_iterator_busy_counter =
105     monitoring::Counter<0>::New("/tensorflow/data/iterator_busy",
106                                 "The time (in microseconds) during which a "
107                                 "tf.data iterator was busy processing at "
108                                 "least one `GetNext()` request.");
109 
110 auto* tf_data_iterator_lifetime_counter = monitoring::Counter<0>::New(
111     "/tensorflow/data/iterator_lifetime",
112     "The time (in microseconds) between a tf.data iterator receiving the first "
113     "`GetNext()` request and responding to the last `GetNext()` request.");
114 
115 auto* tf_data_optimization_counter = monitoring::Counter<1>::New(
116     "/tensorflow/data/optimization", "tf.data optimization", "name");
117 
118 auto* tf_data_filename_counter = monitoring::Counter<2>::New(
119     "/tensorflow/data/filename", "The file name read by a tf.data Dataset.",
120     "name", "filename");
121 
122 auto* parse_dense_feature_counter = monitoring::Counter<0>::New(
123     "/tensorflow/data/dense_feature",
124     "The number of dense features parsed by ops for parsing tf.Example.");
125 
126 auto* parse_sparse_feature_counter = monitoring::Counter<0>::New(
127     "/tensorflow/data/sparse_feature",
128     "The number of sparse features parsed by ops for parsing tf.Example.");
129 
130 auto* parse_ragged_feature_counter = monitoring::Counter<0>::New(
131     "/tensorflow/data/ragged_feature",
132     "The number of ragged features parsed by ops for parsing tf.Example.");
133 
134 auto* build_graph_calls = monitoring::Counter<0>::New(
135     "/tensorflow/core/graph_build_calls",
136     "The number of times TensorFlow has created a new client graph. "
137     "A client graph is a sub-graph of the full graph, induced by a set of "
138     "options, including the requested feeds and fetches. It includes time "
139     "spent optimizing the graph with Grappler, and time spent pruning the "
140     "sub-graph.");
141 
142 auto* build_graph_time_usecs = monitoring::Counter<0>::New(
143     "/tensorflow/core/graph_build_time_usecs",
144     "The amount of time TensorFlow has spent creating new client graphs in "
145     "microseconds. "
146     "A client graph is a sub-graph of the full graph, induced by a set of "
147     "options, including the requested feeds and fetches. It includes time "
148     "spent optimizing the graph with Grappler, and time spent pruning the "
149     "sub-graph.");
150 
151 auto* xla_compilations = monitoring::Counter<0>::New(
152     "/tensorflow/core/xla_compilations",
153     "The number of XLA compilations used to collect "
154     "/tensorflow/core/xla_compilation_time_usecs");
155 
156 auto* xla_compilation_time_usecs = monitoring::Counter<0>::New(
157     "/tensorflow/core/xla_compilation_time_usecs",
158     "The total time spent on compiling XLA graphs in microseconds.");
159 
160 auto* mlir_import_failure_count = monitoring::Counter<0>::New(
161     "/tensorflow/mlir/import_failure_count",
162     "The number of jobs that failed during mlir import or verification.");
163 
164 auto* bfc_allocator_delay =
165     monitoring::Counter<0>::New("/tensorflow/core/bfc_allocator_delay",
166                                 "The total time spent running each graph "
167                                 "optimization pass in microseconds.");
168 
169 }  // namespace
170 
RecordTFDataAutotune(const string & name)171 void RecordTFDataAutotune(const string& name) {
172   tf_data_autotune_counter->GetCell(name)->IncrementBy(1);
173 }
174 
GetTFDataBytesConsumedCounter(const string & name)175 monitoring::CounterCell* GetTFDataBytesConsumedCounter(const string& name) {
176   return tf_data_bytes_consumed_counter->GetCell(name);
177 }
178 
GetTFDataBytesProducedCounter(const string & name)179 monitoring::CounterCell* GetTFDataBytesProducedCounter(const string& name) {
180   return tf_data_bytes_produced_counter->GetCell(name);
181 }
182 
GetTFDataBytesReadCounter(const string & name)183 monitoring::CounterCell* GetTFDataBytesReadCounter(const string& name) {
184   return tf_data_bytes_read_counter->GetCell(name);
185 }
186 
GetTFDataElementsCounter(const string & name)187 monitoring::CounterCell* GetTFDataElementsCounter(const string& name) {
188   return tf_data_elements_counter->GetCell(name);
189 }
190 
RecordTFDataBytesFetched(int64 num_bytes)191 void RecordTFDataBytesFetched(int64 num_bytes) {
192   tf_data_bytes_fetched_counter->GetCell()->IncrementBy(num_bytes);
193 }
194 
RecordTFDataExperiment(const string & name)195 void RecordTFDataExperiment(const string& name) {
196   tf_data_experiment_counter->GetCell(name)->IncrementBy(1);
197 }
198 
RecordTFDataFingerprint(const string & name)199 void RecordTFDataFingerprint(const string& name) {
200   tf_data_fingerprint_counter->GetCell(name)->IncrementBy(1);
201 }
202 
RecordTFDataGetNextDuration(uint64 duration_us)203 void RecordTFDataGetNextDuration(uint64 duration_us) {
204   static auto* tf_data_get_next_duration_cell =
205       tf_data_get_next_duration_usecs_histogram->GetCell();
206   tf_data_get_next_duration_cell->Add(duration_us);
207 }
208 
RecordTFDataIteratorBusy(uint64 duration_us)209 void RecordTFDataIteratorBusy(uint64 duration_us) {
210   static auto* tf_data_iterator_busy_cell =
211       tf_data_iterator_busy_counter->GetCell();
212   tf_data_iterator_busy_cell->IncrementBy(duration_us);
213 }
214 
RecordTFDataIteratorLifetime(uint64 duration_us)215 void RecordTFDataIteratorLifetime(uint64 duration_us) {
216   static auto* tf_data_iterator_lifetime_cell =
217       tf_data_iterator_lifetime_counter->GetCell();
218   tf_data_iterator_lifetime_cell->IncrementBy(duration_us);
219 }
220 
RecordTFDataOptimization(const string & name,int64 num_changes)221 void RecordTFDataOptimization(const string& name, int64 num_changes) {
222   tf_data_optimization_counter->GetCell(name)->IncrementBy(num_changes);
223 }
224 
RecordTFDataFilename(const string & name,const string & filename)225 void RecordTFDataFilename(const string& name, const string& filename) {
226   tf_data_filename_counter->GetCell(name, filename)->IncrementBy(1);
227 }
228 
RecordParseDenseFeature(int64 num_features)229 void RecordParseDenseFeature(int64 num_features) {
230   static auto* parse_dense_feature_counter_cell =
231       parse_dense_feature_counter->GetCell();
232   parse_dense_feature_counter_cell->IncrementBy(num_features);
233 }
234 
RecordParseSparseFeature(int64 num_features)235 void RecordParseSparseFeature(int64 num_features) {
236   static auto* parse_sparse_feature_counter_cell =
237       parse_sparse_feature_counter->GetCell();
238   parse_sparse_feature_counter_cell->IncrementBy(num_features);
239 }
240 
RecordParseRaggedFeature(int64 num_features)241 void RecordParseRaggedFeature(int64 num_features) {
242   static auto* parse_ragged_feature_counter_cell =
243       parse_ragged_feature_counter->GetCell();
244   parse_ragged_feature_counter_cell->IncrementBy(num_features);
245 }
246 
RecordGraphInputTensors(const size_t size)247 void RecordGraphInputTensors(const size_t size) {
248   static auto* graph_run_input_tensor_bytes_cell =
249       graph_run_input_tensor_bytes->GetCell();
250   graph_run_input_tensor_bytes_cell->Add(size);
251 }
252 
RecordGraphOutputTensors(const size_t size)253 void RecordGraphOutputTensors(const size_t size) {
254   static auto* graph_run_output_tensor_bytes_cell =
255       graph_run_output_tensor_bytes->GetCell();
256   graph_run_output_tensor_bytes_cell->Add(size);
257 }
258 
UpdateGraphExecTime(const uint64 running_time_usecs)259 void UpdateGraphExecTime(const uint64 running_time_usecs) {
260   if (running_time_usecs > 0) {
261     static auto* graph_runs_cell = graph_runs->GetCell();
262     static auto* graph_run_time_usecs_cell = graph_run_time_usecs->GetCell();
263     static auto* graph_run_time_usecs_histogram_cell =
264         graph_run_time_usecs_histogram->GetCell();
265     graph_runs_cell->IncrementBy(1);
266     graph_run_time_usecs_cell->IncrementBy(running_time_usecs);
267     graph_run_time_usecs_histogram_cell->Add(running_time_usecs);
268   }
269 }
270 
UpdateGraphPendingQueueLength(uint64 len)271 void UpdateGraphPendingQueueLength(uint64 len) {
272   static auto* graph_pending_queue_length_cell =
273       graph_pending_queue_length_histogram->GetCell();
274   graph_pending_queue_length_cell->Add(len);
275 }
276 
UpdateGraphOptimizationPassTime(const string & pass_name,const uint64 running_time_usecs)277 void UpdateGraphOptimizationPassTime(const string& pass_name,
278                                      const uint64 running_time_usecs) {
279   if (running_time_usecs > 0) {
280     graph_optimization_usecs->GetCell("GraphOptimizationPass", pass_name)
281         ->IncrementBy(running_time_usecs);
282   }
283 }
284 
UpdateGrapplerPassTime(const string & pass_name,const uint64 running_time_usecs)285 void UpdateGrapplerPassTime(const string& pass_name,
286                             const uint64 running_time_usecs) {
287   if (running_time_usecs > 0) {
288     graph_optimization_usecs->GetCell("Grappler", pass_name)
289         ->IncrementBy(running_time_usecs);
290   }
291 }
292 
UpdateGraphBuildTime(const uint64 running_time_usecs)293 void UpdateGraphBuildTime(const uint64 running_time_usecs) {
294   if (running_time_usecs > 0) {
295     static auto* build_graph_calls_cell = build_graph_calls->GetCell();
296     static auto* build_graph_time_usecs_cell =
297         build_graph_time_usecs->GetCell();
298     build_graph_calls_cell->IncrementBy(1);
299     build_graph_time_usecs_cell->IncrementBy(running_time_usecs);
300   }
301 }
302 
UpdateXlaCompilationTime(const uint64 compilation_time_usecs)303 void UpdateXlaCompilationTime(const uint64 compilation_time_usecs) {
304   if (compilation_time_usecs > 0) {
305     static auto* xla_compilations_cell = xla_compilations->GetCell();
306     static auto* xla_compilation_time_usecs_cell =
307         xla_compilation_time_usecs->GetCell();
308     xla_compilations_cell->IncrementBy(1);
309     xla_compilation_time_usecs_cell->IncrementBy(compilation_time_usecs);
310   }
311 }
312 
UpdateBfcAllocatorDelayTime(const uint64 delay_usecs)313 void UpdateBfcAllocatorDelayTime(const uint64 delay_usecs) {
314   static auto* bfc_allocator_delay_cell = bfc_allocator_delay->GetCell();
315   if (delay_usecs > 0) {
316     bfc_allocator_delay_cell->IncrementBy(delay_usecs);
317   }
318 }
319 
IncrementMLIRImportFailureCount()320 void IncrementMLIRImportFailureCount() {
321   static auto* mlir_import_failure_count_cell =
322       mlir_import_failure_count->GetCell();
323   mlir_import_failure_count_cell->IncrementBy(1);
324 }
325 
RecordUnusedOutput(const string & op_name)326 void RecordUnusedOutput(const string& op_name) {
327   graph_unused_outputs->GetCell(op_name)->IncrementBy(1);
328 }
329 
330 }  // namespace metrics
331 }  // namespace tensorflow
332