1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_CORE_UTIL_DEBUG_EVENTS_WRITER_H_
17 #define TENSORFLOW_CORE_UTIL_DEBUG_EVENTS_WRITER_H_
18 
19 #include <deque>
20 
21 #include "absl/container/flat_hash_map.h"
22 #include "tensorflow/core/framework/tensor.h"
23 #include "tensorflow/core/lib/core/status.h"
24 #include "tensorflow/core/lib/io/record_writer.h"
25 #include "tensorflow/core/platform/env.h"
26 #include "tensorflow/core/platform/macros.h"
27 #include "tensorflow/core/platform/types.h"
28 #include "tensorflow/core/protobuf/debug_event.pb.h"
29 
30 namespace tensorflow {
31 namespace tfdbg {
32 
33 // The set of files generated by a debugged TensorFlow program.
34 enum DebugEventFileType {
35   METADATA,
36   SOURCE_FILES,
37   STACK_FRAMES,
38   GRAPHS,
39   EXECUTION,
40   GRAPH_EXECUTION_TRACES,
41 };
42 
43 // Helper class for DebugEventsWriter.
44 // This class manages the writing of data to a single TFRecord file.
45 // Each object of the DebugEventsWriter class below involves multiple
46 // TFRecord files, and hence utilizes multiple objects of this helper class.
47 class SingleDebugEventFileWriter {
48  public:
49   explicit SingleDebugEventFileWriter(const string& file_path);
50 
51   Status Init();
52 
53   void WriteSerializedDebugEvent(tensorflow::StringPiece debug_event_str);
54 
55   Status Flush();
56   Status Close();
57 
58   const string FileName();
59 
60  private:
61   Env* env_;
62   const string file_path_;
63   std::atomic_int_fast32_t num_outstanding_events_;
64 
65   std::unique_ptr<WritableFile> writable_file_;
66   std::unique_ptr<io::RecordWriter> record_writer_ TF_PT_GUARDED_BY(writer_mu_);
67   mutex writer_mu_;
68 };
69 
70 // The DebugEvents writer class.
71 class DebugEventsWriter {
72  public:
73 #ifndef SWIG
74   // Prefix of version string present in the first entry of every event file.
75   // Default size of each circular buffer (unit: number of DebugEvent protos).
76   static constexpr const int64 kDefaultCyclicBufferSize = 1000;
77 
78   static constexpr const char* kFileNamePrefix = "tfdbg_events";
79   static constexpr const char* kMetadataSuffix = "metadata";
80   static constexpr const char* kSourceFilesSuffix = "source_files";
81   static constexpr const char* kStackFramesSuffix = "stack_frames";
82   static constexpr const char* kGraphsSuffix = "graphs";
83   static constexpr const char* kExecutionSuffix = "execution";
84   static constexpr const char* kGraphExecutionTracesSuffix =
85       "graph_execution_traces";
86 
87   static constexpr const char* kVersionPrefix = "debug.Event:";
88   static constexpr const int kCurrentFormatVersion = 1;
89 #endif
90 
91   // Get the DebugEventsWriter for the given dump_root.
92   // For a given dump_root value, it is a singleton. tfdbg event files come in
93   // sets of six. The singleton pattern avoids storing multiple sets in a single
94   // folder, which might cause confusion.
95   //
96   // If an instance of DebugEventsWriter has already been created at a
97   // `dump_root`, calling this method with the same `dump_root` will return
98   // the existing instance.
99   //
100   // Args:
101   //   dump_root: Dump root directory. If it doesn't exist, will be created.
102   //   tfdbg_run_id: Debugging run ID of the writer.
103   //   circular_buffer_size: Circular buffer size (in number of DebugEvent
104   //     protos). If set to a value <=0, will abolish the circular-buffer
105   //     behavior.
106   // Returns:
107   //   A pointer to a DebugEventsWriter object: a per-dump_root singleton.
108   static DebugEventsWriter* GetDebugEventsWriter(const string& dump_root,
109                                                  const string& tfdbg_run_id,
110                                                  int64 circular_buffer_size);
111   // Look up existing events writer by dump_root.
112   // If no DebugEventsWriter has been created at the dump_root, a non-OK
113   // Status will be returned. Else an OK status will be returned, with
114   // the pointer to the existing instance provided by reference.
115   static Status LookUpDebugEventsWriter(
116       const string& dump_root, DebugEventsWriter** debug_events_writer);
117   ~DebugEventsWriter();
118 
119   // Sets the debug event filenames and opens file for writing.
120   // All files (see the DebugEventFileType enum) share the same prefix and
121   // differ only in their suffixes. If not called by user, will be invoked
122   // automatically by a call to FileName() or any of the Write*() methods().
123   // Idempotent: if the metadata file exists and is open, this is a no-op.
124   // If on the other hand the file was opened, but has since disappeared (e.g.
125   // deleted by another process), this will open a new file.
126   Status Init();
127 
128   // The four DebugEvent fields below are written _without_ the circular
129   // buffer. Source file contents are written to the *.source_files file.
130   // Takes ownership of source_file.
131   Status WriteSourceFile(SourceFile* source_file);
132   // Stack frames are written to the *.code_locations file.
133   // Takes ownership of stack_frame_with_id.
134   Status WriteStackFrameWithId(StackFrameWithId* stack_frame_with_id);
135   // Graph op creation events are written to the *.graphs file.
136   // Takes ownership of graph_op_creation.
137   Status WriteGraphOpCreation(GraphOpCreation* graph_op_creation);
138   // Debugged graphs are written to the *.graphs file.
139   // Takes ownership of debugged_graph.
140   Status WriteDebuggedGraph(DebuggedGraph* debugged_graph);
141 
142   // The two DebugEvent fields below are written to the circular buffer
143   // and saved to disk only at the FlushExecutionFiles() call.
144   // Execution events (eager execution of an op or a tf.function) are written
145   // to the *.execution file. Takes ownership of execution.
146   Status WriteExecution(Execution* execution);
147   // Graph execution traces (graph-internal tensor values or their summaries)
148   // are written to the *.graph_execution_traces file.
149   // Takes ownership of graph_execution_trace.
150   Status WriteGraphExecutionTrace(GraphExecutionTrace* graph_execution_trace);
151 
152   // Write a graph execution trace without using a protocol buffer.
153   // Instead, pass the raw values related to the graph execution trace.
154   // Args:
155   //   tfdbg_context_id: A unique ID for the context of interest, e.g., a
156   //   concreted compiled tf.function that the op of interest belongs to.
157   //   op_name: Name of the op that this graph execution trace is concerned
158   //     with. Applicable only to the single-tensor trace case. For cases in
159   //     which the trace concerns multiple tensors, this is an empty string.
160   //   output_slot: Output slot index of the op that this trace is concerned
161   //     with.
162   //   tensor_debug_mode: An integer that represents the tensor-debug mode
163   //   enum. tensor_value: The value of the tensor that describes the
164   //   tensor(s)
165   //     that this trace is concerned with. The semantics of this tensor value
166   //     depends on the value of `tensor_debug_mode`.
167   Status WriteGraphExecutionTrace(const string& tfdbg_context_id,
168                                   const string& device_name,
169                                   const string& op_name, int32 output_slot,
170                                   int32 tensor_debug_mode,
171                                   const Tensor& tensor_value);
172 
173   // Writes a serialized DebugEvent to one of the debug-events files
174   // concerned with the non-execution events: the SOURCE_FILES, STACK_FRAMES
175   // and GRAPHS files.
176   // NOTE: Actually used in the Python binding, to avoid overhead of
177   // serializing and parsing protos at the language interface.
178   void WriteSerializedNonExecutionDebugEvent(const string& debug_event_str,
179                                              DebugEventFileType type);
180 
181   // Writes a serialized DebugEvent to one of the debug-events files
182   // concerned with the execution-related events: the EXECUTION and
183   // GRAPH_EXECUTION_TRACES files. This involves the cyclic-buffer behavior if
184   // circular_buffer_size is configured to be >0.
185   // NOTE: Actually used in the Python binding, to avoid overhead of
186   // serializing and parsing protos at the language interface.
187   void WriteSerializedExecutionDebugEvent(const string& debug_event_str,
188                                           DebugEventFileType type);
189 
190   // Given name of the device, retrieve a unique integer ID. As a side effect,
191   // if this is the first time this object encounters the device name,
192   // writes a DebuggedDevice proto to the .graphs file in the file set.
193   int RegisterDeviceAndGetId(const string& device_name);
194 
195   // EventWriter automatically flushes and closes on destruction, but
196   // this method is provided for users who want to write to disk sooner
197   // and/or check for success.
198   // FlushNonExecutionFiles() pushes outstanding DebugEvents not written
199   // events to the circular buffer to their respective files.
200   Status FlushNonExecutionFiles();
201 
202   // Writes current contents of the circular buffers to their respective
203   // debug event files and clears the circular buffers.
204   Status FlushExecutionFiles();
205 
206   // Close() calls FlushNonExecutionFiles() and FlushExecutionFiles()
207   // and then closes the current debug events files.
208   Status Close();
209 
210  private:
211   static std::unordered_map<string, std::unique_ptr<DebugEventsWriter>>*
212 
213   // Get a static map from dump-root path to DebugEventsWriter objects.
214   // This helps the per-dump-root singletone pattern.
215   GetDebugEventsWriterMap();
216 
217   // Guards calls to the GetDebugEventsWriter() method.
218   static mutex factory_mu_;
219 
220   DebugEventsWriter(const string& dump_root, const string& tfdbg_run_id,
221                     int64 circular_buffer_size);
222 
223   // Get the path prefix. The same for all files, which differ only in the
224   // suffix.
225   string FileName(DebugEventFileType type);
226 
227   // Initialize the TFRecord writer for non-metadata file type.
228   Status InitNonMetadataFile(DebugEventFileType type);
229 
230   Status SerializeAndWriteDebugEvent(DebugEvent* debug_event,
231                                      DebugEventFileType type);
232 
233   void SelectWriter(DebugEventFileType type,
234                     std::unique_ptr<SingleDebugEventFileWriter>** writer);
235   const string GetSuffix(DebugEventFileType type);
236   string GetFileNameInternal(DebugEventFileType type);
237 
238   Env* env_;
239   const string dump_root_;
240   const string tfdbg_run_id_;
241 
242   string file_prefix_;
243   bool is_initialized_ TF_GUARDED_BY(initialization_mu_);
244   mutex initialization_mu_;
245 
246   const int64 circular_buffer_size_;
247   std::deque<string> execution_buffer_ TF_GUARDED_BY(execution_buffer_mu_);
248   mutex execution_buffer_mu_;
249   std::deque<string> graph_execution_trace_buffer_
250       TF_GUARDED_BY(graph_execution_trace_buffer_mu_);
251   mutex graph_execution_trace_buffer_mu_;
252 
253   absl::flat_hash_map<string, int> device_name_to_id_ TF_GUARDED_BY(device_mu_);
254   mutex device_mu_;
255 
256   std::unique_ptr<SingleDebugEventFileWriter> metadata_writer_;
257   std::unique_ptr<SingleDebugEventFileWriter> source_files_writer_;
258   std::unique_ptr<SingleDebugEventFileWriter> stack_frames_writer_;
259   std::unique_ptr<SingleDebugEventFileWriter> graphs_writer_;
260   std::unique_ptr<SingleDebugEventFileWriter> execution_writer_;
261   std::unique_ptr<SingleDebugEventFileWriter> graph_execution_traces_writer_;
262 
263   TF_DISALLOW_COPY_AND_ASSIGN(DebugEventsWriter);
264 
265   friend class DebugEventsWriterTest;
266 };
267 
268 }  // namespace tfdbg
269 }  // namespace tensorflow
270 
271 #endif  // TENSORFLOW_CORE_UTIL_DEBUG_EVENTS_WRITER_H_
272