1syntax = "proto3";
2
3package tensorflow.profiler;
4
5import "google/protobuf/any.proto";
6import "tensorflow/core/profiler/protobuf/op_metrics.proto";
7
8// Breakdown of step-time on generic hardware. Note that these components are
9// mutually exclusive so that adding them together is equal to the step time. If
10// an execution time interval has multiple types of event happening, we need to
11// pick one of the event type to attribute the time interval to.
12message GenericStepBreakdown {
13  // Map event type to the accumulated duration in
14  // picoseconds of that type.
15  map<int32, uint64> type_ps = 1;
16}
17
18// Information about memory transfer to/from device memory.
19message DeviceMemoryTransfer {
20  uint64 occurrence = 1;
21  double time_us = 2;
22  uint64 bytes_transferred = 3;
23}
24
25// Next ID: 5
26// Result proto for StepInfo.
27message StepInfoResult {
28  // The step number.
29  uint32 step_num = 1;
30  // The step duration in picoseconds.
31  uint64 duration_ps = 2;
32  // The start time of this step in picoseconds.
33  uint64 begin_ps = 3;
34  // Breakdown of the step-time. Can be unpacked into a GenericStepBreakdown.
35  google.protobuf.Any step_breakdown = 4;
36}
37
38// Result proto for metrics on flow events.
39message FlowEventInfo {
40  // Unique id for each send and recv pair.
41  uint64 flow_id = 1;
42  // Channel id generated by the XLA compiler, it is statically unique within an
43  // HloModule.
44  int64 channel_id = 2;
45  // The name of the hlo op.
46  string name = 3;
47  // Category of the hlo op.
48  string category = 4;
49  // The start time in picoseconds of the op event.
50  uint64 start_time_ps = 5;
51  // The end time in picoseconds of the op event.
52  uint64 end_time_ps = 6;
53  // The size of the op in bytes.
54  uint64 byte_size = 7;
55  // The replica id of the program running the flow event.
56  uint32 replica_id = 8;
57}
58
59// Result database for core to core flow events.
60message FlowDbResult {
61  repeated FlowEventInfo flow_info = 1;
62}
63
64// Result proto for all -educe ops.
65message AllReduceInfo {
66  // Unique id for all-reduce ops.
67  uint64 id = 1;
68  // The name of the hlo op.
69  string name = 2;
70  // For all-reduce nodes from different modules, if they have the same
71  // all_reduce_id, they will be 'Allreduce'd'. If empty, AllReduce will not be
72  // applied across modules.
73  uint64 all_reduce_id = 3;
74  // The start time in picoseconds of the op event.
75  uint64 start_time_ps = 4;
76  // The end time in picoseconds of the op event.
77  uint64 end_time_ps = 5;
78  // The size of the op in bytes.
79  uint64 byte_size = 6;
80}
81
82// Result database for all-reduce ops.
83message AllReduceDbResult {
84  repeated AllReduceInfo all_reduce_info = 1;
85}
86
87// Result proto for information in a step across all cores.
88message PerCoreStepInfo {
89  // The step number.
90  uint32 step_num = 1;
91  // A map from core_id to StepInfo.
92  map<uint32, StepInfoResult> step_info_per_core = 2;
93  // The result for the per-step HLO-metric database.
94  OpMetricsDb hlo_metrics_db = 3;
95  // The result for send and recv flows.
96  map<uint32, FlowDbResult> flow_db_per_core = 4;
97  // A map from core ID to program replica id. Replica id map could change
98  // during a profile session, but should stay stable within a step.
99  map<uint32, uint32> core_id_to_replica_id_map = 5;
100  // A map from core_id to all-reduce ops.
101  map<uint32, AllReduceDbResult> all_reduce_db_per_core = 6;
102  // Information about deivce memory transfers, categoried by source and
103  // destination. Ordered by following categories:
104  // 1. HostToDevice
105  // 2. DeviceToHost
106  // 3. DeviceToDevice
107  repeated DeviceMemoryTransfer device_memory_transfers = 7;
108}
109
110// Result proto for a StepDatabase.
111message StepDatabaseResult {
112  // A sequence of PerCoreStepInfo.
113  repeated PerCoreStepInfo step_sequence = 1;
114  // Whether the step db uses incomplete step information.
115  // This flag is set to true when:
116  // 1) no step marker or annotation present.
117  // 2) profiling duration is too short to cover a full step.
118  // If this flag is false, we will group and breakdown the
119  // profile by complete steps only and ignore incomplete steps.
120  // If this flag is true, we will simply aggregate and breakdown over the total
121  // profile as a single step.
122  bool use_incomplete_step = 2;
123  // Number of steps dropped during post processing.
124  uint32 num_steps_dropped = 3;
125}
126