1// This proto describes the format of the output profile file from
2// the Pod Viewer tool.
3syntax = "proto3";
4
5package tensorflow.profiler;
6
7import "tensorflow/core/profiler/protobuf/diagnostics.proto";
8import "tensorflow/core/profiler/protobuf/pod_stats.proto";
9
10// Describes the replica groups in a cross replica op (e.g., all-reduce and
11// all-to-all).
12message ReplicaGroup {
13  // The ids of the replicas that belongs to the same group. The ordering of the
14  // ids matters in some ops (e.g., all-to-all).
15  repeated int64 replica_ids = 1;
16}
17
18message AllReduceOpInfo {
19  // Name of this OP.
20  string name = 1;
21  // Number of instances that this OP occurred.
22  uint32 occurrences = 2;
23  // The time in microseconds spent in this OP (averaged across all of its
24  // occurrences).
25  double duration_us = 3;
26  // Byte size of data transferred.
27  uint64 data_size = 4;
28  // Replica groups.
29  repeated ReplicaGroup replica_groups = 5;
30  // Description (e.g. XLA expression).
31  string description = 6;
32}
33
34// Result proto for information in a step across all cores.
35message PodStatsMap {
36  // The (micro) step number.
37  uint32 step_num = 1;
38  // A map from core_id to PodStatsRecord.
39  map<uint32, PodStatsRecord> pod_stats_per_core = 2;
40  // A database of channel info.
41  repeated ChannelInfo channel_db = 3;
42  // A map from core ID to program replica id. Replica id map could change
43  // during a profile session, but should stay stable within a step.
44  map<uint32, uint32> core_id_to_replica_id_map = 4;
45  // A database of all reduce ops.
46  repeated AllReduceOpInfo all_reduce_op_db = 5;
47}
48
49// A sequence of PodStatsMap for each step.
50message PodStatsSequence {
51  repeated PodStatsMap pod_stats_map = 1;
52}
53
54// Next ID: 14
55// Information about a send and recv channel.
56message ChannelInfo {
57  // Id of the channel.
58  int64 channel_id = 1;
59  // Core ids of send ops.
60  repeated uint32 src_core_ids = 11;
61  // Core ids of recv ops.
62  repeated uint32 dst_core_ids = 12;
63  // Byte size of the data transferred.
64  uint64 data_size = 4;
65  // Duration from the beginning of send to the end of recv-done in
66  // microseconds.
67  double duration_us = 5;
68  // Number of occurrences of a channel.
69  uint32 occurrences = 6;
70  // Percentage of the link BW utilized over the peak link BW.
71  double utilization = 7;
72  // A list of hlo names associated with this channel id.
73  repeated string hlo_names = 8;
74  // Duration from the beginning of the recv-done to the beginning of send in
75  // microseconds. If the recv-done op starts after the beginning of the send
76  // op, the delay is zero.
77  double send_delay_us = 9;
78  // Description (e.g. XLA expression).
79  string description = 13;
80
81  reserved 2, 3, 10;
82}
83
84message PodViewerSummary {
85  repeated string warnings = 1;
86}
87
88// Next ID: 8
89// Topology graph draws all the cores in the system in a 2-D rectangle or
90// 3-D cube. It is hierarchically grouped by host, chip and core.
91message PodViewerTopology {
92  // Number of cores in the x dimension of the rectangle/cube.
93  int32 x_dimension = 1;
94  // Number of cores in the y dimension of the rectangle/cube.
95  int32 y_dimension = 2;
96  // Number of cores in the z dimension of the cube.
97  int32 z_dimension = 3;
98  // Number of cores in the x dimension of each host.
99  int32 host_x_stride = 4;
100  // Number of cores in the y dimension of each host.
101  int32 host_y_stride = 5;
102  // Number of cores in the z dimension of each host.
103  int32 host_z_stride = 6;
104  // Number of cores per chip.
105  int32 num_cores_per_chip = 7;
106}
107
108// Next ID: 12
109// A database of pod viewer records.
110message PodViewerDatabase {
111  // The type of device used.
112  string device_type = 10;
113  // Pod level stats for each step.
114  PodStatsSequence pod_stats_sequence = 3;
115  // Top level summary of pod viewer.
116  PodViewerSummary summary = 7;
117  // Error and warning messages for diagnosing profiling issues.
118  Diagnostics diagnostics = 8;
119  // A map from event type number to event name string for step breakdown.
120  repeated StepBreakdownEvents step_breakdown_events = 9;
121  // Info to draw the topology graph.
122  PodViewerTopology topology = 11;
123
124  reserved 1, 2, 4, 5, 6;
125}
126