1syntax = "proto3";
2
3package tensorflow.profiler;
4
5import "tensorflow/core/profiler/protobuf/diagnostics.proto";
6import "tensorflow/core/profiler/protobuf/kernel_stats.proto";
7import "tensorflow/core/profiler/protobuf/op_metrics.proto";
8import "tensorflow/core/profiler/protobuf/steps_db.proto";
9import "tensorflow/core/profiler/protobuf/tf_function.proto";
10
11// Performance environment, e.g the peak performance capabilities of the device.
12message PerfEnv {
13  // Peak performance of a TPU core or a GPU in TFLOP/s.
14  double peak_tera_flops_per_second = 1;
15  // Peak memory bandwidth of a TPU core or a GPU in GiBs/s.
16  double peak_hbm_bw_giga_bytes_per_second = 2;
17  // The ridge point of roofline model in FLOP/Byte. (i.e., minimum operational
18  // intensity required to achieve maximum performance).
19  double ridge_point = 3;
20}
21
22// Result proto for host-independent job information.
23message HostIndependentJobInfoResult {
24  // The change-list number of this build.
25  int64 change_list = 1;
26  // The time of this build (nanoseconds since the Unix epoch).
27  int64 build_time = 2;
28  // The target of this build.
29  string build_target = 3;
30  // Profiling duration (in ms).
31  uint32 profile_duration_ms = 4;
32}
33
34// Result proto for host-dependent job information.
35message HostDependentJobInfoResult {
36  // This ID of the host where the job was run on.
37  string host_id = 1;
38  // The command line used to run the job.
39  string command_line = 2;
40  // The start time of this run (nanoseconds since the Unix epoch).
41  int64 start_time = 3;
42  // BNS address specified by client at time of profiling request.
43  string bns_address = 4;
44  // Profiling start walltime (in ns).
45  uint64 profile_time_ns = 5;
46}
47
48// System topology, which describes the number of chips in a pod
49// and the connectivity style.
50message SystemTopology {
51  // The X, Y, and Z dimensions of this topology. 0 means that dimension does
52  // not exist.
53  int64 x_dimension = 1;
54  int64 y_dimension = 2;
55  int64 z_dimension = 3;
56  // The number of expected bad chips in this system.
57  int64 num_expected_reduced_chips = 4;
58}
59
60// The run environment of a profiling session.
61message RunEnvironment {
62  // Number of hosts used.
63  int32 host_count = 1;
64  // Number of tasks used.
65  int32 task_count = 2;
66  // Distinct hostnames seen.
67  map<string, bool> hostnames = 3;
68  // The type of device used.
69  string device_type = 4;
70  // The number of device cores used.
71  //   In TPU case, this corresponds to the number of TPU cores
72  //   In GPU case, this corresponds to the number of GPUs (not the number of
73  //   SMs).
74  int32 device_core_count = 5;
75  // The per-device-core batch size.
76  int32 per_core_batch_size = 6;
77  // Host-independent information about this job.
78  HostIndependentJobInfoResult host_independent_job_info = 7;
79  // Host-dependent information about this job.
80  repeated HostDependentJobInfoResult host_dependent_job_info = 8;
81  // The number of replicas, corresponds to input parallelism.
82  // If there is no model parallelism, replica_count = device_core_count
83  int32 replica_count = 9;
84  // The number of cores used for a single replica, e.g. model parallelism.
85  // If there is no model parallelism, then num_cores_per_replica = 1
86  int32 num_cores_per_replica = 10;
87  // The chip interconnection topology.
88  SystemTopology topology = 11;
89  // Host trace level.
90  uint32 host_trace_level = 12;
91}
92
93// Next ID: 7
94message CoreDetails {
95  string hostname = 1;
96  uint32 device_ordinal = 2;  // unique within host, TPU core only
97  uint32 core_num = 3;        // unique within chip per core type
98  uint32 local_chip_id = 4;   // unique within host
99  uint32 global_chip_id = 5;  // unique within mesh
100  uint32 global_core_id = 6;  // unique within mesh, TPU core only
101}
102
103// Next ID: 12
104// Operator Statistics.
105message OpStats {
106  // The database for the op metrics collected from the host over the entire
107  // profiling session including incomplete steps.
108  OpMetricsDb host_op_metrics_db = 1;
109  // The database for the op metrics collected from the device over the entire
110  // profiling session including incomplete steps.
111  OpMetricsDb device_op_metrics_db = 2;
112  // The result for the HLO-metric database over the complete steps only.
113  OpMetricsDb hlo_metrics_db_complete_steps_only = 10;
114  // Performance environment of the op metrics collected.
115  PerfEnv perf_env = 3;
116  // The database of step sequences.
117  StepDatabaseResult step_db = 4;
118  // The run environment of this profiling session.
119  RunEnvironment run_environment = 5;
120  // Kernel stats results from all GPUs.
121  KernelStatsDb kernel_stats_db = 6;
122  // Statistics for all tf-functions.
123  TfFunctionDb tf_function_db = 8;
124  // A map from core ID to details.
125  map<uint32, CoreDetails> core_id_to_details = 11;
126  // Error and warning messages for diagnosing profiling issues.
127  Diagnostics diagnostics = 9;
128  reserved 7;
129}
130