1syntax = "proto3"; 2 3package tensorflow.profiler; 4 5import "tensorflow/core/profiler/protobuf/diagnostics.proto"; 6import "tensorflow/core/profiler/protobuf/kernel_stats.proto"; 7import "tensorflow/core/profiler/protobuf/op_metrics.proto"; 8import "tensorflow/core/profiler/protobuf/steps_db.proto"; 9import "tensorflow/core/profiler/protobuf/tf_function.proto"; 10 11// Performance environment, e.g the peak performance capabilities of the device. 12message PerfEnv { 13 // Peak performance of a TPU core or a GPU in TFLOP/s. 14 double peak_tera_flops_per_second = 1; 15 // Peak memory bandwidth of a TPU core or a GPU in GiBs/s. 16 double peak_hbm_bw_giga_bytes_per_second = 2; 17 // The ridge point of roofline model in FLOP/Byte. (i.e., minimum operational 18 // intensity required to achieve maximum performance). 19 double ridge_point = 3; 20} 21 22// Result proto for host-independent job information. 23message HostIndependentJobInfoResult { 24 // The change-list number of this build. 25 int64 change_list = 1; 26 // The time of this build (nanoseconds since the Unix epoch). 27 int64 build_time = 2; 28 // The target of this build. 29 string build_target = 3; 30 // Profiling duration (in ms). 31 uint32 profile_duration_ms = 4; 32} 33 34// Result proto for host-dependent job information. 35message HostDependentJobInfoResult { 36 // This ID of the host where the job was run on. 37 string host_id = 1; 38 // The command line used to run the job. 39 string command_line = 2; 40 // The start time of this run (nanoseconds since the Unix epoch). 41 int64 start_time = 3; 42 // BNS address specified by client at time of profiling request. 43 string bns_address = 4; 44 // Profiling start walltime (in ns). 45 uint64 profile_time_ns = 5; 46} 47 48// System topology, which describes the number of chips in a pod 49// and the connectivity style. 50message SystemTopology { 51 // The X, Y, and Z dimensions of this topology. 0 means that dimension does 52 // not exist. 53 int64 x_dimension = 1; 54 int64 y_dimension = 2; 55 int64 z_dimension = 3; 56 // The number of expected bad chips in this system. 57 int64 num_expected_reduced_chips = 4; 58} 59 60// The run environment of a profiling session. 61message RunEnvironment { 62 // Number of hosts used. 63 int32 host_count = 1; 64 // Number of tasks used. 65 int32 task_count = 2; 66 // Distinct hostnames seen. 67 map<string, bool> hostnames = 3; 68 // The type of device used. 69 string device_type = 4; 70 // The number of device cores used. 71 // In TPU case, this corresponds to the number of TPU cores 72 // In GPU case, this corresponds to the number of GPUs (not the number of 73 // SMs). 74 int32 device_core_count = 5; 75 // The per-device-core batch size. 76 int32 per_core_batch_size = 6; 77 // Host-independent information about this job. 78 HostIndependentJobInfoResult host_independent_job_info = 7; 79 // Host-dependent information about this job. 80 repeated HostDependentJobInfoResult host_dependent_job_info = 8; 81 // The number of replicas, corresponds to input parallelism. 82 // If there is no model parallelism, replica_count = device_core_count 83 int32 replica_count = 9; 84 // The number of cores used for a single replica, e.g. model parallelism. 85 // If there is no model parallelism, then num_cores_per_replica = 1 86 int32 num_cores_per_replica = 10; 87 // The chip interconnection topology. 88 SystemTopology topology = 11; 89 // Host trace level. 90 uint32 host_trace_level = 12; 91} 92 93// Next ID: 7 94message CoreDetails { 95 string hostname = 1; 96 uint32 device_ordinal = 2; // unique within host, TPU core only 97 uint32 core_num = 3; // unique within chip per core type 98 uint32 local_chip_id = 4; // unique within host 99 uint32 global_chip_id = 5; // unique within mesh 100 uint32 global_core_id = 6; // unique within mesh, TPU core only 101} 102 103// Next ID: 12 104// Operator Statistics. 105message OpStats { 106 // The database for the op metrics collected from the host over the entire 107 // profiling session including incomplete steps. 108 OpMetricsDb host_op_metrics_db = 1; 109 // The database for the op metrics collected from the device over the entire 110 // profiling session including incomplete steps. 111 OpMetricsDb device_op_metrics_db = 2; 112 // The result for the HLO-metric database over the complete steps only. 113 OpMetricsDb hlo_metrics_db_complete_steps_only = 10; 114 // Performance environment of the op metrics collected. 115 PerfEnv perf_env = 3; 116 // The database of step sequences. 117 StepDatabaseResult step_db = 4; 118 // The run environment of this profiling session. 119 RunEnvironment run_environment = 5; 120 // Kernel stats results from all GPUs. 121 KernelStatsDb kernel_stats_db = 6; 122 // Statistics for all tf-functions. 123 TfFunctionDb tf_function_db = 8; 124 // A map from core ID to details. 125 map<uint32, CoreDetails> core_id_to_details = 11; 126 // Error and warning messages for diagnosing profiling issues. 127 Diagnostics diagnostics = 9; 128 reserved 7; 129} 130