1syntax = "proto3"; 2 3package tensorflow.profiler; 4 5import "google/protobuf/any.proto"; 6import "tensorflow/core/profiler/protobuf/diagnostics.proto"; 7import "tensorflow/core/profiler/protobuf/input_pipeline.proto"; 8 9// Overview result for a TensorFlow Op. 10message OverviewTfOp { 11 // Name of the Op. 12 string name = 1; 13 // Category of the Op. 14 string category = 2; 15 // The amount of time that this Op takes by itself 16 // as fraction of the total execution time on the device or host. 17 double self_time_fraction = 3; 18 // The cumulative time upto this Op as fraction of the total execution time. 19 double cumulative_time_fraction = 4; 20 // How many GFlops/sec that this Op achieves. 21 double flop_rate = 5; 22 // Whether the Op is eligible to use TensorCores. 23 bool is_op_tensorcore_eligible = 6; 24 // Whether at least one of the kernels launched in this op is using 25 // TensorCore. 26 bool is_op_using_tensorcore = 7; 27} 28 29// Overview result for general analysis. 30message OverviewPageAnalysis { 31 // MXU utilization in percentage. 32 double mxu_utilization_percent = 1; 33 // Percentage of the device time that is idle. 34 double device_idle_time_percent = 2; 35 // Percentage of the host time that is idle. 36 double host_idle_time_percent = 3; 37 // Top TF Ops executed on the device. 38 repeated OverviewTfOp top_device_ops = 4; 39 // Remark text in the performance summary section. 40 string remark_text = 5; 41 // Color of the remark text. 42 string remark_color = 6; 43 // FLOP rate utilization relative to the roofline in percentage. 44 double flop_rate_utilization_relative_to_roofline_percent = 7; 45 // Memory bandwidth utilization relative to the hw limit in percentage. 46 double memory_bw_utilization_relative_to_hw_limit_percent = 8; 47 // Percentage of device computation that is 16-bit. 48 double device_compute_16bit_percent = 9; 49 // Percentage of device computation that is 32-bit. 50 double device_compute_32bit_percent = 10; 51 // Percentage of TF ops executed on the host. 52 double host_tf_op_percent = 11; 53 // Percentage of TF ops executed on the device. 54 double device_tf_op_percent = 12; 55 // Host trace level. 56 uint32 host_trace_level = 13; 57 // Percentage of TF-op execution time on the host (excluding the idle time) 58 // that are in eager mode. 59 double host_op_time_eager_percent = 14; 60 // Percentage of TF-op execution time on the device (excluding the idle time) 61 // that are in eager mode. 62 double device_op_time_eager_percent = 15; 63 // Percentage of TF-op execution time on the device (excluding the idle time) 64 // that are for outside compilation. 65 double device_op_time_outside_compilation_percent = 16; 66} 67 68// Overview result for a performance tip to users. 69message OverviewPageTip { 70 // Link to the tip. 71 string link = 1; 72} 73 74message GenericRecommendation { 75 // Indicates if kernel launch is a performance bottleneck. Possible values: 76 // "no", "moderate", "high". 77 string kernel_launch_bottleneck = 1; 78 // A statement that recommends if we need to further investigate kernel-launch 79 // performance. 80 string kernel_launch_statement = 2; 81 // Indicates if all other is a performance bottleneck. Possible values: "no", 82 // "moderate", "high". 83 string all_other_bottleneck = 3; 84 // A statement that recommends if we need to further investigate all-other 85 // performance. 86 string all_other_statement = 4; 87 // A statement that recommends if the user should try using lower precision. 88 // Shows this statement to users only if it is not empty. 89 string precision_statement = 5; 90 // Indicates if device collectives are a performance bottleneck. Possible 91 // values: "no", "moderate", "high". 92 string device_collectives_bottleneck = 6; 93 // A statement that recommends if we need to further investigate 94 // device-collectives performance. 95 string device_collectives_statement = 7; 96} 97 98// Overview result for the recommendation section. 99message OverviewPageRecommendation { 100 // Possible performance bottleneck: "host", "device", "both". 101 string bottleneck = 1; 102 // A statement for input that recommends the next steps for investigating the 103 // bottleneck. 104 string statement = 2; 105 // A list of tips for tackling input bottleneck. 106 repeated OverviewPageTip input_tips = 11; 107 // A statement for output that recommends the next steps for investigating the 108 // bottleneck. 109 string output_statement = 9; 110 // A statement that recommends the next steps for investigating eager-mode 111 // related bottleneck (it is an html so that it can link to other tools/docs.) 112 string eager_statement_html = 12; 113 // A statement that recommends the next steps for investigating 114 // outside-compilation related bottleneck (it is an html so that it can link 115 // to other tools/docs.) 116 string outside_compilation_statement_html = 13; 117 // A statement that recommends the next steps for investigating tf-function 118 // related bottleneck (it is an html so that it can link to other tools/docs.) 119 string tf_function_statement_html = 10; 120 // A list of tips for improving host performance. 121 repeated OverviewPageTip host_tips = 3; 122 // A list of tips for improving device performance. 123 repeated OverviewPageTip device_tips = 4; 124 // A list of links to related useful documents. 125 repeated OverviewPageTip documentation_tips = 5; 126 // // The recommendation made to the user. Can be unpacked into a 127 // GenericRecommendation. 128 google.protobuf.Any recommendation = 6; 129 // A list of tips for FAQ. 130 repeated OverviewPageTip faq_tips = 7; 131 // A list of tips for inference run. 132 repeated OverviewPageTip inference_tips = 8; 133} 134 135// Result proto for host-independent job information. 136message OverviewPageHostIndependentJobInfo { 137 // The change-list number of this build. 138 int64 change_list = 1; 139 // The time of this build (nanoseconds since the Unix epoch). 140 int64 build_time = 2; 141 // The target of this build. 142 string build_target = 3; 143 // Profiling duration (in ms). 144 uint32 profile_duration_ms = 4; 145} 146 147// Result proto for host-dependent job information. 148message OverviewPageHostDependentJobInfo { 149 // This ID of the host where the job was run on. 150 string host_id = 1; 151 // The command line used to run the job. 152 string command_line = 2; 153 // The start time of this run (nanoseconds since the Unix epoch). 154 int64 start_time = 3; 155 // BNS address specified by client at time of profiling request. 156 string bns_address = 4; 157 // Profiling start walltime (in ns). 158 uint64 profile_time_ns = 5; 159} 160 161// The run environment of a profiling session. 162message OverviewPageRunEnvironment { 163 // Number of hosts used. 164 int32 host_count = 1; 165 // Number of tasks used. 166 int32 task_count = 2; 167 // Distinct hostnames seen. 168 map<string, bool> hostnames = 3; 169 // The type of device used. 170 string device_type = 4; 171 // The number of device cores used. 172 // In TPU case, this corresponds to the number of TPU cores 173 // In GPU case, this corresponds to the number of GPUs (not the number of 174 // SMs). 175 int32 device_core_count = 5; 176 // The per-device-core batch size. 177 int32 per_core_batch_size = 6; 178 // Host-independent information about this job. 179 OverviewPageHostIndependentJobInfo host_independent_job_info = 7; 180 // Host-dependent information about this job. 181 repeated OverviewPageHostDependentJobInfo host_dependent_job_info = 8; 182 // The number of replicas, corresponds to input parallelism. 183 // If there is no model parallelism, replica_count = device_core_count 184 int32 replica_count = 9; 185 // The number of cores used for a single replica, e.g. model parallelism. 186 // If there is no model parallelism, then num_cores_per_replica = 1 187 int32 num_cores_per_replica = 10; 188} 189 190message OverviewPage { 191 // The run environment of the profiled session. 192 OverviewPageRunEnvironment run_environment = 6; 193 // The step-time result. 194 InputPipelineAnalysisResult input_analysis = 2; 195 // The other analysis result. 196 OverviewPageAnalysis analysis = 3; 197 // The recommendation made to the user. 198 OverviewPageRecommendation recommendation = 4; 199 // Error and warning messages for diagnosing profiling issues. 200 Diagnostics diagnostics = 8; 201 reserved 1, 5, 7; 202} 203