android-12.0.0_r2/s

syntax = "proto3";

package tensorflow.profiler;

import "google/protobuf/any.proto";
import "tensorflow/core/profiler/protobuf/diagnostics.proto";
import "tensorflow/core/profiler/protobuf/input_pipeline.proto";

// Overview result for a TensorFlow Op.
message OverviewTfOp {
  // Name of the Op.
  string name = 1;
  // Category of the Op.
  string category = 2;
  // The amount of time that this Op takes by itself
  // as fraction of the total execution time on the device or host.
  double self_time_fraction = 3;
  // The cumulative time upto this Op as fraction of the total execution time.
  double cumulative_time_fraction = 4;
  // How many GFlops/sec that this Op achieves.
  double flop_rate = 5;
  // Whether the Op is eligible to use TensorCores.
  bool is_op_tensorcore_eligible = 6;
  // Whether at least one of the kernels launched in this op is using
  // TensorCore.
  bool is_op_using_tensorcore = 7;
}

// Overview result for general analysis.
message OverviewPageAnalysis {
  // MXU utilization in percentage.
  double mxu_utilization_percent = 1;
  // Percentage of the device time that is idle.
  double device_idle_time_percent = 2;
  // Percentage of the host time that is idle.
  double host_idle_time_percent = 3;
  // Top TF Ops executed on the device.
  repeated OverviewTfOp top_device_ops = 4;
  // Remark text in the performance summary section.
  string remark_text = 5;
  // Color of the remark text.
  string remark_color = 6;
  // FLOP rate utilization relative to the roofline in percentage.
  double flop_rate_utilization_relative_to_roofline_percent = 7;
  // Memory bandwidth utilization relative to the hw limit in percentage.
  double memory_bw_utilization_relative_to_hw_limit_percent = 8;
  // Percentage of device computation that is 16-bit.
  double device_compute_16bit_percent = 9;
  // Percentage of device computation that is 32-bit.
  double device_compute_32bit_percent = 10;
  // Percentage of TF ops executed on the host.
  double host_tf_op_percent = 11;
  // Percentage of TF ops executed on the device.
  double device_tf_op_percent = 12;
  // Host trace level.
  uint32 host_trace_level = 13;
  // Percentage of TF-op execution time on the host (excluding the idle time)
  // that are in eager mode.
  double host_op_time_eager_percent = 14;
  // Percentage of TF-op execution time on the device (excluding the idle time)
  // that are in eager mode.
  double device_op_time_eager_percent = 15;
  // Percentage of TF-op execution time on the device (excluding the idle time)
  // that are for outside compilation.
  double device_op_time_outside_compilation_percent = 16;
}

// Overview result for a performance tip to users.
message OverviewPageTip {
  // Link to the tip.
  string link = 1;
}

message GenericRecommendation {
  // Indicates if kernel launch is a performance bottleneck. Possible values:
  // "no", "moderate", "high".
  string kernel_launch_bottleneck = 1;
  // A statement that recommends if we need to further investigate kernel-launch
  // performance.
  string kernel_launch_statement = 2;
  // Indicates if all other is a performance bottleneck. Possible values: "no",
  // "moderate", "high".
  string all_other_bottleneck = 3;
  // A statement that recommends if we need to further investigate all-other
  // performance.
  string all_other_statement = 4;
  // A statement that recommends if the user should try using lower precision.
  // Shows this statement to users only if it is not empty.
  string precision_statement = 5;
  // Indicates if device collectives are a performance bottleneck. Possible
  // values: "no", "moderate", "high".
  string device_collectives_bottleneck = 6;
  // A statement that recommends if we need to further investigate
  // device-collectives performance.
  string device_collectives_statement = 7;
}

// Overview result for the recommendation section.
message OverviewPageRecommendation {
  // Possible performance bottleneck: "host", "device", "both".
  string bottleneck = 1;
  // A statement for input that recommends the next steps for investigating the
  // bottleneck.
  string statement = 2;
  // A list of tips for tackling input bottleneck.
  repeated OverviewPageTip input_tips = 11;
  // A statement for output that recommends the next steps for investigating the
  // bottleneck.
  string output_statement = 9;
  // A statement that recommends the next steps for investigating eager-mode
  // related bottleneck (it is an html so that it can link to other tools/docs.)
  string eager_statement_html = 12;
  // A statement that recommends the next steps for investigating
  // outside-compilation related bottleneck (it is an html so that it can link
  // to other tools/docs.)
  string outside_compilation_statement_html = 13;
  // A statement that recommends the next steps for investigating tf-function
  // related bottleneck (it is an html so that it can link to other tools/docs.)
  string tf_function_statement_html = 10;
  // A list of tips for improving host performance.
  repeated OverviewPageTip host_tips = 3;
  // A list of tips for improving device performance.
  repeated OverviewPageTip device_tips = 4;
  // A list of links to related useful documents.
  repeated OverviewPageTip documentation_tips = 5;
  // // The recommendation made to the user. Can be unpacked into a
  // GenericRecommendation.
  google.protobuf.Any recommendation = 6;
  // A list of tips for FAQ.
  repeated OverviewPageTip faq_tips = 7;
  // A list of tips for inference run.
  repeated OverviewPageTip inference_tips = 8;
}

// Result proto for host-independent job information.
message OverviewPageHostIndependentJobInfo {
  // The change-list number of this build.
  int64 change_list = 1;
  // The time of this build (nanoseconds since the Unix epoch).
  int64 build_time = 2;
  // The target of this build.
  string build_target = 3;
  // Profiling duration (in ms).
  uint32 profile_duration_ms = 4;
}

// Result proto for host-dependent job information.
message OverviewPageHostDependentJobInfo {
  // This ID of the host where the job was run on.
  string host_id = 1;
  // The command line used to run the job.
  string command_line = 2;
  // The start time of this run (nanoseconds since the Unix epoch).
  int64 start_time = 3;
  // BNS address specified by client at time of profiling request.
  string bns_address = 4;
  // Profiling start walltime (in ns).
  uint64 profile_time_ns = 5;
}

// The run environment of a profiling session.
message OverviewPageRunEnvironment {
  // Number of hosts used.
  int32 host_count = 1;
  // Number of tasks used.
  int32 task_count = 2;
  // Distinct hostnames seen.
  map<string, bool> hostnames = 3;
  // The type of device used.
  string device_type = 4;
  // The number of device cores used.
  //   In TPU case, this corresponds to the number of TPU cores
  //   In GPU case, this corresponds to the number of GPUs (not the number of
  //   SMs).
  int32 device_core_count = 5;
  // The per-device-core batch size.
  int32 per_core_batch_size = 6;
  // Host-independent information about this job.
  OverviewPageHostIndependentJobInfo host_independent_job_info = 7;
  // Host-dependent information about this job.
  repeated OverviewPageHostDependentJobInfo host_dependent_job_info = 8;
  // The number of replicas, corresponds to input parallelism.
  // If there is no model parallelism, replica_count = device_core_count
  int32 replica_count = 9;
  // The number of cores used for a single replica, e.g. model parallelism.
  // If there is no model parallelism, then num_cores_per_replica = 1
  int32 num_cores_per_replica = 10;
}

message OverviewPage {
  // The run environment of the profiled session.
  OverviewPageRunEnvironment run_environment = 6;
  // The step-time result.
  InputPipelineAnalysisResult input_analysis = 2;
  // The other analysis result.
  OverviewPageAnalysis analysis = 3;
  // The recommendation made to the user.
  OverviewPageRecommendation recommendation = 4;
  // Error and warning messages for diagnosing profiling issues.
  Diagnostics diagnostics = 8;
  reserved 1, 5, 7;
}