1syntax = "proto3";
2
3package tensorflow.profiler;
4
5import "google/protobuf/any.proto";
6import "tensorflow/core/profiler/protobuf/diagnostics.proto";
7import "tensorflow/core/profiler/protobuf/input_pipeline.proto";
8
9// Overview result for a TensorFlow Op.
10message OverviewTfOp {
11  // Name of the Op.
12  string name = 1;
13  // Category of the Op.
14  string category = 2;
15  // The amount of time that this Op takes by itself
16  // as fraction of the total execution time on the device or host.
17  double self_time_fraction = 3;
18  // The cumulative time upto this Op as fraction of the total execution time.
19  double cumulative_time_fraction = 4;
20  // How many GFlops/sec that this Op achieves.
21  double flop_rate = 5;
22  // Whether the Op is eligible to use TensorCores.
23  bool is_op_tensorcore_eligible = 6;
24  // Whether at least one of the kernels launched in this op is using
25  // TensorCore.
26  bool is_op_using_tensorcore = 7;
27}
28
29// Overview result for general analysis.
30message OverviewPageAnalysis {
31  // MXU utilization in percentage.
32  double mxu_utilization_percent = 1;
33  // Percentage of the device time that is idle.
34  double device_idle_time_percent = 2;
35  // Percentage of the host time that is idle.
36  double host_idle_time_percent = 3;
37  // Top TF Ops executed on the device.
38  repeated OverviewTfOp top_device_ops = 4;
39  // Remark text in the performance summary section.
40  string remark_text = 5;
41  // Color of the remark text.
42  string remark_color = 6;
43  // FLOP rate utilization relative to the roofline in percentage.
44  double flop_rate_utilization_relative_to_roofline_percent = 7;
45  // Memory bandwidth utilization relative to the hw limit in percentage.
46  double memory_bw_utilization_relative_to_hw_limit_percent = 8;
47  // Percentage of device computation that is 16-bit.
48  double device_compute_16bit_percent = 9;
49  // Percentage of device computation that is 32-bit.
50  double device_compute_32bit_percent = 10;
51  // Percentage of TF ops executed on the host.
52  double host_tf_op_percent = 11;
53  // Percentage of TF ops executed on the device.
54  double device_tf_op_percent = 12;
55  // Host trace level.
56  uint32 host_trace_level = 13;
57  // Percentage of TF-op execution time on the host (excluding the idle time)
58  // that are in eager mode.
59  double host_op_time_eager_percent = 14;
60  // Percentage of TF-op execution time on the device (excluding the idle time)
61  // that are in eager mode.
62  double device_op_time_eager_percent = 15;
63  // Percentage of TF-op execution time on the device (excluding the idle time)
64  // that are for outside compilation.
65  double device_op_time_outside_compilation_percent = 16;
66}
67
68// Overview result for a performance tip to users.
69message OverviewPageTip {
70  // Link to the tip.
71  string link = 1;
72}
73
74message GenericRecommendation {
75  // Indicates if kernel launch is a performance bottleneck. Possible values:
76  // "no", "moderate", "high".
77  string kernel_launch_bottleneck = 1;
78  // A statement that recommends if we need to further investigate kernel-launch
79  // performance.
80  string kernel_launch_statement = 2;
81  // Indicates if all other is a performance bottleneck. Possible values: "no",
82  // "moderate", "high".
83  string all_other_bottleneck = 3;
84  // A statement that recommends if we need to further investigate all-other
85  // performance.
86  string all_other_statement = 4;
87  // A statement that recommends if the user should try using lower precision.
88  // Shows this statement to users only if it is not empty.
89  string precision_statement = 5;
90  // Indicates if device collectives are a performance bottleneck. Possible
91  // values: "no", "moderate", "high".
92  string device_collectives_bottleneck = 6;
93  // A statement that recommends if we need to further investigate
94  // device-collectives performance.
95  string device_collectives_statement = 7;
96}
97
98// Overview result for the recommendation section.
99message OverviewPageRecommendation {
100  // Possible performance bottleneck: "host", "device", "both".
101  string bottleneck = 1;
102  // A statement for input that recommends the next steps for investigating the
103  // bottleneck.
104  string statement = 2;
105  // A list of tips for tackling input bottleneck.
106  repeated OverviewPageTip input_tips = 11;
107  // A statement for output that recommends the next steps for investigating the
108  // bottleneck.
109  string output_statement = 9;
110  // A statement that recommends the next steps for investigating eager-mode
111  // related bottleneck (it is an html so that it can link to other tools/docs.)
112  string eager_statement_html = 12;
113  // A statement that recommends the next steps for investigating
114  // outside-compilation related bottleneck (it is an html so that it can link
115  // to other tools/docs.)
116  string outside_compilation_statement_html = 13;
117  // A statement that recommends the next steps for investigating tf-function
118  // related bottleneck (it is an html so that it can link to other tools/docs.)
119  string tf_function_statement_html = 10;
120  // A list of tips for improving host performance.
121  repeated OverviewPageTip host_tips = 3;
122  // A list of tips for improving device performance.
123  repeated OverviewPageTip device_tips = 4;
124  // A list of links to related useful documents.
125  repeated OverviewPageTip documentation_tips = 5;
126  // // The recommendation made to the user. Can be unpacked into a
127  // GenericRecommendation.
128  google.protobuf.Any recommendation = 6;
129  // A list of tips for FAQ.
130  repeated OverviewPageTip faq_tips = 7;
131  // A list of tips for inference run.
132  repeated OverviewPageTip inference_tips = 8;
133}
134
135// Result proto for host-independent job information.
136message OverviewPageHostIndependentJobInfo {
137  // The change-list number of this build.
138  int64 change_list = 1;
139  // The time of this build (nanoseconds since the Unix epoch).
140  int64 build_time = 2;
141  // The target of this build.
142  string build_target = 3;
143  // Profiling duration (in ms).
144  uint32 profile_duration_ms = 4;
145}
146
147// Result proto for host-dependent job information.
148message OverviewPageHostDependentJobInfo {
149  // This ID of the host where the job was run on.
150  string host_id = 1;
151  // The command line used to run the job.
152  string command_line = 2;
153  // The start time of this run (nanoseconds since the Unix epoch).
154  int64 start_time = 3;
155  // BNS address specified by client at time of profiling request.
156  string bns_address = 4;
157  // Profiling start walltime (in ns).
158  uint64 profile_time_ns = 5;
159}
160
161// The run environment of a profiling session.
162message OverviewPageRunEnvironment {
163  // Number of hosts used.
164  int32 host_count = 1;
165  // Number of tasks used.
166  int32 task_count = 2;
167  // Distinct hostnames seen.
168  map<string, bool> hostnames = 3;
169  // The type of device used.
170  string device_type = 4;
171  // The number of device cores used.
172  //   In TPU case, this corresponds to the number of TPU cores
173  //   In GPU case, this corresponds to the number of GPUs (not the number of
174  //   SMs).
175  int32 device_core_count = 5;
176  // The per-device-core batch size.
177  int32 per_core_batch_size = 6;
178  // Host-independent information about this job.
179  OverviewPageHostIndependentJobInfo host_independent_job_info = 7;
180  // Host-dependent information about this job.
181  repeated OverviewPageHostDependentJobInfo host_dependent_job_info = 8;
182  // The number of replicas, corresponds to input parallelism.
183  // If there is no model parallelism, replica_count = device_core_count
184  int32 replica_count = 9;
185  // The number of cores used for a single replica, e.g. model parallelism.
186  // If there is no model parallelism, then num_cores_per_replica = 1
187  int32 num_cores_per_replica = 10;
188}
189
190message OverviewPage {
191  // The run environment of the profiled session.
192  OverviewPageRunEnvironment run_environment = 6;
193  // The step-time result.
194  InputPipelineAnalysisResult input_analysis = 2;
195  // The other analysis result.
196  OverviewPageAnalysis analysis = 3;
197  // The recommendation made to the user.
198  OverviewPageRecommendation recommendation = 4;
199  // Error and warning messages for diagnosing profiling issues.
200  Diagnostics diagnostics = 8;
201  reserved 1, 5, 7;
202}
203