1syntax = "proto3";
2
3package tensorflow.profiler;
4
5// What the dimension represents, e.g. spatial, feature or batch.
6enum LayoutDimensionSemantics {
7  UNKNOWN_SEMANTICS = 0;
8  FEATURE = 1;
9  BATCH = 2;
10  SPATIAL = 3;
11}
12
13// Data layout of an op.
14message LayoutAnalysis {
15  // Physical data layout in each tensor dimension.
16  message Dimension {
17    // Size of the data in this dimension.
18    int32 size = 1;
19    // Data must be padded to a multiple of alignment.
20    int32 alignment = 2;
21    // What the dimension represents.
22    LayoutDimensionSemantics semantics = 3;
23  }
24  // The physical data layout, from most-minor to most-major dimensions.
25  repeated Dimension dimensions = 1;
26}
27
28// Metrics for an operation (accumulated over all occurrences).
29// Next ID: 21
30message OpMetrics {
31  // HLO module id. 0 for TF ops.
32  uint64 hlo_module_id = 13;
33  // Name of this op.
34  string name = 6;
35  // Long name of this op (e.g., HLO expression).
36  string long_name = 20;
37  // Category of this op.
38  string category = 11;
39  // Provenance of this op (e.g., if HLO op, original TF op).
40  string provenance = 12;
41  // Whether it is executed eagerly.
42  bool is_eager = 18;
43  // Number of executions.
44  uint32 occurrences = 3;
45  // Total time (self + children) in picoseconds.
46  uint64 time_ps = 7;
47  // Minimum time (self + children) among all occurrences.
48  uint64 min_time_ps = 17;
49  // Total self time in picoseconds.
50  uint64 self_time_ps = 1;
51  // Total FLOPs.
52  uint64 flops = 2;
53  // Total bytes accessed.
54  uint64 bytes_accessed = 5;
55  // Breakdown of memory accessed by operation type and memory space.
56  message MemoryAccessed {
57    enum OperationType {
58      UNKNOWN = 0;
59      READ = 1;
60      WRITE = 2;
61    }
62    OperationType operation_type = 1;
63    // Device-specific id of memory space.
64    uint64 memory_space = 2;
65    uint64 bytes_accessed = 3;
66  }
67  repeated MemoryAccessed memory_accessed_breakdown = 19;
68  // Total dma stall time in picoseconds.
69  uint64 dma_stall_ps = 10;
70  // The data layout for this op. Only set for convolution ops for now.
71  LayoutAnalysis layout = 14;
72  // Deduplicated HLO name for this op. Not set for TF ops.
73  string deduplicated_name = 15;
74  // Children of the op. e.g. fused ops if this op is fusion.
75  OpMetricsDb children = 16;
76  reserved 4, 8, 9;
77}
78
79// Statistics about the various precision used in computation.
80message PrecisionStats {
81  // Amount of time spent on 16-bit computation (in ps).
82  uint64 compute_16bit_ps = 1;
83  // Amount of time spent on 32-bit computation (in ps).
84  uint64 compute_32bit_ps = 2;
85}
86
87// A database for OpMetrics.
88// Next ID: 14
89message OpMetricsDb {
90  // A bunch of OpMetrics.
91  repeated OpMetrics metrics_db = 10;
92  // The total host infeed-enqueue duration in picoseconds.
93  uint64 total_host_infeed_enq_duration_ps = 2;
94  // The total of the difference between the start times of two
95  // consecutive infeed-enqueues (per host) in picoseconds.
96  uint64 total_host_infeed_enq_start_timestamp_ps_diff = 3;
97  // The total time in picoseconds.
98  uint64 total_time_ps = 11;
99  // The total time incurred by OPs in picoseconds.
100  uint64 total_op_time_ps = 12;
101  // Precision-related stats.
102  PrecisionStats precision_stats = 13;
103  reserved 1, 4, 5, 6, 7, 8, 9;
104}
105