1syntax = "proto3";
2
3package tensorflow.tpu;
4
5import "tensorflow/core/protobuf/tpu/optimization_parameters.proto";
6import "tensorflow/core/protobuf/tpu/tpu_embedding_output_layout.proto";
7
8message TPUEmbeddingConfiguration {
9  // Description of the various embedding tables.
10  message TableDescriptor {
11    // Name of the table.
12    string name = 1;
13    // Size of the vocabulary (i.e., number of rows) in the table.
14    int64 vocabulary_size = 2;
15    // The embedding dimension (i.e., the width of the embedding table).
16    int32 dimension = 3;
17    // Number of features mapped to this table.
18    int32 num_features = 4;
19    // Details of the learning algorithm used to update the embedding
20    // parameters.
21    OptimizationParameters optimization_parameters = 5;
22  }
23  repeated TableDescriptor table_descriptor = 1;
24
25  // Mode. Should the embedding layer program be run for inference (just forward
26  // pass), training (both forward and backward pass) or just the backward_pass.
27  enum Mode {
28    UNSPECIFIED = 0;
29    INFERENCE = 1;
30    TRAINING = 2;
31    BACKWARD_PASS_ONLY = 3;
32  }
33  Mode mode = 2;
34
35  // Number of samples in each batch of embedding layer activations sent to
36  // the TensorCore.
37  int32 batch_size_per_tensor_core = 3;
38
39  // Number of TPU hosts used for inference/training.
40  int32 num_hosts = 4;
41
42  // Number of TensorCore used for inference/training.
43  int32 num_tensor_cores = 5;
44
45  // Sharding strategy of the embedding tables among the hosts.
46  // If the sharding_strategy is "mod", each id is assigned to host
47  // "id % num_hosts". For instance, 13 ids are split across 5 hosts as:
48  // [[0, 5, 10], [1, 6, 11], [2, 7, 12], [3, 8], [4, 9]].
49  // If the sharding_strategy is "div", ids are assigned to hosts in a
50  // contiguous manner. In this case, 13 ids are split across 5 hosts as:
51  // [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10], [11, 12]].
52  // In both the strategies, if the id space does not evenly divide the number
53  // of hosts, each of the first "table_descriptor.vocabulary_size % num_hosts"
54  // hosts will be assigned one more id.
55  // This partitioning strategy exactly follows that in the embedding_lookup
56  // TensorFlow function at tensorflow/python/ops/embedding_ops.py.
57  enum ShardingStrategy {
58    DIV_DEFAULT = 0;
59    MOD = 1;
60  }
61  ShardingStrategy sharding_strategy = 6;
62
63  // This parameter determines if the execution of the sparse core will be
64  // pipelined with that of the TensorCore. This parameter only affects results
65  // when mode=TRAINING. If mode=INFERENCE or BACKWARD_PASS_ONLY, this parameter
66  // does not affect execution and hence, is a don't care value.
67  //
68  // false: The execution of the sparse core is not pipelined with that of the
69  // TensorCore. The forward pass of every step on the sparse core is executed
70  // only after the backward pass of the previous step is complete. And the
71  // backward pass on the sparse core is executed only after the embedding
72  // gradients have been computed on the TensorCore on every step. This ensures
73  // that the activations on every step observe the gradient updates from the
74  // previous step on both the sparse core and the TensorCore.
75  //
76  // true: The execution of the sparse core is pipelined with that of the
77  // TensorCore. The forward pass of every step on the sparse core can be
78  // executed after the forward pass of the previous step is complete without
79  // waiting for the backward pass. This improves the utilization of the sparse
80  // core allowing it to process step N+1 while the embedding gradients for step
81  // N are computed on the TensorCore. The backward pass of every step on the
82  // sparse core is executed directly after the forward pass for the next step
83  // is complete. The drawback is that embedding activations for step N+1 do not
84  // observe the embedding gradient updates from step N. This could affect model
85  // quality if step N and N+1 involve the same set of embedding IDs. However,
86  // since the embedding updates are sparse, this is generally not considered a
87  // problem.
88  bool pipeline_execution_with_tensor_core = 7;
89
90  // Directory where embedding lookup statistics are stored. These statistics
91  // summarize information about the inputs to the embedding lookup
92  // operation, in particular, the average number of embedding IDs per example
93  // and how well the embedding IDs are load balanced across the system. The
94  // lookup statistics are used during TPU initialization for embedding table
95  // partitioning. Collection of lookup statistics is done at runtime by
96  // profiling the embedding inputs: only 3% of input samples are profiled to
97  // minimize host CPU overhead. Once a suitable number of samples are
98  // profiled, the lookup statistics are saved to table-specific files in the
99  // profile data directory generally at the end of a TPU training loop. The
100  // filename corresponding to each table is obtained by hashing table specific
101  // parameters (e.g., table name and number of features) and global
102  // configuration parameters (e.g., sharding strategy and TPU worker task
103  // count). The same profile data directory can be shared amongst several
104  // models to reuse embedding lookup statistics.
105  string profile_data_directory = 9;
106
107  // Extended output layout information; deprecated and now ignored.
108  TPUEmbeddingOutputLayout output_layout = 8 [deprecated = true];
109}
110