1syntax = "proto3"; 2 3package tensorflow.tpu; 4 5import "tensorflow/core/protobuf/tpu/optimization_parameters.proto"; 6import "tensorflow/core/protobuf/tpu/tpu_embedding_output_layout.proto"; 7 8message TPUEmbeddingConfiguration { 9 // Description of the various embedding tables. 10 message TableDescriptor { 11 // Name of the table. 12 string name = 1; 13 // Size of the vocabulary (i.e., number of rows) in the table. 14 int64 vocabulary_size = 2; 15 // The embedding dimension (i.e., the width of the embedding table). 16 int32 dimension = 3; 17 // Number of features mapped to this table. 18 int32 num_features = 4; 19 // Details of the learning algorithm used to update the embedding 20 // parameters. 21 OptimizationParameters optimization_parameters = 5; 22 } 23 repeated TableDescriptor table_descriptor = 1; 24 25 // Mode. Should the embedding layer program be run for inference (just forward 26 // pass), training (both forward and backward pass) or just the backward_pass. 27 enum Mode { 28 UNSPECIFIED = 0; 29 INFERENCE = 1; 30 TRAINING = 2; 31 BACKWARD_PASS_ONLY = 3; 32 } 33 Mode mode = 2; 34 35 // Number of samples in each batch of embedding layer activations sent to 36 // the TensorCore. 37 int32 batch_size_per_tensor_core = 3; 38 39 // Number of TPU hosts used for inference/training. 40 int32 num_hosts = 4; 41 42 // Number of TensorCore used for inference/training. 43 int32 num_tensor_cores = 5; 44 45 // Sharding strategy of the embedding tables among the hosts. 46 // If the sharding_strategy is "mod", each id is assigned to host 47 // "id % num_hosts". For instance, 13 ids are split across 5 hosts as: 48 // [[0, 5, 10], [1, 6, 11], [2, 7, 12], [3, 8], [4, 9]]. 49 // If the sharding_strategy is "div", ids are assigned to hosts in a 50 // contiguous manner. In this case, 13 ids are split across 5 hosts as: 51 // [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10], [11, 12]]. 52 // In both the strategies, if the id space does not evenly divide the number 53 // of hosts, each of the first "table_descriptor.vocabulary_size % num_hosts" 54 // hosts will be assigned one more id. 55 // This partitioning strategy exactly follows that in the embedding_lookup 56 // TensorFlow function at tensorflow/python/ops/embedding_ops.py. 57 enum ShardingStrategy { 58 DIV_DEFAULT = 0; 59 MOD = 1; 60 } 61 ShardingStrategy sharding_strategy = 6; 62 63 // This parameter determines if the execution of the sparse core will be 64 // pipelined with that of the TensorCore. This parameter only affects results 65 // when mode=TRAINING. If mode=INFERENCE or BACKWARD_PASS_ONLY, this parameter 66 // does not affect execution and hence, is a don't care value. 67 // 68 // false: The execution of the sparse core is not pipelined with that of the 69 // TensorCore. The forward pass of every step on the sparse core is executed 70 // only after the backward pass of the previous step is complete. And the 71 // backward pass on the sparse core is executed only after the embedding 72 // gradients have been computed on the TensorCore on every step. This ensures 73 // that the activations on every step observe the gradient updates from the 74 // previous step on both the sparse core and the TensorCore. 75 // 76 // true: The execution of the sparse core is pipelined with that of the 77 // TensorCore. The forward pass of every step on the sparse core can be 78 // executed after the forward pass of the previous step is complete without 79 // waiting for the backward pass. This improves the utilization of the sparse 80 // core allowing it to process step N+1 while the embedding gradients for step 81 // N are computed on the TensorCore. The backward pass of every step on the 82 // sparse core is executed directly after the forward pass for the next step 83 // is complete. The drawback is that embedding activations for step N+1 do not 84 // observe the embedding gradient updates from step N. This could affect model 85 // quality if step N and N+1 involve the same set of embedding IDs. However, 86 // since the embedding updates are sparse, this is generally not considered a 87 // problem. 88 bool pipeline_execution_with_tensor_core = 7; 89 90 // Directory where embedding lookup statistics are stored. These statistics 91 // summarize information about the inputs to the embedding lookup 92 // operation, in particular, the average number of embedding IDs per example 93 // and how well the embedding IDs are load balanced across the system. The 94 // lookup statistics are used during TPU initialization for embedding table 95 // partitioning. Collection of lookup statistics is done at runtime by 96 // profiling the embedding inputs: only 3% of input samples are profiled to 97 // minimize host CPU overhead. Once a suitable number of samples are 98 // profiled, the lookup statistics are saved to table-specific files in the 99 // profile data directory generally at the end of a TPU training loop. The 100 // filename corresponding to each table is obtained by hashing table specific 101 // parameters (e.g., table name and number of features) and global 102 // configuration parameters (e.g., sharding strategy and TPU worker task 103 // count). The same profile data directory can be shared amongst several 104 // models to reuse embedding lookup statistics. 105 string profile_data_directory = 9; 106 107 // Extended output layout information; deprecated and now ignored. 108 TPUEmbeddingOutputLayout output_layout = 8 [deprecated = true]; 109} 110