1syntax = "proto3";
2
3package tensorflow;
4option cc_enable_arenas = true;
5option java_outer_classname = "ConfigProtos";
6option java_multiple_files = true;
7option java_package = "org.tensorflow.framework";
8
9import "tensorflow/core/framework/cost_graph.proto";
10import "tensorflow/core/framework/graph.proto";
11import "tensorflow/core/framework/step_stats.proto";
12import "tensorflow/core/protobuf/debug.proto";
13import "tensorflow/core/protobuf/cluster.proto";
14import "tensorflow/core/protobuf/rewriter_config.proto";
15
16message GPUOptions {
17  // A value between 0 and 1 that indicates what fraction of the
18  // available GPU memory to pre-allocate for each process.  1 means
19  // to pre-allocate all of the GPU memory, 0.5 means the process
20  // allocates ~50% of the available GPU memory.
21  double per_process_gpu_memory_fraction = 1;
22
23  // The type of GPU allocation strategy to use.
24  //
25  // Allowed values:
26  // "": The empty string (default) uses a system-chosen default
27  //     which may change over time.
28  //
29  // "BFC": A "Best-fit with coalescing" algorithm, simplified from a
30  //        version of dlmalloc.
31  string allocator_type = 2;
32
33  // Delay deletion of up to this many bytes to reduce the number of
34  // interactions with gpu driver code.  If 0, the system chooses
35  // a reasonable default (several MBs).
36  int64 deferred_deletion_bytes = 3;
37
38  // If true, the allocator does not pre-allocate the entire specified
39  // GPU memory region, instead starting small and growing as needed.
40  bool allow_growth = 4;
41
42  // A comma-separated list of GPU ids that determines the 'visible'
43  // to 'virtual' mapping of GPU devices.  For example, if TensorFlow
44  // can see 8 GPU devices in the process, and one wanted to map
45  // visible GPU devices 5 and 3 as "/device:GPU:0", and "/device:GPU:1",
46  // then one would specify this field as "5,3".  This field is similar in
47  // spirit to the CUDA_VISIBLE_DEVICES environment variable, except
48  // it applies to the visible GPU devices in the process.
49  //
50  // NOTE:
51  // 1. The GPU driver provides the process with the visible GPUs
52  //    in an order which is not guaranteed to have any correlation to
53  //    the *physical* GPU id in the machine.  This field is used for
54  //    remapping "visible" to "virtual", which means this operates only
55  //    after the process starts.  Users are required to use vendor
56  //    specific mechanisms (e.g., CUDA_VISIBLE_DEVICES) to control the
57  //    physical to visible device mapping prior to invoking TensorFlow.
58  // 2. In the code, the ids in this list are also called "CUDA GPU id"s,
59  //    and the 'virtual' ids of GPU devices (i.e. the ids in the device
60  //    name "/device:GPU:<id>") are also called "TF GPU id"s. Please
61  //    refer to third_party/tensorflow/core/common_runtime/gpu/gpu_id.h
62  //    for more information.
63  string visible_device_list = 5;
64
65  // In the event polling loop sleep this many microseconds between
66  // PollEvents calls, when the queue is not empty.  If value is not
67  // set or set to 0, gets set to a non-zero default.
68  int32 polling_active_delay_usecs = 6;
69
70  // In the event polling loop sleep this many millisconds between
71  // PollEvents calls, when the queue is empty.  If value is not
72  // set or set to 0, gets set to a non-zero default.
73  int32 polling_inactive_delay_msecs = 7;
74
75  // Force all tensors to be gpu_compatible. On a GPU-enabled TensorFlow,
76  // enabling this option forces all CPU tensors to be allocated with Cuda
77  // pinned memory. Normally, TensorFlow will infer which tensors should be
78  // allocated as the pinned memory. But in case where the inference is
79  // incomplete, this option can significantly speed up the cross-device memory
80  // copy performance as long as it fits the memory.
81  // Note that this option is not something that should be
82  // enabled by default for unknown or very large models, since all Cuda pinned
83  // memory is unpageable, having too much pinned memory might negatively impact
84  // the overall host system performance.
85  bool force_gpu_compatible = 8;
86
87  // Everything inside Experimental is subject to change and is not subject
88  // to API stability guarantees in
89  // https://www.tensorflow.org/programmers_guide/version_compat.
90  message Experimental {
91    // Configuration for breaking down a visible GPU into multiple "virtual"
92    // devices.
93    message VirtualDevices {
94      // Per "virtual" device memory limit, in MB. The number of elements in
95      // the list is the number of virtual devices to create on the
96      // corresponding visible GPU (see "virtual_devices" below).
97      // If empty, it will create single virtual device taking all available
98      // memory from the device.
99      //
100      // For the concept of "visible" and "virtual" GPU, see the comments for
101      // "visible_device_list" above for more information.
102      repeated float memory_limit_mb = 1;
103    }
104
105    // The multi virtual device settings. If empty (not set), it will create
106    // single virtual device on each visible GPU, according to the settings
107    // in "visible_device_list" above. Otherwise, the number of elements in the
108    // list must be the same as the number of visible GPUs (after
109    // "visible_device_list" filtering if it is set), and the string represented
110    // device names (e.g. /device:GPU:<id>) will refer to the virtual
111    // devices and have the <id> field assigned sequentially starting from 0,
112    // according to the order they appear in this list and the "memory_limit"
113    // list inside each element. For example,
114    //   visible_device_list = "1,0"
115    //   virtual_devices { memory_limit: 1GB memory_limit: 2GB }
116    //   virtual_devices {}
117    // will create three virtual devices as:
118    //   /device:GPU:0 -> visible GPU 1 with 1GB memory
119    //   /device:GPU:1 -> visible GPU 1 with 2GB memory
120    //   /device:GPU:2 -> visible GPU 0 with all available memory
121    //
122    // NOTE:
123    // 1. It's invalid to set both this and "per_process_gpu_memory_fraction"
124    //    at the same time.
125    // 2. Currently this setting is per-process, not per-session. Using
126    //    different settings in different sessions within same process will
127    //    result in undefined behavior.
128    repeated VirtualDevices virtual_devices = 1;
129  }
130
131  Experimental experimental = 9;
132};
133
134// Options passed to the graph optimizer
135message OptimizerOptions {
136  // If true, optimize the graph using common subexpression elimination.
137  bool do_common_subexpression_elimination = 1;
138
139  // If true, perform constant folding optimization on the graph.
140  bool do_constant_folding = 2;
141
142  // Constant folding optimization replaces tensors whose values can be
143  // predetermined, with constant nodes. To avoid inserting too large constants,
144  // the size of each constant created can be limited. If this value is zero, a
145  // default limit of 10 MiB will be applied. If constant folding optimization
146  // is disabled, this value is ignored.
147  int64 max_folded_constant_in_bytes = 6;
148
149  // If true, perform function inlining on the graph.
150  bool do_function_inlining = 4;
151
152  // Optimization level
153  enum Level {
154    // L1 is the default level.
155    // Optimization performed at L1 :
156    // 1. Common subexpression elimination
157    // 2. Constant folding
158    L1 = 0;
159
160    // No optimizations
161    L0 = -1;
162  }
163
164  // Overall optimization level. The actual optimizations applied will be the
165  // logical OR of the flags that this level implies and any flags already set.
166  Level opt_level = 3;
167
168  // Control the use of the compiler/jit.  Experimental.
169  enum GlobalJitLevel {
170    DEFAULT = 0;  // Default setting ("off" now, but later expected to be "on")
171    OFF = -1;
172    // The following settings turn on compilation, with higher values being
173    // more aggressive.  Higher values may reduce opportunities for parallelism
174    // and may use more memory.  (At present, there is no distinction, but this
175    // is expected to change.)
176    ON_1 = 1;
177    ON_2 = 2;
178  }
179  GlobalJitLevel global_jit_level = 5;
180}
181
182message GraphOptions {
183  // Removed, use optimizer_options below.
184  reserved "skip_common_subexpression_elimination";
185  reserved 1;
186
187  // If true, use control flow to schedule the activation of Recv nodes.
188  // (Currently ignored.)
189  bool enable_recv_scheduling = 2;
190
191  // Options controlling how graph is optimized.
192  OptimizerOptions optimizer_options = 3;
193
194  // The number of steps to run before returning a cost model detailing
195  // the memory usage and performance of each node of the graph. 0 means
196  // no cost model.
197  int64 build_cost_model = 4;
198
199  // The number of steps to skip before collecting statistics for the
200  // cost model.
201  int64 build_cost_model_after = 9;
202
203  // Annotate each Node with Op output shape data, to the extent it can
204  // be statically inferred.
205  bool infer_shapes = 5;
206
207  // Only place the subgraphs that are run, rather than the entire graph.
208  //
209  // This is useful for interactive graph building, where one might
210  // produce graphs that cannot be placed during the debugging
211  // process.  In particular, it allows the client to continue work in
212  // a session after adding a node to a graph whose placement
213  // constraints are unsatisfiable.
214  bool place_pruned_graph = 6;
215
216  // If true, transfer float values between processes as bfloat16.
217  bool enable_bfloat16_sendrecv = 7;
218
219  // If > 0, record a timeline every this many steps.
220  // EXPERIMENTAL: This currently has no effect in MasterSession.
221  int32 timeline_step = 8;
222
223  // Options that control the type and amount of graph rewriting.
224  // Not currently configurable via the public Python API (i.e. there is no API
225  // stability guarantee if you import RewriterConfig explicitly).
226  RewriterConfig rewrite_options = 10;
227};
228
229message ThreadPoolOptionProto {
230  // The number of threads in the pool.
231  //
232  // 0 means the system picks a value based on where this option proto is used
233  // (see the declaration of the specific field for more info).
234  int32 num_threads = 1;
235
236  // The global name of the threadpool.
237  //
238  // If empty, then the threadpool is made and used according to the scope it's
239  // in - e.g., for a session threadpool, it is used by that session only.
240  //
241  // If non-empty, then:
242  // - a global threadpool associated with this name is looked
243  //   up or created. This allows, for example, sharing one threadpool across
244  //   many sessions (e.g., like the default behavior, if
245  //   inter_op_parallelism_threads is not configured), but still partitioning
246  //   into a large and small pool.
247  // - if the threadpool for this global_name already exists, then it is an
248  //   error if the existing pool was created using a different num_threads
249  //   value as is specified on this call.
250  // - threadpools created this way are never garbage collected.
251  string global_name = 2;
252};
253
254message RPCOptions {
255  // If true, always use RPC to contact the session target.
256  //
257  // If false (the default option), TensorFlow may use an optimized
258  // transport for client-master communication that avoids the RPC
259  // stack. This option is primarily for used testing the RPC stack.
260  bool use_rpc_for_inprocess_master = 1;
261};
262
263// Session configuration parameters.
264// The system picks appropriate values for fields that are not set.
265message ConfigProto {
266  // Map from device type name (e.g., "CPU" or "GPU" ) to maximum
267  // number of devices of that type to use.  If a particular device
268  // type is not found in the map, the system picks an appropriate
269  // number.
270  map<string, int32> device_count = 1;
271
272  // The execution of an individual op (for some op types) can be
273  // parallelized on a pool of intra_op_parallelism_threads.
274  // 0 means the system picks an appropriate number.
275  int32 intra_op_parallelism_threads = 2;
276
277  // Nodes that perform blocking operations are enqueued on a pool of
278  // inter_op_parallelism_threads available in each process.
279  //
280  // 0 means the system picks an appropriate number.
281  //
282  // Note that the first Session created in the process sets the
283  // number of threads for all future sessions unless use_per_session_threads is
284  // true or session_inter_op_thread_pool is configured.
285  int32 inter_op_parallelism_threads = 5;
286
287  // If true, use a new set of threads for this session rather than the global
288  // pool of threads. Only supported by direct sessions.
289  //
290  // If false, use the global threads created by the first session, or the
291  // per-session thread pools configured by session_inter_op_thread_pool.
292  //
293  // This option is deprecated. The same effect can be achieved by setting
294  // session_inter_op_thread_pool to have one element, whose num_threads equals
295  // inter_op_parallelism_threads.
296  bool use_per_session_threads = 9;
297
298  // This option is experimental - it may be replaced with a different mechanism
299  // in the future.
300  //
301  // Configures session thread pools. If this is configured, then RunOptions for
302  // a Run call can select the thread pool to use.
303  //
304  // The intended use is for when some session invocations need to run in a
305  // background pool limited to a small number of threads:
306  // - For example, a session may be configured to have one large pool (for
307  // regular compute) and one small pool (for periodic, low priority work);
308  // using the small pool is currently the mechanism for limiting the inter-op
309  // parallelism of the low priority work.  Note that it does not limit the
310  // parallelism of work spawned by a single op kernel implementation.
311  // - Using this setting is normally not needed in training, but may help some
312  // serving use cases.
313  // - It is also generally recommended to set the global_name field of this
314  // proto, to avoid creating multiple large pools. It is typically better to
315  // run the non-low-priority work, even across sessions, in a single large
316  // pool.
317  repeated ThreadPoolOptionProto session_inter_op_thread_pool = 12;
318
319  // Assignment of Nodes to Devices is recomputed every placement_period
320  // steps until the system warms up (at which point the recomputation
321  // typically slows down automatically).
322  int32 placement_period = 3;
323
324  // When any filters are present sessions will ignore all devices which do not
325  // match the filters. Each filter can be partially specified, e.g. "/job:ps"
326  // "/job:worker/replica:3", etc.
327  repeated string device_filters = 4;
328
329  // Options that apply to all GPUs.
330  GPUOptions gpu_options = 6;
331
332  // Whether soft placement is allowed. If allow_soft_placement is true,
333  // an op will be placed on CPU if
334  //   1. there's no GPU implementation for the OP
335  // or
336  //   2. no GPU devices are known or registered
337  // or
338  //   3. need to co-locate with reftype input(s) which are from CPU.
339  bool allow_soft_placement = 7;
340
341  // Whether device placements should be logged.
342  bool log_device_placement = 8;
343
344  // Options that apply to all graphs.
345  GraphOptions graph_options = 10;
346
347  // Global timeout for all blocking operations in this session.  If non-zero,
348  // and not overridden on a per-operation basis, this value will be used as the
349  // deadline for all blocking operations.
350  int64 operation_timeout_in_ms = 11;
351
352  // Options that apply when this session uses the distributed runtime.
353  RPCOptions rpc_options = 13;
354
355  // Optional list of all workers to use in this session.
356  ClusterDef cluster_def = 14;
357
358  // If true, any resources such as Variables used in the session will not be
359  // shared with other sessions.
360  bool isolate_session_state = 15;
361
362  // Next: 16
363};
364
365// Options for a single Run() call.
366message RunOptions {
367  // TODO(pbar) Turn this into a TraceOptions proto which allows
368  // tracing to be controlled in a more orthogonal manner?
369  enum TraceLevel {
370    NO_TRACE = 0;
371    SOFTWARE_TRACE = 1;
372    HARDWARE_TRACE = 2;
373    FULL_TRACE = 3;
374  }
375  TraceLevel trace_level = 1;
376
377  // Time to wait for operation to complete in milliseconds.
378  int64 timeout_in_ms = 2;
379
380  // The thread pool to use, if session_inter_op_thread_pool is configured.
381  int32 inter_op_thread_pool = 3;
382
383  // Whether the partition graph(s) executed by the executor(s) should be
384  // outputted via RunMetadata.
385  bool output_partition_graphs = 5;
386
387  // EXPERIMENTAL.  Options used to initialize DebuggerState, if enabled.
388  DebugOptions debug_options = 6;
389
390  // When enabled, causes tensor allocation information to be included in
391  // the error message when the Run() call fails because the allocator ran
392  // out of memory (OOM).
393  //
394  // Enabling this option can slow down the Run() call.
395  bool report_tensor_allocations_upon_oom = 7;
396
397  reserved 4;
398}
399
400// Metadata output (i.e., non-Tensor) for a single Run() call.
401message RunMetadata {
402  // Statistics traced for this step. Populated if tracing is turned on via the
403  // "RunOptions" proto.
404  // EXPERIMENTAL: The format and set of events may change in future versions.
405  StepStats step_stats = 1;
406
407  // The cost graph for the computation defined by the run call.
408  CostGraphDef cost_graph = 2;
409
410  // Graphs of the partitions executed by executors.
411  repeated GraphDef partition_graphs = 3;
412}
413