1syntax = "proto3";
2
3package tensorflow;
4option cc_enable_arenas = true;
5option java_outer_classname = "ConfigProtos";
6option java_multiple_files = true;
7option java_package = "org.tensorflow.framework";
8// add go_package externally with copybara
9import "tensorflow/core/framework/cost_graph.proto";
10import "tensorflow/core/framework/graph.proto";
11import "tensorflow/core/framework/step_stats.proto";
12import "tensorflow/core/protobuf/debug.proto";
13import "tensorflow/core/protobuf/cluster.proto";
14import "tensorflow/core/protobuf/rewriter_config.proto";
15
16message GPUOptions {
17  // Fraction of the available GPU memory to allocate for each process.
18  // 1 means to allocate all of the GPU memory, 0.5 means the process
19  // allocates up to ~50% of the available GPU memory.
20  //
21  // GPU memory is pre-allocated unless the allow_growth option is enabled.
22  //
23  // If greater than 1.0, uses CUDA unified memory to potentially oversubscribe
24  // the amount of memory available on the GPU device by using host memory as a
25  // swap space. Accessing memory not available on the device will be
26  // significantly slower as that would require memory transfer between the host
27  // and the device. Options to reduce the memory requirement should be
28  // considered before enabling this option as this may come with a negative
29  // performance impact. Oversubscription using the unified memory requires
30  // Pascal class or newer GPUs and it is currently only supported on the Linux
31  // operating system. See
32  // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#um-requirements
33  // for the detailed requirements.
34  double per_process_gpu_memory_fraction = 1;
35
36  // If true, the allocator does not pre-allocate the entire specified
37  // GPU memory region, instead starting small and growing as needed.
38  bool allow_growth = 4;
39
40  // The type of GPU allocation strategy to use.
41  //
42  // Allowed values:
43  // "": The empty string (default) uses a system-chosen default
44  //     which may change over time.
45  //
46  // "BFC": A "Best-fit with coalescing" algorithm, simplified from a
47  //        version of dlmalloc.
48  string allocator_type = 2;
49
50  // Delay deletion of up to this many bytes to reduce the number of
51  // interactions with gpu driver code.  If 0, the system chooses
52  // a reasonable default (several MBs).
53  int64 deferred_deletion_bytes = 3;
54
55  // A comma-separated list of GPU ids that determines the 'visible'
56  // to 'virtual' mapping of GPU devices.  For example, if TensorFlow
57  // can see 8 GPU devices in the process, and one wanted to map
58  // visible GPU devices 5 and 3 as "/device:GPU:0", and "/device:GPU:1",
59  // then one would specify this field as "5,3".  This field is similar in
60  // spirit to the CUDA_VISIBLE_DEVICES environment variable, except
61  // it applies to the visible GPU devices in the process.
62  //
63  // NOTE:
64  // 1. The GPU driver provides the process with the visible GPUs
65  //    in an order which is not guaranteed to have any correlation to
66  //    the *physical* GPU id in the machine.  This field is used for
67  //    remapping "visible" to "virtual", which means this operates only
68  //    after the process starts.  Users are required to use vendor
69  //    specific mechanisms (e.g., CUDA_VISIBLE_DEVICES) to control the
70  //    physical to visible device mapping prior to invoking TensorFlow.
71  // 2. In the code, the ids in this list are also called "platform GPU id"s,
72  //    and the 'virtual' ids of GPU devices (i.e. the ids in the device
73  //    name "/device:GPU:<id>") are also called "TF GPU id"s. Please
74  //    refer to third_party/tensorflow/core/common_runtime/gpu/gpu_id.h
75  //    for more information.
76  string visible_device_list = 5;
77
78  // In the event polling loop sleep this many microseconds between
79  // PollEvents calls, when the queue is not empty.  If value is not
80  // set or set to 0, gets set to a non-zero default.
81  int32 polling_active_delay_usecs = 6;
82
83  // This field is deprecated and ignored.
84  int32 polling_inactive_delay_msecs = 7;
85
86  // Force all tensors to be gpu_compatible. On a GPU-enabled TensorFlow,
87  // enabling this option forces all CPU tensors to be allocated with Cuda
88  // pinned memory. Normally, TensorFlow will infer which tensors should be
89  // allocated as the pinned memory. But in case where the inference is
90  // incomplete, this option can significantly speed up the cross-device memory
91  // copy performance as long as it fits the memory.
92  // Note that this option is not something that should be
93  // enabled by default for unknown or very large models, since all Cuda pinned
94  // memory is unpageable, having too much pinned memory might negatively impact
95  // the overall host system performance.
96  bool force_gpu_compatible = 8;
97
98  message Experimental {
99    // Configuration for breaking down a visible GPU into multiple "virtual"
100    // devices.
101    message VirtualDevices {
102      // Per "virtual" device memory limit, in MB. The number of elements in
103      // the list is the number of virtual devices to create on the
104      // corresponding visible GPU (see "virtual_devices" below).
105      // If empty, it will create single virtual device taking all available
106      // memory from the device.
107      //
108      // For the concept of "visible" and "virtual" GPU, see the comments for
109      // "visible_device_list" above for more information.
110      repeated float memory_limit_mb = 1;
111    }
112
113    // The multi virtual device settings. If empty (not set), it will create
114    // single virtual device on each visible GPU, according to the settings
115    // in "visible_device_list" above. Otherwise, the number of elements in the
116    // list must be the same as the number of visible GPUs (after
117    // "visible_device_list" filtering if it is set), and the string represented
118    // device names (e.g. /device:GPU:<id>) will refer to the virtual
119    // devices and have the <id> field assigned sequentially starting from 0,
120    // according to the order they appear in this list and the "memory_limit"
121    // list inside each element. For example,
122    //   visible_device_list = "1,0"
123    //   virtual_devices { memory_limit: 1GB memory_limit: 2GB }
124    //   virtual_devices {}
125    // will create three virtual devices as:
126    //   /device:GPU:0 -> visible GPU 1 with 1GB memory
127    //   /device:GPU:1 -> visible GPU 1 with 2GB memory
128    //   /device:GPU:2 -> visible GPU 0 with all available memory
129    //
130    // NOTE:
131    // 1. It's invalid to set both this and "per_process_gpu_memory_fraction"
132    //    at the same time.
133    // 2. Currently this setting is per-process, not per-session. Using
134    //    different settings in different sessions within same process will
135    //    result in undefined behavior.
136    repeated VirtualDevices virtual_devices = 1;
137
138    // If true, uses CUDA unified memory for memory allocations. If
139    // per_process_gpu_memory_fraction option is greater than 1.0, then unified
140    // memory is used regardless of the value for this field. See comments for
141    // per_process_gpu_memory_fraction field for more details and requirements
142    // of the unified memory. This option is useful to oversubscribe memory if
143    // multiple processes are sharing a single GPU while individually using less
144    // than 1.0 per process memory fraction.
145    bool use_unified_memory = 2;
146
147    // If > 1, the number of device-to-device copy streams to create
148    // for each GPUDevice.  Default value is 0, which is automatically
149    // converted to 1.
150    int32 num_dev_to_dev_copy_streams = 3;
151
152    // If non-empty, defines a good GPU ring order on a single worker based on
153    // device interconnect.  This assumes that all workers have the same GPU
154    // topology.  Specify as a comma-separated string, e.g. "3,2,1,0,7,6,5,4".
155    // This ring order is used by the RingReducer implementation of
156    // CollectiveReduce, and serves as an override to automatic ring order
157    // generation in OrderTaskDeviceMap() during CollectiveParam resolution.
158    string collective_ring_order = 4;
159
160    // If true then extra work is done by GPUDevice and GPUBFCAllocator to
161    // keep track of when GPU memory is freed and when kernels actually
162    // complete so that we can know when a nominally free memory chunk
163    // is really not subject to pending use.
164    bool timestamped_allocator = 5;
165
166    // If > 0 limit the number of pending kernels on any compute
167    // stream to this number.
168    int32 pending_cap = 6;
169  }
170
171  // Everything inside experimental is subject to change and is not subject
172  // to API stability guarantees in
173  // https://www.tensorflow.org/guide/version_compat.
174  Experimental experimental = 9;
175};
176
177// Options passed to the graph optimizer
178message OptimizerOptions {
179  // If true, optimize the graph using common subexpression elimination.
180  bool do_common_subexpression_elimination = 1;
181
182  // If true, perform constant folding optimization on the graph.
183  bool do_constant_folding = 2;
184
185  // Constant folding optimization replaces tensors whose values can be
186  // predetermined, with constant nodes. To avoid inserting too large constants,
187  // the size of each constant created can be limited. If this value is zero, a
188  // default limit of 10 MiB will be applied. If constant folding optimization
189  // is disabled, this value is ignored.
190  int64 max_folded_constant_in_bytes = 6;
191
192  // If true, perform function inlining on the graph.
193  bool do_function_inlining = 4;
194
195  // Optimization level
196  enum Level {
197    // L1 is the default level.
198    // Optimization performed at L1 :
199    // 1. Common subexpression elimination
200    // 2. Constant folding
201    L1 = 0;
202
203    // No optimizations
204    L0 = -1;
205  }
206
207  // Overall optimization level. The actual optimizations applied will be the
208  // logical OR of the flags that this level implies and any flags already set.
209  Level opt_level = 3;
210
211  // Control the use of the compiler/jit.  Experimental.
212  enum GlobalJitLevel {
213    DEFAULT = 0;  // Default setting ("off" now, but later expected to be "on")
214    OFF = -1;
215    // The following settings turn on compilation, with higher values being
216    // more aggressive.  Higher values may reduce opportunities for parallelism
217    // and may use more memory.  (At present, there is no distinction, but this
218    // is expected to change.)
219    ON_1 = 1;
220    ON_2 = 2;
221  }
222  GlobalJitLevel global_jit_level = 5;
223}
224
225message GraphOptions {
226  // Removed, use optimizer_options below.
227  reserved "skip_common_subexpression_elimination";
228  reserved 1;
229
230  // If true, use control flow to schedule the activation of Recv nodes.
231  // (Currently ignored.)
232  bool enable_recv_scheduling = 2;
233
234  // Options controlling how graph is optimized.
235  OptimizerOptions optimizer_options = 3;
236
237  // The number of steps to run before returning a cost model detailing
238  // the memory usage and performance of each node of the graph. 0 means
239  // no cost model.
240  int64 build_cost_model = 4;
241
242  // The number of steps to skip before collecting statistics for the
243  // cost model.
244  int64 build_cost_model_after = 9;
245
246  // Annotate each Node with Op output shape data, to the extent it can
247  // be statically inferred.
248  bool infer_shapes = 5;
249
250  // Only place the subgraphs that are run, rather than the entire graph.
251  //
252  // This is useful for interactive graph building, where one might
253  // produce graphs that cannot be placed during the debugging
254  // process.  In particular, it allows the client to continue work in
255  // a session after adding a node to a graph whose placement
256  // constraints are unsatisfiable.
257  bool place_pruned_graph = 6;
258
259  // If true, transfer float values between processes as bfloat16.
260  bool enable_bfloat16_sendrecv = 7;
261
262  // If > 0, record a timeline every this many steps.
263  // EXPERIMENTAL: This currently has no effect in MasterSession.
264  int32 timeline_step = 8;
265
266  // Options that control the type and amount of graph rewriting.
267  // Not currently configurable via the public Python API (i.e. there is no API
268  // stability guarantee if you import RewriterConfig explicitly).
269  RewriterConfig rewrite_options = 10;
270};
271
272message ThreadPoolOptionProto {
273  // The number of threads in the pool.
274  //
275  // 0 means the system picks a value based on where this option proto is used
276  // (see the declaration of the specific field for more info).
277  int32 num_threads = 1;
278
279  // The global name of the threadpool.
280  //
281  // If empty, then the threadpool is made and used according to the scope it's
282  // in - e.g., for a session threadpool, it is used by that session only.
283  //
284  // If non-empty, then:
285  // - a global threadpool associated with this name is looked
286  //   up or created. This allows, for example, sharing one threadpool across
287  //   many sessions (e.g., like the default behavior, if
288  //   inter_op_parallelism_threads is not configured), but still partitioning
289  //   into a large and small pool.
290  // - if the threadpool for this global_name already exists, then it is an
291  //   error if the existing pool was created using a different num_threads
292  //   value as is specified on this call.
293  // - threadpools created this way are never garbage collected.
294  string global_name = 2;
295};
296
297message RPCOptions {
298  // If true, always use RPC to contact the session target.
299  //
300  // If false (the default option), TensorFlow may use an optimized
301  // transport for client-master communication that avoids the RPC
302  // stack. This option is primarily for used testing the RPC stack.
303  bool use_rpc_for_inprocess_master = 1;
304
305  // The compression algorithm to be used. One of "deflate", "gzip".
306  string compression_algorithm = 2;
307
308  // If compression_algorithm is set, the compression level to be used.
309  // From 0 (no compression), up to 3.
310  int32 compression_level = 3;
311};
312
313// Session configuration parameters.
314// The system picks appropriate values for fields that are not set.
315message ConfigProto {
316  // Map from device type name (e.g., "CPU" or "GPU" ) to maximum
317  // number of devices of that type to use.  If a particular device
318  // type is not found in the map, the system picks an appropriate
319  // number.
320  map<string, int32> device_count = 1;
321
322  // The execution of an individual op (for some op types) can be
323  // parallelized on a pool of intra_op_parallelism_threads.
324  // 0 means the system picks an appropriate number.
325  int32 intra_op_parallelism_threads = 2;
326
327  // Nodes that perform blocking operations are enqueued on a pool of
328  // inter_op_parallelism_threads available in each process.
329  //
330  // 0 means the system picks an appropriate number.
331  //
332  // Note that the first Session created in the process sets the
333  // number of threads for all future sessions unless use_per_session_threads is
334  // true or session_inter_op_thread_pool is configured.
335  int32 inter_op_parallelism_threads = 5;
336
337  // If true, use a new set of threads for this session rather than the global
338  // pool of threads. Only supported by direct sessions.
339  //
340  // If false, use the global threads created by the first session, or the
341  // per-session thread pools configured by session_inter_op_thread_pool.
342  //
343  // This option is deprecated. The same effect can be achieved by setting
344  // session_inter_op_thread_pool to have one element, whose num_threads equals
345  // inter_op_parallelism_threads.
346  bool use_per_session_threads = 9;
347
348  // This option is experimental - it may be replaced with a different mechanism
349  // in the future.
350  //
351  // Configures session thread pools. If this is configured, then RunOptions for
352  // a Run call can select the thread pool to use.
353  //
354  // The intended use is for when some session invocations need to run in a
355  // background pool limited to a small number of threads:
356  // - For example, a session may be configured to have one large pool (for
357  // regular compute) and one small pool (for periodic, low priority work);
358  // using the small pool is currently the mechanism for limiting the inter-op
359  // parallelism of the low priority work.  Note that it does not limit the
360  // parallelism of work spawned by a single op kernel implementation.
361  // - Using this setting is normally not needed in training, but may help some
362  // serving use cases.
363  // - It is also generally recommended to set the global_name field of this
364  // proto, to avoid creating multiple large pools. It is typically better to
365  // run the non-low-priority work, even across sessions, in a single large
366  // pool.
367  repeated ThreadPoolOptionProto session_inter_op_thread_pool = 12;
368
369  // Assignment of Nodes to Devices is recomputed every placement_period
370  // steps until the system warms up (at which point the recomputation
371  // typically slows down automatically).
372  int32 placement_period = 3;
373
374  // When any filters are present sessions will ignore all devices which do not
375  // match the filters. Each filter can be partially specified, e.g. "/job:ps"
376  // "/job:worker/replica:3", etc.
377  repeated string device_filters = 4;
378
379  // Options that apply to all GPUs.
380  GPUOptions gpu_options = 6;
381
382  // Whether soft placement is allowed. If allow_soft_placement is true,
383  // an op will be placed on CPU if
384  //   1. there's no GPU implementation for the OP
385  // or
386  //   2. no GPU devices are known or registered
387  // or
388  //   3. need to co-locate with reftype input(s) which are from CPU.
389  bool allow_soft_placement = 7;
390
391  // Whether device placements should be logged.
392  bool log_device_placement = 8;
393
394  // Options that apply to all graphs.
395  GraphOptions graph_options = 10;
396
397  // Global timeout for all blocking operations in this session.  If non-zero,
398  // and not overridden on a per-operation basis, this value will be used as the
399  // deadline for all blocking operations.
400  int64 operation_timeout_in_ms = 11;
401
402  // Options that apply when this session uses the distributed runtime.
403  RPCOptions rpc_options = 13;
404
405  // Optional list of all workers to use in this session.
406  ClusterDef cluster_def = 14;
407
408  // If true, any resources such as Variables used in the session will not be
409  // shared with other sessions.
410  bool isolate_session_state = 15;
411
412  // Everything inside Experimental is subject to change and is not subject
413  // to API stability guarantees in
414  // https://www.tensorflow.org/guide/version_compat.
415  message Experimental {
416    // Task name for group resolution.
417    string collective_group_leader = 1;
418
419    // We removed the flag client_handles_error_formatting. Marking the tag
420    // number as reserved.
421    // TODO(shikharagarwal): Should we just remove this tag so that it can be
422    // used in future for other purpose?
423    reserved 2;
424
425    // Which executor to use, the default executor will be used
426    // if it is an empty string or "DEFAULT"
427    string executor_type = 3;
428
429    // Guidance to formatting of large RecvBuf fields for transfer.
430    // Any positive value sets the max chunk size.  0 defaults to 4096.
431    // Any negative value indicates no max, i.e. one chunk only.
432    int32 recv_buf_max_chunk = 4;
433
434    // If true, and supported by the platform, the runtime will attempt to
435    // use NUMA affinity where applicable.  One consequence will be the
436    // existence of as many CPU devices as there are available NUMA nodes.
437    bool use_numa_affinity = 5;
438
439    // If true, make collective op execution order sequential and deterministic
440    // for potentially concurrent collective instances.
441    bool collective_deterministic_sequential_execution = 6;
442
443    // If true, use NCCL for CollectiveOps.  This feature is highly
444    // experimental.
445    bool collective_nccl = 7;
446  };
447
448  Experimental experimental = 16;
449
450  // Next: 17
451};
452
453// Options for a single Run() call.
454message RunOptions {
455  // TODO(pbar) Turn this into a TraceOptions proto which allows
456  // tracing to be controlled in a more orthogonal manner?
457  enum TraceLevel {
458    NO_TRACE = 0;
459    SOFTWARE_TRACE = 1;
460    HARDWARE_TRACE = 2;
461    FULL_TRACE = 3;
462  }
463  TraceLevel trace_level = 1;
464
465  // Time to wait for operation to complete in milliseconds.
466  int64 timeout_in_ms = 2;
467
468  // The thread pool to use, if session_inter_op_thread_pool is configured.
469  // To use the caller thread set this to -1 - this uses the caller thread
470  // to execute Session::Run() and thus avoids a context switch. Using the
471  // caller thread to execute Session::Run() should be done ONLY for simple
472  // graphs, where the overhead of an additional context switch is
473  // comparable with the overhead of Session::Run().
474  int32 inter_op_thread_pool = 3;
475
476  // Whether the partition graph(s) executed by the executor(s) should be
477  // outputted via RunMetadata.
478  bool output_partition_graphs = 5;
479
480  // EXPERIMENTAL.  Options used to initialize DebuggerState, if enabled.
481  DebugOptions debug_options = 6;
482
483  // When enabled, causes tensor allocation information to be included in
484  // the error message when the Run() call fails because the allocator ran
485  // out of memory (OOM).
486  //
487  // Enabling this option can slow down the Run() call.
488  bool report_tensor_allocations_upon_oom = 7;
489
490  // Everything inside Experimental is subject to change and is not subject
491  // to API stability guarantees in
492  // https://www.tensorflow.org/guide/version_compat.
493  message Experimental {
494    // If non-zero, declares that this graph is going to use collective
495    // ops and must synchronize step_ids with any other graph with this
496    // same group_key value (in a distributed computation where tasks
497    // run disjoint graphs).
498    int64 collective_graph_key = 1;
499    // If true, then operations (using the inter-op pool) across all
500    // session::run() calls will be centrally scheduled, optimizing for (median
501    // and tail) latency.
502    // Consider using this option for CPU-bound workloads like inference.
503    bool use_run_handler_pool = 2;
504  };
505
506  Experimental experimental = 8;
507
508  reserved 4;
509}
510
511// Metadata output (i.e., non-Tensor) for a single Run() call.
512message RunMetadata {
513  // Statistics traced for this step. Populated if tracing is turned on via the
514  // "RunOptions" proto.
515  // EXPERIMENTAL: The format and set of events may change in future versions.
516  StepStats step_stats = 1;
517
518  // The cost graph for the computation defined by the run call.
519  CostGraphDef cost_graph = 2;
520
521  // Graphs of the partitions executed by executors.
522  repeated GraphDef partition_graphs = 3;
523
524  message FunctionGraphs {
525    // TODO(nareshmodi): Include some sort of function/cache-key identifier?
526    repeated GraphDef partition_graphs = 1;
527
528    GraphDef pre_optimization_graph = 2;
529    GraphDef post_optimization_graph = 3;
530  }
531  // This is only populated for graphs that are run as functions in TensorFlow
532  // V2. There will be an entry below for each function that is traced.
533  // The main use cases of the post_optimization_graph and the partition_graphs
534  // is to give the caller insight into the graphs that were actually run by the
535  // runtime. Additional information (such as those in step_stats) will match
536  // these graphs.
537  // We also include the pre_optimization_graph since it is usually easier to
538  // read, and is helpful in situations where the caller wants to get a high
539  // level idea of what the built graph looks like (since the various graph
540  // optimization passes might change the structure of the graph significantly).
541  repeated FunctionGraphs function_graphs = 4;
542}
543
544// Defines a connection between two tensors in a `GraphDef`.
545message TensorConnection {
546  // A tensor name. The value of this tensor will be substituted for
547  // the tensor named in `to_tensor`.
548  string from_tensor = 1;
549
550  // A tensor name. The value of this tensor will be bound to the
551  // value of the tensor named in `from_tensor`.
552  string to_tensor = 2;
553}
554
555// Defines a subgraph in another `GraphDef` as a set of feed points and nodes
556// to be fetched or executed.
557//
558// Compare with the arguments to `Session::Run()`.
559message CallableOptions {
560  // Tensors to be fed in the callable. Each feed is the name of a tensor.
561  repeated string feed = 1;
562
563  // Fetches. A list of tensor names. The caller of the callable expects a
564  // tensor to be returned for each fetch[i] (see RunStepResponse.tensor). The
565  // order of specified fetches does not change the execution order.
566  repeated string fetch = 2;
567
568  // Target Nodes. A list of node names. The named nodes will be run by the
569  // callable but their outputs will not be returned.
570  repeated string target = 3;
571
572  // Options that will be applied to each run.
573  RunOptions run_options = 4;
574
575  // Tensors to be connected in the callable. Each TensorConnection denotes
576  // a pair of tensors in the graph, between which an edge will be created
577  // in the callable.
578  repeated TensorConnection tensor_connection = 5;
579
580  // The Tensor objects fed in the callable and fetched from the callable
581  // are expected to be backed by host (CPU) memory by default.
582  //
583  // The options below allow changing that - feeding tensors backed by
584  // device memory, or returning tensors that are backed by device memory.
585  //
586  // The maps below map the name of a feed/fetch tensor (which appears in
587  // 'feed' or 'fetch' fields above), to the fully qualified name of the device
588  // owning the memory backing the contents of the tensor.
589  //
590  // For example, creating a callable with the following options:
591  //
592  // CallableOptions {
593  //   feed: "a:0"
594  //   feed: "b:0"
595  //
596  //   fetch: "x:0"
597  //   fetch: "y:0"
598  //
599  //   feed_devices: {
600  //     "a:0": "/job:localhost/replica:0/task:0/device:GPU:0"
601  //   }
602  //
603  //   fetch_devices: {
604  //     "y:0": "/job:localhost/replica:0/task:0/device:GPU:0"
605  //  }
606  // }
607  //
608  // means that the Callable expects:
609  // - The first argument ("a:0") is a Tensor backed by GPU memory.
610  // - The second argument ("b:0") is a Tensor backed by host memory.
611  // and of its return values:
612  // - The first output ("x:0") will be backed by host memory.
613  // - The second output ("y:0") will be backed by GPU memory.
614  //
615  // FEEDS:
616  // It is the responsibility of the caller to ensure that the memory of the fed
617  // tensors will be correctly initialized and synchronized before it is
618  // accessed by operations executed during the call to Session::RunCallable().
619  //
620  // This is typically ensured by using the TensorFlow memory allocators
621  // (Device::GetAllocator()) to create the Tensor to be fed.
622  //
623  // Alternatively, for CUDA-enabled GPU devices, this typically means that the
624  // operation that produced the contents of the tensor has completed, i.e., the
625  // CUDA stream has been synchronized (e.g., via cuCtxSynchronize() or
626  // cuStreamSynchronize()).
627  map<string, string> feed_devices = 6;
628  map<string, string> fetch_devices = 7;
629
630  // By default, RunCallable() will synchronize the GPU stream before returning
631  // fetched tensors on a GPU device, to ensure that the values in those tensors
632  // have been produced. This simplifies interacting with the tensors, but
633  // potentially incurs a performance hit.
634  //
635  // If this options is set to true, the caller is responsible for ensuring
636  // that the values in the fetched tensors have been produced before they are
637  // used. The caller can do this by invoking `Device::Sync()` on the underlying
638  // device(s), or by feeding the tensors back to the same Session using
639  // `feed_devices` with the same corresponding device name.
640  bool fetch_skip_sync = 8;
641
642  // Next: 9
643}
644