1syntax = "proto3"; 2 3package tensorflow; 4option cc_enable_arenas = true; 5option java_outer_classname = "ConfigProtos"; 6option java_multiple_files = true; 7option java_package = "org.tensorflow.framework"; 8 9import "tensorflow/core/framework/cost_graph.proto"; 10import "tensorflow/core/framework/graph.proto"; 11import "tensorflow/core/framework/step_stats.proto"; 12import "tensorflow/core/protobuf/debug.proto"; 13import "tensorflow/core/protobuf/cluster.proto"; 14import "tensorflow/core/protobuf/rewriter_config.proto"; 15 16message GPUOptions { 17 // A value between 0 and 1 that indicates what fraction of the 18 // available GPU memory to pre-allocate for each process. 1 means 19 // to pre-allocate all of the GPU memory, 0.5 means the process 20 // allocates ~50% of the available GPU memory. 21 double per_process_gpu_memory_fraction = 1; 22 23 // The type of GPU allocation strategy to use. 24 // 25 // Allowed values: 26 // "": The empty string (default) uses a system-chosen default 27 // which may change over time. 28 // 29 // "BFC": A "Best-fit with coalescing" algorithm, simplified from a 30 // version of dlmalloc. 31 string allocator_type = 2; 32 33 // Delay deletion of up to this many bytes to reduce the number of 34 // interactions with gpu driver code. If 0, the system chooses 35 // a reasonable default (several MBs). 36 int64 deferred_deletion_bytes = 3; 37 38 // If true, the allocator does not pre-allocate the entire specified 39 // GPU memory region, instead starting small and growing as needed. 40 bool allow_growth = 4; 41 42 // A comma-separated list of GPU ids that determines the 'visible' 43 // to 'virtual' mapping of GPU devices. For example, if TensorFlow 44 // can see 8 GPU devices in the process, and one wanted to map 45 // visible GPU devices 5 and 3 as "/device:GPU:0", and "/device:GPU:1", 46 // then one would specify this field as "5,3". This field is similar in 47 // spirit to the CUDA_VISIBLE_DEVICES environment variable, except 48 // it applies to the visible GPU devices in the process. 49 // 50 // NOTE: 51 // 1. The GPU driver provides the process with the visible GPUs 52 // in an order which is not guaranteed to have any correlation to 53 // the *physical* GPU id in the machine. This field is used for 54 // remapping "visible" to "virtual", which means this operates only 55 // after the process starts. Users are required to use vendor 56 // specific mechanisms (e.g., CUDA_VISIBLE_DEVICES) to control the 57 // physical to visible device mapping prior to invoking TensorFlow. 58 // 2. In the code, the ids in this list are also called "CUDA GPU id"s, 59 // and the 'virtual' ids of GPU devices (i.e. the ids in the device 60 // name "/device:GPU:<id>") are also called "TF GPU id"s. Please 61 // refer to third_party/tensorflow/core/common_runtime/gpu/gpu_id.h 62 // for more information. 63 string visible_device_list = 5; 64 65 // In the event polling loop sleep this many microseconds between 66 // PollEvents calls, when the queue is not empty. If value is not 67 // set or set to 0, gets set to a non-zero default. 68 int32 polling_active_delay_usecs = 6; 69 70 // In the event polling loop sleep this many millisconds between 71 // PollEvents calls, when the queue is empty. If value is not 72 // set or set to 0, gets set to a non-zero default. 73 int32 polling_inactive_delay_msecs = 7; 74 75 // Force all tensors to be gpu_compatible. On a GPU-enabled TensorFlow, 76 // enabling this option forces all CPU tensors to be allocated with Cuda 77 // pinned memory. Normally, TensorFlow will infer which tensors should be 78 // allocated as the pinned memory. But in case where the inference is 79 // incomplete, this option can significantly speed up the cross-device memory 80 // copy performance as long as it fits the memory. 81 // Note that this option is not something that should be 82 // enabled by default for unknown or very large models, since all Cuda pinned 83 // memory is unpageable, having too much pinned memory might negatively impact 84 // the overall host system performance. 85 bool force_gpu_compatible = 8; 86 87 // Everything inside Experimental is subject to change and is not subject 88 // to API stability guarantees in 89 // https://www.tensorflow.org/programmers_guide/version_compat. 90 message Experimental { 91 // Configuration for breaking down a visible GPU into multiple "virtual" 92 // devices. 93 message VirtualDevices { 94 // Per "virtual" device memory limit, in MB. The number of elements in 95 // the list is the number of virtual devices to create on the 96 // corresponding visible GPU (see "virtual_devices" below). 97 // If empty, it will create single virtual device taking all available 98 // memory from the device. 99 // 100 // For the concept of "visible" and "virtual" GPU, see the comments for 101 // "visible_device_list" above for more information. 102 repeated float memory_limit_mb = 1; 103 } 104 105 // The multi virtual device settings. If empty (not set), it will create 106 // single virtual device on each visible GPU, according to the settings 107 // in "visible_device_list" above. Otherwise, the number of elements in the 108 // list must be the same as the number of visible GPUs (after 109 // "visible_device_list" filtering if it is set), and the string represented 110 // device names (e.g. /device:GPU:<id>) will refer to the virtual 111 // devices and have the <id> field assigned sequentially starting from 0, 112 // according to the order they appear in this list and the "memory_limit" 113 // list inside each element. For example, 114 // visible_device_list = "1,0" 115 // virtual_devices { memory_limit: 1GB memory_limit: 2GB } 116 // virtual_devices {} 117 // will create three virtual devices as: 118 // /device:GPU:0 -> visible GPU 1 with 1GB memory 119 // /device:GPU:1 -> visible GPU 1 with 2GB memory 120 // /device:GPU:2 -> visible GPU 0 with all available memory 121 // 122 // NOTE: 123 // 1. It's invalid to set both this and "per_process_gpu_memory_fraction" 124 // at the same time. 125 // 2. Currently this setting is per-process, not per-session. Using 126 // different settings in different sessions within same process will 127 // result in undefined behavior. 128 repeated VirtualDevices virtual_devices = 1; 129 } 130 131 Experimental experimental = 9; 132}; 133 134// Options passed to the graph optimizer 135message OptimizerOptions { 136 // If true, optimize the graph using common subexpression elimination. 137 bool do_common_subexpression_elimination = 1; 138 139 // If true, perform constant folding optimization on the graph. 140 bool do_constant_folding = 2; 141 142 // Constant folding optimization replaces tensors whose values can be 143 // predetermined, with constant nodes. To avoid inserting too large constants, 144 // the size of each constant created can be limited. If this value is zero, a 145 // default limit of 10 MiB will be applied. If constant folding optimization 146 // is disabled, this value is ignored. 147 int64 max_folded_constant_in_bytes = 6; 148 149 // If true, perform function inlining on the graph. 150 bool do_function_inlining = 4; 151 152 // Optimization level 153 enum Level { 154 // L1 is the default level. 155 // Optimization performed at L1 : 156 // 1. Common subexpression elimination 157 // 2. Constant folding 158 L1 = 0; 159 160 // No optimizations 161 L0 = -1; 162 } 163 164 // Overall optimization level. The actual optimizations applied will be the 165 // logical OR of the flags that this level implies and any flags already set. 166 Level opt_level = 3; 167 168 // Control the use of the compiler/jit. Experimental. 169 enum GlobalJitLevel { 170 DEFAULT = 0; // Default setting ("off" now, but later expected to be "on") 171 OFF = -1; 172 // The following settings turn on compilation, with higher values being 173 // more aggressive. Higher values may reduce opportunities for parallelism 174 // and may use more memory. (At present, there is no distinction, but this 175 // is expected to change.) 176 ON_1 = 1; 177 ON_2 = 2; 178 } 179 GlobalJitLevel global_jit_level = 5; 180} 181 182message GraphOptions { 183 // Removed, use optimizer_options below. 184 reserved "skip_common_subexpression_elimination"; 185 reserved 1; 186 187 // If true, use control flow to schedule the activation of Recv nodes. 188 // (Currently ignored.) 189 bool enable_recv_scheduling = 2; 190 191 // Options controlling how graph is optimized. 192 OptimizerOptions optimizer_options = 3; 193 194 // The number of steps to run before returning a cost model detailing 195 // the memory usage and performance of each node of the graph. 0 means 196 // no cost model. 197 int64 build_cost_model = 4; 198 199 // The number of steps to skip before collecting statistics for the 200 // cost model. 201 int64 build_cost_model_after = 9; 202 203 // Annotate each Node with Op output shape data, to the extent it can 204 // be statically inferred. 205 bool infer_shapes = 5; 206 207 // Only place the subgraphs that are run, rather than the entire graph. 208 // 209 // This is useful for interactive graph building, where one might 210 // produce graphs that cannot be placed during the debugging 211 // process. In particular, it allows the client to continue work in 212 // a session after adding a node to a graph whose placement 213 // constraints are unsatisfiable. 214 bool place_pruned_graph = 6; 215 216 // If true, transfer float values between processes as bfloat16. 217 bool enable_bfloat16_sendrecv = 7; 218 219 // If > 0, record a timeline every this many steps. 220 // EXPERIMENTAL: This currently has no effect in MasterSession. 221 int32 timeline_step = 8; 222 223 // Options that control the type and amount of graph rewriting. 224 // Not currently configurable via the public Python API (i.e. there is no API 225 // stability guarantee if you import RewriterConfig explicitly). 226 RewriterConfig rewrite_options = 10; 227}; 228 229message ThreadPoolOptionProto { 230 // The number of threads in the pool. 231 // 232 // 0 means the system picks a value based on where this option proto is used 233 // (see the declaration of the specific field for more info). 234 int32 num_threads = 1; 235 236 // The global name of the threadpool. 237 // 238 // If empty, then the threadpool is made and used according to the scope it's 239 // in - e.g., for a session threadpool, it is used by that session only. 240 // 241 // If non-empty, then: 242 // - a global threadpool associated with this name is looked 243 // up or created. This allows, for example, sharing one threadpool across 244 // many sessions (e.g., like the default behavior, if 245 // inter_op_parallelism_threads is not configured), but still partitioning 246 // into a large and small pool. 247 // - if the threadpool for this global_name already exists, then it is an 248 // error if the existing pool was created using a different num_threads 249 // value as is specified on this call. 250 // - threadpools created this way are never garbage collected. 251 string global_name = 2; 252}; 253 254message RPCOptions { 255 // If true, always use RPC to contact the session target. 256 // 257 // If false (the default option), TensorFlow may use an optimized 258 // transport for client-master communication that avoids the RPC 259 // stack. This option is primarily for used testing the RPC stack. 260 bool use_rpc_for_inprocess_master = 1; 261}; 262 263// Session configuration parameters. 264// The system picks appropriate values for fields that are not set. 265message ConfigProto { 266 // Map from device type name (e.g., "CPU" or "GPU" ) to maximum 267 // number of devices of that type to use. If a particular device 268 // type is not found in the map, the system picks an appropriate 269 // number. 270 map<string, int32> device_count = 1; 271 272 // The execution of an individual op (for some op types) can be 273 // parallelized on a pool of intra_op_parallelism_threads. 274 // 0 means the system picks an appropriate number. 275 int32 intra_op_parallelism_threads = 2; 276 277 // Nodes that perform blocking operations are enqueued on a pool of 278 // inter_op_parallelism_threads available in each process. 279 // 280 // 0 means the system picks an appropriate number. 281 // 282 // Note that the first Session created in the process sets the 283 // number of threads for all future sessions unless use_per_session_threads is 284 // true or session_inter_op_thread_pool is configured. 285 int32 inter_op_parallelism_threads = 5; 286 287 // If true, use a new set of threads for this session rather than the global 288 // pool of threads. Only supported by direct sessions. 289 // 290 // If false, use the global threads created by the first session, or the 291 // per-session thread pools configured by session_inter_op_thread_pool. 292 // 293 // This option is deprecated. The same effect can be achieved by setting 294 // session_inter_op_thread_pool to have one element, whose num_threads equals 295 // inter_op_parallelism_threads. 296 bool use_per_session_threads = 9; 297 298 // This option is experimental - it may be replaced with a different mechanism 299 // in the future. 300 // 301 // Configures session thread pools. If this is configured, then RunOptions for 302 // a Run call can select the thread pool to use. 303 // 304 // The intended use is for when some session invocations need to run in a 305 // background pool limited to a small number of threads: 306 // - For example, a session may be configured to have one large pool (for 307 // regular compute) and one small pool (for periodic, low priority work); 308 // using the small pool is currently the mechanism for limiting the inter-op 309 // parallelism of the low priority work. Note that it does not limit the 310 // parallelism of work spawned by a single op kernel implementation. 311 // - Using this setting is normally not needed in training, but may help some 312 // serving use cases. 313 // - It is also generally recommended to set the global_name field of this 314 // proto, to avoid creating multiple large pools. It is typically better to 315 // run the non-low-priority work, even across sessions, in a single large 316 // pool. 317 repeated ThreadPoolOptionProto session_inter_op_thread_pool = 12; 318 319 // Assignment of Nodes to Devices is recomputed every placement_period 320 // steps until the system warms up (at which point the recomputation 321 // typically slows down automatically). 322 int32 placement_period = 3; 323 324 // When any filters are present sessions will ignore all devices which do not 325 // match the filters. Each filter can be partially specified, e.g. "/job:ps" 326 // "/job:worker/replica:3", etc. 327 repeated string device_filters = 4; 328 329 // Options that apply to all GPUs. 330 GPUOptions gpu_options = 6; 331 332 // Whether soft placement is allowed. If allow_soft_placement is true, 333 // an op will be placed on CPU if 334 // 1. there's no GPU implementation for the OP 335 // or 336 // 2. no GPU devices are known or registered 337 // or 338 // 3. need to co-locate with reftype input(s) which are from CPU. 339 bool allow_soft_placement = 7; 340 341 // Whether device placements should be logged. 342 bool log_device_placement = 8; 343 344 // Options that apply to all graphs. 345 GraphOptions graph_options = 10; 346 347 // Global timeout for all blocking operations in this session. If non-zero, 348 // and not overridden on a per-operation basis, this value will be used as the 349 // deadline for all blocking operations. 350 int64 operation_timeout_in_ms = 11; 351 352 // Options that apply when this session uses the distributed runtime. 353 RPCOptions rpc_options = 13; 354 355 // Optional list of all workers to use in this session. 356 ClusterDef cluster_def = 14; 357 358 // If true, any resources such as Variables used in the session will not be 359 // shared with other sessions. 360 bool isolate_session_state = 15; 361 362 // Next: 16 363}; 364 365// Options for a single Run() call. 366message RunOptions { 367 // TODO(pbar) Turn this into a TraceOptions proto which allows 368 // tracing to be controlled in a more orthogonal manner? 369 enum TraceLevel { 370 NO_TRACE = 0; 371 SOFTWARE_TRACE = 1; 372 HARDWARE_TRACE = 2; 373 FULL_TRACE = 3; 374 } 375 TraceLevel trace_level = 1; 376 377 // Time to wait for operation to complete in milliseconds. 378 int64 timeout_in_ms = 2; 379 380 // The thread pool to use, if session_inter_op_thread_pool is configured. 381 int32 inter_op_thread_pool = 3; 382 383 // Whether the partition graph(s) executed by the executor(s) should be 384 // outputted via RunMetadata. 385 bool output_partition_graphs = 5; 386 387 // EXPERIMENTAL. Options used to initialize DebuggerState, if enabled. 388 DebugOptions debug_options = 6; 389 390 // When enabled, causes tensor allocation information to be included in 391 // the error message when the Run() call fails because the allocator ran 392 // out of memory (OOM). 393 // 394 // Enabling this option can slow down the Run() call. 395 bool report_tensor_allocations_upon_oom = 7; 396 397 reserved 4; 398} 399 400// Metadata output (i.e., non-Tensor) for a single Run() call. 401message RunMetadata { 402 // Statistics traced for this step. Populated if tracing is turned on via the 403 // "RunOptions" proto. 404 // EXPERIMENTAL: The format and set of events may change in future versions. 405 StepStats step_stats = 1; 406 407 // The cost graph for the computation defined by the run call. 408 CostGraphDef cost_graph = 2; 409 410 // Graphs of the partitions executed by executors. 411 repeated GraphDef partition_graphs = 3; 412} 413