1/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2
3Licensed under the Apache License, Version 2.0 (the "License");
4you may not use this file except in compliance with the License.
5You may obtain a copy of the License at
6
7    http://www.apache.org/licenses/LICENSE-2.0
8
9Unless required by applicable law or agreed to in writing, software
10distributed under the License is distributed on an "AS IS" BASIS,
11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12See the License for the specific language governing permissions and
13limitations under the License.
14==============================================================================*/
15
16syntax = "proto3";
17
18package xla;
19
20import "tensorflow/compiler/xla/service/hlo.proto";
21import "tensorflow/compiler/xla/xla_data.proto";
22
23// Options for the HLO insert-reduce-precision-operations pass.
24message HloReducePrecisionOptions {
25  // Where and when the reduce-precision operations will be added.
26  enum Location {
27    // Add reduce-precision operations to the inputs of selected instructions.
28    // This is done before any optimization occurs.
29    OP_INPUTS = 0;
30    // Add reduce-precision operations to the outputs of selected instructions.
31    // This is done before any optimization occurs.
32    OP_OUTPUTS = 1;
33    // After operation-fusion occurs, add reduce-precision operations to the
34    // outputs of any selected instructions that have not been fused into
35    // fusion instructions.
36    UNFUSED_OP_OUTPUTS = 2;
37    // After operation-fusion occurs, add reduce-precision operations to the
38    // outputs of any fusion instructions that contain operations matching the
39    // selection criteria.
40    FUSION_INPUTS_BY_CONTENT = 3;
41    // After operation-fusion occurs, add reduce-precision operations to the
42    // outputs of any fusion instructions that contain operations matching the
43    // selection criteria.
44    FUSION_OUTPUTS_BY_CONTENT = 4;
45  }
46  Location location = 1;
47
48  // Exponent and mantissa bit counts for the reduced precision.
49  uint32 exponent_bits = 2;
50  uint32 mantissa_bits = 3;
51
52  // Operations matching these opcodes should be suffixed with reduce-precision
53  // operations.
54  repeated uint32 opcodes_to_suffix = 4;
55
56  // Operations with names containing these substrings should be suffixed with
57  // reduce-precision operations.
58  repeated string opname_substrings_to_suffix = 5;
59}
60
61// Debugging options for XLA. These options may change at any time - there are
62// no guarantees about backward or forward compatibility for these fields.
63message DebugOptions {
64  // Show addresses of HLO ops in graph dump.
65  bool xla_hlo_graph_addresses = 2;
66
67  // Instrument the computation to collect per-HLO cycle counts.
68  bool xla_hlo_profile = 9;
69
70  // List of HLO passes to disable. These names must exactly match the pass
71  // names as specified by the HloPassInterface::name() method.
72  repeated string xla_disable_hlo_passes = 30;
73
74  // Disables all HLO passes.  Notes that some passes are necessary for
75  // correctness and the invariants that must be satisfied by "fully optimized"
76  // HLO are different for different devices and may change over time.  The only
77  // "guarantee", such as it is, is that if you compile XLA and dump the
78  // optimized HLO for some graph, you should be able to run it again on the
79  // same device with the same build of XLA.
80  bool xla_disable_all_hlo_passes = 104;
81
82  // Numerical optimization level for the XLA compiler backend; the specific
83  // interpretation of this value is left to the backends.
84  int32 xla_backend_optimization_level = 31;
85
86  // Embed the compiler IR as a string in the executable.
87  bool xla_embed_ir_in_executable = 33;
88
89  // Eliminate implicit broadcasts when lowering user computations to HLO
90  // instructions; use explicit broadcast instead.
91  bool xla_eliminate_hlo_implicit_broadcast = 35;
92
93  // When generating calls to Eigen in the CPU backend, use multi-threaded Eigen
94  // mode.
95  bool xla_cpu_multi_thread_eigen = 60;
96
97  // Path to directory with cuda/ptx tools and libraries.
98  string xla_gpu_cuda_data_dir = 61;
99
100  // Enable flush-to-zero semantics in the GPU backend.
101  bool xla_gpu_ftz = 62;
102
103  // Disable multi-streaming in the GPU backend.
104  bool xla_gpu_disable_multi_streaming = 63;
105
106  // If true, in LLVM-based backends, emit !alias.scope metadata in
107  // generated IR.
108  bool xla_llvm_enable_alias_scope_metadata = 70;
109
110  // If true, in LLVM-based backends, emit !noalias metadata in the
111  // generated IR.
112  bool xla_llvm_enable_noalias_metadata = 71;
113
114  // If true, in LLVM-based backends, emit !invariant.load metadata in
115  // the generated IR.
116  bool xla_llvm_enable_invariant_load_metadata = 72;
117
118  // If true, a set of expensive LLVM optimization passes will not be run.
119  bool xla_llvm_disable_expensive_passes = 73;
120
121  // Options for inserting reduce-precision operations for numerical
122  // experimentation.  This is a repeated field, as we may want to have
123  // multiple passes with different parameters.
124  repeated HloReducePrecisionOptions hlo_reduce_precision_options = 80;
125
126  // This is used by ClientLibraryTestBase::ComputeAndCompare*. If true, the
127  // computation will run n! times with all permunations of layouts for the
128  // output shape in rank n. For example, with a 3D shape, all permutations of
129  // the set {0, 1, 2} are tried.
130  bool xla_test_all_output_layouts = 90;
131
132  // This is used by ClientLibraryTestBase::ComputeAndCompare*. If true, the
133  // computation will run for all permunations of layouts of all input
134  // arguments. For example, with 2 input arguments in 2D and 4D shapes, the
135  // computation will run 2! * 4! times.
136  bool xla_test_all_input_layouts = 91;
137
138  // Assign colors based on sharding information when generating the Graphviz
139  // HLO graph.
140  bool xla_hlo_graph_sharding_color = 92;
141
142  reserved 93;  // Was xla_hlo_tfgraph_device_scopes
143
144  // If true, the GPU backend is free to use cudnn for HLO batch normalization
145  // ops.
146  bool xla_gpu_use_cudnn_batchnorm = 94;
147
148  // Generate calls to MKL-DNN in the CPU backend.
149  bool xla_cpu_use_mkl_dnn = 97;
150
151  // Maximum kernel unroll factor for the GPU backend.
152  int32 xla_gpu_max_kernel_unroll_factor = 98;
153
154  // When true, "unsafe" mathematical optimizations are enabled. These
155  // transformations include but are not limited to:
156  //
157  //  - Reducing the precision of operations (e.g. using an approximate sin
158  //    function, or transforming x/y into x * (1/y)).
159  //  - Assuming that operations never produce or consume NaN or +/- Inf (this
160  //    behavior can be adjusted using xla_cpu_fast_math_allow_{nans|infs}).
161  //  - Assuming that +0 and -0 are indistinguishable.
162  bool xla_cpu_enable_fast_math = 99;
163
164  // When xla_cpu_enable_fast_math is true then this controls whether we allow
165  // operations to produce NaNs.  Ignored when xla_cpu_enable_fast_math is
166  // false.
167  bool xla_cpu_fast_math_honor_nans = 120;
168
169  // When xla_cpu_enable_fast_math is true then this controls whether we allow
170  // operations to produce infinites.  Ignored when xla_cpu_enable_fast_math is
171  // false.
172  bool xla_cpu_fast_math_honor_infs = 121;
173
174  // When true we lower the Minimum and Maximum hlos in the GPU backend such
175  // that Min(NotNaN, NaN) = Min(NaN, NotNaN) = NotNaN.  In other words, if flag
176  // this is true we don't propagate NaNs through Min and Max.
177  bool xla_gpu_enable_fast_min_max = 100;
178
179  // Crashes the program when any kind of verification fails, instead of just
180  // logging the failures. One example is cross checking of convolution results
181  // among different algorithms.
182  bool xla_gpu_crash_on_verification_failures = 101;
183
184  // Force the host platform to pretend that there are these many host
185  // "devices".  All these devices are backed by the same threadpool.  Defaults
186  // to 1.
187  //
188  // Setting this to anything other than 1 can increase overhead from context
189  // switching but we let the user override this behavior to help run tests on
190  // the host that run models in parallel across multiple devices.
191  int32 xla_force_host_platform_device_count = 102;
192
193  // If set to true XLA:GPU invokes `ptxas` with -O0 (default is -O3).
194  bool xla_gpu_disable_ptxas_optimizations = 103;
195
196  // Enable fast math with eigen in the HLO evaluator.
197  bool xla_hlo_evaluator_use_fast_path = 106;
198
199  // Temporary option to allow support for both the R1 and the scalar index
200  // versions of DynamicSlice and DynamicUpdateSlice. Only used for testing.
201  bool xla_allow_scalar_index_dynamic_ops = 107;
202
203  enum StepMarkerLocation {
204    // Generate step mark at each iteration of top level while loop, which
205    // is assumed to be a training loop. This is the default.
206    STEP_MARK_AT_ENTRY = 0;
207    // Generate step mark at program entry. This handles the case where each
208    // step are done by one or multiple programs execution. Only the first
209    // program will be tagged for generating step mark at program entry.
210    STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP = 1;
211    // No step mark.
212    STEP_MARK_NONE = 2;
213  }
214  // Option to emit a target-specific marker to indicate the start of a training
215  // step. The location of the marker (if any) is determined by the option
216  // value.
217  StepMarkerLocation xla_step_marker_location = 108;
218
219  //
220  // BEGIN flags controlling dumping HLO modules for debugging.
221  //
222  // When dumping is enabled, HLO modules dumped at the very beginning and end
223  // of compilation, and optionally also during the pass pipeline.
224  //
225  // In general, if you set one of these flags, we will try to infer reasonable
226  // defaults for the others.  For example:
227  //
228  //  * Setting --xla_dump_to=/tmp/foo without specifying a format
229  //    with --xla_dump_hlo_as_* will turn on --xla_dump_hlo_as_text.
230  //
231  //  * Setting --xla_dump_hlo_as_text without specifying --xla_dump_to will
232  //    dump to stdout.
233  //
234
235  // Directory to dump into.
236  string xla_dump_to = 109;
237
238  // If specified, will only dump modules which match this regexp.
239  string xla_dump_hlo_module_re = 110;
240
241  // If this flag is specified, will also HLO before and after passes that match
242  // this regular expression.  Set to .* to dump before/after all passes.
243  string xla_dump_hlo_pass_re = 111;
244
245  // Specifies the format that HLO is dumped in.  Multiple of these may be
246  // specified.
247  bool xla_dump_hlo_as_text = 112;
248  bool xla_dump_hlo_as_proto = 113;
249  bool xla_dump_hlo_as_dot = 114;
250  bool xla_dump_hlo_as_url = 115;
251
252  // Dump HLO graphs as an HTML (DOT -> SVG inlined in HTML)
253  bool xla_dump_hlo_as_html = 116;
254
255  // If true, every time an HLO module is run, we will dump an HloSnapshot
256  // (essentially, a serialized module plus its inputs) to the --xla_dump_to
257  // directory.
258  bool xla_dump_hlo_snapshots = 118;
259
260  //
261  // END flags controlling dumping HLO modules.
262  //
263
264  // Next id: 121
265
266  // Extra options to pass to the compilation backend (e.g. LLVM); specific
267  // interpretation of these values is left to the backend.
268  map<string, string> xla_backend_extra_options = 500;
269
270  reserved 117;  // was xla_dump_to
271  reserved 5;    // Was xla_hlo_dump_as_graphdef
272}
273
274// These settings control how XLA compiles and/or runs code.  Not all settings
275// will have an effect on every platform.
276//
277// When adding new fields, keep in mind that boolean fields default to false.
278message ExecutionOptions {
279  // This optional field's layout is used as a hint when storing the output of
280  // this computation.  Subsequent transfers of this output array to the client
281  // may be faster when using this layout.
282  //
283  // We use a Shape here to accommodate computations that return a tuple.
284  ShapeProto shape_with_output_layout = 2;
285
286  // Used to seed random-number generators used in this computation.  If this is
287  // 0, we generate a seed ourselves.
288  //
289  // TODO(b/32083678): Changing the seed unnecessarily forces a recompilation.
290  uint64 seed = 3;
291
292  DebugOptions debug_options = 4;
293
294  // This optional field specifies a particular set of devices to run the
295  // computation on. The computation will be partitioned across these devices.
296  // If not provided, the default device will be chosen.
297  repeated DeviceHandle device_handles = 5;
298
299  // Number of replicas of the computation to run. If zero, uses the default
300  // number of replicas for the XLA service.
301  int32 num_replicas = 6;
302
303  // This optional field specifies the device assignment if known at compile
304  // time.
305  DeviceAssignmentProto device_assignment = 7;
306}
307
308message GetDeviceHandlesRequest {
309  int64 device_count = 1;
310}
311
312message GetDeviceHandlesResponse {
313  repeated DeviceHandle device_handles = 1;
314}
315
316message TransferToClientRequest {
317  GlobalDataHandle data = 1;
318
319  // This optional field directs the service to return the literal in this
320  // layout. A shape is used to hold the layout to accommodate tuples.
321  ShapeProto shape_with_layout = 2;
322}
323
324message TransferToClientResponse {
325  LiteralProto literal = 1;
326}
327
328message TransferToServerRequest {
329  LiteralProto literal = 1;
330  DeviceHandle device_handle = 2;
331}
332
333message TransferToServerResponse {
334  GlobalDataHandle data = 1;
335}
336
337message TransferToInfeedRequest {
338  LiteralProto literal = 1;
339  int64 replica_id = 2;
340  DeviceHandle device_handle = 3;
341}
342
343message TransferToInfeedResponse {}
344
345message TransferFromOutfeedRequest {
346  // This optional field directs the service to return the literal in this
347  // layout. A shape is used to hold the layout to accommodate tuples.
348  ShapeProto shape_with_layout = 1;
349
350  int64 replica_id = 2;
351  DeviceHandle device_handle = 3;
352}
353
354message TransferFromOutfeedResponse {
355  LiteralProto literal = 1;
356}
357
358message ResetDeviceRequest {
359  DeviceHandle device_handle = 1;
360}
361
362message ResetDeviceResponse {}
363
364message ComputationGraphStatsRequest {
365  HloModuleProto computation = 1;
366  DebugOptions debug_options = 2;
367}
368
369message ComputationStatsResponse {
370  ComputationStats stats = 1;
371}
372
373message CreateChannelHandleRequest {
374  ChannelHandle.ChannelType channel_type = 1;
375}
376
377message CreateChannelHandleResponse {
378  ChannelHandle channel = 1;
379}
380
381message UnregisterRequest {
382  repeated GlobalDataHandle data = 1;
383}
384
385message UnregisterResponse {}
386
387message CompileRequest {
388  // The graph to be compiled.
389  HloModuleProto computation = 1;
390
391  // Options that affect how XLA compiles code to service this request.
392  ExecutionOptions execution_options = 2;
393
394  // The layouts of the input arguments. If not set, the default layout will be
395  // used. Although the real arguments are not needed in compilation, the
396  // layouts of the arguments can affect the compilation.
397  repeated ShapeProto input_shape_with_layout = 3;
398}
399
400message CompileResponse {
401  // The handle to the executable.
402  ExecutionHandle handle = 1;
403}
404
405message ExecuteRequest {
406  ExecutionHandle handle = 1;
407
408  // The shape and layout of the arguments must be the same as the those of the
409  // executable's parameters.
410  repeated GlobalDataHandle arguments = 2;
411}
412
413// TODO(b/118493728): Remove this and ExecuteGraphParallelRequest and replace
414// the uses with calls to Compile and Execute.
415message ExecuteGraphRequest {
416  HloModuleProto computation = 1;
417  repeated GlobalDataHandle arguments = 2;
418
419  // Options that affect how XLA compiles and runs code to service this request.
420  ExecutionOptions execution_options = 3;
421}
422
423message ExecuteGraphParallelRequest {
424  repeated ExecuteGraphRequest requests = 1;
425}
426
427message ExecuteResponse {
428  GlobalDataHandle output = 1;
429  ExecutionProfile profile = 2;
430}
431
432message ExecuteParallelResponse {
433  repeated ExecuteResponse responses = 1;
434}
435
436message WaitForExecutionRequest {
437  ExecutionHandle execution = 1;
438}
439
440message WaitForExecutionResponse {
441  GlobalDataHandle output = 1;
442  ExecutionProfile profile = 2;
443}
444
445message ComputeConstantGraphRequest {
446  HloModuleProto computation = 1;
447  LayoutProto output_layout = 2;
448}
449
450message ComputeConstantResponse {
451  // A LiteralProto is returned directly for this request.
452  LiteralProto literal = 1;
453}
454
455message DeconstructTupleRequest {
456  GlobalDataHandle tuple_handle = 2;
457}
458
459message DeconstructTupleResponse {
460  repeated GlobalDataHandle element_handles = 1;
461}
462
463message LoadDataRequest {
464  // Describes the path of the ColumnIO tablet to load.
465  string columnio_tablet_path = 1;
466
467  // Describes the field to load within the ColumnIO tablet.
468  string columnio_field = 2;
469
470  // Individual element shape, excluding rows.
471  ShapeProto element_shape = 3;
472
473  // Warning: ColumnIO does not support random-access, so use offset with
474  // caution in performance-critical scenarios.
475  int64 offset = 4;
476
477  // Maximum number of elements (with shape element_shape) to load.
478  int64 limit = 5;
479
480  // If more than one item is requested (via limit > 1), then this request
481  // attribute zips together the produced vectors.
482  bool zip = 6;
483}
484
485message LoadDataResponse {
486  GlobalDataHandle data = 1;
487  ShapeProto data_shape = 2;
488  int64 available_rows = 3;
489  int64 rows_loaded = 4;
490  int64 nanoseconds = 5;
491}
492
493message GetShapeRequest {
494  GlobalDataHandle data = 1;
495}
496
497message GetShapeResponse {
498  ShapeProto shape = 1;
499}
500
501message UnpackRequest {
502  GlobalDataHandle data = 1;
503}
504
505message UnpackResponse {
506  repeated GlobalDataHandle tied_data = 1;
507}
508