1/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2
3Licensed under the Apache License, Version 2.0 (the "License");
4you may not use this file except in compliance with the License.
5You may obtain a copy of the License at
6
7    http://www.apache.org/licenses/LICENSE-2.0
8
9Unless required by applicable law or agreed to in writing, software
10distributed under the License is distributed on an "AS IS" BASIS,
11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12See the License for the specific language governing permissions and
13limitations under the License.
14==============================================================================*/
15
16syntax = "proto3";
17
18package tensorflow;
19option cc_enable_arenas = true;
20option java_outer_classname = "WorkerProtos";
21option java_multiple_files = true;
22option java_package = "org.tensorflow.distruntime";
23option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf";
24import "google/protobuf/any.proto";
25import "tensorflow/core/framework/cost_graph.proto";
26import "tensorflow/core/framework/step_stats.proto";
27import "tensorflow/core/framework/device_attributes.proto";
28import "tensorflow/core/framework/graph.proto";
29import "tensorflow/core/framework/tensor.proto";
30import "tensorflow/core/framework/tensor_shape.proto";
31import "tensorflow/core/framework/types.proto";
32import "tensorflow/core/lib/core/error_codes.proto";
33import "tensorflow/core/protobuf/config.proto";
34import "tensorflow/core/protobuf/debug.proto";
35import "tensorflow/core/protobuf/named_tensor.proto";
36import "tensorflow/core/protobuf/tensorflow_server.proto";
37
38////////////////////////////////////////////////////////////////////////////////
39//
40// GetStatus method request/response messages
41//
42////////////////////////////////////////////////////////////////////////////////
43
44message GetStatusRequest {
45}
46
47message GetStatusResponse {
48  repeated DeviceAttributes device_attributes = 1;
49}
50
51////////////////////////////////////////////////////////////////////////////////
52//
53// CreateSession method request/response messages
54//
55// For each session,
56//
57////////////////////////////////////////////////////////////////////////////////
58
59message CreateWorkerSessionRequest {
60  // Sessions are identified by a given handle.
61  string session_handle = 1;
62
63  // Defines the configuration of a TensorFlow worker.
64  ServerDef server_def = 2;
65
66  // If true, any resources such as Variables used in the session will not be
67  // shared with other sessions.
68  bool isolate_session_state = 3;
69}
70
71message CreateWorkerSessionResponse {
72}
73
74////////////////////////////////////////////////////////////////////////////////
75//
76// DeleteSession method request/response messages
77//
78// Deletes all worker-side state associated with the given session handle.
79//
80////////////////////////////////////////////////////////////////////////////////
81
82message DeleteWorkerSessionRequest {
83  // Sessions are identified by a given handle.
84  string session_handle = 1;
85}
86
87message DeleteWorkerSessionResponse {
88}
89
90////////////////////////////////////////////////////////////////////////////////
91//
92// RegisterGraph method request/response messages
93//
94// For each session, after the master placed every node on a device,
95// it partitions the whole graph into many subgraphs. All the nodes in
96// a subgraph were in the same worker, but potentially on many devices
97// owned by that worker (e.g. cpu0, plus gpu0, gpu1, ..., gpu7). The
98// master registers subgraphs for a worker before running any steps. A
99// successful registration returns a graph handle to be used in latter
100// RunGraph requests.
101//
102////////////////////////////////////////////////////////////////////////////////
103
104message RegisterGraphRequest {
105  // Subgraphs are scoped within one session.
106  string session_handle = 1;
107
108  // Set to true if `CreateWorkerSession` was called for `session_handle`.
109  bool create_worker_session_called = 6;
110
111  // "graph_def" has the subgraph of nodes for this worker, with each node
112  // having its device_name filled in.
113  GraphDef graph_def = 2;
114
115  // True iff the graph (before partitioning) contains control flow nodes.
116  //
117  // As of 01/11/2015, this is no longer set by clients.
118  bool has_control_flow = 3 [deprecated = true];
119
120  // Configuration options for the session in which this graph was created.
121  GraphOptions graph_options = 4;
122
123  // Field(s) used by TensorFlow Debugger (tfdbg).
124  DebugOptions debug_options = 5;
125
126  // If graph_def contains any collective ops this must be a positive
127  // integer used to coordinate execution with other graphs.  All
128  // graphs in a distributed execution with the same
129  // collective_graph_key will coordinate to use the same step_id
130  // concurrently so that BufRendezvous entries will make the correct
131  // values accessible.
132  int64 collective_graph_key = 7;
133}
134
135message RegisterGraphResponse {
136  // If the registration succeeds, returns an opaque graph_handle to
137  // the master. The master calls RunGraph with graph_handle to
138  // compute different steps.
139  string graph_handle = 1;
140}
141
142////////////////////////////////////////////////////////////////////////////////
143//
144// DeregisterGraph method request/response messages
145//
146// The master deregisters the given graph_handle when the graph is no
147// longer needed (e.g., the overall graph is re-scheduled and nodes
148// are re-placed).
149//
150// The worker deregisters a graph_handle automatically according to on
151// a TTL-base policy in case of master restarts.
152//
153////////////////////////////////////////////////////////////////////////////////
154
155message DeregisterGraphRequest {
156  // The session_handle used when registering the graph. If session_handle is
157  // empty, a single global namespace is used.
158  string session_handle = 2;
159
160  // Set to true if `CreateWorkerSession` was called for `session_handle`.
161  bool create_worker_session_called = 3;
162
163  // REQUIRED: graph_handle must be returned by a RegisterGraph call
164  // to the same WorkerService.
165  string graph_handle = 1;
166}
167
168message DeregisterGraphResponse {
169  // TODO(mrry): Optionally add summary stats for the graph.
170}
171
172////////////////////////////////////////////////////////////////////////////////
173//
174// CleanupAll method request/response messages
175//
176////////////////////////////////////////////////////////////////////////////////
177
178message CleanupAllRequest {
179  // A list of container names.
180  //
181  // If 'container' is not empty, releases resources in the given
182  // containers in all devices.
183  //
184  // If 'container' is empty, releases resources in the default
185  // container in all devices.
186  repeated string container = 1;
187}
188
189message CleanupAllResponse {
190}
191
192////////////////////////////////////////////////////////////////////////////////
193//
194// RunGraph request / response messages
195//
196// The worker executes all subgraphs registered under graph_handle.
197// RunGraph returns after the execution finishes or an error is
198// encountered.
199// A sequence of RunGraphRequests with is_partial may be sent to RunGraph for
200// partial graph execution.
201//
202////////////////////////////////////////////////////////////////////////////////
203
204// Options specific to the execution of a single step.
205message ExecutorOpts {
206  bool record_costs = 1;
207  bool record_timeline = 3;
208  bool record_partition_graphs = 4;
209  bool report_tensor_allocations_upon_oom = 5;
210};
211
212message RunGraphRequest {
213  // session_handle is the master-generated unique id for this session.
214  // If session_handle is non-empty, it must be the same as used when
215  // registering the graph. If it is empty, a single global namespace is used to
216  // search for the graph_handle.
217  string session_handle = 8;
218
219  // Set to true if `CreateWorkerSession` was called for `session_handle`.
220  bool create_worker_session_called = 10;
221
222  // REQUIRED: graph_handle must be returned by a RegisterGraph call
223  // to the same WorkerService.
224  string graph_handle = 1;
225
226  // A unique ID to distinguish different runs of the same graph.
227  //
228  // The master generates a global unique `step_id` to distinguish
229  // different runs of the graph computation. Subgraphs communicate
230  // (e.g., send/recv ops) with each other using `step_id` to
231  // distinguish tensors generated by different runs.
232  int64 step_id = 2;
233
234  // Options for this step.
235  ExecutorOpts exec_opts = 5;
236
237  // Runs the graph.
238  //
239  // Sends the tensors in "send" into the graph before the run and
240  // fetches the keys into `RunGraphResponse.recv` after the run.
241  repeated NamedTensorProto send = 3;
242  repeated string recv_key = 4;
243
244  // True if the RunGraphRequest is a partial run request.
245  bool is_partial = 6;
246  // True if this is the last partial run request in a sequence of requests.
247  bool is_last_partial_run = 7;
248
249  // If true then some errors, e.g., execution errors that have long
250  // error messages, may return an OK RunGraphResponse with the actual
251  // error saved in the status_code/status_error_message fields of the
252  // response body. This is a workaround since the RPC subsystem may
253  // truncate long metadata messages.
254  bool store_errors_in_response_body = 9;
255
256  // Next: 11
257}
258
259message RunGraphResponse {
260  // A list of tensors corresponding to those requested by
261  // `RunGraphRequest.recv_key`.
262  repeated NamedTensorProto recv = 1;
263
264  // If the request asked for execution stats, the cost graph, or the partition
265  // graphs, these are returned here.
266  // TODO(suharshs): Package these in a RunMetadata instead.
267  StepStats step_stats = 2;
268  CostGraphDef cost_graph = 3;
269  repeated GraphDef partition_graph = 4;
270
271  // If store_errors_in_response_body is true in the request, then
272  // optionally the server may return an OK status for the RPC and
273  // fill the true status into the fields below, to allow for messages
274  // that are too long to fit in metadata.
275  error.Code status_code = 5;
276  string status_error_message = 6;
277}
278
279////////////////////////////////////////////////////////////////////////////////
280//
281// CleanupGraph method request/response messages
282//
283// After the master receives RunGraph responses from all workers, the
284// master instructs every worker to cleanup any remaining state of a
285// step (e.g. tensors buffered by a `Send` op but not picked up by
286// other workers). The master does not necessarily need to wait for
287// completion of CleanupGraph calls.
288//
289// Workers should cleanup step states automatically according to a
290// TTL-based policy in case of master restarts.
291//
292////////////////////////////////////////////////////////////////////////////////
293
294message CleanupGraphRequest {
295  int64 step_id = 1;
296}
297
298message CleanupGraphResponse {
299}
300
301////////////////////////////////////////////////////////////////////////////////
302//
303// RecvTensor method request/response messages
304//
305////////////////////////////////////////////////////////////////////////////////
306
307message RecvTensorRequest {
308  // The step in which the tensor will be produced.
309  //
310  // REQUIRED: This must eventually correspond to the `step_id` passed
311  // into a RunGraph call on the same WorkerService.
312  int64 step_id = 1;
313
314  // A key identifying the channel to receive tensors from. A RecvTensor request
315  // retrieves one tensor from the channel, but multiple tensors can be sent and
316  // received over the same channel with multiple RecvTensor requests. See
317  // rendezvous.h for details.
318  string rendezvous_key = 2;
319
320  // If true, use an out-of-band DMA mechanism to transfer the
321  // received tensor.
322  bool dma_ok = 3;
323
324  // Optional information on client-side device locality.
325  DeviceLocality client_locality = 4;
326
327  // Optional information on server-side device locality.
328  DeviceLocality server_locality = 5;
329
330  // Optional information needed by the RPC subsystem.
331  google.protobuf.Any transport_options = 6;
332
333  // Unique identifier for this request. Every RecvTensorRequest must have a
334  // unique request_id, and retried RecvTensorRequests must have the same
335  // request_id. If request_id is zero, retry detection is disabled.
336  //
337  // Retried RecvTensorRequests are problematic because a RecvTensor with no
338  // corresponding sender will wait forever, and the tensor may have been
339  // delivered to a previous retry. Workers use request_ids to reject retried
340  // RecvTensor requests instead of waiting forever.
341  int64 request_id = 7;
342}
343
344message RecvTensorResponse {
345  // The tensor as a proto.
346  TensorProto tensor = 1;
347
348  // If true, this tensor was the output of a dead node, and the
349  // content is invalid.
350  bool is_dead = 2;
351
352  // The time at which tensor was available and started to be returned.
353  int64 send_start_micros = 3;
354
355  // Optional additional information about how to receive the tensor,
356  // e.g. in the event that `RecvTensorRequest.dma_ok` was true.
357  google.protobuf.Any transport_options = 4;
358}
359
360////////////////////////////////////////////////////////////////////////////////
361//
362// Logging method request/response messages
363//
364// NOTE(mrry): This feature is not supported in the open-source
365// version, and these messages are expected to change.
366//
367////////////////////////////////////////////////////////////////////////////////
368
369// Out-of-band request to begin or end logging, or
370// to retrieve logs for particular steps.
371message LoggingRequest {
372  // If true, RPC logging will be enabled.
373  bool enable_rpc_logging = 1;
374
375  // If true, RPC logging will be disabled.
376  bool disable_rpc_logging = 4;
377
378  // If true, discard any saved logging data (for all steps).
379  bool clear = 2;
380
381  // When set, requests all saved log data pertaining to the step.
382  // Any log data retrieved is eliminated from the store and cannot be
383  // retrieved again.
384  repeated int64 fetch_step_id = 3;
385}
386
387message LabeledStepStats {
388  int64 step_id = 1;
389  StepStats step_stats = 2;
390}
391
392message LoggingResponse {
393  repeated LabeledStepStats step = 1;
394}
395
396////////////////////////////////////////////////////////////////////////////////
397//
398// Tracing method request/response messages
399//
400// NOTE(mrry): This feature is not supported in the open-source
401// version, and these messages are expected to change.
402//
403////////////////////////////////////////////////////////////////////////////////
404
405message TraceOpts {
406  // Length of the trace to be taken, in seconds.
407  double duration = 1;
408  // If true, capture step profile locally in each worker. Currently
409  // unimplemented.
410  bool use_step_profiler = 2;
411  // If true, capture kernel events from each worker.
412  bool use_kernel_profiler = 3;
413  // If true, capture extended profiling events from TensorFlow process.
414  bool use_extended_profiler = 4;
415  // If true, capture GPU profiling events locally on each
416  // machine. Currently unimplemented.
417  bool use_gpu_profiler = 5;
418  // If true, collect sampled profile events. Currently unimplemented.
419  bool use_sample_profiler = 6;
420}
421
422// Out-of-band request to configure distributed tracing.
423message TracingRequest {
424  TraceOpts options = 1;
425}
426
427message TracingResponse {
428}
429
430////////////////////////////////////////////////////////////////////////////////
431//
432// Raw data transfers in support of Collective Ops.
433// These methods are experimental and subject to change.
434//
435// The intention is to allow collectives to take advantage of the most
436// efficient methods available on a platform, e.g. RDMA, and not be
437// constrained to use the RPC system in use by other methods.
438//
439////////////////////////////////////////////////////////////////////////////////
440
441message RecvBufRequest {
442  // Use of the fields below may vary by implementation.  For example
443  // the buf_ptr and num_bytes may be set only for local operations and
444  // not sent on the wire, or only sent on the wire in one direction.
445
446  // Used at server side to find the correct BufRendezvous.
447  int64 step_id = 1;
448
449  // Arbitrary string identifying a BufRendezvous entry.
450  string buf_rendezvous_key = 2;
451
452  // Size of value expected, must agree with BufRendezvous entry.
453  int64 num_bytes = 3;
454
455  // When RDMA is in use, address of destination field on client.
456  fixed64 buf_ptr = 4;
457
458  // Optional information on client-side device locality.
459  DeviceLocality client_locality = 5;
460
461  // Optional information on server-side device locality.
462  DeviceLocality server_locality = 6;
463
464  // Optional, implementation-specific data.
465  google.protobuf.Any transport_options = 7;
466  // Optional, for annotating the timeline.
467  string src_device = 8;
468  string dst_device = 9;
469
470  // Depending on the RPC system in use, it may be necessary to set this
471  // id to detect resends of RPCs where the server is not aware that
472  // the prior RPC failed.
473  int64 request_id = 10;
474}
475
476message RecvBufResponse {
477  // Use of the fields below may vary by implementation.  Comments give
478  // intended use.
479
480  fixed64 buf_ptr = 1;  // Address of source field on server.
481  int64 num_bytes = 2;  // Byte length of buf_ptr field, if set.
482  bool is_dead = 3;     // True if value is 'dead' like a tensor.
483  // Optional, implementation-specific data.
484  google.protobuf.Any transport_options = 4;
485  // Optional, for timeline.
486  int64 send_start_micros = 5;
487}
488
489////////////////////////////////////////////////////////////////////////////////
490//
491// Collective Op dynamic group resolution messages.
492//
493////////////////////////////////////////////////////////////////////////////////
494
495// Supplies one or more device names as members of the group identified by
496// group_key.  Service will respond when all group_size devices become known.
497// All devices in group must have same type.
498message CompleteGroupRequest {
499  int32 group_key = 1;
500  int32 group_size = 2;
501  string device_type = 3;
502  repeated string device_name = 4;
503}
504
505// Gives the complete membership of the group identified by group_key.
506message CompleteGroupResponse {
507  int32 group_key = 1;
508  int32 group_size = 2;
509  string device_type = 3;
510  int32 num_tasks = 4;  // number of distinct tasks hosting the devices
511  repeated string device_name = 5;
512  repeated string task_name = 6;  // task name prefixes of device_names
513}
514
515// Supplies data about one collective op belonging to the instance identified
516// by instance_key.  Service will respond when all group_size ops have
517// become known.  Most of the data being sent is for correctness checking,
518// to ensure that all ops in the instance share common attributes.
519message CompleteInstanceRequest {
520  string name = 1;
521  int32 type = 2;
522  DataType data_type = 3;
523  TensorShapeProto shape = 4;
524  int32 group_key = 5;
525  int32 group_size = 6;
526  int32 instance_key = 7;
527  string device_type = 8;
528  repeated int32 subdiv_offset = 9;
529  string device = 10;
530  bool is_source = 11;
531}
532
533// Confirms that every op in the instance has consistently declared itself.
534// Also gives the source_rank in case of broadcast.
535message CompleteInstanceResponse {
536  int32 instance_key = 1;
537  int32 source_rank = 2;
538  bytes communicator_key = 3;
539}
540
541// Request for next agreed-upon step_id for the specified graph_keys.
542// This is used to enable multiple graphs containing nodes from
543// a common collective instance to coordinate using the same step_ids.
544message GetStepSequenceRequest {
545  repeated int64 graph_key = 1;
546}
547
548message StepSequence {
549  int64 graph_key = 1;
550  int64 next_step_id = 2;
551}
552
553// Next valid step_ids for one or more graph_keys.
554message GetStepSequenceResponse {
555  repeated StepSequence step_sequence = 1;
556}
557