1syntax = "proto3";
2
3package tensorflow.eager;
4
5import "tensorflow/core/framework/attr_value.proto";
6import "tensorflow/core/framework/device_attributes.proto";
7import "tensorflow/core/framework/function.proto";
8import "tensorflow/core/framework/versions.proto";
9import "tensorflow/core/protobuf/tensorflow_server.proto";
10import "tensorflow/core/framework/tensor_shape.proto";
11import "tensorflow/core/framework/tensor.proto";
12
13message RemoteTensorHandle {
14  // The ID of the operation that produced this tensor.
15  int64 op_id = 1;
16  // The index into the outputs of the operation that produced this tensor.
17  int32 output_num = 2;
18}
19
20// A proto representation of an eager operation.
21message Operation {
22  // A unique identifier for the operation. Set by the client so that the client
23  // can uniquely identify the outputs of the scheduled operation.
24  //
25  // In the initial implementation, sending duplicate IDs has undefined
26  // behaviour, but additional constraints may be placed upon this in the
27  // future.
28  int64 id = 1;
29  string name = 2;
30  repeated RemoteTensorHandle inputs = 3;
31
32  // Control Operation IDs that will be respected when ops are re-ordered by
33  // async execution. If async execution (+ op re-ordering) is not enabled, this
34  // should have no effect.
35  repeated int64 control_op_ids = 4;
36  map<string, AttrValue> attrs = 5;
37  string device = 6;
38}
39
40message QueueItem {
41  // The remote executor should be able to handle either executing ops directly,
42  // or releasing any unused tensor handles, since the tensor lifetime is
43  // maintained by the client.
44  oneof item {
45    RemoteTensorHandle handle_to_decref = 1;
46    Operation operation = 2;
47  }
48}
49
50message QueueResponse {
51  repeated TensorShapeProto shape = 1;
52}
53
54message CreateContextRequest {
55  // Identifies the full cluster, and this particular worker's position within.
56  ServerDef server_def = 1;
57
58  // Whether the ops on the worker should be executed synchronously or
59  // asynchronously. By default, ops are executed synchronously.
60  bool async = 2;
61
62  // Number of seconds to keep the context alive. If more than keep_alive_secs
63  // has passed since a particular context has been communicated with, it will
64  // be garbage collected.
65  int64 keep_alive_secs = 3;
66
67  // This is the version for all the ops that will be enqueued by the client.
68  VersionDef version_def = 4;
69
70  // This ID will be used for all future communications. It is essential that
71  // both ends use this ID for selecting a rendezvous to get everything to
72  // match.
73  int64 rendezvous_id = 5;
74}
75
76message CreateContextResponse {
77  // The ID of the created context. This is usually a randomly generated number,
78  // that will be used to identify the context in future requests to the
79  // service. Contexts are not persisted through server restarts.
80  fixed64 context_id = 1;
81
82  // List of devices that are locally accessible to the worker.
83  repeated DeviceAttributes device_attributes = 2;
84}
85
86message EnqueueRequest {
87  fixed64 context_id = 1;
88
89  repeated QueueItem queue = 3;
90}
91
92message EnqueueResponse {
93  // A single operation response for every item in the request.
94  repeated QueueResponse queue_response = 1;
95}
96
97message WaitQueueDoneRequest {
98  fixed64 context_id = 1;
99
100  // Ids to wait on. If empty, wait on everything currently pending.
101  repeated int64 op_id = 2;
102}
103
104message WaitQueueDoneResponse {
105  // TODO(nareshmodi): Consider adding NodeExecStats here to be able to
106  // propagate some stats.
107}
108
109message KeepAliveRequest {
110  fixed64 context_id = 1;
111}
112
113message KeepAliveResponse {
114}
115
116message CloseContextRequest {
117  fixed64 context_id = 1;
118}
119
120message CloseContextResponse {
121}
122
123message RegisterFunctionRequest {
124  fixed64 context_id = 1;
125
126  FunctionDef function_def = 2;
127}
128
129message RegisterFunctionResponse {
130}
131
132message SendTensorRequest {
133  fixed64 context_id = 1;
134
135  // All remote tensors are identified by <Op ID, Output num>. To mimic this
136  // situation when directly sending tensors, we include an "artificial" op ID
137  // (which would have corresponded to the _Recv op when not using SendTensor).
138  int64 op_id = 2;
139  // The index within the repeated field is the output number that will help
140  // uniquely identify (along with the above op_id) the particular tensor.
141  repeated TensorProto tensors = 3;
142
143  // The device on which the tensors should be resident.
144  string device_name = 4;
145}
146
147message SendTensorResponse {
148}
149
150////////////////////////////////////////////////////////////////////////////////
151//
152// Eager Service defines a TensorFlow service that executes operations eagerly
153// on a set of local devices, on behalf of a remote Eager executor.
154//
155// The service impl will keep track of the various clients and devices it has
156// access to and allows the client to enqueue ops on any devices that it is able
157// to access and schedule data transfers from/to any of the peers.
158//
159// A client can generate multiple contexts to be able to independently execute
160// operations, but cannot share data between the two contexts.
161//
162// NOTE: Even though contexts generated by clients should be independent, the
163// lower level tensorflow execution engine is not, so they might share some data
164// (e.g. a Device's ResourceMgr).
165//
166////////////////////////////////////////////////////////////////////////////////
167service EagerService {
168  // This initializes the worker, informing it about the other workers in the
169  // cluster and exchanging authentication tokens which will be used in all
170  // other RPCs to detect whether the worker has restarted.
171  rpc CreateContext(CreateContextRequest) returns (CreateContextResponse);
172
173  // This takes a list of Execute and DeleteTensorHandle operations and enqueues
174  // (in async mode) or executes (in sync mode) them on the remote server.
175  // All outputs of ops which were not explicitly deleted with
176  // DeleteTensorHandle entries will be assumed to be alive and are usable by
177  // future calls to Enqueue.
178  rpc Enqueue(EnqueueRequest) returns (EnqueueResponse);
179
180  // Takes a set of op IDs and waits until those ops are done. Returns any error
181  // in the stream so far.
182  rpc WaitQueueDone(WaitQueueDoneRequest) returns (WaitQueueDoneResponse);
183
184  // Contexts are always created with a deadline and no RPCs within a deadline
185  // will trigger a context garbage collection. KeepAlive calls can be used to
186  // delay this.
187  rpc KeepAlive(KeepAliveRequest) returns (KeepAliveResponse);
188
189  // Closes the context. No calls to other methods using the existing context ID
190  // are valid after this.
191  rpc CloseContext(CloseContextRequest) returns (CloseContextResponse);
192
193  // Takes a FunctionDef and makes it enqueable on the remote worker.
194  rpc RegisterFunction(RegisterFunctionRequest)
195      returns (RegisterFunctionResponse);
196
197  // An RPC to push tensors to the server. At times, certain environments don't
198  // allow the server to connect back to the client.
199  rpc SendTensor(SendTensorRequest) returns (SendTensorResponse);
200}
201