1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/core/distributed_runtime/session_mgr.h"
17 
18 #include <utility>
19 
20 #include "tensorflow/core/common_runtime/device_mgr.h"
21 #include "tensorflow/core/common_runtime/renamed_device.h"
22 #include "tensorflow/core/distributed_runtime/graph_mgr.h"
23 #include "tensorflow/core/distributed_runtime/worker_cache_wrapper.h"
24 #include "tensorflow/core/lib/strings/strcat.h"
25 #include "tensorflow/core/protobuf/cluster.pb.h"
26 #include "tensorflow/core/protobuf/tensorflow_server.pb.h"
27 #include "tensorflow/core/util/ptr_util.h"
28 
29 namespace tensorflow {
30 
SessionMgr(WorkerEnv * worker_env,const string & default_worker_name,std::unique_ptr<WorkerCacheInterface> default_worker_cache,WorkerCacheFactory worker_cache_factory)31 SessionMgr::SessionMgr(
32     WorkerEnv* worker_env, const string& default_worker_name,
33     std::unique_ptr<WorkerCacheInterface> default_worker_cache,
34     WorkerCacheFactory worker_cache_factory)
35     : worker_env_(worker_env),
36       default_worker_cache_(std::move(default_worker_cache)),
37       legacy_session_(WorkerSession::CreateWithBorrowedDeviceMgr(
38           "", default_worker_name,
39           std::unique_ptr<WorkerCacheInterface>(
40               new WorkerCacheWrapper(default_worker_cache_.get())),
41           worker_env->device_mgr,
42           std::unique_ptr<GraphMgr>(
43               new GraphMgr(worker_env, worker_env->device_mgr)))),
44       worker_cache_factory_(std::move(worker_cache_factory)) {}
45 
46 /* static */
WorkerNameFromServerDef(const ServerDef & server_def)47 string SessionMgr::WorkerNameFromServerDef(const ServerDef& server_def) {
48   return strings::StrCat("/job:", server_def.job_name(), "/replica:0/task:",
49                          server_def.task_index());
50 }
51 
CreateSession(const string & session,const ServerDef & server_def,bool isolate_session_state)52 Status SessionMgr::CreateSession(const string& session,
53                                  const ServerDef& server_def,
54                                  bool isolate_session_state) {
55   mutex_lock l(mu_);
56   if (session.empty()) {
57     return errors::InvalidArgument("Session must be non-empty.");
58   }
59 
60   WorkerCacheInterface* worker_cache = nullptr;
61   string worker_name;
62   if (server_def.cluster().job().empty()) {
63     worker_cache = new WorkerCacheWrapper(default_worker_cache_.get());
64     worker_name = legacy_session_->worker_name;
65   } else {
66     TF_RETURN_IF_ERROR(worker_cache_factory_(server_def, &worker_cache));
67     worker_name = WorkerNameFromServerDef(server_def);
68   }
69 
70   if (worker_cache != nullptr && default_worker_cache_ != nullptr) {
71     worker_cache->SetLogging(this->is_logging_active_);
72   }
73 
74   CHECK(!worker_env_->local_devices.empty())
75       << "The WorkerEnv must have at least one device in `local_devices`.";
76 
77   std::shared_ptr<WorkerSession> worker_session;
78 
79   if (isolate_session_state) {
80     // Create a private copy of the DeviceMgr for the WorkerSession.
81     std::vector<std::unique_ptr<Device>> renamed_devices;
82     for (Device* d : worker_env_->local_devices) {
83       renamed_devices.push_back(RenamedDevice::NewRenamedDevice(
84           worker_name, d, false, isolate_session_state));
85     }
86 
87     auto device_mgr = MakeUnique<DeviceMgr>(std::move(renamed_devices));
88     auto graph_mgr = MakeUnique<GraphMgr>(worker_env_, device_mgr.get());
89     worker_session.reset(
90         new WorkerSession(session, worker_name,
91                           std::unique_ptr<WorkerCacheInterface>(worker_cache),
92                           std::move(device_mgr), std::move(graph_mgr)));
93   } else {
94     // Borrown the WorkerEnv's DeviceMgr for the WorkerSession, so
95     // that resources using it can use its devices after the
96     // WorkerSession has been deleted.
97     auto graph_mgr = MakeUnique<GraphMgr>(worker_env_, worker_env_->device_mgr);
98     worker_session = WorkerSession::CreateWithBorrowedDeviceMgr(
99         session, worker_name,
100         std::unique_ptr<WorkerCacheInterface>(worker_cache),
101         worker_env_->device_mgr, std::move(graph_mgr));
102   }
103 
104   sessions_.insert(std::make_pair(session, std::move(worker_session)));
105   return Status::OK();
106 }
107 
DeleteSession(const string & session)108 Status SessionMgr::DeleteSession(const string& session) {
109   mutex_lock l(mu_);
110   auto it = sessions_.find(session);
111   if (it != sessions_.end()) {
112     sessions_.erase(it);
113   }
114   return Status::OK();
115 }
116 
WorkerSessionForSessionLocked(const string & session_handle,std::shared_ptr<WorkerSession> * out_session)117 Status SessionMgr::WorkerSessionForSessionLocked(
118     const string& session_handle, std::shared_ptr<WorkerSession>* out_session) {
119   if (session_handle.empty()) {
120     *out_session = legacy_session_;
121   } else {
122     auto it = sessions_.find(session_handle);
123     if (it == sessions_.end()) {
124       return errors::Aborted("Session handle is not found: ", session_handle,
125                              ". Possibly this worker (\"",
126                              legacy_session_->worker_name,
127                              "\") just restarted.");
128     } else {
129       *out_session = it->second;
130     }
131   }
132   return Status::OK();
133 }
134 
WorkerSessionForSession(const string & session_handle,std::shared_ptr<WorkerSession> * out_session)135 Status SessionMgr::WorkerSessionForSession(
136     const string& session_handle, std::shared_ptr<WorkerSession>* out_session) {
137   mutex_lock l(mu_);
138   return WorkerSessionForSessionLocked(session_handle, out_session);
139 }
140 
LegacySession()141 std::shared_ptr<WorkerSession> SessionMgr::LegacySession() {
142   return legacy_session_;
143 }
144 
SetLogging(bool active)145 void SessionMgr::SetLogging(bool active) {
146   mutex_lock l(mu_);
147   this->is_logging_active_ = active;
148   // Legacy Session
149   if (legacy_session_) {
150     auto* worker_cache = legacy_session_->worker_cache.get();
151     if (worker_cache) {
152       worker_cache->SetLogging(active);
153     }
154   }
155 
156   for (const auto& session_kv : sessions_) {
157     auto session = session_kv.second.get();
158     if (session) {
159       auto* worker_cache = session->worker_cache.get();
160       if (worker_cache) {
161         worker_cache->SetLogging(active);
162       }
163     }
164   }
165 }
166 
RetrieveLogs(tensorflow::int64 step_id,LoggingResponse * response)167 void SessionMgr::RetrieveLogs(tensorflow::int64 step_id,
168                               LoggingResponse* response) {
169   mutex_lock l(mu_);
170   // Legacy Session
171   if (legacy_session_) {
172     auto* worker_cache = legacy_session_->worker_cache.get();
173     if (worker_cache) {
174       auto step_stats = StepStats();
175       if (worker_cache->RetrieveLogs(step_id, &step_stats)) {
176         auto* labeled_step_stats = response->add_step();
177         labeled_step_stats->set_step_id(step_id);
178         labeled_step_stats->mutable_step_stats()->Swap(&step_stats);
179       }
180     }
181   }
182   for (const auto& session_kv : sessions_) {
183     auto session = session_kv.second.get();
184     if (session) {
185       auto* worker_cache = session->worker_cache.get();
186       if (worker_cache) {
187         auto step_stats = StepStats();
188         if (worker_cache->RetrieveLogs(step_id, &step_stats)) {
189           auto* labeled_step_stats = response->add_step();
190           labeled_step_stats->set_step_id(step_id);
191           labeled_step_stats->mutable_step_stats()->Swap(&step_stats);
192         }
193       }
194     }
195   }
196 }
197 
ClearLogs()198 void SessionMgr::ClearLogs() {
199   mutex_lock l(mu_);
200   // Legacy Session
201   if (legacy_session_) {
202     auto* worker_cache = legacy_session_->worker_cache.get();
203     if (worker_cache) {
204       worker_cache->ClearLogs();
205     }
206   }
207 
208   for (const auto& session_kv : sessions_) {
209     auto session = session_kv.second.get();
210     if (session) {
211       auto* worker_cache = session->worker_cache.get();
212       if (worker_cache) {
213         worker_cache->ClearLogs();
214       }
215     }
216   }
217 }
218 }  // namespace tensorflow
219