1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
17
18 #if !defined(PLATFORM_WINDOWS)
19 #include <dirent.h>
20 #endif
21
22 #include <limits.h>
23 #include <stddef.h>
24 #include <stdio.h>
25 #include <stdlib.h>
26 #include <string.h>
27 #ifdef __APPLE__
28 #include <IOKit/kext/KextManager.h>
29 #include <mach-o/dyld.h>
30 #else
31 #if !defined(PLATFORM_WINDOWS)
32 #include <link.h>
33 #include <sys/sysmacros.h>
34 #include <unistd.h>
35 #endif
36 #include <sys/stat.h>
37 #endif
38 #include <algorithm>
39 #include <memory>
40 #include <vector>
41
42 #include "absl/container/inlined_vector.h"
43 #include "absl/strings/str_cat.h"
44 #include "tensorflow/stream_executor/lib/error.h"
45 #include "tensorflow/stream_executor/lib/numbers.h"
46 #include "tensorflow/stream_executor/lib/process_state.h"
47 #include "tensorflow/stream_executor/lib/status.h"
48 #include "tensorflow/stream_executor/lib/str_util.h"
49 #include "tensorflow/stream_executor/lib/stringprintf.h"
50 #include "tensorflow/stream_executor/platform/logging.h"
51
52 namespace stream_executor {
53 namespace cuda {
54
DriverVersionToString(DriverVersion version)55 string DriverVersionToString(DriverVersion version) {
56 return port::Printf("%d.%d.%d", std::get<0>(version), std::get<1>(version), std::get<2>(version));
57 }
58
DriverVersionStatusToString(port::StatusOr<DriverVersion> version)59 string DriverVersionStatusToString(port::StatusOr<DriverVersion> version) {
60 if (!version.ok()) {
61 return version.status().ToString();
62 }
63
64 return DriverVersionToString(version.ValueOrDie());
65 }
66
StringToDriverVersion(const string & value)67 port::StatusOr<DriverVersion> StringToDriverVersion(const string &value) {
68 std::vector<string> pieces = port::Split(value, '.');
69 if (pieces.size() < 2 || pieces.size() > 4) {
70 return port::Status(
71 port::error::INVALID_ARGUMENT,
72 port::Printf("expected %%d.%%d, %%d.%%d.%%d, or %%d.%%d.%%d.%%d form "
73 "for driver version; got \"%s\"",
74 value.c_str()));
75 }
76
77 int major;
78 int minor;
79 int patch = 0;
80 if (!port::safe_strto32(pieces[0], &major)) {
81 return port::Status(
82 port::error::INVALID_ARGUMENT,
83 port::Printf("could not parse major version number \"%s\" as an "
84 "integer from string \"%s\"",
85 pieces[0].c_str(), value.c_str()));
86 }
87 if (!port::safe_strto32(pieces[1], &minor)) {
88 return port::Status(
89 port::error::INVALID_ARGUMENT,
90 port::Printf("could not parse minor version number \"%s\" as an "
91 "integer from string \"%s\"",
92 pieces[1].c_str(), value.c_str()));
93 }
94 if (pieces.size() == 3 && !port::safe_strto32(pieces[2], &patch)) {
95 return port::Status(
96 port::error::INVALID_ARGUMENT,
97 port::Printf("could not parse patch version number \"%s\" as an "
98 "integer from string \"%s\"",
99 pieces[2].c_str(), value.c_str()));
100 }
101
102 DriverVersion result{major, minor, patch};
103 VLOG(2) << "version string \"" << value << "\" made value "
104 << DriverVersionToString(result);
105 return result;
106 }
107
108 } // namespace cuda
109 } // namespace stream_executor
110
111 namespace stream_executor {
112 namespace gpu {
113
114 #ifdef __APPLE__
115 static const CFStringRef kDriverKextIdentifier = CFSTR("com.nvidia.CUDA");
116 #elif !defined(PLATFORM_WINDOWS)
117 static const char *kDriverVersionPath = "/proc/driver/nvidia/version";
118 #endif
119
120 // -- class Diagnostician
121
GetDevNodePath(int dev_node_ordinal)122 string Diagnostician::GetDevNodePath(int dev_node_ordinal) {
123 return absl::StrCat("/dev/nvidia", dev_node_ordinal);
124 }
125
LogDiagnosticInformation()126 void Diagnostician::LogDiagnosticInformation() {
127 #ifdef __APPLE__
128 CFStringRef kext_ids[1];
129 kext_ids[0] = kDriverKextIdentifier;
130 CFArrayRef kext_id_query = CFArrayCreate(nullptr, (const void **)kext_ids, 1,
131 &kCFTypeArrayCallBacks);
132 CFDictionaryRef kext_infos =
133 KextManagerCopyLoadedKextInfo(kext_id_query, nullptr);
134 CFRelease(kext_id_query);
135
136 CFDictionaryRef cuda_driver_info = nullptr;
137 if (CFDictionaryGetValueIfPresent(kext_infos, kDriverKextIdentifier,
138 (const void **)&cuda_driver_info)) {
139 bool started = CFBooleanGetValue((CFBooleanRef)CFDictionaryGetValue(
140 cuda_driver_info, CFSTR("OSBundleStarted")));
141 if (!started) {
142 LOG(INFO) << "kernel driver is installed, but does not appear to be "
143 "running on this host "
144 << "(" << port::Hostname() << ")";
145 }
146 } else {
147 LOG(INFO) << "kernel driver does not appear to be installed on this host "
148 << "(" << port::Hostname() << ")";
149 }
150 CFRelease(kext_infos);
151 #elif !defined(PLATFORM_WINDOWS)
152 if (access(kDriverVersionPath, F_OK) != 0) {
153 LOG(INFO) << "kernel driver does not appear to be running on this host "
154 << "(" << port::Hostname() << "): "
155 << "/proc/driver/nvidia/version does not exist";
156 return;
157 }
158 auto dev0_path = GetDevNodePath(0);
159 if (access(dev0_path.c_str(), F_OK) != 0) {
160 LOG(INFO) << "no NVIDIA GPU device is present: " << dev0_path
161 << " does not exist";
162 return;
163 }
164 #endif
165
166 LOG(INFO) << "retrieving CUDA diagnostic information for host: "
167 << port::Hostname();
168
169 LogDriverVersionInformation();
170 }
171
LogDriverVersionInformation()172 /* static */ void Diagnostician::LogDriverVersionInformation() {
173 LOG(INFO) << "hostname: " << port::Hostname();
174 #ifndef PLATFORM_WINDOWS
175 if (VLOG_IS_ON(1)) {
176 const char *value = getenv("LD_LIBRARY_PATH");
177 string library_path = value == nullptr ? "" : value;
178 VLOG(1) << "LD_LIBRARY_PATH is: \"" << library_path << "\"";
179
180 std::vector<string> pieces = port::Split(library_path, ':');
181 for (const auto &piece : pieces) {
182 if (piece.empty()) {
183 continue;
184 }
185 DIR *dir = opendir(piece.c_str());
186 if (dir == nullptr) {
187 VLOG(1) << "could not open \"" << piece << "\"";
188 continue;
189 }
190 while (dirent *entity = readdir(dir)) {
191 VLOG(1) << piece << " :: " << entity->d_name;
192 }
193 closedir(dir);
194 }
195 }
196 port::StatusOr<DriverVersion> dso_version = FindDsoVersion();
197 LOG(INFO) << "libcuda reported version is: "
198 << cuda::DriverVersionStatusToString(dso_version);
199
200 port::StatusOr<DriverVersion> kernel_version = FindKernelDriverVersion();
201 LOG(INFO) << "kernel reported version is: "
202 << cuda::DriverVersionStatusToString(kernel_version);
203 #endif
204
205 // OS X kernel driver does not report version accurately
206 #if !defined(__APPLE__) && !defined(PLATFORM_WINDOWS)
207 if (kernel_version.ok() && dso_version.ok()) {
208 WarnOnDsoKernelMismatch(dso_version, kernel_version);
209 }
210 #endif
211 }
212
213 // Iterates through loaded DSOs with DlIteratePhdrCallback to find the
214 // driver-interfacing DSO version number. Returns it as a string.
FindDsoVersion()215 port::StatusOr<DriverVersion> Diagnostician::FindDsoVersion() {
216 port::StatusOr<DriverVersion> result(port::Status(
217 port::error::NOT_FOUND,
218 "was unable to find libcuda.so DSO loaded into this program"));
219
220 #if defined(__APPLE__)
221 // OSX CUDA libraries have names like: libcuda_310.41.15_mercury.dylib
222 const string prefix("libcuda_");
223 const string suffix("_mercury.dylib");
224 for (uint32_t image_index = 0; image_index < _dyld_image_count();
225 ++image_index) {
226 const string path(_dyld_get_image_name(image_index));
227 const size_t suffix_pos = path.rfind(suffix);
228 const size_t prefix_pos = path.rfind(prefix, suffix_pos);
229 if (prefix_pos == string::npos || suffix_pos == string::npos) {
230 // no match
231 continue;
232 }
233 const size_t start = prefix_pos + prefix.size();
234 if (start >= suffix_pos) {
235 // version not included
236 continue;
237 }
238 const size_t length = suffix_pos - start;
239 const string version = path.substr(start, length);
240 result = cuda::StringToDriverVersion(version);
241 }
242 #else
243 #if !defined(PLATFORM_WINDOWS) && !defined(ANDROID_TEGRA)
244 // Callback used when iterating through DSOs. Looks for the driver-interfacing
245 // DSO and yields its version number into the callback data, when found.
246 auto iterate_phdr =
247 [](struct dl_phdr_info *info, size_t size, void *data) -> int {
248 if (strstr(info->dlpi_name, "libcuda.so.1")) {
249 VLOG(1) << "found DLL info with name: " << info->dlpi_name;
250 char resolved_path[PATH_MAX] = {0};
251 if (realpath(info->dlpi_name, resolved_path) == nullptr) {
252 return 0;
253 }
254 VLOG(1) << "found DLL info with resolved path: " << resolved_path;
255 const char *slash = rindex(resolved_path, '/');
256 if (slash == nullptr) {
257 return 0;
258 }
259 const char *so_suffix = ".so.";
260 const char *dot = strstr(slash, so_suffix);
261 if (dot == nullptr) {
262 return 0;
263 }
264 string dso_version = dot + strlen(so_suffix);
265 // TODO(b/22689637): Eliminate the explicit namespace if possible.
266 auto stripped_dso_version = port::StripSuffixString(dso_version, ".ld64");
267 auto result = static_cast<port::StatusOr<DriverVersion> *>(data);
268 *result = cuda::StringToDriverVersion(stripped_dso_version);
269 return 1;
270 }
271 return 0;
272 };
273
274 dl_iterate_phdr(iterate_phdr, &result);
275 #endif
276 #endif
277
278 return result;
279 }
280
FindKernelModuleVersion(const string & driver_version_file_contents)281 port::StatusOr<DriverVersion> Diagnostician::FindKernelModuleVersion(
282 const string &driver_version_file_contents) {
283 static const char *kDriverFilePrelude = "Kernel Module ";
284 size_t offset = driver_version_file_contents.find(kDriverFilePrelude);
285 if (offset == string::npos) {
286 return port::Status(
287 port::error::NOT_FOUND,
288 absl::StrCat("could not find kernel module information in "
289 "driver version file contents: \"",
290 driver_version_file_contents, "\""));
291 }
292
293 string version_and_rest = driver_version_file_contents.substr(
294 offset + strlen(kDriverFilePrelude), string::npos);
295 size_t space_index = version_and_rest.find(" ");
296 auto kernel_version = version_and_rest.substr(0, space_index);
297 // TODO(b/22689637): Eliminate the explicit namespace if possible.
298 auto stripped_kernel_version =
299 port::StripSuffixString(kernel_version, ".ld64");
300 return cuda::StringToDriverVersion(stripped_kernel_version);
301 }
302
WarnOnDsoKernelMismatch(port::StatusOr<DriverVersion> dso_version,port::StatusOr<DriverVersion> kernel_version)303 void Diagnostician::WarnOnDsoKernelMismatch(
304 port::StatusOr<DriverVersion> dso_version,
305 port::StatusOr<DriverVersion> kernel_version) {
306 if (kernel_version.ok() && dso_version.ok() &&
307 dso_version.ValueOrDie() == kernel_version.ValueOrDie()) {
308 LOG(INFO) << "kernel version seems to match DSO: "
309 << cuda::DriverVersionToString(kernel_version.ValueOrDie());
310 } else {
311 LOG(ERROR) << "kernel version "
312 << cuda::DriverVersionStatusToString(kernel_version)
313 << " does not match DSO version "
314 << cuda::DriverVersionStatusToString(dso_version)
315 << " -- cannot find working devices in this configuration";
316 }
317 }
318
319
FindKernelDriverVersion()320 port::StatusOr<DriverVersion> Diagnostician::FindKernelDriverVersion() {
321 #if defined(__APPLE__)
322 CFStringRef kext_ids[1];
323 kext_ids[0] = kDriverKextIdentifier;
324 CFArrayRef kext_id_query = CFArrayCreate(nullptr, (const void **)kext_ids, 1,
325 &kCFTypeArrayCallBacks);
326 CFDictionaryRef kext_infos =
327 KextManagerCopyLoadedKextInfo(kext_id_query, nullptr);
328 CFRelease(kext_id_query);
329
330 CFDictionaryRef cuda_driver_info = nullptr;
331 if (CFDictionaryGetValueIfPresent(kext_infos, kDriverKextIdentifier,
332 (const void **)&cuda_driver_info)) {
333 // NOTE: OSX CUDA driver does not currently store the same driver version
334 // in kCFBundleVersionKey as is returned by cuDriverGetVersion
335 CFRelease(kext_infos);
336 const CFStringRef str = (CFStringRef)CFDictionaryGetValue(
337 cuda_driver_info, kCFBundleVersionKey);
338 const char *version = CFStringGetCStringPtr(str, kCFStringEncodingUTF8);
339
340 // version can be NULL in which case treat it as empty string
341 // see
342 // https://developer.apple.com/library/mac/documentation/CoreFoundation/Conceptual/CFStrings/Articles/AccessingContents.html#//apple_ref/doc/uid/20001184-100980-TPXREF112
343 if (version == NULL) {
344 return cuda::StringToDriverVersion("");
345 }
346 return cuda::StringToDriverVersion(version);
347 }
348 CFRelease(kext_infos);
349 auto status = port::Status(
350 port::error::INTERNAL,
351 absl::StrCat(
352 "failed to read driver bundle version: ",
353 CFStringGetCStringPtr(kDriverKextIdentifier, kCFStringEncodingUTF8)));
354 return status;
355 #elif defined(PLATFORM_WINDOWS)
356 auto status =
357 port::Status(port::error::UNIMPLEMENTED,
358 "kernel reported driver version not implemented on Windows");
359 return status;
360 #else
361 FILE *driver_version_file = fopen(kDriverVersionPath, "r");
362 if (driver_version_file == nullptr) {
363 return port::Status(
364 port::error::PERMISSION_DENIED,
365 absl::StrCat("could not open driver version path for reading: ",
366 kDriverVersionPath));
367 }
368
369 static const int kContentsSize = 1024;
370 absl::InlinedVector<char, 4> contents(kContentsSize);
371 size_t retcode =
372 fread(contents.begin(), 1, kContentsSize - 2, driver_version_file);
373 if (retcode < kContentsSize - 1) {
374 contents[retcode] = '\0';
375 }
376 contents[kContentsSize - 1] = '\0';
377
378 if (retcode != 0) {
379 VLOG(1) << "driver version file contents: \"\"\"" << contents.begin()
380 << "\"\"\"";
381 fclose(driver_version_file);
382 return FindKernelModuleVersion(contents.begin());
383 }
384
385 auto status = port::Status(
386 port::error::INTERNAL,
387 absl::StrCat(
388 "failed to read driver version file contents: ", kDriverVersionPath,
389 "; ferror: ", ferror(driver_version_file)));
390 fclose(driver_version_file);
391 return status;
392 #endif
393 }
394
395 } // namespace gpu
396 } // namespace stream_executor
397