1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
17 
18 #if !defined(PLATFORM_WINDOWS)
19 #include <dirent.h>
20 #endif
21 
22 #include <limits.h>
23 #include <stddef.h>
24 #include <stdio.h>
25 #include <stdlib.h>
26 #include <string.h>
27 #ifdef __APPLE__
28 #include <IOKit/kext/KextManager.h>
29 #include <mach-o/dyld.h>
30 #else
31 #if !defined(PLATFORM_WINDOWS)
32 #include <link.h>
33 #include <sys/sysmacros.h>
34 #include <unistd.h>
35 #endif
36 #include <sys/stat.h>
37 #endif
38 #include <algorithm>
39 #include <memory>
40 #include <vector>
41 
42 #include "absl/container/inlined_vector.h"
43 #include "absl/strings/str_cat.h"
44 #include "tensorflow/stream_executor/lib/error.h"
45 #include "tensorflow/stream_executor/lib/numbers.h"
46 #include "tensorflow/stream_executor/lib/process_state.h"
47 #include "tensorflow/stream_executor/lib/status.h"
48 #include "tensorflow/stream_executor/lib/str_util.h"
49 #include "tensorflow/stream_executor/lib/stringprintf.h"
50 #include "tensorflow/stream_executor/platform/logging.h"
51 
52 namespace stream_executor {
53 namespace cuda {
54 
DriverVersionToString(DriverVersion version)55 string DriverVersionToString(DriverVersion version) {
56   return port::Printf("%d.%d.%d", std::get<0>(version), std::get<1>(version), std::get<2>(version));
57 }
58 
DriverVersionStatusToString(port::StatusOr<DriverVersion> version)59 string DriverVersionStatusToString(port::StatusOr<DriverVersion> version) {
60   if (!version.ok()) {
61     return version.status().ToString();
62   }
63 
64   return DriverVersionToString(version.ValueOrDie());
65 }
66 
StringToDriverVersion(const string & value)67 port::StatusOr<DriverVersion> StringToDriverVersion(const string &value) {
68   std::vector<string> pieces = port::Split(value, '.');
69   if (pieces.size() < 2 || pieces.size() > 4) {
70     return port::Status(
71         port::error::INVALID_ARGUMENT,
72         port::Printf("expected %%d.%%d, %%d.%%d.%%d, or %%d.%%d.%%d.%%d form "
73                      "for driver version; got \"%s\"",
74                      value.c_str()));
75   }
76 
77   int major;
78   int minor;
79   int patch = 0;
80   if (!port::safe_strto32(pieces[0], &major)) {
81     return port::Status(
82         port::error::INVALID_ARGUMENT,
83         port::Printf("could not parse major version number \"%s\" as an "
84                      "integer from string \"%s\"",
85                      pieces[0].c_str(), value.c_str()));
86   }
87   if (!port::safe_strto32(pieces[1], &minor)) {
88     return port::Status(
89         port::error::INVALID_ARGUMENT,
90         port::Printf("could not parse minor version number \"%s\" as an "
91                      "integer from string \"%s\"",
92                      pieces[1].c_str(), value.c_str()));
93   }
94   if (pieces.size() == 3 && !port::safe_strto32(pieces[2], &patch)) {
95     return port::Status(
96         port::error::INVALID_ARGUMENT,
97         port::Printf("could not parse patch version number \"%s\" as an "
98                      "integer from string \"%s\"",
99                      pieces[2].c_str(), value.c_str()));
100   }
101 
102   DriverVersion result{major, minor, patch};
103   VLOG(2) << "version string \"" << value << "\" made value "
104           << DriverVersionToString(result);
105   return result;
106 }
107 
108 }  // namespace cuda
109 }  // namespace stream_executor
110 
111 namespace stream_executor {
112 namespace gpu {
113 
114 #ifdef __APPLE__
115 static const CFStringRef kDriverKextIdentifier = CFSTR("com.nvidia.CUDA");
116 #elif !defined(PLATFORM_WINDOWS)
117 static const char *kDriverVersionPath = "/proc/driver/nvidia/version";
118 #endif
119 
120 // -- class Diagnostician
121 
GetDevNodePath(int dev_node_ordinal)122 string Diagnostician::GetDevNodePath(int dev_node_ordinal) {
123   return absl::StrCat("/dev/nvidia", dev_node_ordinal);
124 }
125 
LogDiagnosticInformation()126 void Diagnostician::LogDiagnosticInformation() {
127 #ifdef __APPLE__
128   CFStringRef kext_ids[1];
129   kext_ids[0] = kDriverKextIdentifier;
130   CFArrayRef kext_id_query = CFArrayCreate(nullptr, (const void **)kext_ids, 1,
131                                            &kCFTypeArrayCallBacks);
132   CFDictionaryRef kext_infos =
133       KextManagerCopyLoadedKextInfo(kext_id_query, nullptr);
134   CFRelease(kext_id_query);
135 
136   CFDictionaryRef cuda_driver_info = nullptr;
137   if (CFDictionaryGetValueIfPresent(kext_infos, kDriverKextIdentifier,
138                                     (const void **)&cuda_driver_info)) {
139     bool started = CFBooleanGetValue((CFBooleanRef)CFDictionaryGetValue(
140         cuda_driver_info, CFSTR("OSBundleStarted")));
141     if (!started) {
142       LOG(INFO) << "kernel driver is installed, but does not appear to be "
143                    "running on this host "
144                 << "(" << port::Hostname() << ")";
145     }
146   } else {
147     LOG(INFO) << "kernel driver does not appear to be installed on this host "
148               << "(" << port::Hostname() << ")";
149   }
150   CFRelease(kext_infos);
151 #elif !defined(PLATFORM_WINDOWS)
152   if (access(kDriverVersionPath, F_OK) != 0) {
153     LOG(INFO) << "kernel driver does not appear to be running on this host "
154               << "(" << port::Hostname() << "): "
155               << "/proc/driver/nvidia/version does not exist";
156     return;
157   }
158   auto dev0_path = GetDevNodePath(0);
159   if (access(dev0_path.c_str(), F_OK) != 0) {
160     LOG(INFO) << "no NVIDIA GPU device is present: " << dev0_path
161               << " does not exist";
162     return;
163   }
164 #endif
165 
166   LOG(INFO) << "retrieving CUDA diagnostic information for host: "
167             << port::Hostname();
168 
169   LogDriverVersionInformation();
170 }
171 
LogDriverVersionInformation()172 /* static */ void Diagnostician::LogDriverVersionInformation() {
173   LOG(INFO) << "hostname: " << port::Hostname();
174 #ifndef PLATFORM_WINDOWS
175   if (VLOG_IS_ON(1)) {
176     const char *value = getenv("LD_LIBRARY_PATH");
177     string library_path = value == nullptr ? "" : value;
178     VLOG(1) << "LD_LIBRARY_PATH is: \"" << library_path << "\"";
179 
180     std::vector<string> pieces = port::Split(library_path, ':');
181     for (const auto &piece : pieces) {
182       if (piece.empty()) {
183         continue;
184       }
185       DIR *dir = opendir(piece.c_str());
186       if (dir == nullptr) {
187         VLOG(1) << "could not open \"" << piece << "\"";
188         continue;
189       }
190       while (dirent *entity = readdir(dir)) {
191         VLOG(1) << piece << " :: " << entity->d_name;
192       }
193       closedir(dir);
194     }
195   }
196   port::StatusOr<DriverVersion> dso_version = FindDsoVersion();
197   LOG(INFO) << "libcuda reported version is: "
198             << cuda::DriverVersionStatusToString(dso_version);
199 
200   port::StatusOr<DriverVersion> kernel_version = FindKernelDriverVersion();
201   LOG(INFO) << "kernel reported version is: "
202             << cuda::DriverVersionStatusToString(kernel_version);
203 #endif
204 
205   // OS X kernel driver does not report version accurately
206 #if !defined(__APPLE__) && !defined(PLATFORM_WINDOWS)
207   if (kernel_version.ok() && dso_version.ok()) {
208     WarnOnDsoKernelMismatch(dso_version, kernel_version);
209   }
210 #endif
211 }
212 
213 // Iterates through loaded DSOs with DlIteratePhdrCallback to find the
214 // driver-interfacing DSO version number. Returns it as a string.
FindDsoVersion()215 port::StatusOr<DriverVersion> Diagnostician::FindDsoVersion() {
216   port::StatusOr<DriverVersion> result(port::Status(
217       port::error::NOT_FOUND,
218       "was unable to find libcuda.so DSO loaded into this program"));
219 
220 #if defined(__APPLE__)
221   // OSX CUDA libraries have names like: libcuda_310.41.15_mercury.dylib
222   const string prefix("libcuda_");
223   const string suffix("_mercury.dylib");
224   for (uint32_t image_index = 0; image_index < _dyld_image_count();
225        ++image_index) {
226     const string path(_dyld_get_image_name(image_index));
227     const size_t suffix_pos = path.rfind(suffix);
228     const size_t prefix_pos = path.rfind(prefix, suffix_pos);
229     if (prefix_pos == string::npos || suffix_pos == string::npos) {
230       // no match
231       continue;
232     }
233     const size_t start = prefix_pos + prefix.size();
234     if (start >= suffix_pos) {
235       // version not included
236       continue;
237     }
238     const size_t length = suffix_pos - start;
239     const string version = path.substr(start, length);
240     result = cuda::StringToDriverVersion(version);
241   }
242 #else
243 #if !defined(PLATFORM_WINDOWS) && !defined(ANDROID_TEGRA)
244   // Callback used when iterating through DSOs. Looks for the driver-interfacing
245   // DSO and yields its version number into the callback data, when found.
246   auto iterate_phdr =
247       [](struct dl_phdr_info *info, size_t size, void *data) -> int {
248     if (strstr(info->dlpi_name, "libcuda.so.1")) {
249       VLOG(1) << "found DLL info with name: " << info->dlpi_name;
250       char resolved_path[PATH_MAX] = {0};
251       if (realpath(info->dlpi_name, resolved_path) == nullptr) {
252         return 0;
253       }
254       VLOG(1) << "found DLL info with resolved path: " << resolved_path;
255       const char *slash = rindex(resolved_path, '/');
256       if (slash == nullptr) {
257         return 0;
258       }
259       const char *so_suffix = ".so.";
260       const char *dot = strstr(slash, so_suffix);
261       if (dot == nullptr) {
262         return 0;
263       }
264       string dso_version = dot + strlen(so_suffix);
265       // TODO(b/22689637): Eliminate the explicit namespace if possible.
266       auto stripped_dso_version = port::StripSuffixString(dso_version, ".ld64");
267       auto result = static_cast<port::StatusOr<DriverVersion> *>(data);
268       *result = cuda::StringToDriverVersion(stripped_dso_version);
269       return 1;
270     }
271     return 0;
272   };
273 
274   dl_iterate_phdr(iterate_phdr, &result);
275 #endif
276 #endif
277 
278   return result;
279 }
280 
FindKernelModuleVersion(const string & driver_version_file_contents)281 port::StatusOr<DriverVersion> Diagnostician::FindKernelModuleVersion(
282     const string &driver_version_file_contents) {
283   static const char *kDriverFilePrelude = "Kernel Module  ";
284   size_t offset = driver_version_file_contents.find(kDriverFilePrelude);
285   if (offset == string::npos) {
286     return port::Status(
287         port::error::NOT_FOUND,
288         absl::StrCat("could not find kernel module information in "
289                      "driver version file contents: \"",
290                      driver_version_file_contents, "\""));
291   }
292 
293   string version_and_rest = driver_version_file_contents.substr(
294       offset + strlen(kDriverFilePrelude), string::npos);
295   size_t space_index = version_and_rest.find(" ");
296   auto kernel_version = version_and_rest.substr(0, space_index);
297   // TODO(b/22689637): Eliminate the explicit namespace if possible.
298   auto stripped_kernel_version =
299       port::StripSuffixString(kernel_version, ".ld64");
300   return cuda::StringToDriverVersion(stripped_kernel_version);
301 }
302 
WarnOnDsoKernelMismatch(port::StatusOr<DriverVersion> dso_version,port::StatusOr<DriverVersion> kernel_version)303 void Diagnostician::WarnOnDsoKernelMismatch(
304     port::StatusOr<DriverVersion> dso_version,
305     port::StatusOr<DriverVersion> kernel_version) {
306   if (kernel_version.ok() && dso_version.ok() &&
307       dso_version.ValueOrDie() == kernel_version.ValueOrDie()) {
308     LOG(INFO) << "kernel version seems to match DSO: "
309               << cuda::DriverVersionToString(kernel_version.ValueOrDie());
310   } else {
311     LOG(ERROR) << "kernel version "
312                << cuda::DriverVersionStatusToString(kernel_version)
313                << " does not match DSO version "
314                << cuda::DriverVersionStatusToString(dso_version)
315                << " -- cannot find working devices in this configuration";
316   }
317 }
318 
319 
FindKernelDriverVersion()320 port::StatusOr<DriverVersion> Diagnostician::FindKernelDriverVersion() {
321 #if defined(__APPLE__)
322   CFStringRef kext_ids[1];
323   kext_ids[0] = kDriverKextIdentifier;
324   CFArrayRef kext_id_query = CFArrayCreate(nullptr, (const void **)kext_ids, 1,
325                                            &kCFTypeArrayCallBacks);
326   CFDictionaryRef kext_infos =
327       KextManagerCopyLoadedKextInfo(kext_id_query, nullptr);
328   CFRelease(kext_id_query);
329 
330   CFDictionaryRef cuda_driver_info = nullptr;
331   if (CFDictionaryGetValueIfPresent(kext_infos, kDriverKextIdentifier,
332                                     (const void **)&cuda_driver_info)) {
333     // NOTE: OSX CUDA driver does not currently store the same driver version
334     // in kCFBundleVersionKey as is returned by cuDriverGetVersion
335     CFRelease(kext_infos);
336     const CFStringRef str = (CFStringRef)CFDictionaryGetValue(
337         cuda_driver_info, kCFBundleVersionKey);
338     const char *version = CFStringGetCStringPtr(str, kCFStringEncodingUTF8);
339 
340     // version can be NULL in which case treat it as empty string
341     // see
342     // https://developer.apple.com/library/mac/documentation/CoreFoundation/Conceptual/CFStrings/Articles/AccessingContents.html#//apple_ref/doc/uid/20001184-100980-TPXREF112
343     if (version == NULL) {
344       return cuda::StringToDriverVersion("");
345     }
346     return cuda::StringToDriverVersion(version);
347   }
348   CFRelease(kext_infos);
349   auto status = port::Status(
350       port::error::INTERNAL,
351       absl::StrCat(
352           "failed to read driver bundle version: ",
353           CFStringGetCStringPtr(kDriverKextIdentifier, kCFStringEncodingUTF8)));
354   return status;
355 #elif defined(PLATFORM_WINDOWS)
356   auto status =
357       port::Status(port::error::UNIMPLEMENTED,
358                    "kernel reported driver version not implemented on Windows");
359   return status;
360 #else
361   FILE *driver_version_file = fopen(kDriverVersionPath, "r");
362   if (driver_version_file == nullptr) {
363     return port::Status(
364         port::error::PERMISSION_DENIED,
365         absl::StrCat("could not open driver version path for reading: ",
366                      kDriverVersionPath));
367   }
368 
369   static const int kContentsSize = 1024;
370   absl::InlinedVector<char, 4> contents(kContentsSize);
371   size_t retcode =
372       fread(contents.begin(), 1, kContentsSize - 2, driver_version_file);
373   if (retcode < kContentsSize - 1) {
374     contents[retcode] = '\0';
375   }
376   contents[kContentsSize - 1] = '\0';
377 
378   if (retcode != 0) {
379     VLOG(1) << "driver version file contents: \"\"\"" << contents.begin()
380             << "\"\"\"";
381     fclose(driver_version_file);
382     return FindKernelModuleVersion(contents.begin());
383   }
384 
385   auto status = port::Status(
386       port::error::INTERNAL,
387       absl::StrCat(
388           "failed to read driver version file contents: ", kDriverVersionPath,
389           "; ferror: ", ferror(driver_version_file)));
390   fclose(driver_version_file);
391   return status;
392 #endif
393 }
394 
395 }  // namespace gpu
396 }  // namespace stream_executor
397