1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/stream_executor/gpu/asm_compiler.h"
17 
18 #include "absl/container/flat_hash_map.h"
19 #include "absl/container/flat_hash_set.h"
20 #include "absl/strings/str_format.h"
21 #include "absl/synchronization/mutex.h"
22 #include "tensorflow/core/lib/core/errors.h"
23 #include "tensorflow/core/lib/gtl/cleanup.h"
24 #include "tensorflow/core/lib/io/path.h"
25 #include "tensorflow/core/platform/cuda_libdevice_path.h"
26 #include "tensorflow/core/platform/env.h"
27 #include "tensorflow/core/platform/mutex.h"
28 #include "tensorflow/core/platform/regexp.h"
29 #include "tensorflow/core/platform/subprocess.h"
30 #include "tensorflow/stream_executor/gpu/gpu_driver.h"
31 #include "tensorflow/stream_executor/lib/statusor.h"
32 
33 namespace stream_executor {
34 
35 // Prints a warning if the ptxas at ptxas_path has known bugs.
36 //
37 // Only prints a warning the first time it's called for a particular value of
38 // ptxas_path.
39 //
40 // Locks on entry.
WarnIfBadPtxasVersion(const std::string & ptxas_path)41 static void WarnIfBadPtxasVersion(const std::string& ptxas_path) {
42   static tensorflow::mutex mu(tensorflow::LINKER_INITIALIZED);
43   static std::unordered_set<std::string>* seen_ptxas_paths TF_GUARDED_BY(mu) =
44       new std::unordered_set<std::string>();
45 
46   tensorflow::mutex_lock lock(mu);
47   if (!seen_ptxas_paths->insert(ptxas_path).second) {
48     // Already checked this ptx binary, nothing to do.
49     return;
50   }
51 
52   tensorflow::SubProcess ptxas;
53   ptxas.SetProgram(ptxas_path, {ptxas_path, "--version"});
54   ptxas.SetChannelAction(tensorflow::CHAN_STDOUT, tensorflow::ACTION_PIPE);
55   if (!ptxas.Start()) {
56     LOG(WARNING) << "Couldn't invoke " << ptxas_path << " --version";
57     return;
58   }
59 
60   std::string out;
61   int exit_code = ptxas.Communicate(/*stdin_input=*/nullptr, &out,
62                                     /*stderr_output=*/nullptr);
63   if (exit_code != 0) {
64     LOG(WARNING) << "Running " << ptxas_path << " --version returned "
65                  << exit_code;
66     return;
67   }
68 
69   int64 vmaj, vmin, vdot;
70   std::string vmaj_str, vmin_str, vdot_str;
71   if (!RE2::PartialMatch(out, R"(\bV(\d+)\.(\d+)\.(\d+)\b)", &vmaj_str,
72                          &vmin_str, &vdot_str) ||
73       !absl::SimpleAtoi(vmaj_str, &vmaj) ||
74       !absl::SimpleAtoi(vmin_str, &vmin) ||
75       !absl::SimpleAtoi(vdot_str, &vdot)) {
76     LOG(WARNING) << "Couldn't parse ptxas version in output of " << ptxas_path
77                  << " --version:\n"
78                  << out;
79     return;
80   }
81 
82   // We need ptxas >= 9.0 as a hard requirement, because we compile targeting
83   // PTX 6.0.  An older ptxas will just fail to compile any of our code.
84   //
85   // ptxas 9.0 before 9.0.276 and ptxas 9.1 before 9.1.121 miscompile some
86   // address calculations with large offsets (e.g. "load ptr + large_constant"),
87   // b/70245379.
88   //
89   // ptxas 9.1.121 miscompiles some large multioutput fusions, again in a way
90   // that appears related to address calculations, b/111107644.  ptxas 9.2.88
91   // appears to work, as far as we can tell.
92   if (vmaj < 9) {
93     LOG(ERROR)
94         << "You are using ptxas 8.x, but TF requires ptxas 9.x (and strongly "
95            "prefers >= 9.2.88).  Compilation of XLA kernels below will likely "
96            "fail.\n\nYou do not need to update CUDA; cherry-picking the ptxas "
97            "binary is sufficient.";
98   } else if (std::make_tuple(vmaj, vmin, vdot) < std::make_tuple(9, 2, 88)) {
99     LOG(WARNING)
100         << "*** WARNING *** You are using ptxas " << vmaj << "." << vmin << "."
101         << vdot
102         << ", which is older than 9.2.88. ptxas 9.x before 9.2.88 is known to "
103            "miscompile XLA code, leading to incorrect results or "
104            "invalid-address errors.\n\nYou do not need to update to CUDA "
105            "9.2.88; cherry-picking the ptxas binary is sufficient.";
106   }
107 }
108 
CompileGpuAsmOrGetCached(int device_ordinal,const char * ptx,GpuAsmOpts compilation_options)109 port::StatusOr<absl::Span<const uint8>> CompileGpuAsmOrGetCached(
110     int device_ordinal, const char* ptx, GpuAsmOpts compilation_options) {
111   using PtxCacheKey = std::tuple<int, std::string, GpuAsmOpts::PtxOptionsTuple>;
112   using PtxCompilerResult = port::StatusOr<std::vector<uint8>>;
113   static tensorflow::mutex ptx_cache_mutex(tensorflow::LINKER_INITIALIZED);
114   static auto& ptx_cache TF_GUARDED_BY(ptx_cache_mutex) =
115       *new absl::flat_hash_map<PtxCacheKey, PtxCompilerResult>();
116 
117   tensorflow::mutex_lock lock(ptx_cache_mutex);
118   PtxCacheKey cache_key{device_ordinal, std::string(ptx),
119                         compilation_options.ToTuple()};
120   auto it = ptx_cache.find(cache_key);
121   if (it == ptx_cache.end()) {
122     PtxCompilerResult compiled =
123         CompileGpuAsm(device_ordinal, ptx, compilation_options);
124     it = ptx_cache.emplace(cache_key, std::move(compiled)).first;
125   }
126 
127   CHECK(it != ptx_cache.end());
128 
129   // Failed compilation attempts are cached.
130   // Use separate status check and ValueOrDie invocation on ptx_cache
131   // entry to avoid value moving introduced by TF_ASSIGN_OR_RETURN.
132 
133   if (TF_PREDICT_FALSE(!it->second.ok())) {
134     return it->second.status();
135   }
136 
137   const std::vector<uint8>& compiled = it->second.ValueOrDie();
138   return absl::MakeSpan(compiled);
139 }
140 
CompileGpuAsm(int device_ordinal,const char * ptx_contents,GpuAsmOpts options)141 port::StatusOr<std::vector<uint8>> CompileGpuAsm(int device_ordinal,
142                                                  const char* ptx_contents,
143                                                  GpuAsmOpts options) {
144   gpu::GpuDeviceHandle handle;
145   TF_RETURN_IF_ERROR(gpu::GpuDriver::GetDevice(device_ordinal, &handle));
146   int cc_major;
147   int cc_minor;
148   TF_RETURN_IF_ERROR(
149       gpu::GpuDriver::GetComputeCapability(&cc_major, &cc_minor, handle));
150   return CompileGpuAsm(cc_major, cc_minor, ptx_contents, options);
151 }
152 
findCudaExecutable(const std::string binary_name,const std::string preferred_cuda_dir)153 static std::string findCudaExecutable(const std::string binary_name,
154                                       const std::string preferred_cuda_dir) {
155 #if defined(PLATFORM_WINDOWS)
156   const std::string binary_filename = binary_name + ".exe";
157 #else
158   const std::string& binary_filename = binary_name;
159 #endif
160 
161   // Search in cuda root candidates.
162   auto env = tensorflow::Env::Default();
163   std::string binary_path;
164   for (const std::string& cuda_root :
165        tensorflow::CandidateCudaRoots(preferred_cuda_dir)) {
166     binary_path = tensorflow::io::JoinPath(cuda_root, "bin", binary_filename);
167     VLOG(2) << "Looking for " << binary_filename << " at " << binary_path;
168     if (env->FileExists(binary_path).ok()) {
169       break;
170     }
171   }
172   if (!env->FileExists(binary_path).ok()) {
173     // Rely on subprocess invocation to find the correct binary.
174     binary_path = binary_filename;
175   }
176   VLOG(2) << "Using " << binary_filename << " at " << binary_path;
177   return binary_path;
178 }
179 
LogPtxasTooOld(const std::string & ptxas_path,int cc_major,int cc_minor)180 static void LogPtxasTooOld(const std::string& ptxas_path, int cc_major,
181                            int cc_minor) {
182   using AlreadyLoggedSetTy =
183       absl::flat_hash_set<std::tuple<std::string, int, int>>;
184 
185   static absl::Mutex* mutex = new absl::Mutex;
186   static AlreadyLoggedSetTy* already_logged = new AlreadyLoggedSetTy;
187 
188   absl::MutexLock lock(mutex);
189 
190   if (already_logged->insert({ptxas_path, cc_major, cc_minor}).second) {
191     LOG(WARNING) << "Falling back to the CUDA driver for PTX compilation; "
192                     "ptxas does not support CC "
193                  << cc_major << "." << cc_minor;
194     LOG(WARNING) << "Used ptxas at " << ptxas_path;
195   }
196 }
197 
CompileGpuAsm(int cc_major,int cc_minor,const char * ptx_contents,GpuAsmOpts options)198 port::StatusOr<std::vector<uint8>> CompileGpuAsm(int cc_major, int cc_minor,
199                                                  const char* ptx_contents,
200                                                  GpuAsmOpts options) {
201   std::string ptxas_path =
202       findCudaExecutable("ptxas", options.preferred_cuda_dir);
203 
204   WarnIfBadPtxasVersion(ptxas_path);
205 
206   // Write ptx into a temporary file.
207   std::string ptx_path;
208   auto env = tensorflow::Env::Default();
209   if (!env->LocalTempFilename(&ptx_path)) {
210     return port::InternalError("couldn't get temp PTX file name");
211   }
212   TF_RETURN_IF_ERROR(
213       tensorflow::WriteStringToFile(env, ptx_path, ptx_contents));
214   VLOG(2) << "ptx written to: " << ptx_path;
215 
216   auto ptx_cleaner = tensorflow::gtl::MakeCleanup([&ptx_path] {
217     TF_CHECK_OK(tensorflow::Env::Default()->DeleteFile(ptx_path));
218   });
219 
220   // Invoke ptxas and collect its output.
221   std::string cubin_path;
222   if (!env->LocalTempFilename(&cubin_path)) {
223     return port::InternalError("couldn't get temp CUBIN file name");
224   }
225   auto cubin_cleaner = tensorflow::gtl::MakeCleanup([&cubin_path] {
226     // CUBIN file may never be created, so the failure to delete it should not
227     // produce TF error.
228     tensorflow::Env::Default()->DeleteFile(cubin_path).IgnoreError();
229   });
230   tensorflow::SubProcess ptxas_info_dumper;
231   std::vector<std::string> ptxas_args = {
232       ptxas_path, ptx_path, "-o", cubin_path,
233       absl::StrCat("-arch=sm_", cc_major, cc_minor)};
234   if (VLOG_IS_ON(2)) {
235     ptxas_args.push_back("-v");
236   }
237   if (options.disable_gpuasm_optimizations) {
238     ptxas_args.push_back("-O0");
239   }
240   ptxas_args.insert(ptxas_args.end(), options.extra_flags.begin(),
241                     options.extra_flags.end());
242   if (VLOG_IS_ON(3)) {
243     VLOG(3) << absl::StrJoin(ptxas_args, " ");
244   }
245 
246   ptxas_info_dumper.SetProgram(ptxas_path, ptxas_args);
247   ptxas_info_dumper.SetChannelAction(tensorflow::CHAN_STDERR,
248                                      tensorflow::ACTION_PIPE);
249   if (!ptxas_info_dumper.Start()) {
250     return port::InternalError("Failed to launch ptxas");
251   }
252   std::string stderr_output;
253   int exit_status = ptxas_info_dumper.Communicate(
254       /*stdin_input=*/nullptr, /*stdout_output=*/nullptr, &stderr_output);
255   if (exit_status != 0) {
256     //  It happens when the ptxas installed is too old for the current GPU.
257     //  Example error message associated with this error code:
258     //      ptxas fatal   : Value 'sm_80' is not defined for option 'gpu-name'
259     // In that case, fallback to the driver for compilation
260     if (absl::StartsWith(stderr_output, "ptxas fatal   : Value '") &&
261         absl::StrContains(stderr_output,
262                           "is not defined for option 'gpu-name'")) {
263       LogPtxasTooOld(ptxas_path, cc_major, cc_minor);
264       return tensorflow::errors::Unimplemented(
265           ptxas_path, " ptxas too old. Falling back to the driver to compile.");
266     }
267 
268     return port::InternalError(
269         absl::StrFormat("ptxas exited with non-zero error code %d, output: %s",
270                         exit_status, stderr_output));
271   }
272   // Print the verbose output of ptxas.
273   if (!stderr_output.empty()) {
274     VLOG(2) << stderr_output;
275   }
276 
277   // Read in the result of compilation and return it as a byte vector.
278   std::string cubin;
279   TF_RETURN_IF_ERROR(tensorflow::ReadFileToString(tensorflow::Env::Default(),
280                                                   cubin_path, &cubin));
281   std::vector<uint8> cubin_vector(cubin.begin(), cubin.end());
282   return cubin_vector;
283 }
284 
BundleGpuAsm(std::vector<CubinOrPTXImage> images,const std::string preferred_cuda_dir)285 port::StatusOr<std::vector<uint8>> BundleGpuAsm(
286     std::vector<CubinOrPTXImage> images, const std::string preferred_cuda_dir) {
287   std::string fatbinary_path =
288       findCudaExecutable("fatbinary", preferred_cuda_dir);
289 
290   // Write images to temporary files.
291   std::vector<std::string> image_paths;
292   auto env = tensorflow::Env::Default();
293   for (const CubinOrPTXImage& img : images) {
294     std::string img_path;
295     if (!env->LocalTempFilename(&img_path)) {
296       return port::InternalError(
297           "Could not get temporary filenames for images.");
298     }
299     TF_RETURN_IF_ERROR(tensorflow::WriteStringToFile(
300         env, img_path, std::string(img.bytes.begin(), img.bytes.end())));
301     VLOG(2) << "image written to " << img_path;
302     image_paths.push_back(std::move(img_path));
303   }
304   auto image_files_cleaner = tensorflow::gtl::MakeCleanup([&image_paths] {
305     for (const auto& path : image_paths) {
306       TF_CHECK_OK(tensorflow::Env::Default()->DeleteFile(path));
307     }
308   });
309 
310   // Prepare temorary result file.
311   std::string result_path;
312   if (!env->LocalTempFilename(&result_path)) {
313     return port::InternalError(
314         "Could not get temporary filename for fatbin result.");
315   }
316   auto result_file_cleaner = tensorflow::gtl::MakeCleanup([&result_path] {
317     // This file may never be created, so the failure to delete it should not
318     // propagate to TF.
319     tensorflow::Env::Default()->DeleteFile(result_path).IgnoreError();
320   });
321 
322   // Invoke fatbinary and collect its output.
323   tensorflow::SubProcess fatbinary;
324   std::vector<std::string> fatbinary_args = {
325       fatbinary_path, "--64",           "--cmdline=--compile-only",
326       "--link",       "--compress-all", absl::StrCat("--create=", result_path)};
327   assert(images.size() == image_paths.size());
328   for (int i = 0; i < images.size(); i++) {
329     fatbinary_args.push_back(absl::StrFormat(
330         "--image=profile=%s,file=%s", images[i].profile, image_paths[i]));
331   }
332   if (VLOG_IS_ON(3)) {
333     VLOG(3) << absl::StrJoin(fatbinary_args, " ");
334   }
335   fatbinary.SetProgram(fatbinary_path, fatbinary_args);
336   fatbinary.SetChannelAction(tensorflow::CHAN_STDERR, tensorflow::ACTION_PIPE);
337   if (!fatbinary.Start()) {
338     return port::InternalError("Failed to launch fatbinary.");
339   }
340   std::string stderr_output;
341   int exit_status = fatbinary.Communicate(
342       /*stdin_input=*/nullptr, /*stdout_output=*/nullptr, &stderr_output);
343   if (exit_status != 0) {
344     return port::InternalError(absl::StrFormat(
345         "fatbinary exited with non-zero error code %d, output: %s", exit_status,
346         stderr_output));
347   }
348   if (!stderr_output.empty()) {
349     VLOG(2) << stderr_output;
350   }
351 
352   // Read in the result and return it as a byte vector.
353   std::string result_blob;
354   TF_RETURN_IF_ERROR(tensorflow::ReadFileToString(tensorflow::Env::Default(),
355                                                   result_path, &result_blob));
356   return std::vector<uint8>(result_blob.begin(), result_blob.end());
357 }
358 
findRocmExecutable(const std::string & binary_relative_path,const std::string & rocm_root_dir)359 static std::string findRocmExecutable(const std::string& binary_relative_path,
360                                       const std::string& rocm_root_dir) {
361   auto env = tensorflow::Env::Default();
362   std::string binary_path =
363       tensorflow::io::JoinPath(rocm_root_dir, binary_relative_path);
364   VLOG(2) << "Looking for " << binary_relative_path << " at " << rocm_root_dir;
365   if (!env->FileExists(binary_path).ok()) {
366     binary_path = absl::StrCat("<", binary_path, " - NOT FOUND>");
367   }
368   return binary_path;
369 }
370 
BundleGpuAsm(std::vector<HsacoImage> images,const std::string rocm_root_dir)371 port::StatusOr<std::vector<uint8>> BundleGpuAsm(
372     std::vector<HsacoImage> images, const std::string rocm_root_dir) {
373   std::string clang_offload_bundler_path =
374       findRocmExecutable("llvm/bin/clang-offload-bundler", rocm_root_dir);
375 
376   // Initialise the "--inputs" / "--targets" arguments for the
377   // clang-offload-bundler with a dummy file / host target triple...
378   // clang-offload-bundler requires 1 and only 1 host target triple
379   std::ostringstream inputs_list;
380   std::ostringstream targets_list;
381 
382   inputs_list << "/dev/null";
383   targets_list << "host-x86_64-unknown-linux";
384 
385   // Write images to temporary files.
386   std::vector<std::string> image_paths;
387   auto env = tensorflow::Env::Default();
388   for (const HsacoImage& img : images) {
389     std::string img_path;
390     if (!env->LocalTempFilename(&img_path)) {
391       return port::InternalError(
392           "Could not get temporary filenames for images.");
393     }
394     TF_RETURN_IF_ERROR(tensorflow::WriteStringToFile(
395         env, img_path, std::string(img.bytes.begin(), img.bytes.end())));
396     VLOG(2) << "image written to " << img_path;
397     inputs_list << "," << img_path;
398     targets_list << ",hip-amdgcn-amd-amdhsa-" << img.gfx_arch;
399     image_paths.push_back(std::move(img_path));
400   }
401   auto image_files_cleaner = tensorflow::gtl::MakeCleanup([&image_paths] {
402     for (const auto& path : image_paths) {
403       TF_CHECK_OK(tensorflow::Env::Default()->DeleteFile(path));
404     }
405   });
406 
407   // Prepare temorary result file.
408   std::string result_path;
409   if (!env->LocalTempFilename(&result_path)) {
410     return port::InternalError(
411         "Could not get temporary filename for fatbin result.");
412   }
413   auto result_file_cleaner = tensorflow::gtl::MakeCleanup([&result_path] {
414     // This file may never be created, so the failure to delete it should not
415     // propagate to TF.
416     tensorflow::Env::Default()->DeleteFile(result_path).IgnoreError();
417   });
418 
419   // Invoke clang_offload_bundler and collect its output.
420   tensorflow::SubProcess clang_offload_bundler;
421   std::vector<std::string> clang_offload_bundler_args = {
422       clang_offload_bundler_path, absl::StrCat("--inputs=", inputs_list.str()),
423       absl::StrCat("--targets=", targets_list.str()), "--type=o",
424       absl::StrCat("--outputs=", result_path)};
425   if (VLOG_IS_ON(3)) {
426     VLOG(3) << absl::StrJoin(clang_offload_bundler_args, " ");
427   }
428   clang_offload_bundler.SetProgram(clang_offload_bundler_path,
429                                    clang_offload_bundler_args);
430   clang_offload_bundler.SetChannelAction(tensorflow::CHAN_STDERR,
431                                          tensorflow::ACTION_PIPE);
432   if (!clang_offload_bundler.Start()) {
433     return port::InternalError("Failed to launch clang_offload_bundler.");
434   }
435   std::string stderr_output;
436   int exit_status = clang_offload_bundler.Communicate(
437       /*stdin_input=*/nullptr, /*stdout_output=*/nullptr, &stderr_output);
438   if (exit_status != 0) {
439     return port::InternalError(absl::StrFormat(
440         "clang_offload_bundler exited with non-zero error code %d, output: %s",
441         exit_status, stderr_output));
442   }
443   if (!stderr_output.empty()) {
444     VLOG(2) << stderr_output;
445   }
446 
447   // Read in the result and return it as a byte vector.
448   std::string result_blob;
449   TF_RETURN_IF_ERROR(tensorflow::ReadFileToString(tensorflow::Env::Default(),
450                                                   result_path, &result_blob));
451   return std::vector<uint8>(result_blob.begin(), result_blob.end());
452 }
453 
454 }  // namespace stream_executor
455